| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 4.0, | |
| "eval_steps": 500, | |
| "global_step": 4156, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.004812319538017324, | |
| "grad_norm": 1.9419618977477497, | |
| "learning_rate": 6.009615384615385e-07, | |
| "loss": 0.5357, | |
| "mean_token_accuracy": 0.8665974557399749, | |
| "step": 5 | |
| }, | |
| { | |
| "epoch": 0.009624639076034648, | |
| "grad_norm": 1.5458023168394313, | |
| "learning_rate": 1.201923076923077e-06, | |
| "loss": 0.5268, | |
| "mean_token_accuracy": 0.8675418138504029, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.014436958614051972, | |
| "grad_norm": 0.7434707096020993, | |
| "learning_rate": 1.8028846153846153e-06, | |
| "loss": 0.4966, | |
| "mean_token_accuracy": 0.8705106377601624, | |
| "step": 15 | |
| }, | |
| { | |
| "epoch": 0.019249278152069296, | |
| "grad_norm": 0.7866351393481213, | |
| "learning_rate": 2.403846153846154e-06, | |
| "loss": 0.4663, | |
| "mean_token_accuracy": 0.8728387534618378, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.02406159769008662, | |
| "grad_norm": 0.4245528856490772, | |
| "learning_rate": 3.0048076923076927e-06, | |
| "loss": 0.4488, | |
| "mean_token_accuracy": 0.8763940989971161, | |
| "step": 25 | |
| }, | |
| { | |
| "epoch": 0.028873917228103944, | |
| "grad_norm": 0.32781111140311164, | |
| "learning_rate": 3.6057692307692307e-06, | |
| "loss": 0.4216, | |
| "mean_token_accuracy": 0.8823561608791352, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.03368623676612127, | |
| "grad_norm": 0.31949279692835963, | |
| "learning_rate": 4.20673076923077e-06, | |
| "loss": 0.402, | |
| "mean_token_accuracy": 0.886776065826416, | |
| "step": 35 | |
| }, | |
| { | |
| "epoch": 0.03849855630413859, | |
| "grad_norm": 0.274823992226846, | |
| "learning_rate": 4.807692307692308e-06, | |
| "loss": 0.3827, | |
| "mean_token_accuracy": 0.8909931480884552, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.04331087584215592, | |
| "grad_norm": 0.2258462276334408, | |
| "learning_rate": 5.408653846153847e-06, | |
| "loss": 0.3782, | |
| "mean_token_accuracy": 0.8906286716461181, | |
| "step": 45 | |
| }, | |
| { | |
| "epoch": 0.04812319538017324, | |
| "grad_norm": 0.2042958344104305, | |
| "learning_rate": 6.0096153846153855e-06, | |
| "loss": 0.3686, | |
| "mean_token_accuracy": 0.8932027518749237, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.05293551491819057, | |
| "grad_norm": 0.1916808494096459, | |
| "learning_rate": 6.610576923076923e-06, | |
| "loss": 0.3615, | |
| "mean_token_accuracy": 0.8949079632759094, | |
| "step": 55 | |
| }, | |
| { | |
| "epoch": 0.05774783445620789, | |
| "grad_norm": 0.19017708248972615, | |
| "learning_rate": 7.211538461538461e-06, | |
| "loss": 0.3611, | |
| "mean_token_accuracy": 0.8945391714572907, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.06256015399422522, | |
| "grad_norm": 0.19480677601380023, | |
| "learning_rate": 7.8125e-06, | |
| "loss": 0.3582, | |
| "mean_token_accuracy": 0.8952944934368133, | |
| "step": 65 | |
| }, | |
| { | |
| "epoch": 0.06737247353224254, | |
| "grad_norm": 0.19602952055969808, | |
| "learning_rate": 8.41346153846154e-06, | |
| "loss": 0.3503, | |
| "mean_token_accuracy": 0.8969010174274444, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.07218479307025986, | |
| "grad_norm": 0.18358230530950115, | |
| "learning_rate": 9.014423076923078e-06, | |
| "loss": 0.3552, | |
| "mean_token_accuracy": 0.8956303000450134, | |
| "step": 75 | |
| }, | |
| { | |
| "epoch": 0.07699711260827719, | |
| "grad_norm": 0.20794763818226286, | |
| "learning_rate": 9.615384615384616e-06, | |
| "loss": 0.346, | |
| "mean_token_accuracy": 0.8976047098636627, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.08180943214629452, | |
| "grad_norm": 0.19335399050885602, | |
| "learning_rate": 1.0216346153846154e-05, | |
| "loss": 0.3397, | |
| "mean_token_accuracy": 0.8996769070625306, | |
| "step": 85 | |
| }, | |
| { | |
| "epoch": 0.08662175168431184, | |
| "grad_norm": 0.18605610669108424, | |
| "learning_rate": 1.0817307692307693e-05, | |
| "loss": 0.3453, | |
| "mean_token_accuracy": 0.8974903225898743, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.09143407122232916, | |
| "grad_norm": 0.20447369849678262, | |
| "learning_rate": 1.1418269230769231e-05, | |
| "loss": 0.3393, | |
| "mean_token_accuracy": 0.8994290769100189, | |
| "step": 95 | |
| }, | |
| { | |
| "epoch": 0.09624639076034648, | |
| "grad_norm": 0.20334310471599598, | |
| "learning_rate": 1.2019230769230771e-05, | |
| "loss": 0.3388, | |
| "mean_token_accuracy": 0.8990323603153229, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.10105871029836382, | |
| "grad_norm": 0.19881436029829386, | |
| "learning_rate": 1.2620192307692307e-05, | |
| "loss": 0.3388, | |
| "mean_token_accuracy": 0.898909044265747, | |
| "step": 105 | |
| }, | |
| { | |
| "epoch": 0.10587102983638114, | |
| "grad_norm": 0.19476317612619345, | |
| "learning_rate": 1.3221153846153847e-05, | |
| "loss": 0.3395, | |
| "mean_token_accuracy": 0.898524421453476, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.11068334937439846, | |
| "grad_norm": 0.19435557042763899, | |
| "learning_rate": 1.3822115384615386e-05, | |
| "loss": 0.3347, | |
| "mean_token_accuracy": 0.899581927061081, | |
| "step": 115 | |
| }, | |
| { | |
| "epoch": 0.11549566891241578, | |
| "grad_norm": 0.23325224855003826, | |
| "learning_rate": 1.4423076923076923e-05, | |
| "loss": 0.3317, | |
| "mean_token_accuracy": 0.9008954703807831, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.12030798845043311, | |
| "grad_norm": 0.20039562214955578, | |
| "learning_rate": 1.5024038461538462e-05, | |
| "loss": 0.3316, | |
| "mean_token_accuracy": 0.9005917489528656, | |
| "step": 125 | |
| }, | |
| { | |
| "epoch": 0.12512030798845045, | |
| "grad_norm": 0.20930806046810285, | |
| "learning_rate": 1.5625e-05, | |
| "loss": 0.3238, | |
| "mean_token_accuracy": 0.9023682653903962, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.12993262752646775, | |
| "grad_norm": 0.21536212756436598, | |
| "learning_rate": 1.6225961538461538e-05, | |
| "loss": 0.3251, | |
| "mean_token_accuracy": 0.9019112467765809, | |
| "step": 135 | |
| }, | |
| { | |
| "epoch": 0.1347449470644851, | |
| "grad_norm": 0.23642271742267582, | |
| "learning_rate": 1.682692307692308e-05, | |
| "loss": 0.327, | |
| "mean_token_accuracy": 0.9015033841133118, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.1395572666025024, | |
| "grad_norm": 0.21465808428552932, | |
| "learning_rate": 1.7427884615384614e-05, | |
| "loss": 0.3223, | |
| "mean_token_accuracy": 0.9023724615573883, | |
| "step": 145 | |
| }, | |
| { | |
| "epoch": 0.14436958614051973, | |
| "grad_norm": 0.20899100670314785, | |
| "learning_rate": 1.8028846153846156e-05, | |
| "loss": 0.3177, | |
| "mean_token_accuracy": 0.9037426590919495, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.14918190567853706, | |
| "grad_norm": 0.21837434793284802, | |
| "learning_rate": 1.8629807692307693e-05, | |
| "loss": 0.3206, | |
| "mean_token_accuracy": 0.9032465398311615, | |
| "step": 155 | |
| }, | |
| { | |
| "epoch": 0.15399422521655437, | |
| "grad_norm": 0.21839337731942585, | |
| "learning_rate": 1.923076923076923e-05, | |
| "loss": 0.3161, | |
| "mean_token_accuracy": 0.9043103694915772, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.1588065447545717, | |
| "grad_norm": 0.2318516971176114, | |
| "learning_rate": 1.983173076923077e-05, | |
| "loss": 0.3194, | |
| "mean_token_accuracy": 0.9029073655605316, | |
| "step": 165 | |
| }, | |
| { | |
| "epoch": 0.16361886429258904, | |
| "grad_norm": 0.2367924022344465, | |
| "learning_rate": 2.0432692307692307e-05, | |
| "loss": 0.3194, | |
| "mean_token_accuracy": 0.9026305794715881, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.16843118383060635, | |
| "grad_norm": 0.23949609066198496, | |
| "learning_rate": 2.103365384615385e-05, | |
| "loss": 0.3114, | |
| "mean_token_accuracy": 0.9051806688308716, | |
| "step": 175 | |
| }, | |
| { | |
| "epoch": 0.17324350336862368, | |
| "grad_norm": 0.21148546656054262, | |
| "learning_rate": 2.1634615384615387e-05, | |
| "loss": 0.3116, | |
| "mean_token_accuracy": 0.9051199972629547, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.17805582290664101, | |
| "grad_norm": 0.2060879719306461, | |
| "learning_rate": 2.223557692307692e-05, | |
| "loss": 0.3098, | |
| "mean_token_accuracy": 0.9056381642818451, | |
| "step": 185 | |
| }, | |
| { | |
| "epoch": 0.18286814244465832, | |
| "grad_norm": 0.22995749054558867, | |
| "learning_rate": 2.2836538461538463e-05, | |
| "loss": 0.3064, | |
| "mean_token_accuracy": 0.9059844076633453, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 0.18768046198267566, | |
| "grad_norm": 0.2529943760836344, | |
| "learning_rate": 2.34375e-05, | |
| "loss": 0.309, | |
| "mean_token_accuracy": 0.9060332119464874, | |
| "step": 195 | |
| }, | |
| { | |
| "epoch": 0.19249278152069296, | |
| "grad_norm": 0.22924511849103366, | |
| "learning_rate": 2.4038461538461542e-05, | |
| "loss": 0.3068, | |
| "mean_token_accuracy": 0.9060187816619873, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.1973051010587103, | |
| "grad_norm": 0.22157933979497463, | |
| "learning_rate": 2.463942307692308e-05, | |
| "loss": 0.3033, | |
| "mean_token_accuracy": 0.9067791402339935, | |
| "step": 205 | |
| }, | |
| { | |
| "epoch": 0.20211742059672763, | |
| "grad_norm": 0.2124158678504148, | |
| "learning_rate": 2.5240384615384614e-05, | |
| "loss": 0.3027, | |
| "mean_token_accuracy": 0.9069958984851837, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 0.20692974013474494, | |
| "grad_norm": 0.2120949513943335, | |
| "learning_rate": 2.584134615384616e-05, | |
| "loss": 0.3035, | |
| "mean_token_accuracy": 0.9071210026741028, | |
| "step": 215 | |
| }, | |
| { | |
| "epoch": 0.21174205967276227, | |
| "grad_norm": 0.22006803152937704, | |
| "learning_rate": 2.6442307692307694e-05, | |
| "loss": 0.2977, | |
| "mean_token_accuracy": 0.9082063376903534, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 0.2165543792107796, | |
| "grad_norm": 0.23505752640023267, | |
| "learning_rate": 2.704326923076923e-05, | |
| "loss": 0.2964, | |
| "mean_token_accuracy": 0.9089667618274688, | |
| "step": 225 | |
| }, | |
| { | |
| "epoch": 0.22136669874879691, | |
| "grad_norm": 0.2225033077639756, | |
| "learning_rate": 2.7644230769230773e-05, | |
| "loss": 0.2926, | |
| "mean_token_accuracy": 0.9100411355495452, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 0.22617901828681425, | |
| "grad_norm": 0.22772871117888155, | |
| "learning_rate": 2.8245192307692307e-05, | |
| "loss": 0.2939, | |
| "mean_token_accuracy": 0.9091685652732849, | |
| "step": 235 | |
| }, | |
| { | |
| "epoch": 0.23099133782483156, | |
| "grad_norm": 0.24694973761176517, | |
| "learning_rate": 2.8846153846153845e-05, | |
| "loss": 0.2926, | |
| "mean_token_accuracy": 0.9095185458660126, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 0.2358036573628489, | |
| "grad_norm": 0.2637613691705069, | |
| "learning_rate": 2.9447115384615387e-05, | |
| "loss": 0.2891, | |
| "mean_token_accuracy": 0.9101428985595703, | |
| "step": 245 | |
| }, | |
| { | |
| "epoch": 0.24061597690086622, | |
| "grad_norm": 0.2713361047071815, | |
| "learning_rate": 3.0048076923076925e-05, | |
| "loss": 0.2886, | |
| "mean_token_accuracy": 0.9104502618312835, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.24542829643888353, | |
| "grad_norm": 0.2553877523921197, | |
| "learning_rate": 3.064903846153846e-05, | |
| "loss": 0.2859, | |
| "mean_token_accuracy": 0.911614739894867, | |
| "step": 255 | |
| }, | |
| { | |
| "epoch": 0.2502406159769009, | |
| "grad_norm": 0.24431375122707405, | |
| "learning_rate": 3.125e-05, | |
| "loss": 0.2854, | |
| "mean_token_accuracy": 0.9113099038600921, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 0.2550529355149182, | |
| "grad_norm": 0.30751784797969783, | |
| "learning_rate": 3.185096153846154e-05, | |
| "loss": 0.2825, | |
| "mean_token_accuracy": 0.9123991250991821, | |
| "step": 265 | |
| }, | |
| { | |
| "epoch": 0.2598652550529355, | |
| "grad_norm": 0.24501943187653763, | |
| "learning_rate": 3.2451923076923077e-05, | |
| "loss": 0.2812, | |
| "mean_token_accuracy": 0.9123341858386993, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 0.2646775745909528, | |
| "grad_norm": 0.25184725821327375, | |
| "learning_rate": 3.3052884615384615e-05, | |
| "loss": 0.2845, | |
| "mean_token_accuracy": 0.9116441786289216, | |
| "step": 275 | |
| }, | |
| { | |
| "epoch": 0.2694898941289702, | |
| "grad_norm": 0.2500241308295983, | |
| "learning_rate": 3.365384615384616e-05, | |
| "loss": 0.2736, | |
| "mean_token_accuracy": 0.9142520189285278, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 0.2743022136669875, | |
| "grad_norm": 0.23710174110673563, | |
| "learning_rate": 3.42548076923077e-05, | |
| "loss": 0.2757, | |
| "mean_token_accuracy": 0.9137245714664459, | |
| "step": 285 | |
| }, | |
| { | |
| "epoch": 0.2791145332050048, | |
| "grad_norm": 0.23148187198017783, | |
| "learning_rate": 3.485576923076923e-05, | |
| "loss": 0.2741, | |
| "mean_token_accuracy": 0.9145014345645904, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 0.28392685274302215, | |
| "grad_norm": 0.26370964484731635, | |
| "learning_rate": 3.545673076923077e-05, | |
| "loss": 0.2724, | |
| "mean_token_accuracy": 0.9147723019123077, | |
| "step": 295 | |
| }, | |
| { | |
| "epoch": 0.28873917228103946, | |
| "grad_norm": 0.21633873707198026, | |
| "learning_rate": 3.605769230769231e-05, | |
| "loss": 0.273, | |
| "mean_token_accuracy": 0.9143404364585876, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.29355149181905676, | |
| "grad_norm": 0.23443037076808915, | |
| "learning_rate": 3.665865384615384e-05, | |
| "loss": 0.2735, | |
| "mean_token_accuracy": 0.9144095242023468, | |
| "step": 305 | |
| }, | |
| { | |
| "epoch": 0.2983638113570741, | |
| "grad_norm": 0.23769555202260598, | |
| "learning_rate": 3.725961538461539e-05, | |
| "loss": 0.2645, | |
| "mean_token_accuracy": 0.9169483184814453, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 0.30317613089509143, | |
| "grad_norm": 0.24097323319080186, | |
| "learning_rate": 3.7860576923076925e-05, | |
| "loss": 0.2655, | |
| "mean_token_accuracy": 0.9172016143798828, | |
| "step": 315 | |
| }, | |
| { | |
| "epoch": 0.30798845043310874, | |
| "grad_norm": 0.2260059541937123, | |
| "learning_rate": 3.846153846153846e-05, | |
| "loss": 0.2633, | |
| "mean_token_accuracy": 0.9176544308662414, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 0.3128007699711261, | |
| "grad_norm": 0.24799131337044003, | |
| "learning_rate": 3.90625e-05, | |
| "loss": 0.2654, | |
| "mean_token_accuracy": 0.9168532133102417, | |
| "step": 325 | |
| }, | |
| { | |
| "epoch": 0.3176130895091434, | |
| "grad_norm": 0.24231727826318275, | |
| "learning_rate": 3.966346153846154e-05, | |
| "loss": 0.26, | |
| "mean_token_accuracy": 0.9179026305675506, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 0.3224254090471607, | |
| "grad_norm": 0.2207228184921339, | |
| "learning_rate": 4.0264423076923083e-05, | |
| "loss": 0.2635, | |
| "mean_token_accuracy": 0.9170978605747223, | |
| "step": 335 | |
| }, | |
| { | |
| "epoch": 0.3272377285851781, | |
| "grad_norm": 0.21630836192516414, | |
| "learning_rate": 4.0865384615384615e-05, | |
| "loss": 0.2623, | |
| "mean_token_accuracy": 0.9176133811473847, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 0.3320500481231954, | |
| "grad_norm": 0.26861089976837044, | |
| "learning_rate": 4.146634615384616e-05, | |
| "loss": 0.2587, | |
| "mean_token_accuracy": 0.9187050104141236, | |
| "step": 345 | |
| }, | |
| { | |
| "epoch": 0.3368623676612127, | |
| "grad_norm": 0.23582082268676752, | |
| "learning_rate": 4.20673076923077e-05, | |
| "loss": 0.2615, | |
| "mean_token_accuracy": 0.9176427960395813, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 0.34167468719923005, | |
| "grad_norm": 0.24694848708005535, | |
| "learning_rate": 4.266826923076923e-05, | |
| "loss": 0.2587, | |
| "mean_token_accuracy": 0.9186454594135285, | |
| "step": 355 | |
| }, | |
| { | |
| "epoch": 0.34648700673724736, | |
| "grad_norm": 0.23331931221260077, | |
| "learning_rate": 4.326923076923077e-05, | |
| "loss": 0.2567, | |
| "mean_token_accuracy": 0.9187827825546264, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 0.35129932627526467, | |
| "grad_norm": 0.23630300636599155, | |
| "learning_rate": 4.387019230769231e-05, | |
| "loss": 0.2533, | |
| "mean_token_accuracy": 0.9199528455734253, | |
| "step": 365 | |
| }, | |
| { | |
| "epoch": 0.35611164581328203, | |
| "grad_norm": 0.20622696018078374, | |
| "learning_rate": 4.447115384615384e-05, | |
| "loss": 0.2459, | |
| "mean_token_accuracy": 0.9218752324581146, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 0.36092396535129934, | |
| "grad_norm": 0.24595357029780027, | |
| "learning_rate": 4.507211538461539e-05, | |
| "loss": 0.2491, | |
| "mean_token_accuracy": 0.9211262464523315, | |
| "step": 375 | |
| }, | |
| { | |
| "epoch": 0.36573628488931664, | |
| "grad_norm": 0.2618254972502607, | |
| "learning_rate": 4.5673076923076925e-05, | |
| "loss": 0.2475, | |
| "mean_token_accuracy": 0.9219158530235291, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 0.37054860442733395, | |
| "grad_norm": 0.21915443484232988, | |
| "learning_rate": 4.627403846153846e-05, | |
| "loss": 0.248, | |
| "mean_token_accuracy": 0.9213620781898498, | |
| "step": 385 | |
| }, | |
| { | |
| "epoch": 0.3753609239653513, | |
| "grad_norm": 0.23933759541395727, | |
| "learning_rate": 4.6875e-05, | |
| "loss": 0.245, | |
| "mean_token_accuracy": 0.9222877621650696, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 0.3801732435033686, | |
| "grad_norm": 0.2387952859840412, | |
| "learning_rate": 4.747596153846154e-05, | |
| "loss": 0.2469, | |
| "mean_token_accuracy": 0.9217739880084992, | |
| "step": 395 | |
| }, | |
| { | |
| "epoch": 0.3849855630413859, | |
| "grad_norm": 0.23642623807370478, | |
| "learning_rate": 4.8076923076923084e-05, | |
| "loss": 0.2448, | |
| "mean_token_accuracy": 0.9224328458309173, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.3897978825794033, | |
| "grad_norm": 0.2293352739617301, | |
| "learning_rate": 4.8677884615384615e-05, | |
| "loss": 0.2435, | |
| "mean_token_accuracy": 0.9224577963352203, | |
| "step": 405 | |
| }, | |
| { | |
| "epoch": 0.3946102021174206, | |
| "grad_norm": 0.19173353682080843, | |
| "learning_rate": 4.927884615384616e-05, | |
| "loss": 0.242, | |
| "mean_token_accuracy": 0.923109644651413, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 0.3994225216554379, | |
| "grad_norm": 0.20164383913127248, | |
| "learning_rate": 4.98798076923077e-05, | |
| "loss": 0.2439, | |
| "mean_token_accuracy": 0.9230926752090454, | |
| "step": 415 | |
| }, | |
| { | |
| "epoch": 0.40423484119345526, | |
| "grad_norm": 0.20018694222561512, | |
| "learning_rate": 4.9999872992713485e-05, | |
| "loss": 0.242, | |
| "mean_token_accuracy": 0.9232556998729706, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 0.40904716073147257, | |
| "grad_norm": 0.1949539861498279, | |
| "learning_rate": 4.9999357028069456e-05, | |
| "loss": 0.2391, | |
| "mean_token_accuracy": 0.9239628553390503, | |
| "step": 425 | |
| }, | |
| { | |
| "epoch": 0.4138594802694899, | |
| "grad_norm": 0.1925271480624706, | |
| "learning_rate": 4.9998444177207064e-05, | |
| "loss": 0.2439, | |
| "mean_token_accuracy": 0.9226640999317169, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 0.41867179980750724, | |
| "grad_norm": 0.2111608513813791, | |
| "learning_rate": 4.9997134456228895e-05, | |
| "loss": 0.2371, | |
| "mean_token_accuracy": 0.9244004487991333, | |
| "step": 435 | |
| }, | |
| { | |
| "epoch": 0.42348411934552455, | |
| "grad_norm": 0.20999155433721162, | |
| "learning_rate": 4.999542788823828e-05, | |
| "loss": 0.2401, | |
| "mean_token_accuracy": 0.9236264228820801, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 0.42829643888354185, | |
| "grad_norm": 0.23287917636614017, | |
| "learning_rate": 4.999332450333892e-05, | |
| "loss": 0.2419, | |
| "mean_token_accuracy": 0.9232407748699188, | |
| "step": 445 | |
| }, | |
| { | |
| "epoch": 0.4331087584215592, | |
| "grad_norm": 0.2168828989727269, | |
| "learning_rate": 4.999082433863426e-05, | |
| "loss": 0.2355, | |
| "mean_token_accuracy": 0.9251596629619598, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 0.4379210779595765, | |
| "grad_norm": 0.20690000441380088, | |
| "learning_rate": 4.998792743822695e-05, | |
| "loss": 0.2329, | |
| "mean_token_accuracy": 0.9252615332603454, | |
| "step": 455 | |
| }, | |
| { | |
| "epoch": 0.44273339749759383, | |
| "grad_norm": 0.198562794932213, | |
| "learning_rate": 4.998463385321802e-05, | |
| "loss": 0.2328, | |
| "mean_token_accuracy": 0.9258227527141571, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 0.4475457170356112, | |
| "grad_norm": 0.19965813798162313, | |
| "learning_rate": 4.998094364170592e-05, | |
| "loss": 0.2321, | |
| "mean_token_accuracy": 0.9261925756931305, | |
| "step": 465 | |
| }, | |
| { | |
| "epoch": 0.4523580365736285, | |
| "grad_norm": 0.19707596816161657, | |
| "learning_rate": 4.997685686878559e-05, | |
| "loss": 0.2322, | |
| "mean_token_accuracy": 0.9263604760169983, | |
| "step": 470 | |
| }, | |
| { | |
| "epoch": 0.4571703561116458, | |
| "grad_norm": 0.2120036055302031, | |
| "learning_rate": 4.997237360654728e-05, | |
| "loss": 0.2359, | |
| "mean_token_accuracy": 0.9246505975723267, | |
| "step": 475 | |
| }, | |
| { | |
| "epoch": 0.4619826756496631, | |
| "grad_norm": 0.1955923914299568, | |
| "learning_rate": 4.9967493934075225e-05, | |
| "loss": 0.2277, | |
| "mean_token_accuracy": 0.9271229326725006, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 0.4667949951876805, | |
| "grad_norm": 0.18845711503685067, | |
| "learning_rate": 4.996221793744633e-05, | |
| "loss": 0.2309, | |
| "mean_token_accuracy": 0.926458764076233, | |
| "step": 485 | |
| }, | |
| { | |
| "epoch": 0.4716073147256978, | |
| "grad_norm": 0.1916514807492055, | |
| "learning_rate": 4.9956545709728607e-05, | |
| "loss": 0.2311, | |
| "mean_token_accuracy": 0.9265025436878205, | |
| "step": 490 | |
| }, | |
| { | |
| "epoch": 0.4764196342637151, | |
| "grad_norm": 0.1831174754255436, | |
| "learning_rate": 4.995047735097953e-05, | |
| "loss": 0.2264, | |
| "mean_token_accuracy": 0.9274278402328491, | |
| "step": 495 | |
| }, | |
| { | |
| "epoch": 0.48123195380173245, | |
| "grad_norm": 0.19118428984130756, | |
| "learning_rate": 4.994401296824429e-05, | |
| "loss": 0.2285, | |
| "mean_token_accuracy": 0.9272461295127868, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.48604427333974976, | |
| "grad_norm": 0.18882992917341843, | |
| "learning_rate": 4.993715267555391e-05, | |
| "loss": 0.2233, | |
| "mean_token_accuracy": 0.9287059664726257, | |
| "step": 505 | |
| }, | |
| { | |
| "epoch": 0.49085659287776706, | |
| "grad_norm": 0.1850369085704065, | |
| "learning_rate": 4.9929896593923186e-05, | |
| "loss": 0.2274, | |
| "mean_token_accuracy": 0.9275294721126557, | |
| "step": 510 | |
| }, | |
| { | |
| "epoch": 0.4956689124157844, | |
| "grad_norm": 0.18326425325736875, | |
| "learning_rate": 4.992224485134863e-05, | |
| "loss": 0.2226, | |
| "mean_token_accuracy": 0.9288658618927002, | |
| "step": 515 | |
| }, | |
| { | |
| "epoch": 0.5004812319538018, | |
| "grad_norm": 0.1751980566027487, | |
| "learning_rate": 4.9914197582806145e-05, | |
| "loss": 0.2234, | |
| "mean_token_accuracy": 0.9285242080688476, | |
| "step": 520 | |
| }, | |
| { | |
| "epoch": 0.5052935514918191, | |
| "grad_norm": 0.1866874844733189, | |
| "learning_rate": 4.990575493024867e-05, | |
| "loss": 0.2188, | |
| "mean_token_accuracy": 0.9300346314907074, | |
| "step": 525 | |
| }, | |
| { | |
| "epoch": 0.5101058710298364, | |
| "grad_norm": 0.17980121352178607, | |
| "learning_rate": 4.98969170426037e-05, | |
| "loss": 0.22, | |
| "mean_token_accuracy": 0.9295093834400177, | |
| "step": 530 | |
| }, | |
| { | |
| "epoch": 0.5149181905678537, | |
| "grad_norm": 0.19194874922176253, | |
| "learning_rate": 4.988768407577059e-05, | |
| "loss": 0.217, | |
| "mean_token_accuracy": 0.9303382456302642, | |
| "step": 535 | |
| }, | |
| { | |
| "epoch": 0.519730510105871, | |
| "grad_norm": 0.18151511666914738, | |
| "learning_rate": 4.9878056192617887e-05, | |
| "loss": 0.2193, | |
| "mean_token_accuracy": 0.9298461735248565, | |
| "step": 540 | |
| }, | |
| { | |
| "epoch": 0.5245428296438883, | |
| "grad_norm": 0.20277802455653215, | |
| "learning_rate": 4.986803356298041e-05, | |
| "loss": 0.2151, | |
| "mean_token_accuracy": 0.9310078263282776, | |
| "step": 545 | |
| }, | |
| { | |
| "epoch": 0.5293551491819056, | |
| "grad_norm": 0.18836634574755295, | |
| "learning_rate": 4.9857616363656254e-05, | |
| "loss": 0.2176, | |
| "mean_token_accuracy": 0.9302059590816498, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 0.534167468719923, | |
| "grad_norm": 0.170190135757744, | |
| "learning_rate": 4.9846804778403684e-05, | |
| "loss": 0.216, | |
| "mean_token_accuracy": 0.9306394636631012, | |
| "step": 555 | |
| }, | |
| { | |
| "epoch": 0.5389797882579404, | |
| "grad_norm": 0.17792241359141722, | |
| "learning_rate": 4.9835598997937886e-05, | |
| "loss": 0.2142, | |
| "mean_token_accuracy": 0.9311121761798858, | |
| "step": 560 | |
| }, | |
| { | |
| "epoch": 0.5437921077959577, | |
| "grad_norm": 0.17146397198256894, | |
| "learning_rate": 4.982399921992762e-05, | |
| "loss": 0.2154, | |
| "mean_token_accuracy": 0.9308123111724853, | |
| "step": 565 | |
| }, | |
| { | |
| "epoch": 0.548604427333975, | |
| "grad_norm": 0.17134968955445928, | |
| "learning_rate": 4.9812005648991715e-05, | |
| "loss": 0.2151, | |
| "mean_token_accuracy": 0.930792760848999, | |
| "step": 570 | |
| }, | |
| { | |
| "epoch": 0.5534167468719923, | |
| "grad_norm": 0.1820481148200458, | |
| "learning_rate": 4.979961849669546e-05, | |
| "loss": 0.2124, | |
| "mean_token_accuracy": 0.9313820898532867, | |
| "step": 575 | |
| }, | |
| { | |
| "epoch": 0.5582290664100096, | |
| "grad_norm": 0.16450930819806728, | |
| "learning_rate": 4.978683798154687e-05, | |
| "loss": 0.2142, | |
| "mean_token_accuracy": 0.9314321875572205, | |
| "step": 580 | |
| }, | |
| { | |
| "epoch": 0.563041385948027, | |
| "grad_norm": 0.17918708676295672, | |
| "learning_rate": 4.977366432899285e-05, | |
| "loss": 0.2122, | |
| "mean_token_accuracy": 0.9318382501602173, | |
| "step": 585 | |
| }, | |
| { | |
| "epoch": 0.5678537054860443, | |
| "grad_norm": 0.16726196420216657, | |
| "learning_rate": 4.9760097771415216e-05, | |
| "loss": 0.2109, | |
| "mean_token_accuracy": 0.9325143158435821, | |
| "step": 590 | |
| }, | |
| { | |
| "epoch": 0.5726660250240616, | |
| "grad_norm": 0.1649318455665257, | |
| "learning_rate": 4.974613854812655e-05, | |
| "loss": 0.2091, | |
| "mean_token_accuracy": 0.9329926609992981, | |
| "step": 595 | |
| }, | |
| { | |
| "epoch": 0.5774783445620789, | |
| "grad_norm": 0.16928807920577638, | |
| "learning_rate": 4.973178690536606e-05, | |
| "loss": 0.2139, | |
| "mean_token_accuracy": 0.9314971804618836, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 0.5822906641000962, | |
| "grad_norm": 0.1662224760778078, | |
| "learning_rate": 4.9717043096295154e-05, | |
| "loss": 0.2101, | |
| "mean_token_accuracy": 0.9325581789016724, | |
| "step": 605 | |
| }, | |
| { | |
| "epoch": 0.5871029836381135, | |
| "grad_norm": 0.1578121845196821, | |
| "learning_rate": 4.9701907380993026e-05, | |
| "loss": 0.2101, | |
| "mean_token_accuracy": 0.9321203470230103, | |
| "step": 610 | |
| }, | |
| { | |
| "epoch": 0.591915303176131, | |
| "grad_norm": 0.17012310791047996, | |
| "learning_rate": 4.968638002645206e-05, | |
| "loss": 0.211, | |
| "mean_token_accuracy": 0.9323883295059204, | |
| "step": 615 | |
| }, | |
| { | |
| "epoch": 0.5967276227141483, | |
| "grad_norm": 0.17166329526964003, | |
| "learning_rate": 4.96704613065731e-05, | |
| "loss": 0.2069, | |
| "mean_token_accuracy": 0.9333858072757721, | |
| "step": 620 | |
| }, | |
| { | |
| "epoch": 0.6015399422521656, | |
| "grad_norm": 0.16533395444354704, | |
| "learning_rate": 4.9654151502160626e-05, | |
| "loss": 0.2091, | |
| "mean_token_accuracy": 0.9330121159553528, | |
| "step": 625 | |
| }, | |
| { | |
| "epoch": 0.6063522617901829, | |
| "grad_norm": 0.1842878024158759, | |
| "learning_rate": 4.963745090091785e-05, | |
| "loss": 0.2121, | |
| "mean_token_accuracy": 0.9319897174835206, | |
| "step": 630 | |
| }, | |
| { | |
| "epoch": 0.6111645813282002, | |
| "grad_norm": 0.17359354952628656, | |
| "learning_rate": 4.962035979744155e-05, | |
| "loss": 0.2082, | |
| "mean_token_accuracy": 0.9331151902675628, | |
| "step": 635 | |
| }, | |
| { | |
| "epoch": 0.6159769008662175, | |
| "grad_norm": 0.17499969626825637, | |
| "learning_rate": 4.9602878493216943e-05, | |
| "loss": 0.2066, | |
| "mean_token_accuracy": 0.9333402752876282, | |
| "step": 640 | |
| }, | |
| { | |
| "epoch": 0.6207892204042348, | |
| "grad_norm": 0.1628031967511819, | |
| "learning_rate": 4.958500729661232e-05, | |
| "loss": 0.2124, | |
| "mean_token_accuracy": 0.9320643424987793, | |
| "step": 645 | |
| }, | |
| { | |
| "epoch": 0.6256015399422522, | |
| "grad_norm": 0.1714543379049269, | |
| "learning_rate": 4.956674652287369e-05, | |
| "loss": 0.2038, | |
| "mean_token_accuracy": 0.9341357469558715, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 0.6304138594802695, | |
| "grad_norm": 0.16165248911245264, | |
| "learning_rate": 4.9548096494119085e-05, | |
| "loss": 0.2057, | |
| "mean_token_accuracy": 0.9338850259780884, | |
| "step": 655 | |
| }, | |
| { | |
| "epoch": 0.6352261790182868, | |
| "grad_norm": 0.1676909877987827, | |
| "learning_rate": 4.9529057539333e-05, | |
| "loss": 0.2086, | |
| "mean_token_accuracy": 0.9329083442687989, | |
| "step": 660 | |
| }, | |
| { | |
| "epoch": 0.6400384985563041, | |
| "grad_norm": 0.153516779931267, | |
| "learning_rate": 4.950962999436054e-05, | |
| "loss": 0.2032, | |
| "mean_token_accuracy": 0.934636515378952, | |
| "step": 665 | |
| }, | |
| { | |
| "epoch": 0.6448508180943214, | |
| "grad_norm": 0.1594429729423397, | |
| "learning_rate": 4.94898142019015e-05, | |
| "loss": 0.2045, | |
| "mean_token_accuracy": 0.9339752435684204, | |
| "step": 670 | |
| }, | |
| { | |
| "epoch": 0.6496631376323387, | |
| "grad_norm": 0.1559834009442479, | |
| "learning_rate": 4.94696105115043e-05, | |
| "loss": 0.2025, | |
| "mean_token_accuracy": 0.9347749710083008, | |
| "step": 675 | |
| }, | |
| { | |
| "epoch": 0.6544754571703562, | |
| "grad_norm": 0.16607000813015502, | |
| "learning_rate": 4.944901927955983e-05, | |
| "loss": 0.205, | |
| "mean_token_accuracy": 0.9340804874897003, | |
| "step": 680 | |
| }, | |
| { | |
| "epoch": 0.6592877767083735, | |
| "grad_norm": 0.16021887842102764, | |
| "learning_rate": 4.9428040869295214e-05, | |
| "loss": 0.2042, | |
| "mean_token_accuracy": 0.9342449188232422, | |
| "step": 685 | |
| }, | |
| { | |
| "epoch": 0.6641000962463908, | |
| "grad_norm": 0.15648166348167702, | |
| "learning_rate": 4.940667565076732e-05, | |
| "loss": 0.2053, | |
| "mean_token_accuracy": 0.9340552270412446, | |
| "step": 690 | |
| }, | |
| { | |
| "epoch": 0.6689124157844081, | |
| "grad_norm": 0.14666372443826106, | |
| "learning_rate": 4.9384924000856304e-05, | |
| "loss": 0.2028, | |
| "mean_token_accuracy": 0.9346394658088684, | |
| "step": 695 | |
| }, | |
| { | |
| "epoch": 0.6737247353224254, | |
| "grad_norm": 0.15389492092330728, | |
| "learning_rate": 4.936278630325889e-05, | |
| "loss": 0.2045, | |
| "mean_token_accuracy": 0.9342878043651581, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 0.6785370548604427, | |
| "grad_norm": 0.15102493921127202, | |
| "learning_rate": 4.9340262948481686e-05, | |
| "loss": 0.2013, | |
| "mean_token_accuracy": 0.9352310419082641, | |
| "step": 705 | |
| }, | |
| { | |
| "epoch": 0.6833493743984601, | |
| "grad_norm": 0.1583451751639065, | |
| "learning_rate": 4.931735433383421e-05, | |
| "loss": 0.2035, | |
| "mean_token_accuracy": 0.9347138643264771, | |
| "step": 710 | |
| }, | |
| { | |
| "epoch": 0.6881616939364774, | |
| "grad_norm": 0.16050868810410332, | |
| "learning_rate": 4.929406086342194e-05, | |
| "loss": 0.2004, | |
| "mean_token_accuracy": 0.9357948362827301, | |
| "step": 715 | |
| }, | |
| { | |
| "epoch": 0.6929740134744947, | |
| "grad_norm": 0.15529404693999008, | |
| "learning_rate": 4.927038294813919e-05, | |
| "loss": 0.2034, | |
| "mean_token_accuracy": 0.9344488203525543, | |
| "step": 720 | |
| }, | |
| { | |
| "epoch": 0.697786333012512, | |
| "grad_norm": 0.15105497756267436, | |
| "learning_rate": 4.9246321005661786e-05, | |
| "loss": 0.2021, | |
| "mean_token_accuracy": 0.9351491272449494, | |
| "step": 725 | |
| }, | |
| { | |
| "epoch": 0.7025986525505293, | |
| "grad_norm": 0.15267217938951647, | |
| "learning_rate": 4.922187546043981e-05, | |
| "loss": 0.1994, | |
| "mean_token_accuracy": 0.9354017674922943, | |
| "step": 730 | |
| }, | |
| { | |
| "epoch": 0.7074109720885466, | |
| "grad_norm": 0.15754272056522092, | |
| "learning_rate": 4.919704674369001e-05, | |
| "loss": 0.2014, | |
| "mean_token_accuracy": 0.9350604891777039, | |
| "step": 735 | |
| }, | |
| { | |
| "epoch": 0.7122232916265641, | |
| "grad_norm": 0.16079938047577652, | |
| "learning_rate": 4.917183529338828e-05, | |
| "loss": 0.1978, | |
| "mean_token_accuracy": 0.9363770544528961, | |
| "step": 740 | |
| }, | |
| { | |
| "epoch": 0.7170356111645814, | |
| "grad_norm": 0.1613856968573564, | |
| "learning_rate": 4.914624155426184e-05, | |
| "loss": 0.1981, | |
| "mean_token_accuracy": 0.935772043466568, | |
| "step": 745 | |
| }, | |
| { | |
| "epoch": 0.7218479307025987, | |
| "grad_norm": 0.15737711638174187, | |
| "learning_rate": 4.912026597778151e-05, | |
| "loss": 0.1983, | |
| "mean_token_accuracy": 0.93600914478302, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 0.726660250240616, | |
| "grad_norm": 0.15379404311353828, | |
| "learning_rate": 4.909390902215362e-05, | |
| "loss": 0.1987, | |
| "mean_token_accuracy": 0.9356024980545044, | |
| "step": 755 | |
| }, | |
| { | |
| "epoch": 0.7314725697786333, | |
| "grad_norm": 0.16038280905509936, | |
| "learning_rate": 4.9067171152312e-05, | |
| "loss": 0.1997, | |
| "mean_token_accuracy": 0.9355686485767365, | |
| "step": 760 | |
| }, | |
| { | |
| "epoch": 0.7362848893166506, | |
| "grad_norm": 0.1486852889908064, | |
| "learning_rate": 4.9040052839909794e-05, | |
| "loss": 0.1965, | |
| "mean_token_accuracy": 0.9363737523555755, | |
| "step": 765 | |
| }, | |
| { | |
| "epoch": 0.7410972088546679, | |
| "grad_norm": 0.15518744777139062, | |
| "learning_rate": 4.901255456331108e-05, | |
| "loss": 0.1987, | |
| "mean_token_accuracy": 0.9358155608177186, | |
| "step": 770 | |
| }, | |
| { | |
| "epoch": 0.7459095283926853, | |
| "grad_norm": 0.15871926575713205, | |
| "learning_rate": 4.898467680758249e-05, | |
| "loss": 0.1973, | |
| "mean_token_accuracy": 0.9361558556556702, | |
| "step": 775 | |
| }, | |
| { | |
| "epoch": 0.7507218479307026, | |
| "grad_norm": 0.15341014535688668, | |
| "learning_rate": 4.895642006448459e-05, | |
| "loss": 0.1998, | |
| "mean_token_accuracy": 0.9353261828422547, | |
| "step": 780 | |
| }, | |
| { | |
| "epoch": 0.7555341674687199, | |
| "grad_norm": 0.15354208139412714, | |
| "learning_rate": 4.892778483246329e-05, | |
| "loss": 0.198, | |
| "mean_token_accuracy": 0.936205518245697, | |
| "step": 785 | |
| }, | |
| { | |
| "epoch": 0.7603464870067372, | |
| "grad_norm": 0.1676080471427938, | |
| "learning_rate": 4.889877161664096e-05, | |
| "loss": 0.1982, | |
| "mean_token_accuracy": 0.9359531819820404, | |
| "step": 790 | |
| }, | |
| { | |
| "epoch": 0.7651588065447545, | |
| "grad_norm": 0.15194121255370366, | |
| "learning_rate": 4.8869380928807584e-05, | |
| "loss": 0.1981, | |
| "mean_token_accuracy": 0.9357672929763794, | |
| "step": 795 | |
| }, | |
| { | |
| "epoch": 0.7699711260827719, | |
| "grad_norm": 0.1485622615962154, | |
| "learning_rate": 4.883961328741172e-05, | |
| "loss": 0.1982, | |
| "mean_token_accuracy": 0.9358666718006134, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 0.7747834456207893, | |
| "grad_norm": 0.14860781416450622, | |
| "learning_rate": 4.8809469217551315e-05, | |
| "loss": 0.1978, | |
| "mean_token_accuracy": 0.9361135065555573, | |
| "step": 805 | |
| }, | |
| { | |
| "epoch": 0.7795957651588066, | |
| "grad_norm": 0.1519487375166515, | |
| "learning_rate": 4.87789492509645e-05, | |
| "loss": 0.1953, | |
| "mean_token_accuracy": 0.9365713894367218, | |
| "step": 810 | |
| }, | |
| { | |
| "epoch": 0.7844080846968239, | |
| "grad_norm": 0.1452068241845975, | |
| "learning_rate": 4.874805392602019e-05, | |
| "loss": 0.1973, | |
| "mean_token_accuracy": 0.9362602889537811, | |
| "step": 815 | |
| }, | |
| { | |
| "epoch": 0.7892204042348412, | |
| "grad_norm": 0.14275500950891148, | |
| "learning_rate": 4.871678378770855e-05, | |
| "loss": 0.1983, | |
| "mean_token_accuracy": 0.935696005821228, | |
| "step": 820 | |
| }, | |
| { | |
| "epoch": 0.7940327237728585, | |
| "grad_norm": 0.15117930283683687, | |
| "learning_rate": 4.868513938763144e-05, | |
| "loss": 0.1995, | |
| "mean_token_accuracy": 0.9355600476264954, | |
| "step": 825 | |
| }, | |
| { | |
| "epoch": 0.7988450433108758, | |
| "grad_norm": 0.14561350304265, | |
| "learning_rate": 4.8653121283992645e-05, | |
| "loss": 0.1964, | |
| "mean_token_accuracy": 0.9365583121776581, | |
| "step": 830 | |
| }, | |
| { | |
| "epoch": 0.8036573628488932, | |
| "grad_norm": 0.1383640699384444, | |
| "learning_rate": 4.862073004158803e-05, | |
| "loss": 0.1925, | |
| "mean_token_accuracy": 0.9378562211990357, | |
| "step": 835 | |
| }, | |
| { | |
| "epoch": 0.8084696823869105, | |
| "grad_norm": 0.14614891503019267, | |
| "learning_rate": 4.858796623179561e-05, | |
| "loss": 0.2002, | |
| "mean_token_accuracy": 0.9354116797447205, | |
| "step": 840 | |
| }, | |
| { | |
| "epoch": 0.8132820019249278, | |
| "grad_norm": 0.14256550291335196, | |
| "learning_rate": 4.8554830432565435e-05, | |
| "loss": 0.1935, | |
| "mean_token_accuracy": 0.9371684491634369, | |
| "step": 845 | |
| }, | |
| { | |
| "epoch": 0.8180943214629451, | |
| "grad_norm": 0.14296189384321376, | |
| "learning_rate": 4.8521323228409416e-05, | |
| "loss": 0.1967, | |
| "mean_token_accuracy": 0.9366355955600738, | |
| "step": 850 | |
| }, | |
| { | |
| "epoch": 0.8229066410009624, | |
| "grad_norm": 0.15045821796262335, | |
| "learning_rate": 4.8487445210390986e-05, | |
| "loss": 0.1939, | |
| "mean_token_accuracy": 0.93716059923172, | |
| "step": 855 | |
| }, | |
| { | |
| "epoch": 0.8277189605389798, | |
| "grad_norm": 0.14147262519766843, | |
| "learning_rate": 4.845319697611472e-05, | |
| "loss": 0.1956, | |
| "mean_token_accuracy": 0.9367146670818329, | |
| "step": 860 | |
| }, | |
| { | |
| "epoch": 0.8325312800769971, | |
| "grad_norm": 0.14803737277242698, | |
| "learning_rate": 4.841857912971576e-05, | |
| "loss": 0.1927, | |
| "mean_token_accuracy": 0.9377514243125915, | |
| "step": 865 | |
| }, | |
| { | |
| "epoch": 0.8373435996150145, | |
| "grad_norm": 0.14753346634230655, | |
| "learning_rate": 4.8383592281849156e-05, | |
| "loss": 0.1953, | |
| "mean_token_accuracy": 0.9369560062885285, | |
| "step": 870 | |
| }, | |
| { | |
| "epoch": 0.8421559191530318, | |
| "grad_norm": 0.13509845475626056, | |
| "learning_rate": 4.8348237049679106e-05, | |
| "loss": 0.195, | |
| "mean_token_accuracy": 0.9368936121463776, | |
| "step": 875 | |
| }, | |
| { | |
| "epoch": 0.8469682386910491, | |
| "grad_norm": 0.1377463966885082, | |
| "learning_rate": 4.8312514056868085e-05, | |
| "loss": 0.1968, | |
| "mean_token_accuracy": 0.9366705775260925, | |
| "step": 880 | |
| }, | |
| { | |
| "epoch": 0.8517805582290664, | |
| "grad_norm": 0.1389380550865362, | |
| "learning_rate": 4.827642393356581e-05, | |
| "loss": 0.1909, | |
| "mean_token_accuracy": 0.9381819784641265, | |
| "step": 885 | |
| }, | |
| { | |
| "epoch": 0.8565928777670837, | |
| "grad_norm": 0.13729355684585437, | |
| "learning_rate": 4.823996731639814e-05, | |
| "loss": 0.195, | |
| "mean_token_accuracy": 0.9369978666305542, | |
| "step": 890 | |
| }, | |
| { | |
| "epoch": 0.861405197305101, | |
| "grad_norm": 0.13879603446263575, | |
| "learning_rate": 4.820314484845585e-05, | |
| "loss": 0.1969, | |
| "mean_token_accuracy": 0.9364095568656922, | |
| "step": 895 | |
| }, | |
| { | |
| "epoch": 0.8662175168431184, | |
| "grad_norm": 0.13824384386600863, | |
| "learning_rate": 4.816595717928327e-05, | |
| "loss": 0.1934, | |
| "mean_token_accuracy": 0.9371289372444153, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 0.8710298363811357, | |
| "grad_norm": 0.1430743516273531, | |
| "learning_rate": 4.812840496486687e-05, | |
| "loss": 0.1961, | |
| "mean_token_accuracy": 0.9366717875003815, | |
| "step": 905 | |
| }, | |
| { | |
| "epoch": 0.875842155919153, | |
| "grad_norm": 0.14720560162962282, | |
| "learning_rate": 4.809048886762363e-05, | |
| "loss": 0.1982, | |
| "mean_token_accuracy": 0.9361905872821807, | |
| "step": 910 | |
| }, | |
| { | |
| "epoch": 0.8806544754571703, | |
| "grad_norm": 0.1427229609872396, | |
| "learning_rate": 4.805220955638939e-05, | |
| "loss": 0.1922, | |
| "mean_token_accuracy": 0.9378859341144562, | |
| "step": 915 | |
| }, | |
| { | |
| "epoch": 0.8854667949951877, | |
| "grad_norm": 0.12756538666968079, | |
| "learning_rate": 4.801356770640707e-05, | |
| "loss": 0.1905, | |
| "mean_token_accuracy": 0.9384245991706848, | |
| "step": 920 | |
| }, | |
| { | |
| "epoch": 0.890279114533205, | |
| "grad_norm": 0.1408484018434893, | |
| "learning_rate": 4.797456399931469e-05, | |
| "loss": 0.1925, | |
| "mean_token_accuracy": 0.9376866579055786, | |
| "step": 925 | |
| }, | |
| { | |
| "epoch": 0.8950914340712224, | |
| "grad_norm": 0.13631559321382822, | |
| "learning_rate": 4.793519912313343e-05, | |
| "loss": 0.1911, | |
| "mean_token_accuracy": 0.9382654786109924, | |
| "step": 930 | |
| }, | |
| { | |
| "epoch": 0.8999037536092397, | |
| "grad_norm": 0.13703489406607036, | |
| "learning_rate": 4.789547377225543e-05, | |
| "loss": 0.189, | |
| "mean_token_accuracy": 0.9390109956264496, | |
| "step": 935 | |
| }, | |
| { | |
| "epoch": 0.904716073147257, | |
| "grad_norm": 0.13866980461707448, | |
| "learning_rate": 4.785538864743157e-05, | |
| "loss": 0.1935, | |
| "mean_token_accuracy": 0.937214457988739, | |
| "step": 940 | |
| }, | |
| { | |
| "epoch": 0.9095283926852743, | |
| "grad_norm": 0.13784556167932913, | |
| "learning_rate": 4.781494445575911e-05, | |
| "loss": 0.1938, | |
| "mean_token_accuracy": 0.9374860525131226, | |
| "step": 945 | |
| }, | |
| { | |
| "epoch": 0.9143407122232916, | |
| "grad_norm": 0.13343434455586559, | |
| "learning_rate": 4.7774141910669204e-05, | |
| "loss": 0.1902, | |
| "mean_token_accuracy": 0.938514119386673, | |
| "step": 950 | |
| }, | |
| { | |
| "epoch": 0.9191530317613089, | |
| "grad_norm": 0.13805323448216772, | |
| "learning_rate": 4.7732981731914326e-05, | |
| "loss": 0.1958, | |
| "mean_token_accuracy": 0.9369194209575653, | |
| "step": 955 | |
| }, | |
| { | |
| "epoch": 0.9239653512993262, | |
| "grad_norm": 0.14339645168093398, | |
| "learning_rate": 4.769146464555557e-05, | |
| "loss": 0.1916, | |
| "mean_token_accuracy": 0.9379463493824005, | |
| "step": 960 | |
| }, | |
| { | |
| "epoch": 0.9287776708373436, | |
| "grad_norm": 0.1364726665654864, | |
| "learning_rate": 4.7649591383949824e-05, | |
| "loss": 0.1904, | |
| "mean_token_accuracy": 0.9379682004451751, | |
| "step": 965 | |
| }, | |
| { | |
| "epoch": 0.933589990375361, | |
| "grad_norm": 0.13485382240339647, | |
| "learning_rate": 4.760736268573689e-05, | |
| "loss": 0.1899, | |
| "mean_token_accuracy": 0.9386738002300262, | |
| "step": 970 | |
| }, | |
| { | |
| "epoch": 0.9384023099133783, | |
| "grad_norm": 0.14008745603004136, | |
| "learning_rate": 4.756477929582643e-05, | |
| "loss": 0.1915, | |
| "mean_token_accuracy": 0.9377342879772186, | |
| "step": 975 | |
| }, | |
| { | |
| "epoch": 0.9432146294513956, | |
| "grad_norm": 0.13901018195257459, | |
| "learning_rate": 4.752184196538482e-05, | |
| "loss": 0.1909, | |
| "mean_token_accuracy": 0.9383879780769349, | |
| "step": 980 | |
| }, | |
| { | |
| "epoch": 0.9480269489894129, | |
| "grad_norm": 0.1374476115040315, | |
| "learning_rate": 4.7478551451821905e-05, | |
| "loss": 0.1915, | |
| "mean_token_accuracy": 0.9382719576358796, | |
| "step": 985 | |
| }, | |
| { | |
| "epoch": 0.9528392685274302, | |
| "grad_norm": 0.13099380855525097, | |
| "learning_rate": 4.7434908518777665e-05, | |
| "loss": 0.1893, | |
| "mean_token_accuracy": 0.9384810984134674, | |
| "step": 990 | |
| }, | |
| { | |
| "epoch": 0.9576515880654476, | |
| "grad_norm": 0.1344663386015308, | |
| "learning_rate": 4.7390913936108703e-05, | |
| "loss": 0.1917, | |
| "mean_token_accuracy": 0.938094437122345, | |
| "step": 995 | |
| }, | |
| { | |
| "epoch": 0.9624639076034649, | |
| "grad_norm": 0.14087858730981573, | |
| "learning_rate": 4.734656847987469e-05, | |
| "loss": 0.1904, | |
| "mean_token_accuracy": 0.9382812976837158, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.9672762271414822, | |
| "grad_norm": 0.13274925994570547, | |
| "learning_rate": 4.730187293232465e-05, | |
| "loss": 0.1927, | |
| "mean_token_accuracy": 0.9376478910446167, | |
| "step": 1005 | |
| }, | |
| { | |
| "epoch": 0.9720885466794995, | |
| "grad_norm": 0.13860612719788293, | |
| "learning_rate": 4.7256828081883205e-05, | |
| "loss": 0.1927, | |
| "mean_token_accuracy": 0.9372950077056885, | |
| "step": 1010 | |
| }, | |
| { | |
| "epoch": 0.9769008662175168, | |
| "grad_norm": 0.1288393233877837, | |
| "learning_rate": 4.721143472313663e-05, | |
| "loss": 0.1894, | |
| "mean_token_accuracy": 0.9385651528835297, | |
| "step": 1015 | |
| }, | |
| { | |
| "epoch": 0.9817131857555341, | |
| "grad_norm": 0.13764498535831643, | |
| "learning_rate": 4.7165693656818874e-05, | |
| "loss": 0.1915, | |
| "mean_token_accuracy": 0.937848836183548, | |
| "step": 1020 | |
| }, | |
| { | |
| "epoch": 0.9865255052935515, | |
| "grad_norm": 0.140439557065953, | |
| "learning_rate": 4.711960568979735e-05, | |
| "loss": 0.1897, | |
| "mean_token_accuracy": 0.9384947657585144, | |
| "step": 1025 | |
| }, | |
| { | |
| "epoch": 0.9913378248315688, | |
| "grad_norm": 0.1378608499542452, | |
| "learning_rate": 4.707317163505882e-05, | |
| "loss": 0.1892, | |
| "mean_token_accuracy": 0.9388699173927307, | |
| "step": 1030 | |
| }, | |
| { | |
| "epoch": 0.9961501443695862, | |
| "grad_norm": 0.14509518623250522, | |
| "learning_rate": 4.702639231169497e-05, | |
| "loss": 0.1889, | |
| "mean_token_accuracy": 0.9390351891517639, | |
| "step": 1035 | |
| }, | |
| { | |
| "epoch": 1.0009624639076036, | |
| "grad_norm": 0.14098292464327092, | |
| "learning_rate": 4.6979268544888e-05, | |
| "loss": 0.1883, | |
| "mean_token_accuracy": 0.9389310836791992, | |
| "step": 1040 | |
| }, | |
| { | |
| "epoch": 1.0057747834456208, | |
| "grad_norm": 0.13352648012607543, | |
| "learning_rate": 4.693180116589603e-05, | |
| "loss": 0.1702, | |
| "mean_token_accuracy": 0.9435855567455291, | |
| "step": 1045 | |
| }, | |
| { | |
| "epoch": 1.0105871029836382, | |
| "grad_norm": 0.14063016763786157, | |
| "learning_rate": 4.6883991012038495e-05, | |
| "loss": 0.1695, | |
| "mean_token_accuracy": 0.9434072017669678, | |
| "step": 1050 | |
| }, | |
| { | |
| "epoch": 1.0153994225216554, | |
| "grad_norm": 0.13337946686605756, | |
| "learning_rate": 4.6835838926681326e-05, | |
| "loss": 0.1665, | |
| "mean_token_accuracy": 0.9447925448417663, | |
| "step": 1055 | |
| }, | |
| { | |
| "epoch": 1.0202117420596728, | |
| "grad_norm": 0.13656347340007405, | |
| "learning_rate": 4.6787345759222066e-05, | |
| "loss": 0.1704, | |
| "mean_token_accuracy": 0.9438008546829224, | |
| "step": 1060 | |
| }, | |
| { | |
| "epoch": 1.02502406159769, | |
| "grad_norm": 0.14114045287544513, | |
| "learning_rate": 4.6738512365074954e-05, | |
| "loss": 0.1683, | |
| "mean_token_accuracy": 0.9438787519931793, | |
| "step": 1065 | |
| }, | |
| { | |
| "epoch": 1.0298363811357074, | |
| "grad_norm": 0.1327248720426579, | |
| "learning_rate": 4.668933960565575e-05, | |
| "loss": 0.17, | |
| "mean_token_accuracy": 0.9436138510704041, | |
| "step": 1070 | |
| }, | |
| { | |
| "epoch": 1.0346487006737248, | |
| "grad_norm": 0.13650793945156836, | |
| "learning_rate": 4.6639828348366616e-05, | |
| "loss": 0.1677, | |
| "mean_token_accuracy": 0.9441941678524017, | |
| "step": 1075 | |
| }, | |
| { | |
| "epoch": 1.039461020211742, | |
| "grad_norm": 0.13561726847606562, | |
| "learning_rate": 4.658997946658075e-05, | |
| "loss": 0.1701, | |
| "mean_token_accuracy": 0.9435689568519592, | |
| "step": 1080 | |
| }, | |
| { | |
| "epoch": 1.0442733397497594, | |
| "grad_norm": 0.13181581788943922, | |
| "learning_rate": 4.653979383962702e-05, | |
| "loss": 0.1699, | |
| "mean_token_accuracy": 0.9436435282230378, | |
| "step": 1085 | |
| }, | |
| { | |
| "epoch": 1.0490856592877766, | |
| "grad_norm": 0.13345034900363856, | |
| "learning_rate": 4.6489272352774456e-05, | |
| "loss": 0.1696, | |
| "mean_token_accuracy": 0.9435225188732147, | |
| "step": 1090 | |
| }, | |
| { | |
| "epoch": 1.053897978825794, | |
| "grad_norm": 0.12425776469782582, | |
| "learning_rate": 4.6438415897216593e-05, | |
| "loss": 0.1669, | |
| "mean_token_accuracy": 0.9445132613182068, | |
| "step": 1095 | |
| }, | |
| { | |
| "epoch": 1.0587102983638113, | |
| "grad_norm": 0.1300484615419265, | |
| "learning_rate": 4.63872253700558e-05, | |
| "loss": 0.167, | |
| "mean_token_accuracy": 0.9445058047771454, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 1.0635226179018287, | |
| "grad_norm": 0.12877402535672933, | |
| "learning_rate": 4.6335701674287436e-05, | |
| "loss": 0.1669, | |
| "mean_token_accuracy": 0.9445120990276337, | |
| "step": 1105 | |
| }, | |
| { | |
| "epoch": 1.068334937439846, | |
| "grad_norm": 0.1291223143463968, | |
| "learning_rate": 4.628384571878389e-05, | |
| "loss": 0.1672, | |
| "mean_token_accuracy": 0.9445106565952301, | |
| "step": 1110 | |
| }, | |
| { | |
| "epoch": 1.0731472569778633, | |
| "grad_norm": 0.1323467397909123, | |
| "learning_rate": 4.62316584182786e-05, | |
| "loss": 0.1671, | |
| "mean_token_accuracy": 0.9442681908607483, | |
| "step": 1115 | |
| }, | |
| { | |
| "epoch": 1.0779595765158807, | |
| "grad_norm": 0.13661398229633773, | |
| "learning_rate": 4.6179140693349894e-05, | |
| "loss": 0.1675, | |
| "mean_token_accuracy": 0.9441010117530823, | |
| "step": 1120 | |
| }, | |
| { | |
| "epoch": 1.082771896053898, | |
| "grad_norm": 0.12611245180721717, | |
| "learning_rate": 4.612629347040474e-05, | |
| "loss": 0.1692, | |
| "mean_token_accuracy": 0.9434883832931519, | |
| "step": 1125 | |
| }, | |
| { | |
| "epoch": 1.0875842155919153, | |
| "grad_norm": 0.13000751460760943, | |
| "learning_rate": 4.607311768166241e-05, | |
| "loss": 0.1672, | |
| "mean_token_accuracy": 0.9441259264945984, | |
| "step": 1130 | |
| }, | |
| { | |
| "epoch": 1.0923965351299327, | |
| "grad_norm": 0.13320880346839792, | |
| "learning_rate": 4.601961426513808e-05, | |
| "loss": 0.1686, | |
| "mean_token_accuracy": 0.9437747776508332, | |
| "step": 1135 | |
| }, | |
| { | |
| "epoch": 1.09720885466795, | |
| "grad_norm": 0.14134895172260561, | |
| "learning_rate": 4.596578416462619e-05, | |
| "loss": 0.1727, | |
| "mean_token_accuracy": 0.9426525175571442, | |
| "step": 1140 | |
| }, | |
| { | |
| "epoch": 1.1020211742059673, | |
| "grad_norm": 0.1340730006312968, | |
| "learning_rate": 4.591162832968389e-05, | |
| "loss": 0.1677, | |
| "mean_token_accuracy": 0.9442629754543305, | |
| "step": 1145 | |
| }, | |
| { | |
| "epoch": 1.1068334937439845, | |
| "grad_norm": 0.13121133703439622, | |
| "learning_rate": 4.585714771561423e-05, | |
| "loss": 0.1687, | |
| "mean_token_accuracy": 0.9438746750354767, | |
| "step": 1150 | |
| }, | |
| { | |
| "epoch": 1.111645813282002, | |
| "grad_norm": 0.13130299804823994, | |
| "learning_rate": 4.5802343283449335e-05, | |
| "loss": 0.1689, | |
| "mean_token_accuracy": 0.9439792096614837, | |
| "step": 1155 | |
| }, | |
| { | |
| "epoch": 1.1164581328200192, | |
| "grad_norm": 0.13704425192256178, | |
| "learning_rate": 4.574721599993345e-05, | |
| "loss": 0.1678, | |
| "mean_token_accuracy": 0.9441709995269776, | |
| "step": 1160 | |
| }, | |
| { | |
| "epoch": 1.1212704523580366, | |
| "grad_norm": 0.13878563931219687, | |
| "learning_rate": 4.5691766837505875e-05, | |
| "loss": 0.1702, | |
| "mean_token_accuracy": 0.9433870613574982, | |
| "step": 1165 | |
| }, | |
| { | |
| "epoch": 1.126082771896054, | |
| "grad_norm": 0.131342416517926, | |
| "learning_rate": 4.563599677428382e-05, | |
| "loss": 0.1704, | |
| "mean_token_accuracy": 0.943337619304657, | |
| "step": 1170 | |
| }, | |
| { | |
| "epoch": 1.1308950914340712, | |
| "grad_norm": 0.1298744027260647, | |
| "learning_rate": 4.557990679404516e-05, | |
| "loss": 0.1661, | |
| "mean_token_accuracy": 0.9448701322078705, | |
| "step": 1175 | |
| }, | |
| { | |
| "epoch": 1.1357074109720886, | |
| "grad_norm": 0.13253523892747635, | |
| "learning_rate": 4.5523497886211064e-05, | |
| "loss": 0.1682, | |
| "mean_token_accuracy": 0.9441543757915497, | |
| "step": 1180 | |
| }, | |
| { | |
| "epoch": 1.1405197305101058, | |
| "grad_norm": 0.13177694395961717, | |
| "learning_rate": 4.5466771045828545e-05, | |
| "loss": 0.1692, | |
| "mean_token_accuracy": 0.943759948015213, | |
| "step": 1185 | |
| }, | |
| { | |
| "epoch": 1.1453320500481232, | |
| "grad_norm": 0.13082391998973963, | |
| "learning_rate": 4.540972727355292e-05, | |
| "loss": 0.1686, | |
| "mean_token_accuracy": 0.9439559042453766, | |
| "step": 1190 | |
| }, | |
| { | |
| "epoch": 1.1501443695861404, | |
| "grad_norm": 0.13545538231015544, | |
| "learning_rate": 4.535236757563014e-05, | |
| "loss": 0.1714, | |
| "mean_token_accuracy": 0.9430326640605926, | |
| "step": 1195 | |
| }, | |
| { | |
| "epoch": 1.1549566891241578, | |
| "grad_norm": 0.1256637117971606, | |
| "learning_rate": 4.529469296387908e-05, | |
| "loss": 0.1689, | |
| "mean_token_accuracy": 0.9438701272010803, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 1.1597690086621752, | |
| "grad_norm": 0.12422786551830951, | |
| "learning_rate": 4.52367044556736e-05, | |
| "loss": 0.1706, | |
| "mean_token_accuracy": 0.9435747802257538, | |
| "step": 1205 | |
| }, | |
| { | |
| "epoch": 1.1645813282001924, | |
| "grad_norm": 0.13496451970813625, | |
| "learning_rate": 4.517840307392472e-05, | |
| "loss": 0.172, | |
| "mean_token_accuracy": 0.9430041670799255, | |
| "step": 1210 | |
| }, | |
| { | |
| "epoch": 1.1693936477382099, | |
| "grad_norm": 0.13429364394475088, | |
| "learning_rate": 4.5119789847062496e-05, | |
| "loss": 0.1691, | |
| "mean_token_accuracy": 0.9438342332839966, | |
| "step": 1215 | |
| }, | |
| { | |
| "epoch": 1.174205967276227, | |
| "grad_norm": 0.13142604554469559, | |
| "learning_rate": 4.506086580901789e-05, | |
| "loss": 0.1689, | |
| "mean_token_accuracy": 0.9438115119934082, | |
| "step": 1220 | |
| }, | |
| { | |
| "epoch": 1.1790182868142445, | |
| "grad_norm": 0.13628313013290896, | |
| "learning_rate": 4.5001631999204535e-05, | |
| "loss": 0.1697, | |
| "mean_token_accuracy": 0.9435074925422668, | |
| "step": 1225 | |
| }, | |
| { | |
| "epoch": 1.1838306063522617, | |
| "grad_norm": 0.1302669601108965, | |
| "learning_rate": 4.494208946250042e-05, | |
| "loss": 0.1688, | |
| "mean_token_accuracy": 0.9438393712043762, | |
| "step": 1230 | |
| }, | |
| { | |
| "epoch": 1.188642925890279, | |
| "grad_norm": 0.13078358146219196, | |
| "learning_rate": 4.4882239249229445e-05, | |
| "loss": 0.1705, | |
| "mean_token_accuracy": 0.9433554470539093, | |
| "step": 1235 | |
| }, | |
| { | |
| "epoch": 1.1934552454282965, | |
| "grad_norm": 0.1297342378764966, | |
| "learning_rate": 4.482208241514287e-05, | |
| "loss": 0.1687, | |
| "mean_token_accuracy": 0.9438297212123871, | |
| "step": 1240 | |
| }, | |
| { | |
| "epoch": 1.1982675649663137, | |
| "grad_norm": 0.12564912731538683, | |
| "learning_rate": 4.4761620021400724e-05, | |
| "loss": 0.1708, | |
| "mean_token_accuracy": 0.943172037601471, | |
| "step": 1245 | |
| }, | |
| { | |
| "epoch": 1.2030798845043311, | |
| "grad_norm": 0.13146122311848338, | |
| "learning_rate": 4.470085313455308e-05, | |
| "loss": 0.1671, | |
| "mean_token_accuracy": 0.9444893717765808, | |
| "step": 1250 | |
| }, | |
| { | |
| "epoch": 1.2078922040423483, | |
| "grad_norm": 0.1247038997684884, | |
| "learning_rate": 4.463978282652125e-05, | |
| "loss": 0.1663, | |
| "mean_token_accuracy": 0.9444051027297974, | |
| "step": 1255 | |
| }, | |
| { | |
| "epoch": 1.2127045235803657, | |
| "grad_norm": 0.12564755849530815, | |
| "learning_rate": 4.457841017457882e-05, | |
| "loss": 0.1684, | |
| "mean_token_accuracy": 0.9438214004039764, | |
| "step": 1260 | |
| }, | |
| { | |
| "epoch": 1.2175168431183832, | |
| "grad_norm": 0.12878914852074813, | |
| "learning_rate": 4.451673626133272e-05, | |
| "loss": 0.1677, | |
| "mean_token_accuracy": 0.9442386627197266, | |
| "step": 1265 | |
| }, | |
| { | |
| "epoch": 1.2223291626564003, | |
| "grad_norm": 0.13017512219309227, | |
| "learning_rate": 4.445476217470411e-05, | |
| "loss": 0.1699, | |
| "mean_token_accuracy": 0.9435798227787018, | |
| "step": 1270 | |
| }, | |
| { | |
| "epoch": 1.2271414821944178, | |
| "grad_norm": 0.12739360897332036, | |
| "learning_rate": 4.439248900790915e-05, | |
| "loss": 0.1726, | |
| "mean_token_accuracy": 0.9427407801151275, | |
| "step": 1275 | |
| }, | |
| { | |
| "epoch": 1.231953801732435, | |
| "grad_norm": 0.12657194463024607, | |
| "learning_rate": 4.432991785943974e-05, | |
| "loss": 0.17, | |
| "mean_token_accuracy": 0.9437889516353607, | |
| "step": 1280 | |
| }, | |
| { | |
| "epoch": 1.2367661212704524, | |
| "grad_norm": 0.1262944789537952, | |
| "learning_rate": 4.426704983304416e-05, | |
| "loss": 0.17, | |
| "mean_token_accuracy": 0.9434307396411896, | |
| "step": 1285 | |
| }, | |
| { | |
| "epoch": 1.2415784408084698, | |
| "grad_norm": 0.12444247164013399, | |
| "learning_rate": 4.420388603770758e-05, | |
| "loss": 0.1713, | |
| "mean_token_accuracy": 0.9432605504989624, | |
| "step": 1290 | |
| }, | |
| { | |
| "epoch": 1.246390760346487, | |
| "grad_norm": 0.12047684348761341, | |
| "learning_rate": 4.414042758763251e-05, | |
| "loss": 0.1689, | |
| "mean_token_accuracy": 0.944023609161377, | |
| "step": 1295 | |
| }, | |
| { | |
| "epoch": 1.2512030798845044, | |
| "grad_norm": 0.12687476698133213, | |
| "learning_rate": 4.407667560221911e-05, | |
| "loss": 0.1678, | |
| "mean_token_accuracy": 0.9442380666732788, | |
| "step": 1300 | |
| }, | |
| { | |
| "epoch": 1.2560153994225216, | |
| "grad_norm": 0.1325946463711835, | |
| "learning_rate": 4.4012631206045505e-05, | |
| "loss": 0.1707, | |
| "mean_token_accuracy": 0.9430667042732239, | |
| "step": 1305 | |
| }, | |
| { | |
| "epoch": 1.260827718960539, | |
| "grad_norm": 0.12868385508137406, | |
| "learning_rate": 4.3948295528847894e-05, | |
| "loss": 0.1708, | |
| "mean_token_accuracy": 0.9432620346546173, | |
| "step": 1310 | |
| }, | |
| { | |
| "epoch": 1.2656400384985562, | |
| "grad_norm": 0.11680291123132963, | |
| "learning_rate": 4.388366970550064e-05, | |
| "loss": 0.1705, | |
| "mean_token_accuracy": 0.9433643639087677, | |
| "step": 1315 | |
| }, | |
| { | |
| "epoch": 1.2704523580365736, | |
| "grad_norm": 0.13072798580717812, | |
| "learning_rate": 4.381875487599628e-05, | |
| "loss": 0.1705, | |
| "mean_token_accuracy": 0.9432330310344696, | |
| "step": 1320 | |
| }, | |
| { | |
| "epoch": 1.275264677574591, | |
| "grad_norm": 0.12834048816972923, | |
| "learning_rate": 4.375355218542535e-05, | |
| "loss": 0.1691, | |
| "mean_token_accuracy": 0.943941992521286, | |
| "step": 1325 | |
| }, | |
| { | |
| "epoch": 1.2800769971126083, | |
| "grad_norm": 0.1262326364831885, | |
| "learning_rate": 4.3688062783956235e-05, | |
| "loss": 0.1675, | |
| "mean_token_accuracy": 0.9441191196441651, | |
| "step": 1330 | |
| }, | |
| { | |
| "epoch": 1.2848893166506257, | |
| "grad_norm": 0.12625231235434545, | |
| "learning_rate": 4.362228782681489e-05, | |
| "loss": 0.1711, | |
| "mean_token_accuracy": 0.9431708574295044, | |
| "step": 1335 | |
| }, | |
| { | |
| "epoch": 1.2897016361886429, | |
| "grad_norm": 0.12222684683375595, | |
| "learning_rate": 4.355622847426443e-05, | |
| "loss": 0.1665, | |
| "mean_token_accuracy": 0.944567060470581, | |
| "step": 1340 | |
| }, | |
| { | |
| "epoch": 1.2945139557266603, | |
| "grad_norm": 0.1273686741107798, | |
| "learning_rate": 4.348988589158466e-05, | |
| "loss": 0.1682, | |
| "mean_token_accuracy": 0.9441350519657135, | |
| "step": 1345 | |
| }, | |
| { | |
| "epoch": 1.2993262752646775, | |
| "grad_norm": 0.1244049776395378, | |
| "learning_rate": 4.342326124905155e-05, | |
| "loss": 0.1717, | |
| "mean_token_accuracy": 0.9428531110286713, | |
| "step": 1350 | |
| }, | |
| { | |
| "epoch": 1.304138594802695, | |
| "grad_norm": 0.12967703766865415, | |
| "learning_rate": 4.3356355721916566e-05, | |
| "loss": 0.1678, | |
| "mean_token_accuracy": 0.9444044053554534, | |
| "step": 1355 | |
| }, | |
| { | |
| "epoch": 1.3089509143407123, | |
| "grad_norm": 0.12894284768611228, | |
| "learning_rate": 4.328917049038597e-05, | |
| "loss": 0.1701, | |
| "mean_token_accuracy": 0.9436497867107392, | |
| "step": 1360 | |
| }, | |
| { | |
| "epoch": 1.3137632338787295, | |
| "grad_norm": 0.12803456304695687, | |
| "learning_rate": 4.322170673959993e-05, | |
| "loss": 0.1738, | |
| "mean_token_accuracy": 0.9425312340259552, | |
| "step": 1365 | |
| }, | |
| { | |
| "epoch": 1.318575553416747, | |
| "grad_norm": 0.12490844730637313, | |
| "learning_rate": 4.315396565961172e-05, | |
| "loss": 0.1669, | |
| "mean_token_accuracy": 0.9444727003574371, | |
| "step": 1370 | |
| }, | |
| { | |
| "epoch": 1.3233878729547641, | |
| "grad_norm": 0.12889604880653935, | |
| "learning_rate": 4.3085948445366635e-05, | |
| "loss": 0.1688, | |
| "mean_token_accuracy": 0.9439269423484802, | |
| "step": 1375 | |
| }, | |
| { | |
| "epoch": 1.3282001924927815, | |
| "grad_norm": 0.12828949214668103, | |
| "learning_rate": 4.301765629668097e-05, | |
| "loss": 0.1694, | |
| "mean_token_accuracy": 0.9442147970199585, | |
| "step": 1380 | |
| }, | |
| { | |
| "epoch": 1.3330125120307987, | |
| "grad_norm": 0.12140985171037239, | |
| "learning_rate": 4.294909041822081e-05, | |
| "loss": 0.1684, | |
| "mean_token_accuracy": 0.9438628017902374, | |
| "step": 1385 | |
| }, | |
| { | |
| "epoch": 1.3378248315688162, | |
| "grad_norm": 0.12451991640388707, | |
| "learning_rate": 4.2880252019480815e-05, | |
| "loss": 0.1676, | |
| "mean_token_accuracy": 0.9443076968193054, | |
| "step": 1390 | |
| }, | |
| { | |
| "epoch": 1.3426371511068336, | |
| "grad_norm": 0.12557170747764212, | |
| "learning_rate": 4.281114231476289e-05, | |
| "loss": 0.1713, | |
| "mean_token_accuracy": 0.9434075772762298, | |
| "step": 1395 | |
| }, | |
| { | |
| "epoch": 1.3474494706448508, | |
| "grad_norm": 0.12669067403953152, | |
| "learning_rate": 4.2741762523154715e-05, | |
| "loss": 0.1706, | |
| "mean_token_accuracy": 0.9434764981269836, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 1.3522617901828682, | |
| "grad_norm": 0.12310591681232867, | |
| "learning_rate": 4.26721138685083e-05, | |
| "loss": 0.1682, | |
| "mean_token_accuracy": 0.9441127836704254, | |
| "step": 1405 | |
| }, | |
| { | |
| "epoch": 1.3570741097208856, | |
| "grad_norm": 0.11941516802375815, | |
| "learning_rate": 4.2602197579418365e-05, | |
| "loss": 0.17, | |
| "mean_token_accuracy": 0.9436100482940674, | |
| "step": 1410 | |
| }, | |
| { | |
| "epoch": 1.3618864292589028, | |
| "grad_norm": 0.12521322800607554, | |
| "learning_rate": 4.2532014889200663e-05, | |
| "loss": 0.1697, | |
| "mean_token_accuracy": 0.9435548663139344, | |
| "step": 1415 | |
| }, | |
| { | |
| "epoch": 1.36669874879692, | |
| "grad_norm": 0.12934274666710788, | |
| "learning_rate": 4.246156703587024e-05, | |
| "loss": 0.1666, | |
| "mean_token_accuracy": 0.9446570336818695, | |
| "step": 1420 | |
| }, | |
| { | |
| "epoch": 1.3715110683349374, | |
| "grad_norm": 0.11758131071829231, | |
| "learning_rate": 4.2390855262119595e-05, | |
| "loss": 0.1668, | |
| "mean_token_accuracy": 0.9441105723381042, | |
| "step": 1425 | |
| }, | |
| { | |
| "epoch": 1.3763233878729548, | |
| "grad_norm": 0.12138932408399884, | |
| "learning_rate": 4.2319880815296743e-05, | |
| "loss": 0.1701, | |
| "mean_token_accuracy": 0.9436386108398438, | |
| "step": 1430 | |
| }, | |
| { | |
| "epoch": 1.381135707410972, | |
| "grad_norm": 0.1295795780877806, | |
| "learning_rate": 4.2248644947383225e-05, | |
| "loss": 0.1692, | |
| "mean_token_accuracy": 0.9435955226421356, | |
| "step": 1435 | |
| }, | |
| { | |
| "epoch": 1.3859480269489894, | |
| "grad_norm": 0.12114292087988099, | |
| "learning_rate": 4.217714891497204e-05, | |
| "loss": 0.1674, | |
| "mean_token_accuracy": 0.9442949891090393, | |
| "step": 1440 | |
| }, | |
| { | |
| "epoch": 1.3907603464870069, | |
| "grad_norm": 0.12616393455966746, | |
| "learning_rate": 4.2105393979245445e-05, | |
| "loss": 0.1684, | |
| "mean_token_accuracy": 0.9440375447273255, | |
| "step": 1445 | |
| }, | |
| { | |
| "epoch": 1.395572666025024, | |
| "grad_norm": 0.12265924397876915, | |
| "learning_rate": 4.2033381405952715e-05, | |
| "loss": 0.1701, | |
| "mean_token_accuracy": 0.9436265408992768, | |
| "step": 1450 | |
| }, | |
| { | |
| "epoch": 1.4003849855630413, | |
| "grad_norm": 0.12287306258754672, | |
| "learning_rate": 4.1961112465387846e-05, | |
| "loss": 0.1697, | |
| "mean_token_accuracy": 0.9436855256557465, | |
| "step": 1455 | |
| }, | |
| { | |
| "epoch": 1.4051973051010587, | |
| "grad_norm": 0.12429881700326155, | |
| "learning_rate": 4.188858843236711e-05, | |
| "loss": 0.1673, | |
| "mean_token_accuracy": 0.9442161798477173, | |
| "step": 1460 | |
| }, | |
| { | |
| "epoch": 1.410009624639076, | |
| "grad_norm": 0.11723067636795216, | |
| "learning_rate": 4.181581058620658e-05, | |
| "loss": 0.1701, | |
| "mean_token_accuracy": 0.9434365093708038, | |
| "step": 1465 | |
| }, | |
| { | |
| "epoch": 1.4148219441770933, | |
| "grad_norm": 0.11878728765010492, | |
| "learning_rate": 4.174278021069958e-05, | |
| "loss": 0.1712, | |
| "mean_token_accuracy": 0.9432783722877502, | |
| "step": 1470 | |
| }, | |
| { | |
| "epoch": 1.4196342637151107, | |
| "grad_norm": 0.12127739873444575, | |
| "learning_rate": 4.166949859409402e-05, | |
| "loss": 0.1697, | |
| "mean_token_accuracy": 0.9436027526855468, | |
| "step": 1475 | |
| }, | |
| { | |
| "epoch": 1.4244465832531281, | |
| "grad_norm": 0.11901416516650089, | |
| "learning_rate": 4.159596702906965e-05, | |
| "loss": 0.1674, | |
| "mean_token_accuracy": 0.9446190297603607, | |
| "step": 1480 | |
| }, | |
| { | |
| "epoch": 1.4292589027911453, | |
| "grad_norm": 0.12212845756527985, | |
| "learning_rate": 4.152218681271532e-05, | |
| "loss": 0.1664, | |
| "mean_token_accuracy": 0.9447129309177399, | |
| "step": 1485 | |
| }, | |
| { | |
| "epoch": 1.4340712223291627, | |
| "grad_norm": 0.11753223946445938, | |
| "learning_rate": 4.1448159246506044e-05, | |
| "loss": 0.165, | |
| "mean_token_accuracy": 0.9450146913528442, | |
| "step": 1490 | |
| }, | |
| { | |
| "epoch": 1.43888354186718, | |
| "grad_norm": 0.1264239630663289, | |
| "learning_rate": 4.137388563628006e-05, | |
| "loss": 0.1655, | |
| "mean_token_accuracy": 0.945018881559372, | |
| "step": 1495 | |
| }, | |
| { | |
| "epoch": 1.4436958614051973, | |
| "grad_norm": 0.12135246983559846, | |
| "learning_rate": 4.1299367292215805e-05, | |
| "loss": 0.168, | |
| "mean_token_accuracy": 0.9440855264663697, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 1.4485081809432145, | |
| "grad_norm": 0.1234777176396436, | |
| "learning_rate": 4.122460552880878e-05, | |
| "loss": 0.1681, | |
| "mean_token_accuracy": 0.9443232297897339, | |
| "step": 1505 | |
| }, | |
| { | |
| "epoch": 1.453320500481232, | |
| "grad_norm": 0.120840543520841, | |
| "learning_rate": 4.1149601664848384e-05, | |
| "loss": 0.1679, | |
| "mean_token_accuracy": 0.9441682994365692, | |
| "step": 1510 | |
| }, | |
| { | |
| "epoch": 1.4581328200192494, | |
| "grad_norm": 0.12078442187715852, | |
| "learning_rate": 4.107435702339467e-05, | |
| "loss": 0.171, | |
| "mean_token_accuracy": 0.9432439386844635, | |
| "step": 1515 | |
| }, | |
| { | |
| "epoch": 1.4629451395572666, | |
| "grad_norm": 0.12090522475500061, | |
| "learning_rate": 4.099887293175491e-05, | |
| "loss": 0.1678, | |
| "mean_token_accuracy": 0.9439578652381897, | |
| "step": 1520 | |
| }, | |
| { | |
| "epoch": 1.467757459095284, | |
| "grad_norm": 0.12231814098730041, | |
| "learning_rate": 4.092315072146033e-05, | |
| "loss": 0.1666, | |
| "mean_token_accuracy": 0.9445959091186523, | |
| "step": 1525 | |
| }, | |
| { | |
| "epoch": 1.4725697786333012, | |
| "grad_norm": 0.11920357648085111, | |
| "learning_rate": 4.08471917282425e-05, | |
| "loss": 0.1692, | |
| "mean_token_accuracy": 0.9439474761486053, | |
| "step": 1530 | |
| }, | |
| { | |
| "epoch": 1.4773820981713186, | |
| "grad_norm": 0.13353946904737157, | |
| "learning_rate": 4.077099729200982e-05, | |
| "loss": 0.1677, | |
| "mean_token_accuracy": 0.9443958938121796, | |
| "step": 1535 | |
| }, | |
| { | |
| "epoch": 1.4821944177093358, | |
| "grad_norm": 0.11931966784782126, | |
| "learning_rate": 4.0694568756823885e-05, | |
| "loss": 0.169, | |
| "mean_token_accuracy": 0.9439583122730255, | |
| "step": 1540 | |
| }, | |
| { | |
| "epoch": 1.4870067372473532, | |
| "grad_norm": 0.12183602860532652, | |
| "learning_rate": 4.0617907470875775e-05, | |
| "loss": 0.166, | |
| "mean_token_accuracy": 0.9447571635246277, | |
| "step": 1545 | |
| }, | |
| { | |
| "epoch": 1.4918190567853706, | |
| "grad_norm": 0.11811842913317021, | |
| "learning_rate": 4.054101478646226e-05, | |
| "loss": 0.1684, | |
| "mean_token_accuracy": 0.9439677178859711, | |
| "step": 1550 | |
| }, | |
| { | |
| "epoch": 1.4966313763233878, | |
| "grad_norm": 0.1233667516552787, | |
| "learning_rate": 4.0463892059961946e-05, | |
| "loss": 0.1692, | |
| "mean_token_accuracy": 0.9439435601234436, | |
| "step": 1555 | |
| }, | |
| { | |
| "epoch": 1.5014436958614052, | |
| "grad_norm": 0.12305317506985826, | |
| "learning_rate": 4.038654065181137e-05, | |
| "loss": 0.1642, | |
| "mean_token_accuracy": 0.9451203525066376, | |
| "step": 1560 | |
| }, | |
| { | |
| "epoch": 1.5062560153994227, | |
| "grad_norm": 0.12969772414272346, | |
| "learning_rate": 4.0308961926480995e-05, | |
| "loss": 0.1657, | |
| "mean_token_accuracy": 0.9447619915008545, | |
| "step": 1565 | |
| }, | |
| { | |
| "epoch": 1.5110683349374399, | |
| "grad_norm": 0.12449601885269666, | |
| "learning_rate": 4.02311572524511e-05, | |
| "loss": 0.1667, | |
| "mean_token_accuracy": 0.9444441139698029, | |
| "step": 1570 | |
| }, | |
| { | |
| "epoch": 1.515880654475457, | |
| "grad_norm": 0.12297625214962239, | |
| "learning_rate": 4.015312800218773e-05, | |
| "loss": 0.1671, | |
| "mean_token_accuracy": 0.9443081796169281, | |
| "step": 1575 | |
| }, | |
| { | |
| "epoch": 1.5206929740134745, | |
| "grad_norm": 0.1165365922555401, | |
| "learning_rate": 4.007487555211838e-05, | |
| "loss": 0.1698, | |
| "mean_token_accuracy": 0.943653005361557, | |
| "step": 1580 | |
| }, | |
| { | |
| "epoch": 1.525505293551492, | |
| "grad_norm": 0.1161851661030259, | |
| "learning_rate": 3.9996401282607784e-05, | |
| "loss": 0.1675, | |
| "mean_token_accuracy": 0.9443969130516052, | |
| "step": 1585 | |
| }, | |
| { | |
| "epoch": 1.530317613089509, | |
| "grad_norm": 0.11904745367387458, | |
| "learning_rate": 3.991770657793354e-05, | |
| "loss": 0.1662, | |
| "mean_token_accuracy": 0.9445495724678039, | |
| "step": 1590 | |
| }, | |
| { | |
| "epoch": 1.5351299326275265, | |
| "grad_norm": 0.12255703868410023, | |
| "learning_rate": 3.983879282626174e-05, | |
| "loss": 0.1688, | |
| "mean_token_accuracy": 0.9440991520881653, | |
| "step": 1595 | |
| }, | |
| { | |
| "epoch": 1.539942252165544, | |
| "grad_norm": 0.12122635628254265, | |
| "learning_rate": 3.975966141962237e-05, | |
| "loss": 0.1703, | |
| "mean_token_accuracy": 0.9436755478382111, | |
| "step": 1600 | |
| }, | |
| { | |
| "epoch": 1.5447545717035611, | |
| "grad_norm": 0.11479122748903152, | |
| "learning_rate": 3.968031375388487e-05, | |
| "loss": 0.1671, | |
| "mean_token_accuracy": 0.9446001768112182, | |
| "step": 1605 | |
| }, | |
| { | |
| "epoch": 1.5495668912415783, | |
| "grad_norm": 0.11440077863710703, | |
| "learning_rate": 3.9600751228733476e-05, | |
| "loss": 0.17, | |
| "mean_token_accuracy": 0.9436830937862396, | |
| "step": 1610 | |
| }, | |
| { | |
| "epoch": 1.5543792107795957, | |
| "grad_norm": 0.1192042764841765, | |
| "learning_rate": 3.952097524764249e-05, | |
| "loss": 0.1688, | |
| "mean_token_accuracy": 0.9438039898872376, | |
| "step": 1615 | |
| }, | |
| { | |
| "epoch": 1.5591915303176132, | |
| "grad_norm": 0.13186271048291426, | |
| "learning_rate": 3.944098721785157e-05, | |
| "loss": 0.1683, | |
| "mean_token_accuracy": 0.9442033290863037, | |
| "step": 1620 | |
| }, | |
| { | |
| "epoch": 1.5640038498556303, | |
| "grad_norm": 0.12055324541493671, | |
| "learning_rate": 3.936078855034089e-05, | |
| "loss": 0.1693, | |
| "mean_token_accuracy": 0.9436140656471252, | |
| "step": 1625 | |
| }, | |
| { | |
| "epoch": 1.5688161693936478, | |
| "grad_norm": 0.12265719838028774, | |
| "learning_rate": 3.9280380659806265e-05, | |
| "loss": 0.1701, | |
| "mean_token_accuracy": 0.9433427393436432, | |
| "step": 1630 | |
| }, | |
| { | |
| "epoch": 1.5736284889316652, | |
| "grad_norm": 0.11973535981251686, | |
| "learning_rate": 3.9199764964634146e-05, | |
| "loss": 0.1685, | |
| "mean_token_accuracy": 0.9438383936882019, | |
| "step": 1635 | |
| }, | |
| { | |
| "epoch": 1.5784408084696824, | |
| "grad_norm": 0.11279283406210072, | |
| "learning_rate": 3.911894288687665e-05, | |
| "loss": 0.1716, | |
| "mean_token_accuracy": 0.9434796929359436, | |
| "step": 1640 | |
| }, | |
| { | |
| "epoch": 1.5832531280076996, | |
| "grad_norm": 0.12286896974013906, | |
| "learning_rate": 3.9037915852226474e-05, | |
| "loss": 0.1685, | |
| "mean_token_accuracy": 0.9440526366233826, | |
| "step": 1645 | |
| }, | |
| { | |
| "epoch": 1.588065447545717, | |
| "grad_norm": 0.11742900310257637, | |
| "learning_rate": 3.895668528999172e-05, | |
| "loss": 0.1667, | |
| "mean_token_accuracy": 0.9446743130683899, | |
| "step": 1650 | |
| }, | |
| { | |
| "epoch": 1.5928777670837344, | |
| "grad_norm": 0.1219979117827572, | |
| "learning_rate": 3.8875252633070716e-05, | |
| "loss": 0.1689, | |
| "mean_token_accuracy": 0.9441424012184143, | |
| "step": 1655 | |
| }, | |
| { | |
| "epoch": 1.5976900866217516, | |
| "grad_norm": 0.12051375250499675, | |
| "learning_rate": 3.879361931792668e-05, | |
| "loss": 0.1702, | |
| "mean_token_accuracy": 0.9435826122760773, | |
| "step": 1660 | |
| }, | |
| { | |
| "epoch": 1.602502406159769, | |
| "grad_norm": 0.12226709284381505, | |
| "learning_rate": 3.8711786784562444e-05, | |
| "loss": 0.1648, | |
| "mean_token_accuracy": 0.9452749371528626, | |
| "step": 1665 | |
| }, | |
| { | |
| "epoch": 1.6073147256977864, | |
| "grad_norm": 0.11946999897354527, | |
| "learning_rate": 3.8629756476495024e-05, | |
| "loss": 0.168, | |
| "mean_token_accuracy": 0.9442229270935059, | |
| "step": 1670 | |
| }, | |
| { | |
| "epoch": 1.6121270452358036, | |
| "grad_norm": 0.11742174946728753, | |
| "learning_rate": 3.854752984073017e-05, | |
| "loss": 0.1686, | |
| "mean_token_accuracy": 0.944132536649704, | |
| "step": 1675 | |
| }, | |
| { | |
| "epoch": 1.6169393647738208, | |
| "grad_norm": 0.1236895644977533, | |
| "learning_rate": 3.84651083277368e-05, | |
| "loss": 0.1703, | |
| "mean_token_accuracy": 0.9436050176620483, | |
| "step": 1680 | |
| }, | |
| { | |
| "epoch": 1.6217516843118385, | |
| "grad_norm": 0.12462091225944294, | |
| "learning_rate": 3.838249339142148e-05, | |
| "loss": 0.1675, | |
| "mean_token_accuracy": 0.9442900955677033, | |
| "step": 1685 | |
| }, | |
| { | |
| "epoch": 1.6265640038498557, | |
| "grad_norm": 0.11525803519589935, | |
| "learning_rate": 3.8299686489102726e-05, | |
| "loss": 0.1681, | |
| "mean_token_accuracy": 0.944085818529129, | |
| "step": 1690 | |
| }, | |
| { | |
| "epoch": 1.6313763233878729, | |
| "grad_norm": 0.11836011035594363, | |
| "learning_rate": 3.821668908148533e-05, | |
| "loss": 0.1694, | |
| "mean_token_accuracy": 0.9436315476894379, | |
| "step": 1695 | |
| }, | |
| { | |
| "epoch": 1.6361886429258903, | |
| "grad_norm": 0.11194098259995407, | |
| "learning_rate": 3.813350263263453e-05, | |
| "loss": 0.1684, | |
| "mean_token_accuracy": 0.9439301788806915, | |
| "step": 1700 | |
| }, | |
| { | |
| "epoch": 1.6410009624639077, | |
| "grad_norm": 0.1214705214423612, | |
| "learning_rate": 3.8050128609950296e-05, | |
| "loss": 0.167, | |
| "mean_token_accuracy": 0.9442422211170196, | |
| "step": 1705 | |
| }, | |
| { | |
| "epoch": 1.645813282001925, | |
| "grad_norm": 0.12109861573508446, | |
| "learning_rate": 3.7966568484141335e-05, | |
| "loss": 0.1674, | |
| "mean_token_accuracy": 0.9443001866340637, | |
| "step": 1710 | |
| }, | |
| { | |
| "epoch": 1.650625601539942, | |
| "grad_norm": 0.12548159813616247, | |
| "learning_rate": 3.7882823729199204e-05, | |
| "loss": 0.1673, | |
| "mean_token_accuracy": 0.9443616211414337, | |
| "step": 1715 | |
| }, | |
| { | |
| "epoch": 1.6554379210779597, | |
| "grad_norm": 0.120479729408816, | |
| "learning_rate": 3.77988958223723e-05, | |
| "loss": 0.1696, | |
| "mean_token_accuracy": 0.9438112080097198, | |
| "step": 1720 | |
| }, | |
| { | |
| "epoch": 1.660250240615977, | |
| "grad_norm": 0.11518812866063087, | |
| "learning_rate": 3.771478624413981e-05, | |
| "loss": 0.1649, | |
| "mean_token_accuracy": 0.9452588975429534, | |
| "step": 1725 | |
| }, | |
| { | |
| "epoch": 1.6650625601539941, | |
| "grad_norm": 0.11808464477482111, | |
| "learning_rate": 3.763049647818556e-05, | |
| "loss": 0.1668, | |
| "mean_token_accuracy": 0.9444884181022644, | |
| "step": 1730 | |
| }, | |
| { | |
| "epoch": 1.6698748796920115, | |
| "grad_norm": 0.11003684714800051, | |
| "learning_rate": 3.7546028011371884e-05, | |
| "loss": 0.1658, | |
| "mean_token_accuracy": 0.9448000669479371, | |
| "step": 1735 | |
| }, | |
| { | |
| "epoch": 1.674687199230029, | |
| "grad_norm": 0.11840372732983077, | |
| "learning_rate": 3.7461382333713374e-05, | |
| "loss": 0.1675, | |
| "mean_token_accuracy": 0.9443848192691803, | |
| "step": 1740 | |
| }, | |
| { | |
| "epoch": 1.6794995187680462, | |
| "grad_norm": 0.11569349634452888, | |
| "learning_rate": 3.737656093835062e-05, | |
| "loss": 0.1702, | |
| "mean_token_accuracy": 0.9434504210948944, | |
| "step": 1745 | |
| }, | |
| { | |
| "epoch": 1.6843118383060636, | |
| "grad_norm": 0.11692012055812281, | |
| "learning_rate": 3.729156532152384e-05, | |
| "loss": 0.1689, | |
| "mean_token_accuracy": 0.9436829447746277, | |
| "step": 1750 | |
| }, | |
| { | |
| "epoch": 1.689124157844081, | |
| "grad_norm": 0.1152755549059481, | |
| "learning_rate": 3.7206396982546475e-05, | |
| "loss": 0.1656, | |
| "mean_token_accuracy": 0.9449601650238038, | |
| "step": 1755 | |
| }, | |
| { | |
| "epoch": 1.6939364773820982, | |
| "grad_norm": 0.1271194090732628, | |
| "learning_rate": 3.712105742377882e-05, | |
| "loss": 0.1673, | |
| "mean_token_accuracy": 0.9443909406661988, | |
| "step": 1760 | |
| }, | |
| { | |
| "epoch": 1.6987487969201154, | |
| "grad_norm": 0.11475504315196915, | |
| "learning_rate": 3.703554815060141e-05, | |
| "loss": 0.1668, | |
| "mean_token_accuracy": 0.9446333408355713, | |
| "step": 1765 | |
| }, | |
| { | |
| "epoch": 1.7035611164581328, | |
| "grad_norm": 0.11508516547758801, | |
| "learning_rate": 3.6949870671388565e-05, | |
| "loss": 0.1681, | |
| "mean_token_accuracy": 0.943927937746048, | |
| "step": 1770 | |
| }, | |
| { | |
| "epoch": 1.7083734359961502, | |
| "grad_norm": 0.11939620663986114, | |
| "learning_rate": 3.686402649748174e-05, | |
| "loss": 0.1663, | |
| "mean_token_accuracy": 0.9443952858448028, | |
| "step": 1775 | |
| }, | |
| { | |
| "epoch": 1.7131857555341674, | |
| "grad_norm": 0.10894111975889467, | |
| "learning_rate": 3.677801714316283e-05, | |
| "loss": 0.1641, | |
| "mean_token_accuracy": 0.945495343208313, | |
| "step": 1780 | |
| }, | |
| { | |
| "epoch": 1.7179980750721848, | |
| "grad_norm": 0.11934649455435464, | |
| "learning_rate": 3.6691844125627536e-05, | |
| "loss": 0.1675, | |
| "mean_token_accuracy": 0.944483506679535, | |
| "step": 1785 | |
| }, | |
| { | |
| "epoch": 1.7228103946102022, | |
| "grad_norm": 0.12376796320967025, | |
| "learning_rate": 3.6605508964958543e-05, | |
| "loss": 0.1666, | |
| "mean_token_accuracy": 0.9446521162986755, | |
| "step": 1790 | |
| }, | |
| { | |
| "epoch": 1.7276227141482194, | |
| "grad_norm": 0.11455955246894524, | |
| "learning_rate": 3.6519013184098724e-05, | |
| "loss": 0.1661, | |
| "mean_token_accuracy": 0.9446718871593476, | |
| "step": 1795 | |
| }, | |
| { | |
| "epoch": 1.7324350336862366, | |
| "grad_norm": 0.11350368248548637, | |
| "learning_rate": 3.643235830882427e-05, | |
| "loss": 0.1672, | |
| "mean_token_accuracy": 0.9445142209529876, | |
| "step": 1800 | |
| }, | |
| { | |
| "epoch": 1.737247353224254, | |
| "grad_norm": 0.12520393083753745, | |
| "learning_rate": 3.634554586771778e-05, | |
| "loss": 0.1666, | |
| "mean_token_accuracy": 0.9445159196853637, | |
| "step": 1805 | |
| }, | |
| { | |
| "epoch": 1.7420596727622715, | |
| "grad_norm": 0.11808014043965162, | |
| "learning_rate": 3.625857739214131e-05, | |
| "loss": 0.1665, | |
| "mean_token_accuracy": 0.9445820391178131, | |
| "step": 1810 | |
| }, | |
| { | |
| "epoch": 1.7468719923002887, | |
| "grad_norm": 0.11185845605472387, | |
| "learning_rate": 3.6171454416209336e-05, | |
| "loss": 0.1663, | |
| "mean_token_accuracy": 0.944850093126297, | |
| "step": 1815 | |
| }, | |
| { | |
| "epoch": 1.751684311838306, | |
| "grad_norm": 0.11510087741484404, | |
| "learning_rate": 3.608417847676171e-05, | |
| "loss": 0.1698, | |
| "mean_token_accuracy": 0.9437756836414337, | |
| "step": 1820 | |
| }, | |
| { | |
| "epoch": 1.7564966313763235, | |
| "grad_norm": 0.1114369723357324, | |
| "learning_rate": 3.599675111333654e-05, | |
| "loss": 0.1653, | |
| "mean_token_accuracy": 0.9452514588832855, | |
| "step": 1825 | |
| }, | |
| { | |
| "epoch": 1.7613089509143407, | |
| "grad_norm": 0.12142754659774699, | |
| "learning_rate": 3.590917386814304e-05, | |
| "loss": 0.1674, | |
| "mean_token_accuracy": 0.9443466603755951, | |
| "step": 1830 | |
| }, | |
| { | |
| "epoch": 1.766121270452358, | |
| "grad_norm": 0.12192098353534307, | |
| "learning_rate": 3.5821448286034305e-05, | |
| "loss": 0.1675, | |
| "mean_token_accuracy": 0.9443329930305481, | |
| "step": 1835 | |
| }, | |
| { | |
| "epoch": 1.7709335899903753, | |
| "grad_norm": 0.11518767998875729, | |
| "learning_rate": 3.5733575914480105e-05, | |
| "loss": 0.1659, | |
| "mean_token_accuracy": 0.9445607185363769, | |
| "step": 1840 | |
| }, | |
| { | |
| "epoch": 1.7757459095283927, | |
| "grad_norm": 0.12451143136112122, | |
| "learning_rate": 3.564555830353955e-05, | |
| "loss": 0.1665, | |
| "mean_token_accuracy": 0.9446374893188476, | |
| "step": 1845 | |
| }, | |
| { | |
| "epoch": 1.78055822906641, | |
| "grad_norm": 0.11126999647192666, | |
| "learning_rate": 3.555739700583374e-05, | |
| "loss": 0.1658, | |
| "mean_token_accuracy": 0.9448657035827637, | |
| "step": 1850 | |
| }, | |
| { | |
| "epoch": 1.7853705486044273, | |
| "grad_norm": 0.11048648548185011, | |
| "learning_rate": 3.54690935765184e-05, | |
| "loss": 0.1664, | |
| "mean_token_accuracy": 0.9448877513408661, | |
| "step": 1855 | |
| }, | |
| { | |
| "epoch": 1.7901828681424448, | |
| "grad_norm": 0.10911318717390217, | |
| "learning_rate": 3.5380649573256475e-05, | |
| "loss": 0.1657, | |
| "mean_token_accuracy": 0.944835102558136, | |
| "step": 1860 | |
| }, | |
| { | |
| "epoch": 1.794995187680462, | |
| "grad_norm": 0.11236133148988718, | |
| "learning_rate": 3.529206655619057e-05, | |
| "loss": 0.1664, | |
| "mean_token_accuracy": 0.9446624279022217, | |
| "step": 1865 | |
| }, | |
| { | |
| "epoch": 1.7998075072184792, | |
| "grad_norm": 0.10611214349837418, | |
| "learning_rate": 3.5203346087915516e-05, | |
| "loss": 0.1662, | |
| "mean_token_accuracy": 0.9446195423603058, | |
| "step": 1870 | |
| }, | |
| { | |
| "epoch": 1.8046198267564968, | |
| "grad_norm": 0.11596860139995219, | |
| "learning_rate": 3.511448973345074e-05, | |
| "loss": 0.1657, | |
| "mean_token_accuracy": 0.9444796085357666, | |
| "step": 1875 | |
| }, | |
| { | |
| "epoch": 1.809432146294514, | |
| "grad_norm": 0.10930896500603306, | |
| "learning_rate": 3.502549906021272e-05, | |
| "loss": 0.1648, | |
| "mean_token_accuracy": 0.9451210200786591, | |
| "step": 1880 | |
| }, | |
| { | |
| "epoch": 1.8142444658325312, | |
| "grad_norm": 0.1107024545834962, | |
| "learning_rate": 3.493637563798726e-05, | |
| "loss": 0.1667, | |
| "mean_token_accuracy": 0.9445821046829224, | |
| "step": 1885 | |
| }, | |
| { | |
| "epoch": 1.8190567853705486, | |
| "grad_norm": 0.10983401471457628, | |
| "learning_rate": 3.4847121038901877e-05, | |
| "loss": 0.1644, | |
| "mean_token_accuracy": 0.945207679271698, | |
| "step": 1890 | |
| }, | |
| { | |
| "epoch": 1.823869104908566, | |
| "grad_norm": 0.11588642596111846, | |
| "learning_rate": 3.475773683739802e-05, | |
| "loss": 0.164, | |
| "mean_token_accuracy": 0.9451716244220734, | |
| "step": 1895 | |
| }, | |
| { | |
| "epoch": 1.8286814244465832, | |
| "grad_norm": 0.10971146715099377, | |
| "learning_rate": 3.46682246102033e-05, | |
| "loss": 0.1671, | |
| "mean_token_accuracy": 0.9444344699382782, | |
| "step": 1900 | |
| }, | |
| { | |
| "epoch": 1.8334937439846006, | |
| "grad_norm": 0.11663557542225617, | |
| "learning_rate": 3.4578585936303715e-05, | |
| "loss": 0.1669, | |
| "mean_token_accuracy": 0.9444554805755615, | |
| "step": 1905 | |
| }, | |
| { | |
| "epoch": 1.838306063522618, | |
| "grad_norm": 0.11089022162439167, | |
| "learning_rate": 3.4488822396915744e-05, | |
| "loss": 0.163, | |
| "mean_token_accuracy": 0.9458965837955475, | |
| "step": 1910 | |
| }, | |
| { | |
| "epoch": 1.8431183830606352, | |
| "grad_norm": 0.11401451374040672, | |
| "learning_rate": 3.439893557545849e-05, | |
| "loss": 0.1655, | |
| "mean_token_accuracy": 0.9449984192848205, | |
| "step": 1915 | |
| }, | |
| { | |
| "epoch": 1.8479307025986524, | |
| "grad_norm": 0.11231646790004433, | |
| "learning_rate": 3.430892705752574e-05, | |
| "loss": 0.1666, | |
| "mean_token_accuracy": 0.9442723631858826, | |
| "step": 1920 | |
| }, | |
| { | |
| "epoch": 1.8527430221366699, | |
| "grad_norm": 0.1285044812904872, | |
| "learning_rate": 3.421879843085799e-05, | |
| "loss": 0.1679, | |
| "mean_token_accuracy": 0.9442314386367798, | |
| "step": 1925 | |
| }, | |
| { | |
| "epoch": 1.8575553416746873, | |
| "grad_norm": 0.11386614068239174, | |
| "learning_rate": 3.4128551285314446e-05, | |
| "loss": 0.1652, | |
| "mean_token_accuracy": 0.9446959733963013, | |
| "step": 1930 | |
| }, | |
| { | |
| "epoch": 1.8623676612127045, | |
| "grad_norm": 0.1091977757085664, | |
| "learning_rate": 3.403818721284496e-05, | |
| "loss": 0.1655, | |
| "mean_token_accuracy": 0.9448766052722931, | |
| "step": 1935 | |
| }, | |
| { | |
| "epoch": 1.867179980750722, | |
| "grad_norm": 0.11446565127578058, | |
| "learning_rate": 3.394770780746197e-05, | |
| "loss": 0.1658, | |
| "mean_token_accuracy": 0.9449751198291778, | |
| "step": 1940 | |
| }, | |
| { | |
| "epoch": 1.8719923002887393, | |
| "grad_norm": 0.11937508003370997, | |
| "learning_rate": 3.385711466521239e-05, | |
| "loss": 0.1677, | |
| "mean_token_accuracy": 0.9442361891269684, | |
| "step": 1945 | |
| }, | |
| { | |
| "epoch": 1.8768046198267565, | |
| "grad_norm": 0.11722536277062336, | |
| "learning_rate": 3.376640938414942e-05, | |
| "loss": 0.1668, | |
| "mean_token_accuracy": 0.9444027721881867, | |
| "step": 1950 | |
| }, | |
| { | |
| "epoch": 1.8816169393647737, | |
| "grad_norm": 0.11586936450715188, | |
| "learning_rate": 3.3675593564304375e-05, | |
| "loss": 0.166, | |
| "mean_token_accuracy": 0.944935929775238, | |
| "step": 1955 | |
| }, | |
| { | |
| "epoch": 1.8864292589027911, | |
| "grad_norm": 0.11314135073226646, | |
| "learning_rate": 3.358466880765849e-05, | |
| "loss": 0.1685, | |
| "mean_token_accuracy": 0.9441147804260254, | |
| "step": 1960 | |
| }, | |
| { | |
| "epoch": 1.8912415784408085, | |
| "grad_norm": 0.11026912567729032, | |
| "learning_rate": 3.349363671811458e-05, | |
| "loss": 0.1661, | |
| "mean_token_accuracy": 0.9442423999309539, | |
| "step": 1965 | |
| }, | |
| { | |
| "epoch": 1.8960538979788257, | |
| "grad_norm": 0.10876863525671715, | |
| "learning_rate": 3.340249890146887e-05, | |
| "loss": 0.1645, | |
| "mean_token_accuracy": 0.945069283246994, | |
| "step": 1970 | |
| }, | |
| { | |
| "epoch": 1.9008662175168431, | |
| "grad_norm": 0.1096943148433995, | |
| "learning_rate": 3.331125696538254e-05, | |
| "loss": 0.1642, | |
| "mean_token_accuracy": 0.9451614439487457, | |
| "step": 1975 | |
| }, | |
| { | |
| "epoch": 1.9056785370548606, | |
| "grad_norm": 0.10876585563534898, | |
| "learning_rate": 3.3219912519353464e-05, | |
| "loss": 0.164, | |
| "mean_token_accuracy": 0.9452253878116608, | |
| "step": 1980 | |
| }, | |
| { | |
| "epoch": 1.9104908565928778, | |
| "grad_norm": 0.11260774105345743, | |
| "learning_rate": 3.312846717468774e-05, | |
| "loss": 0.1657, | |
| "mean_token_accuracy": 0.9448507964611054, | |
| "step": 1985 | |
| }, | |
| { | |
| "epoch": 1.915303176130895, | |
| "grad_norm": 0.10669819797390323, | |
| "learning_rate": 3.3036922544471365e-05, | |
| "loss": 0.1649, | |
| "mean_token_accuracy": 0.9453028261661529, | |
| "step": 1990 | |
| }, | |
| { | |
| "epoch": 1.9201154956689124, | |
| "grad_norm": 0.119215402513875, | |
| "learning_rate": 3.294528024354168e-05, | |
| "loss": 0.1675, | |
| "mean_token_accuracy": 0.94445042014122, | |
| "step": 1995 | |
| }, | |
| { | |
| "epoch": 1.9249278152069298, | |
| "grad_norm": 0.11681227846316306, | |
| "learning_rate": 3.285354188845892e-05, | |
| "loss": 0.1633, | |
| "mean_token_accuracy": 0.945355623960495, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 1.929740134744947, | |
| "grad_norm": 0.11525776366425255, | |
| "learning_rate": 3.276170909747774e-05, | |
| "loss": 0.1618, | |
| "mean_token_accuracy": 0.9459958136081695, | |
| "step": 2005 | |
| }, | |
| { | |
| "epoch": 1.9345524542829644, | |
| "grad_norm": 0.11372041546524679, | |
| "learning_rate": 3.266978349051861e-05, | |
| "loss": 0.1696, | |
| "mean_token_accuracy": 0.9437148630619049, | |
| "step": 2010 | |
| }, | |
| { | |
| "epoch": 1.9393647738209818, | |
| "grad_norm": 0.11075885587789694, | |
| "learning_rate": 3.257776668913927e-05, | |
| "loss": 0.1667, | |
| "mean_token_accuracy": 0.9443390965461731, | |
| "step": 2015 | |
| }, | |
| { | |
| "epoch": 1.944177093358999, | |
| "grad_norm": 0.11605852104454013, | |
| "learning_rate": 3.248566031650611e-05, | |
| "loss": 0.1663, | |
| "mean_token_accuracy": 0.944392466545105, | |
| "step": 2020 | |
| }, | |
| { | |
| "epoch": 1.9489894128970162, | |
| "grad_norm": 0.116556530528732, | |
| "learning_rate": 3.2393465997365566e-05, | |
| "loss": 0.1659, | |
| "mean_token_accuracy": 0.9447161555290222, | |
| "step": 2025 | |
| }, | |
| { | |
| "epoch": 1.9538017324350336, | |
| "grad_norm": 0.11514799635856185, | |
| "learning_rate": 3.230118535801543e-05, | |
| "loss": 0.1675, | |
| "mean_token_accuracy": 0.9445320844650269, | |
| "step": 2030 | |
| }, | |
| { | |
| "epoch": 1.958614051973051, | |
| "grad_norm": 0.12089850815755274, | |
| "learning_rate": 3.220882002627617e-05, | |
| "loss": 0.1657, | |
| "mean_token_accuracy": 0.9446768760681152, | |
| "step": 2035 | |
| }, | |
| { | |
| "epoch": 1.9634263715110682, | |
| "grad_norm": 0.1113011067340702, | |
| "learning_rate": 3.2116371631462226e-05, | |
| "loss": 0.1672, | |
| "mean_token_accuracy": 0.9445036470890045, | |
| "step": 2040 | |
| }, | |
| { | |
| "epoch": 1.9682386910490857, | |
| "grad_norm": 0.10605433365354906, | |
| "learning_rate": 3.202384180435326e-05, | |
| "loss": 0.1645, | |
| "mean_token_accuracy": 0.9452289998531341, | |
| "step": 2045 | |
| }, | |
| { | |
| "epoch": 1.973051010587103, | |
| "grad_norm": 0.10750431601721744, | |
| "learning_rate": 3.193123217716538e-05, | |
| "loss": 0.1642, | |
| "mean_token_accuracy": 0.945152896642685, | |
| "step": 2050 | |
| }, | |
| { | |
| "epoch": 1.9778633301251203, | |
| "grad_norm": 0.10653069267106692, | |
| "learning_rate": 3.183854438352239e-05, | |
| "loss": 0.1649, | |
| "mean_token_accuracy": 0.9449185371398926, | |
| "step": 2055 | |
| }, | |
| { | |
| "epoch": 1.9826756496631375, | |
| "grad_norm": 0.1134357628891207, | |
| "learning_rate": 3.1745780058426885e-05, | |
| "loss": 0.1666, | |
| "mean_token_accuracy": 0.9449750900268554, | |
| "step": 2060 | |
| }, | |
| { | |
| "epoch": 1.9874879692011551, | |
| "grad_norm": 0.1154033064870012, | |
| "learning_rate": 3.165294083823152e-05, | |
| "loss": 0.1647, | |
| "mean_token_accuracy": 0.9452174723148346, | |
| "step": 2065 | |
| }, | |
| { | |
| "epoch": 1.9923002887391723, | |
| "grad_norm": 0.11275165982081663, | |
| "learning_rate": 3.156002836061008e-05, | |
| "loss": 0.166, | |
| "mean_token_accuracy": 0.9443567633628845, | |
| "step": 2070 | |
| }, | |
| { | |
| "epoch": 1.9971126082771895, | |
| "grad_norm": 0.10756809687507227, | |
| "learning_rate": 3.1467044264528595e-05, | |
| "loss": 0.1625, | |
| "mean_token_accuracy": 0.9458874106407166, | |
| "step": 2075 | |
| }, | |
| { | |
| "epoch": 2.001924927815207, | |
| "grad_norm": 0.11995518101376783, | |
| "learning_rate": 3.137399019021642e-05, | |
| "loss": 0.1515, | |
| "mean_token_accuracy": 0.9487068593502045, | |
| "step": 2080 | |
| }, | |
| { | |
| "epoch": 2.0067372473532243, | |
| "grad_norm": 0.13126034959811075, | |
| "learning_rate": 3.128086777913736e-05, | |
| "loss": 0.1335, | |
| "mean_token_accuracy": 0.9544493913650512, | |
| "step": 2085 | |
| }, | |
| { | |
| "epoch": 2.0115495668912415, | |
| "grad_norm": 0.12749044278087585, | |
| "learning_rate": 3.118767867396063e-05, | |
| "loss": 0.1321, | |
| "mean_token_accuracy": 0.9546238481998444, | |
| "step": 2090 | |
| }, | |
| { | |
| "epoch": 2.0163618864292587, | |
| "grad_norm": 0.1203501305881084, | |
| "learning_rate": 3.1094424518531944e-05, | |
| "loss": 0.1315, | |
| "mean_token_accuracy": 0.9548863470554352, | |
| "step": 2095 | |
| }, | |
| { | |
| "epoch": 2.0211742059672764, | |
| "grad_norm": 0.11995393096689302, | |
| "learning_rate": 3.1001106957844494e-05, | |
| "loss": 0.1331, | |
| "mean_token_accuracy": 0.954364675283432, | |
| "step": 2100 | |
| }, | |
| { | |
| "epoch": 2.0259865255052936, | |
| "grad_norm": 0.11741824190303445, | |
| "learning_rate": 3.090772763800994e-05, | |
| "loss": 0.1313, | |
| "mean_token_accuracy": 0.9547434866428375, | |
| "step": 2105 | |
| }, | |
| { | |
| "epoch": 2.0307988450433108, | |
| "grad_norm": 0.12145334367360447, | |
| "learning_rate": 3.081428820622935e-05, | |
| "loss": 0.1308, | |
| "mean_token_accuracy": 0.9551793098449707, | |
| "step": 2110 | |
| }, | |
| { | |
| "epoch": 2.0356111645813284, | |
| "grad_norm": 0.12062129976196787, | |
| "learning_rate": 3.072079031076416e-05, | |
| "loss": 0.1301, | |
| "mean_token_accuracy": 0.9553466737270355, | |
| "step": 2115 | |
| }, | |
| { | |
| "epoch": 2.0404234841193456, | |
| "grad_norm": 0.12138707446174458, | |
| "learning_rate": 3.062723560090714e-05, | |
| "loss": 0.1326, | |
| "mean_token_accuracy": 0.9546328842639923, | |
| "step": 2120 | |
| }, | |
| { | |
| "epoch": 2.045235803657363, | |
| "grad_norm": 0.13067793450163243, | |
| "learning_rate": 3.053362572695319e-05, | |
| "loss": 0.1328, | |
| "mean_token_accuracy": 0.954385656118393, | |
| "step": 2125 | |
| }, | |
| { | |
| "epoch": 2.05004812319538, | |
| "grad_norm": 0.12252436597993463, | |
| "learning_rate": 3.0439962340170362e-05, | |
| "loss": 0.1319, | |
| "mean_token_accuracy": 0.9546407461166382, | |
| "step": 2130 | |
| }, | |
| { | |
| "epoch": 2.0548604427333976, | |
| "grad_norm": 0.11818275944352087, | |
| "learning_rate": 3.0346247092770636e-05, | |
| "loss": 0.1334, | |
| "mean_token_accuracy": 0.9543164849281311, | |
| "step": 2135 | |
| }, | |
| { | |
| "epoch": 2.059672762271415, | |
| "grad_norm": 0.12553899967672977, | |
| "learning_rate": 3.0252481637880807e-05, | |
| "loss": 0.1342, | |
| "mean_token_accuracy": 0.9537067830562591, | |
| "step": 2140 | |
| }, | |
| { | |
| "epoch": 2.064485081809432, | |
| "grad_norm": 0.11783222687235054, | |
| "learning_rate": 3.0158667629513344e-05, | |
| "loss": 0.1314, | |
| "mean_token_accuracy": 0.9547268807888031, | |
| "step": 2145 | |
| }, | |
| { | |
| "epoch": 2.0692974013474497, | |
| "grad_norm": 0.11841606151360683, | |
| "learning_rate": 3.0064806722537163e-05, | |
| "loss": 0.129, | |
| "mean_token_accuracy": 0.9556107759475708, | |
| "step": 2150 | |
| }, | |
| { | |
| "epoch": 2.074109720885467, | |
| "grad_norm": 0.1247427528305096, | |
| "learning_rate": 2.9970900572648514e-05, | |
| "loss": 0.1367, | |
| "mean_token_accuracy": 0.953337824344635, | |
| "step": 2155 | |
| }, | |
| { | |
| "epoch": 2.078922040423484, | |
| "grad_norm": 0.13814870657022443, | |
| "learning_rate": 2.9876950836341676e-05, | |
| "loss": 0.1309, | |
| "mean_token_accuracy": 0.9548674285411834, | |
| "step": 2160 | |
| }, | |
| { | |
| "epoch": 2.0837343599615012, | |
| "grad_norm": 0.12018419603985044, | |
| "learning_rate": 2.978295917087982e-05, | |
| "loss": 0.1333, | |
| "mean_token_accuracy": 0.9541543126106262, | |
| "step": 2165 | |
| }, | |
| { | |
| "epoch": 2.088546679499519, | |
| "grad_norm": 0.11631435176719744, | |
| "learning_rate": 2.9688927234265735e-05, | |
| "loss": 0.1295, | |
| "mean_token_accuracy": 0.9555166900157929, | |
| "step": 2170 | |
| }, | |
| { | |
| "epoch": 2.093358999037536, | |
| "grad_norm": 0.12403202973683218, | |
| "learning_rate": 2.9594856685212574e-05, | |
| "loss": 0.1345, | |
| "mean_token_accuracy": 0.9536327719688416, | |
| "step": 2175 | |
| }, | |
| { | |
| "epoch": 2.0981713185755533, | |
| "grad_norm": 0.12539378204104595, | |
| "learning_rate": 2.950074918311464e-05, | |
| "loss": 0.1321, | |
| "mean_token_accuracy": 0.9545833170413971, | |
| "step": 2180 | |
| }, | |
| { | |
| "epoch": 2.102983638113571, | |
| "grad_norm": 0.12614523878434605, | |
| "learning_rate": 2.940660638801806e-05, | |
| "loss": 0.1338, | |
| "mean_token_accuracy": 0.9542303681373596, | |
| "step": 2185 | |
| }, | |
| { | |
| "epoch": 2.107795957651588, | |
| "grad_norm": 0.12505796700518418, | |
| "learning_rate": 2.9312429960591524e-05, | |
| "loss": 0.1306, | |
| "mean_token_accuracy": 0.9551637351512909, | |
| "step": 2190 | |
| }, | |
| { | |
| "epoch": 2.1126082771896053, | |
| "grad_norm": 0.12733399173088358, | |
| "learning_rate": 2.9218221562097005e-05, | |
| "loss": 0.1321, | |
| "mean_token_accuracy": 0.9546559393405915, | |
| "step": 2195 | |
| }, | |
| { | |
| "epoch": 2.1174205967276225, | |
| "grad_norm": 0.11864892364792179, | |
| "learning_rate": 2.9123982854360438e-05, | |
| "loss": 0.1319, | |
| "mean_token_accuracy": 0.954675143957138, | |
| "step": 2200 | |
| }, | |
| { | |
| "epoch": 2.12223291626564, | |
| "grad_norm": 0.12061236159389063, | |
| "learning_rate": 2.902971549974241e-05, | |
| "loss": 0.1318, | |
| "mean_token_accuracy": 0.9549073457717896, | |
| "step": 2205 | |
| }, | |
| { | |
| "epoch": 2.1270452358036573, | |
| "grad_norm": 0.1140867992691015, | |
| "learning_rate": 2.8935421161108833e-05, | |
| "loss": 0.1329, | |
| "mean_token_accuracy": 0.9544546484947205, | |
| "step": 2210 | |
| }, | |
| { | |
| "epoch": 2.1318575553416745, | |
| "grad_norm": 0.12105327381783884, | |
| "learning_rate": 2.884110150180162e-05, | |
| "loss": 0.1317, | |
| "mean_token_accuracy": 0.9547511756420135, | |
| "step": 2215 | |
| }, | |
| { | |
| "epoch": 2.136669874879692, | |
| "grad_norm": 0.12717876151832444, | |
| "learning_rate": 2.874675818560933e-05, | |
| "loss": 0.1317, | |
| "mean_token_accuracy": 0.9547945916652679, | |
| "step": 2220 | |
| }, | |
| { | |
| "epoch": 2.1414821944177094, | |
| "grad_norm": 0.12426466051142133, | |
| "learning_rate": 2.8652392876737825e-05, | |
| "loss": 0.1336, | |
| "mean_token_accuracy": 0.9541352748870849, | |
| "step": 2225 | |
| }, | |
| { | |
| "epoch": 2.1462945139557266, | |
| "grad_norm": 0.11867330598443876, | |
| "learning_rate": 2.8558007239780932e-05, | |
| "loss": 0.1312, | |
| "mean_token_accuracy": 0.9549016654491425, | |
| "step": 2230 | |
| }, | |
| { | |
| "epoch": 2.1511068334937438, | |
| "grad_norm": 0.12291203501683998, | |
| "learning_rate": 2.846360293969106e-05, | |
| "loss": 0.1342, | |
| "mean_token_accuracy": 0.9538123309612274, | |
| "step": 2235 | |
| }, | |
| { | |
| "epoch": 2.1559191530317614, | |
| "grad_norm": 0.12513626732710528, | |
| "learning_rate": 2.836918164174981e-05, | |
| "loss": 0.1355, | |
| "mean_token_accuracy": 0.9533581078052521, | |
| "step": 2240 | |
| }, | |
| { | |
| "epoch": 2.1607314725697786, | |
| "grad_norm": 0.11973737934728648, | |
| "learning_rate": 2.827474501153864e-05, | |
| "loss": 0.1306, | |
| "mean_token_accuracy": 0.9549562215805054, | |
| "step": 2245 | |
| }, | |
| { | |
| "epoch": 2.165543792107796, | |
| "grad_norm": 0.12149824800768187, | |
| "learning_rate": 2.818029471490947e-05, | |
| "loss": 0.131, | |
| "mean_token_accuracy": 0.9550965666770935, | |
| "step": 2250 | |
| }, | |
| { | |
| "epoch": 2.1703561116458134, | |
| "grad_norm": 0.12103926256332564, | |
| "learning_rate": 2.8085832417955305e-05, | |
| "loss": 0.1333, | |
| "mean_token_accuracy": 0.9543343603610992, | |
| "step": 2255 | |
| }, | |
| { | |
| "epoch": 2.1751684311838306, | |
| "grad_norm": 0.12072169041620741, | |
| "learning_rate": 2.7991359786980797e-05, | |
| "loss": 0.1323, | |
| "mean_token_accuracy": 0.9544179022312165, | |
| "step": 2260 | |
| }, | |
| { | |
| "epoch": 2.179980750721848, | |
| "grad_norm": 0.12251282879748909, | |
| "learning_rate": 2.7896878488472927e-05, | |
| "loss": 0.1313, | |
| "mean_token_accuracy": 0.9548408627510071, | |
| "step": 2265 | |
| }, | |
| { | |
| "epoch": 2.1847930702598655, | |
| "grad_norm": 0.11789609733615358, | |
| "learning_rate": 2.7802390189071563e-05, | |
| "loss": 0.1336, | |
| "mean_token_accuracy": 0.9543514966964721, | |
| "step": 2270 | |
| }, | |
| { | |
| "epoch": 2.1896053897978827, | |
| "grad_norm": 0.12307329297713238, | |
| "learning_rate": 2.770789655554005e-05, | |
| "loss": 0.1327, | |
| "mean_token_accuracy": 0.9544746696949005, | |
| "step": 2275 | |
| }, | |
| { | |
| "epoch": 2.1944177093359, | |
| "grad_norm": 0.12315082015293093, | |
| "learning_rate": 2.7613399254735827e-05, | |
| "loss": 0.1315, | |
| "mean_token_accuracy": 0.9548600077629089, | |
| "step": 2280 | |
| }, | |
| { | |
| "epoch": 2.199230028873917, | |
| "grad_norm": 0.12374492854826523, | |
| "learning_rate": 2.751889995358106e-05, | |
| "loss": 0.1332, | |
| "mean_token_accuracy": 0.9542760193347931, | |
| "step": 2285 | |
| }, | |
| { | |
| "epoch": 2.2040423484119347, | |
| "grad_norm": 0.12977646870439685, | |
| "learning_rate": 2.7424400319033155e-05, | |
| "loss": 0.1326, | |
| "mean_token_accuracy": 0.9544100821018219, | |
| "step": 2290 | |
| }, | |
| { | |
| "epoch": 2.208854667949952, | |
| "grad_norm": 0.12285135892142031, | |
| "learning_rate": 2.7329902018055425e-05, | |
| "loss": 0.1345, | |
| "mean_token_accuracy": 0.9539024710655213, | |
| "step": 2295 | |
| }, | |
| { | |
| "epoch": 2.213666987487969, | |
| "grad_norm": 0.1254327075340909, | |
| "learning_rate": 2.7235406717587658e-05, | |
| "loss": 0.1323, | |
| "mean_token_accuracy": 0.9546830892562866, | |
| "step": 2300 | |
| }, | |
| { | |
| "epoch": 2.2184793070259867, | |
| "grad_norm": 0.11797249195080454, | |
| "learning_rate": 2.71409160845167e-05, | |
| "loss": 0.132, | |
| "mean_token_accuracy": 0.9544696629047393, | |
| "step": 2305 | |
| }, | |
| { | |
| "epoch": 2.223291626564004, | |
| "grad_norm": 0.12371441389507953, | |
| "learning_rate": 2.704643178564707e-05, | |
| "loss": 0.1324, | |
| "mean_token_accuracy": 0.9546551823616027, | |
| "step": 2310 | |
| }, | |
| { | |
| "epoch": 2.228103946102021, | |
| "grad_norm": 0.12142821449106762, | |
| "learning_rate": 2.695195548767157e-05, | |
| "loss": 0.1334, | |
| "mean_token_accuracy": 0.9541204988956451, | |
| "step": 2315 | |
| }, | |
| { | |
| "epoch": 2.2329162656400383, | |
| "grad_norm": 0.1220023124678011, | |
| "learning_rate": 2.6857488857141837e-05, | |
| "loss": 0.1346, | |
| "mean_token_accuracy": 0.9537499129772187, | |
| "step": 2320 | |
| }, | |
| { | |
| "epoch": 2.237728585178056, | |
| "grad_norm": 0.12251837181483602, | |
| "learning_rate": 2.6763033560439005e-05, | |
| "loss": 0.1345, | |
| "mean_token_accuracy": 0.9538609445095062, | |
| "step": 2325 | |
| }, | |
| { | |
| "epoch": 2.242540904716073, | |
| "grad_norm": 0.12521614366145867, | |
| "learning_rate": 2.6668591263744246e-05, | |
| "loss": 0.1335, | |
| "mean_token_accuracy": 0.9543049156665802, | |
| "step": 2330 | |
| }, | |
| { | |
| "epoch": 2.2473532242540903, | |
| "grad_norm": 0.12236642476577748, | |
| "learning_rate": 2.6574163633009445e-05, | |
| "loss": 0.1296, | |
| "mean_token_accuracy": 0.9554842472076416, | |
| "step": 2335 | |
| }, | |
| { | |
| "epoch": 2.252165543792108, | |
| "grad_norm": 0.1201187449624429, | |
| "learning_rate": 2.6479752333927776e-05, | |
| "loss": 0.1323, | |
| "mean_token_accuracy": 0.9545388877391815, | |
| "step": 2340 | |
| }, | |
| { | |
| "epoch": 2.256977863330125, | |
| "grad_norm": 0.11859918661445157, | |
| "learning_rate": 2.6385359031904307e-05, | |
| "loss": 0.1335, | |
| "mean_token_accuracy": 0.9541272759437561, | |
| "step": 2345 | |
| }, | |
| { | |
| "epoch": 2.2617901828681424, | |
| "grad_norm": 0.11988864935433534, | |
| "learning_rate": 2.629098539202665e-05, | |
| "loss": 0.1342, | |
| "mean_token_accuracy": 0.953696221113205, | |
| "step": 2350 | |
| }, | |
| { | |
| "epoch": 2.26660250240616, | |
| "grad_norm": 0.1228157380029682, | |
| "learning_rate": 2.6196633079035593e-05, | |
| "loss": 0.1326, | |
| "mean_token_accuracy": 0.9543487191200256, | |
| "step": 2355 | |
| }, | |
| { | |
| "epoch": 2.271414821944177, | |
| "grad_norm": 0.12501355645936355, | |
| "learning_rate": 2.6102303757295692e-05, | |
| "loss": 0.1354, | |
| "mean_token_accuracy": 0.9536598801612854, | |
| "step": 2360 | |
| }, | |
| { | |
| "epoch": 2.2762271414821944, | |
| "grad_norm": 0.12161791196507869, | |
| "learning_rate": 2.600799909076596e-05, | |
| "loss": 0.1335, | |
| "mean_token_accuracy": 0.9543259024620057, | |
| "step": 2365 | |
| }, | |
| { | |
| "epoch": 2.2810394610202116, | |
| "grad_norm": 0.12237945594083449, | |
| "learning_rate": 2.5913720742970495e-05, | |
| "loss": 0.1327, | |
| "mean_token_accuracy": 0.9544833540916443, | |
| "step": 2370 | |
| }, | |
| { | |
| "epoch": 2.2858517805582292, | |
| "grad_norm": 0.1185880721412727, | |
| "learning_rate": 2.5819470376969117e-05, | |
| "loss": 0.1333, | |
| "mean_token_accuracy": 0.9542553067207337, | |
| "step": 2375 | |
| }, | |
| { | |
| "epoch": 2.2906641000962464, | |
| "grad_norm": 0.11983605172847067, | |
| "learning_rate": 2.5725249655328066e-05, | |
| "loss": 0.1299, | |
| "mean_token_accuracy": 0.9553394496440888, | |
| "step": 2380 | |
| }, | |
| { | |
| "epoch": 2.2954764196342636, | |
| "grad_norm": 0.11809567235718993, | |
| "learning_rate": 2.5631060240090665e-05, | |
| "loss": 0.1322, | |
| "mean_token_accuracy": 0.9548580408096313, | |
| "step": 2385 | |
| }, | |
| { | |
| "epoch": 2.300288739172281, | |
| "grad_norm": 0.12533309263229675, | |
| "learning_rate": 2.553690379274798e-05, | |
| "loss": 0.1304, | |
| "mean_token_accuracy": 0.9553009092807769, | |
| "step": 2390 | |
| }, | |
| { | |
| "epoch": 2.3051010587102985, | |
| "grad_norm": 0.12024247123243283, | |
| "learning_rate": 2.5442781974209524e-05, | |
| "loss": 0.1337, | |
| "mean_token_accuracy": 0.9538061857223511, | |
| "step": 2395 | |
| }, | |
| { | |
| "epoch": 2.3099133782483157, | |
| "grad_norm": 0.12215267551637267, | |
| "learning_rate": 2.5348696444773984e-05, | |
| "loss": 0.1337, | |
| "mean_token_accuracy": 0.9542751133441925, | |
| "step": 2400 | |
| }, | |
| { | |
| "epoch": 2.314725697786333, | |
| "grad_norm": 0.12017720162702422, | |
| "learning_rate": 2.525464886409989e-05, | |
| "loss": 0.133, | |
| "mean_token_accuracy": 0.954441887140274, | |
| "step": 2405 | |
| }, | |
| { | |
| "epoch": 2.3195380173243505, | |
| "grad_norm": 0.12439356484344923, | |
| "learning_rate": 2.5160640891176368e-05, | |
| "loss": 0.1317, | |
| "mean_token_accuracy": 0.9547025561332703, | |
| "step": 2410 | |
| }, | |
| { | |
| "epoch": 2.3243503368623677, | |
| "grad_norm": 0.1197568786206684, | |
| "learning_rate": 2.5066674184293872e-05, | |
| "loss": 0.1335, | |
| "mean_token_accuracy": 0.9541660666465759, | |
| "step": 2415 | |
| }, | |
| { | |
| "epoch": 2.329162656400385, | |
| "grad_norm": 0.1304024226672097, | |
| "learning_rate": 2.4972750401014917e-05, | |
| "loss": 0.1313, | |
| "mean_token_accuracy": 0.9548667967319489, | |
| "step": 2420 | |
| }, | |
| { | |
| "epoch": 2.3339749759384025, | |
| "grad_norm": 0.11948583613983592, | |
| "learning_rate": 2.4878871198144866e-05, | |
| "loss": 0.1332, | |
| "mean_token_accuracy": 0.9540335357189178, | |
| "step": 2425 | |
| }, | |
| { | |
| "epoch": 2.3387872954764197, | |
| "grad_norm": 0.11940412537821125, | |
| "learning_rate": 2.4785038231702662e-05, | |
| "loss": 0.1301, | |
| "mean_token_accuracy": 0.9552293717861176, | |
| "step": 2430 | |
| }, | |
| { | |
| "epoch": 2.343599615014437, | |
| "grad_norm": 0.11983395261469829, | |
| "learning_rate": 2.469125315689167e-05, | |
| "loss": 0.131, | |
| "mean_token_accuracy": 0.9548083424568177, | |
| "step": 2435 | |
| }, | |
| { | |
| "epoch": 2.348411934552454, | |
| "grad_norm": 0.11871931678738833, | |
| "learning_rate": 2.4597517628070434e-05, | |
| "loss": 0.1323, | |
| "mean_token_accuracy": 0.9544838011264801, | |
| "step": 2440 | |
| }, | |
| { | |
| "epoch": 2.3532242540904718, | |
| "grad_norm": 0.1156881640910327, | |
| "learning_rate": 2.450383329872352e-05, | |
| "loss": 0.1295, | |
| "mean_token_accuracy": 0.9555284321308136, | |
| "step": 2445 | |
| }, | |
| { | |
| "epoch": 2.358036573628489, | |
| "grad_norm": 0.12492566645639688, | |
| "learning_rate": 2.4410201821432344e-05, | |
| "loss": 0.1342, | |
| "mean_token_accuracy": 0.9538214087486268, | |
| "step": 2450 | |
| }, | |
| { | |
| "epoch": 2.362848893166506, | |
| "grad_norm": 0.1238103003195217, | |
| "learning_rate": 2.431662484784601e-05, | |
| "loss": 0.1331, | |
| "mean_token_accuracy": 0.9543547987937927, | |
| "step": 2455 | |
| }, | |
| { | |
| "epoch": 2.3676612127045233, | |
| "grad_norm": 0.11922211094290269, | |
| "learning_rate": 2.4223104028652187e-05, | |
| "loss": 0.1332, | |
| "mean_token_accuracy": 0.9542674422264099, | |
| "step": 2460 | |
| }, | |
| { | |
| "epoch": 2.372473532242541, | |
| "grad_norm": 0.12221924952360166, | |
| "learning_rate": 2.4129641013547976e-05, | |
| "loss": 0.1344, | |
| "mean_token_accuracy": 0.9539968192577362, | |
| "step": 2465 | |
| }, | |
| { | |
| "epoch": 2.377285851780558, | |
| "grad_norm": 0.12074665033607256, | |
| "learning_rate": 2.4036237451210853e-05, | |
| "loss": 0.1313, | |
| "mean_token_accuracy": 0.9551043808460236, | |
| "step": 2470 | |
| }, | |
| { | |
| "epoch": 2.3820981713185754, | |
| "grad_norm": 0.12578342435533066, | |
| "learning_rate": 2.3942894989269506e-05, | |
| "loss": 0.1337, | |
| "mean_token_accuracy": 0.9543434858322144, | |
| "step": 2475 | |
| }, | |
| { | |
| "epoch": 2.386910490856593, | |
| "grad_norm": 0.11981499544239471, | |
| "learning_rate": 2.3849615274274846e-05, | |
| "loss": 0.1317, | |
| "mean_token_accuracy": 0.9547614216804504, | |
| "step": 2480 | |
| }, | |
| { | |
| "epoch": 2.39172281039461, | |
| "grad_norm": 0.11663272374038669, | |
| "learning_rate": 2.3756399951670945e-05, | |
| "loss": 0.1305, | |
| "mean_token_accuracy": 0.955213874578476, | |
| "step": 2485 | |
| }, | |
| { | |
| "epoch": 2.3965351299326274, | |
| "grad_norm": 0.1256512952494838, | |
| "learning_rate": 2.366325066576597e-05, | |
| "loss": 0.135, | |
| "mean_token_accuracy": 0.9534781336784363, | |
| "step": 2490 | |
| }, | |
| { | |
| "epoch": 2.401347449470645, | |
| "grad_norm": 0.11923923169952254, | |
| "learning_rate": 2.3570169059703228e-05, | |
| "loss": 0.1347, | |
| "mean_token_accuracy": 0.9539281606674195, | |
| "step": 2495 | |
| }, | |
| { | |
| "epoch": 2.4061597690086622, | |
| "grad_norm": 0.12212883758639682, | |
| "learning_rate": 2.3477156775432167e-05, | |
| "loss": 0.1331, | |
| "mean_token_accuracy": 0.9542208135128021, | |
| "step": 2500 | |
| }, | |
| { | |
| "epoch": 2.4109720885466794, | |
| "grad_norm": 0.1185404971188421, | |
| "learning_rate": 2.3384215453679388e-05, | |
| "loss": 0.1314, | |
| "mean_token_accuracy": 0.9548238515853882, | |
| "step": 2505 | |
| }, | |
| { | |
| "epoch": 2.4157844080846966, | |
| "grad_norm": 0.12058775798481192, | |
| "learning_rate": 2.3291346733919728e-05, | |
| "loss": 0.1357, | |
| "mean_token_accuracy": 0.9534320294857025, | |
| "step": 2510 | |
| }, | |
| { | |
| "epoch": 2.4205967276227143, | |
| "grad_norm": 0.12607534374083526, | |
| "learning_rate": 2.319855225434734e-05, | |
| "loss": 0.1298, | |
| "mean_token_accuracy": 0.9555575132369996, | |
| "step": 2515 | |
| }, | |
| { | |
| "epoch": 2.4254090471607315, | |
| "grad_norm": 0.12491353176028583, | |
| "learning_rate": 2.3105833651846776e-05, | |
| "loss": 0.1293, | |
| "mean_token_accuracy": 0.9553836643695831, | |
| "step": 2520 | |
| }, | |
| { | |
| "epoch": 2.4302213666987487, | |
| "grad_norm": 0.12110440871837368, | |
| "learning_rate": 2.301319256196414e-05, | |
| "loss": 0.1316, | |
| "mean_token_accuracy": 0.9548416078090668, | |
| "step": 2525 | |
| }, | |
| { | |
| "epoch": 2.4350336862367663, | |
| "grad_norm": 0.1192368917950234, | |
| "learning_rate": 2.2920630618878203e-05, | |
| "loss": 0.1347, | |
| "mean_token_accuracy": 0.9538196146488189, | |
| "step": 2530 | |
| }, | |
| { | |
| "epoch": 2.4398460057747835, | |
| "grad_norm": 0.12679556793985397, | |
| "learning_rate": 2.2828149455371607e-05, | |
| "loss": 0.1321, | |
| "mean_token_accuracy": 0.9546487092971802, | |
| "step": 2535 | |
| }, | |
| { | |
| "epoch": 2.4446583253128007, | |
| "grad_norm": 0.12398988764990418, | |
| "learning_rate": 2.273575070280204e-05, | |
| "loss": 0.1321, | |
| "mean_token_accuracy": 0.9545963108539581, | |
| "step": 2540 | |
| }, | |
| { | |
| "epoch": 2.449470644850818, | |
| "grad_norm": 0.11817459353486662, | |
| "learning_rate": 2.2643435991073485e-05, | |
| "loss": 0.1287, | |
| "mean_token_accuracy": 0.9560497164726257, | |
| "step": 2545 | |
| }, | |
| { | |
| "epoch": 2.4542829643888355, | |
| "grad_norm": 0.11719967304458964, | |
| "learning_rate": 2.2551206948607466e-05, | |
| "loss": 0.1335, | |
| "mean_token_accuracy": 0.9542776644229889, | |
| "step": 2550 | |
| }, | |
| { | |
| "epoch": 2.4590952839268527, | |
| "grad_norm": 0.11968805967183985, | |
| "learning_rate": 2.245906520231426e-05, | |
| "loss": 0.1327, | |
| "mean_token_accuracy": 0.9545498311519622, | |
| "step": 2555 | |
| }, | |
| { | |
| "epoch": 2.46390760346487, | |
| "grad_norm": 0.13515626783628001, | |
| "learning_rate": 2.23670123775643e-05, | |
| "loss": 0.1343, | |
| "mean_token_accuracy": 0.954009610414505, | |
| "step": 2560 | |
| }, | |
| { | |
| "epoch": 2.4687199230028876, | |
| "grad_norm": 0.11891683688757336, | |
| "learning_rate": 2.227505009815943e-05, | |
| "loss": 0.1326, | |
| "mean_token_accuracy": 0.9545702993869781, | |
| "step": 2565 | |
| }, | |
| { | |
| "epoch": 2.4735322425409048, | |
| "grad_norm": 0.11895716330205122, | |
| "learning_rate": 2.218317998630428e-05, | |
| "loss": 0.1298, | |
| "mean_token_accuracy": 0.9552351653575897, | |
| "step": 2570 | |
| }, | |
| { | |
| "epoch": 2.478344562078922, | |
| "grad_norm": 0.12229913088965033, | |
| "learning_rate": 2.209140366257767e-05, | |
| "loss": 0.1316, | |
| "mean_token_accuracy": 0.9549697756767273, | |
| "step": 2575 | |
| }, | |
| { | |
| "epoch": 2.4831568816169396, | |
| "grad_norm": 0.11984226996590863, | |
| "learning_rate": 2.1999722745904006e-05, | |
| "loss": 0.1302, | |
| "mean_token_accuracy": 0.9551658987998962, | |
| "step": 2580 | |
| }, | |
| { | |
| "epoch": 2.487969201154957, | |
| "grad_norm": 0.1213679229681783, | |
| "learning_rate": 2.1908138853524694e-05, | |
| "loss": 0.131, | |
| "mean_token_accuracy": 0.9548961997032166, | |
| "step": 2585 | |
| }, | |
| { | |
| "epoch": 2.492781520692974, | |
| "grad_norm": 0.1191811385186817, | |
| "learning_rate": 2.181665360096969e-05, | |
| "loss": 0.1335, | |
| "mean_token_accuracy": 0.954352217912674, | |
| "step": 2590 | |
| }, | |
| { | |
| "epoch": 2.497593840230991, | |
| "grad_norm": 0.12673082212489745, | |
| "learning_rate": 2.1725268602028914e-05, | |
| "loss": 0.1323, | |
| "mean_token_accuracy": 0.9546513199806214, | |
| "step": 2595 | |
| }, | |
| { | |
| "epoch": 2.502406159769009, | |
| "grad_norm": 0.1153806459155593, | |
| "learning_rate": 2.1633985468723837e-05, | |
| "loss": 0.1342, | |
| "mean_token_accuracy": 0.9541236937046051, | |
| "step": 2600 | |
| }, | |
| { | |
| "epoch": 2.507218479307026, | |
| "grad_norm": 0.11890978449870844, | |
| "learning_rate": 2.1542805811279043e-05, | |
| "loss": 0.1321, | |
| "mean_token_accuracy": 0.9546476840972901, | |
| "step": 2605 | |
| }, | |
| { | |
| "epoch": 2.512030798845043, | |
| "grad_norm": 0.12518430159186708, | |
| "learning_rate": 2.1451731238093797e-05, | |
| "loss": 0.1311, | |
| "mean_token_accuracy": 0.9550606966018677, | |
| "step": 2610 | |
| }, | |
| { | |
| "epoch": 2.5168431183830604, | |
| "grad_norm": 0.11994883336111889, | |
| "learning_rate": 2.1360763355713698e-05, | |
| "loss": 0.1337, | |
| "mean_token_accuracy": 0.9542861402034759, | |
| "step": 2615 | |
| }, | |
| { | |
| "epoch": 2.521655437921078, | |
| "grad_norm": 0.12069231025941671, | |
| "learning_rate": 2.126990376880233e-05, | |
| "loss": 0.1349, | |
| "mean_token_accuracy": 0.9535523414611816, | |
| "step": 2620 | |
| }, | |
| { | |
| "epoch": 2.5264677574590952, | |
| "grad_norm": 0.12174746467176746, | |
| "learning_rate": 2.1179154080112938e-05, | |
| "loss": 0.1297, | |
| "mean_token_accuracy": 0.9554234504699707, | |
| "step": 2625 | |
| }, | |
| { | |
| "epoch": 2.5312800769971124, | |
| "grad_norm": 0.12177976395856156, | |
| "learning_rate": 2.108851589046022e-05, | |
| "loss": 0.1322, | |
| "mean_token_accuracy": 0.954600042104721, | |
| "step": 2630 | |
| }, | |
| { | |
| "epoch": 2.53609239653513, | |
| "grad_norm": 0.11481111490917931, | |
| "learning_rate": 2.0997990798691985e-05, | |
| "loss": 0.1308, | |
| "mean_token_accuracy": 0.9550839960575104, | |
| "step": 2635 | |
| }, | |
| { | |
| "epoch": 2.5409047160731473, | |
| "grad_norm": 0.1213261291958557, | |
| "learning_rate": 2.0907580401661043e-05, | |
| "loss": 0.1348, | |
| "mean_token_accuracy": 0.9538499236106872, | |
| "step": 2640 | |
| }, | |
| { | |
| "epoch": 2.5457170356111645, | |
| "grad_norm": 0.12150988254524177, | |
| "learning_rate": 2.0817286294196995e-05, | |
| "loss": 0.1324, | |
| "mean_token_accuracy": 0.9548084199428558, | |
| "step": 2645 | |
| }, | |
| { | |
| "epoch": 2.550529355149182, | |
| "grad_norm": 0.12319521032962352, | |
| "learning_rate": 2.072711006907812e-05, | |
| "loss": 0.1338, | |
| "mean_token_accuracy": 0.954076099395752, | |
| "step": 2650 | |
| }, | |
| { | |
| "epoch": 2.5553416746871993, | |
| "grad_norm": 0.116593792548146, | |
| "learning_rate": 2.0637053317003237e-05, | |
| "loss": 0.1309, | |
| "mean_token_accuracy": 0.9548590064048768, | |
| "step": 2655 | |
| }, | |
| { | |
| "epoch": 2.5601539942252165, | |
| "grad_norm": 0.1159157325715896, | |
| "learning_rate": 2.054711762656369e-05, | |
| "loss": 0.1322, | |
| "mean_token_accuracy": 0.9546443939208984, | |
| "step": 2660 | |
| }, | |
| { | |
| "epoch": 2.564966313763234, | |
| "grad_norm": 0.11494826323790959, | |
| "learning_rate": 2.045730458421531e-05, | |
| "loss": 0.1289, | |
| "mean_token_accuracy": 0.9559885859489441, | |
| "step": 2665 | |
| }, | |
| { | |
| "epoch": 2.5697786333012513, | |
| "grad_norm": 0.12303007277935378, | |
| "learning_rate": 2.0367615774250414e-05, | |
| "loss": 0.1319, | |
| "mean_token_accuracy": 0.9548058807849884, | |
| "step": 2670 | |
| }, | |
| { | |
| "epoch": 2.5745909528392685, | |
| "grad_norm": 0.1179943878551545, | |
| "learning_rate": 2.02780527787699e-05, | |
| "loss": 0.1312, | |
| "mean_token_accuracy": 0.9550620734691619, | |
| "step": 2675 | |
| }, | |
| { | |
| "epoch": 2.5794032723772857, | |
| "grad_norm": 0.11603865591696458, | |
| "learning_rate": 2.0188617177655296e-05, | |
| "loss": 0.1309, | |
| "mean_token_accuracy": 0.9550534367561341, | |
| "step": 2680 | |
| }, | |
| { | |
| "epoch": 2.584215591915303, | |
| "grad_norm": 0.11771963641203456, | |
| "learning_rate": 2.0099310548540895e-05, | |
| "loss": 0.1313, | |
| "mean_token_accuracy": 0.9550515651702881, | |
| "step": 2685 | |
| }, | |
| { | |
| "epoch": 2.5890279114533206, | |
| "grad_norm": 0.12094444658718573, | |
| "learning_rate": 2.0010134466785962e-05, | |
| "loss": 0.1328, | |
| "mean_token_accuracy": 0.9544416069984436, | |
| "step": 2690 | |
| }, | |
| { | |
| "epoch": 2.5938402309913378, | |
| "grad_norm": 0.11944102441666446, | |
| "learning_rate": 1.992109050544691e-05, | |
| "loss": 0.1333, | |
| "mean_token_accuracy": 0.9542948365211487, | |
| "step": 2695 | |
| }, | |
| { | |
| "epoch": 2.598652550529355, | |
| "grad_norm": 0.12457573247179644, | |
| "learning_rate": 1.9832180235249552e-05, | |
| "loss": 0.1316, | |
| "mean_token_accuracy": 0.9547983348369599, | |
| "step": 2700 | |
| }, | |
| { | |
| "epoch": 2.6034648700673726, | |
| "grad_norm": 0.12417443201330976, | |
| "learning_rate": 1.9743405224561423e-05, | |
| "loss": 0.1297, | |
| "mean_token_accuracy": 0.9556341528892517, | |
| "step": 2705 | |
| }, | |
| { | |
| "epoch": 2.60827718960539, | |
| "grad_norm": 0.12698239548171042, | |
| "learning_rate": 1.965476703936409e-05, | |
| "loss": 0.1318, | |
| "mean_token_accuracy": 0.9548296809196473, | |
| "step": 2710 | |
| }, | |
| { | |
| "epoch": 2.613089509143407, | |
| "grad_norm": 0.11986249667382091, | |
| "learning_rate": 1.9566267243225517e-05, | |
| "loss": 0.1326, | |
| "mean_token_accuracy": 0.9545009911060334, | |
| "step": 2715 | |
| }, | |
| { | |
| "epoch": 2.6179018286814246, | |
| "grad_norm": 0.11786538819009267, | |
| "learning_rate": 1.9477907397272514e-05, | |
| "loss": 0.133, | |
| "mean_token_accuracy": 0.9544251382350921, | |
| "step": 2720 | |
| }, | |
| { | |
| "epoch": 2.622714148219442, | |
| "grad_norm": 0.11630194583469736, | |
| "learning_rate": 1.9389689060163197e-05, | |
| "loss": 0.1316, | |
| "mean_token_accuracy": 0.9547665357589722, | |
| "step": 2725 | |
| }, | |
| { | |
| "epoch": 2.627526467757459, | |
| "grad_norm": 0.12116313019151212, | |
| "learning_rate": 1.930161378805944e-05, | |
| "loss": 0.1322, | |
| "mean_token_accuracy": 0.9545979201793671, | |
| "step": 2730 | |
| }, | |
| { | |
| "epoch": 2.6323387872954767, | |
| "grad_norm": 0.12109833343844571, | |
| "learning_rate": 1.921368313459953e-05, | |
| "loss": 0.1326, | |
| "mean_token_accuracy": 0.9542627811431885, | |
| "step": 2735 | |
| }, | |
| { | |
| "epoch": 2.637151106833494, | |
| "grad_norm": 0.12350690739307885, | |
| "learning_rate": 1.912589865087062e-05, | |
| "loss": 0.1299, | |
| "mean_token_accuracy": 0.9553773999214172, | |
| "step": 2740 | |
| }, | |
| { | |
| "epoch": 2.641963426371511, | |
| "grad_norm": 0.11774736413362642, | |
| "learning_rate": 1.9038261885381507e-05, | |
| "loss": 0.1308, | |
| "mean_token_accuracy": 0.9551102817058563, | |
| "step": 2745 | |
| }, | |
| { | |
| "epoch": 2.6467757459095282, | |
| "grad_norm": 0.12012526176245106, | |
| "learning_rate": 1.8950774384035235e-05, | |
| "loss": 0.1324, | |
| "mean_token_accuracy": 0.954530930519104, | |
| "step": 2750 | |
| }, | |
| { | |
| "epoch": 2.651588065447546, | |
| "grad_norm": 0.11913471446894865, | |
| "learning_rate": 1.8863437690101826e-05, | |
| "loss": 0.1303, | |
| "mean_token_accuracy": 0.9552291572093964, | |
| "step": 2755 | |
| }, | |
| { | |
| "epoch": 2.656400384985563, | |
| "grad_norm": 0.11498496701684938, | |
| "learning_rate": 1.8776253344191096e-05, | |
| "loss": 0.1303, | |
| "mean_token_accuracy": 0.9551547944545746, | |
| "step": 2760 | |
| }, | |
| { | |
| "epoch": 2.6612127045235803, | |
| "grad_norm": 0.12241409474268447, | |
| "learning_rate": 1.8689222884225467e-05, | |
| "loss": 0.1315, | |
| "mean_token_accuracy": 0.9549719274044037, | |
| "step": 2765 | |
| }, | |
| { | |
| "epoch": 2.6660250240615975, | |
| "grad_norm": 0.12406911336736134, | |
| "learning_rate": 1.86023478454128e-05, | |
| "loss": 0.1311, | |
| "mean_token_accuracy": 0.9548561632633209, | |
| "step": 2770 | |
| }, | |
| { | |
| "epoch": 2.670837343599615, | |
| "grad_norm": 0.11836599415476366, | |
| "learning_rate": 1.851562976021936e-05, | |
| "loss": 0.1324, | |
| "mean_token_accuracy": 0.9545220673084259, | |
| "step": 2775 | |
| }, | |
| { | |
| "epoch": 2.6756496631376323, | |
| "grad_norm": 0.12782652731593716, | |
| "learning_rate": 1.842907015834278e-05, | |
| "loss": 0.1274, | |
| "mean_token_accuracy": 0.956011027097702, | |
| "step": 2780 | |
| }, | |
| { | |
| "epoch": 2.6804619826756495, | |
| "grad_norm": 0.12796705831424773, | |
| "learning_rate": 1.8342670566685043e-05, | |
| "loss": 0.1324, | |
| "mean_token_accuracy": 0.954570335149765, | |
| "step": 2785 | |
| }, | |
| { | |
| "epoch": 2.685274302213667, | |
| "grad_norm": 0.12212136034586325, | |
| "learning_rate": 1.825643250932557e-05, | |
| "loss": 0.133, | |
| "mean_token_accuracy": 0.9543835699558259, | |
| "step": 2790 | |
| }, | |
| { | |
| "epoch": 2.6900866217516843, | |
| "grad_norm": 0.1163613281386537, | |
| "learning_rate": 1.8170357507494363e-05, | |
| "loss": 0.1304, | |
| "mean_token_accuracy": 0.9552183389663697, | |
| "step": 2795 | |
| }, | |
| { | |
| "epoch": 2.6948989412897015, | |
| "grad_norm": 0.12438927943038487, | |
| "learning_rate": 1.8084447079545096e-05, | |
| "loss": 0.1298, | |
| "mean_token_accuracy": 0.9555165588855743, | |
| "step": 2800 | |
| }, | |
| { | |
| "epoch": 2.699711260827719, | |
| "grad_norm": 0.12095904467852418, | |
| "learning_rate": 1.799870274092842e-05, | |
| "loss": 0.1275, | |
| "mean_token_accuracy": 0.9562630355358124, | |
| "step": 2805 | |
| }, | |
| { | |
| "epoch": 2.7045235803657364, | |
| "grad_norm": 0.11740817698868464, | |
| "learning_rate": 1.791312600416517e-05, | |
| "loss": 0.1315, | |
| "mean_token_accuracy": 0.9548931837081909, | |
| "step": 2810 | |
| }, | |
| { | |
| "epoch": 2.7093358999037536, | |
| "grad_norm": 0.11576958719402669, | |
| "learning_rate": 1.78277183788197e-05, | |
| "loss": 0.1309, | |
| "mean_token_accuracy": 0.9549652457237243, | |
| "step": 2815 | |
| }, | |
| { | |
| "epoch": 2.714148219441771, | |
| "grad_norm": 0.12081456376946226, | |
| "learning_rate": 1.774248137147325e-05, | |
| "loss": 0.1317, | |
| "mean_token_accuracy": 0.9547723591327667, | |
| "step": 2820 | |
| }, | |
| { | |
| "epoch": 2.7189605389797884, | |
| "grad_norm": 0.11918991844055905, | |
| "learning_rate": 1.7657416485697408e-05, | |
| "loss": 0.1324, | |
| "mean_token_accuracy": 0.954517936706543, | |
| "step": 2825 | |
| }, | |
| { | |
| "epoch": 2.7237728585178056, | |
| "grad_norm": 0.11400005611412717, | |
| "learning_rate": 1.7572525222027515e-05, | |
| "loss": 0.1296, | |
| "mean_token_accuracy": 0.9553654193878174, | |
| "step": 2830 | |
| }, | |
| { | |
| "epoch": 2.728585178055823, | |
| "grad_norm": 0.11304322067029585, | |
| "learning_rate": 1.7487809077936277e-05, | |
| "loss": 0.1305, | |
| "mean_token_accuracy": 0.9552641093730927, | |
| "step": 2835 | |
| }, | |
| { | |
| "epoch": 2.73339749759384, | |
| "grad_norm": 0.11301365559101982, | |
| "learning_rate": 1.7403269547807285e-05, | |
| "loss": 0.1322, | |
| "mean_token_accuracy": 0.9547658503055573, | |
| "step": 2840 | |
| }, | |
| { | |
| "epoch": 2.7382098171318576, | |
| "grad_norm": 0.1289578081000946, | |
| "learning_rate": 1.7318908122908668e-05, | |
| "loss": 0.134, | |
| "mean_token_accuracy": 0.9542883634567261, | |
| "step": 2845 | |
| }, | |
| { | |
| "epoch": 2.743022136669875, | |
| "grad_norm": 0.1211101774849991, | |
| "learning_rate": 1.7234726291366826e-05, | |
| "loss": 0.1305, | |
| "mean_token_accuracy": 0.9551473438739777, | |
| "step": 2850 | |
| }, | |
| { | |
| "epoch": 2.747834456207892, | |
| "grad_norm": 0.11540967878921071, | |
| "learning_rate": 1.715072553814014e-05, | |
| "loss": 0.1329, | |
| "mean_token_accuracy": 0.9542284786701203, | |
| "step": 2855 | |
| }, | |
| { | |
| "epoch": 2.7526467757459097, | |
| "grad_norm": 0.11928698419009154, | |
| "learning_rate": 1.7066907344992782e-05, | |
| "loss": 0.1319, | |
| "mean_token_accuracy": 0.9548710584640503, | |
| "step": 2860 | |
| }, | |
| { | |
| "epoch": 2.757459095283927, | |
| "grad_norm": 0.11854321731885047, | |
| "learning_rate": 1.69832731904686e-05, | |
| "loss": 0.1319, | |
| "mean_token_accuracy": 0.9545396983623504, | |
| "step": 2865 | |
| }, | |
| { | |
| "epoch": 2.762271414821944, | |
| "grad_norm": 0.12016173759646688, | |
| "learning_rate": 1.6899824549865007e-05, | |
| "loss": 0.1335, | |
| "mean_token_accuracy": 0.9543656527996063, | |
| "step": 2870 | |
| }, | |
| { | |
| "epoch": 2.7670837343599617, | |
| "grad_norm": 0.12323213427325587, | |
| "learning_rate": 1.6816562895206967e-05, | |
| "loss": 0.1314, | |
| "mean_token_accuracy": 0.954924327135086, | |
| "step": 2875 | |
| }, | |
| { | |
| "epoch": 2.771896053897979, | |
| "grad_norm": 0.11513684204157092, | |
| "learning_rate": 1.6733489695221056e-05, | |
| "loss": 0.1288, | |
| "mean_token_accuracy": 0.9557023704051971, | |
| "step": 2880 | |
| }, | |
| { | |
| "epoch": 2.776708373435996, | |
| "grad_norm": 0.11850590308838514, | |
| "learning_rate": 1.6650606415309506e-05, | |
| "loss": 0.1296, | |
| "mean_token_accuracy": 0.9554703533649445, | |
| "step": 2885 | |
| }, | |
| { | |
| "epoch": 2.7815206929740137, | |
| "grad_norm": 0.11405829079595443, | |
| "learning_rate": 1.6567914517524384e-05, | |
| "loss": 0.1318, | |
| "mean_token_accuracy": 0.9548174798488617, | |
| "step": 2890 | |
| }, | |
| { | |
| "epoch": 2.786333012512031, | |
| "grad_norm": 0.12349923492866052, | |
| "learning_rate": 1.6485415460541806e-05, | |
| "loss": 0.127, | |
| "mean_token_accuracy": 0.9565874934196472, | |
| "step": 2895 | |
| }, | |
| { | |
| "epoch": 2.791145332050048, | |
| "grad_norm": 0.12093558495792264, | |
| "learning_rate": 1.6403110699636193e-05, | |
| "loss": 0.1322, | |
| "mean_token_accuracy": 0.9547248542308807, | |
| "step": 2900 | |
| }, | |
| { | |
| "epoch": 2.7959576515880653, | |
| "grad_norm": 0.1213540234718498, | |
| "learning_rate": 1.6321001686654592e-05, | |
| "loss": 0.1343, | |
| "mean_token_accuracy": 0.9541695833206176, | |
| "step": 2905 | |
| }, | |
| { | |
| "epoch": 2.8007699711260825, | |
| "grad_norm": 0.12328272236609271, | |
| "learning_rate": 1.6239089869991106e-05, | |
| "loss": 0.1295, | |
| "mean_token_accuracy": 0.9555430829524993, | |
| "step": 2910 | |
| }, | |
| { | |
| "epoch": 2.8055822906641, | |
| "grad_norm": 0.12468836310253578, | |
| "learning_rate": 1.6157376694561287e-05, | |
| "loss": 0.1307, | |
| "mean_token_accuracy": 0.9552483201026917, | |
| "step": 2915 | |
| }, | |
| { | |
| "epoch": 2.8103946102021173, | |
| "grad_norm": 0.1232970732215795, | |
| "learning_rate": 1.6075863601776687e-05, | |
| "loss": 0.1306, | |
| "mean_token_accuracy": 0.9553005278110505, | |
| "step": 2920 | |
| }, | |
| { | |
| "epoch": 2.8152069297401345, | |
| "grad_norm": 0.11658647271805182, | |
| "learning_rate": 1.599455202951944e-05, | |
| "loss": 0.1291, | |
| "mean_token_accuracy": 0.95576052069664, | |
| "step": 2925 | |
| }, | |
| { | |
| "epoch": 2.820019249278152, | |
| "grad_norm": 0.12097179899490057, | |
| "learning_rate": 1.5913443412116847e-05, | |
| "loss": 0.1308, | |
| "mean_token_accuracy": 0.9550281763076782, | |
| "step": 2930 | |
| }, | |
| { | |
| "epoch": 2.8248315688161694, | |
| "grad_norm": 0.12495077615468701, | |
| "learning_rate": 1.5832539180316128e-05, | |
| "loss": 0.1304, | |
| "mean_token_accuracy": 0.9550132036209107, | |
| "step": 2935 | |
| }, | |
| { | |
| "epoch": 2.8296438883541866, | |
| "grad_norm": 0.11888535072349493, | |
| "learning_rate": 1.5751840761259172e-05, | |
| "loss": 0.1308, | |
| "mean_token_accuracy": 0.9551308155059814, | |
| "step": 2940 | |
| }, | |
| { | |
| "epoch": 2.834456207892204, | |
| "grad_norm": 0.122716580499259, | |
| "learning_rate": 1.5671349578457327e-05, | |
| "loss": 0.1308, | |
| "mean_token_accuracy": 0.9550053656101227, | |
| "step": 2945 | |
| }, | |
| { | |
| "epoch": 2.8392685274302214, | |
| "grad_norm": 0.12076581715834696, | |
| "learning_rate": 1.559106705176634e-05, | |
| "loss": 0.1298, | |
| "mean_token_accuracy": 0.955363392829895, | |
| "step": 2950 | |
| }, | |
| { | |
| "epoch": 2.8440808469682386, | |
| "grad_norm": 0.11663337272165107, | |
| "learning_rate": 1.5510994597361263e-05, | |
| "loss": 0.1313, | |
| "mean_token_accuracy": 0.9548046112060546, | |
| "step": 2955 | |
| }, | |
| { | |
| "epoch": 2.8488931665062562, | |
| "grad_norm": 0.11910310423944706, | |
| "learning_rate": 1.543113362771152e-05, | |
| "loss": 0.1315, | |
| "mean_token_accuracy": 0.9550003468990326, | |
| "step": 2960 | |
| }, | |
| { | |
| "epoch": 2.8537054860442734, | |
| "grad_norm": 0.1220170312626907, | |
| "learning_rate": 1.5351485551555955e-05, | |
| "loss": 0.1304, | |
| "mean_token_accuracy": 0.955298799276352, | |
| "step": 2965 | |
| }, | |
| { | |
| "epoch": 2.8585178055822906, | |
| "grad_norm": 0.1213926832580786, | |
| "learning_rate": 1.5272051773877996e-05, | |
| "loss": 0.1302, | |
| "mean_token_accuracy": 0.9553127706050872, | |
| "step": 2970 | |
| }, | |
| { | |
| "epoch": 2.863330125120308, | |
| "grad_norm": 0.11524798419356376, | |
| "learning_rate": 1.519283369588086e-05, | |
| "loss": 0.1255, | |
| "mean_token_accuracy": 0.9566778540611267, | |
| "step": 2975 | |
| }, | |
| { | |
| "epoch": 2.8681424446583255, | |
| "grad_norm": 0.12415689999988921, | |
| "learning_rate": 1.5113832714962867e-05, | |
| "loss": 0.1307, | |
| "mean_token_accuracy": 0.9551591515541077, | |
| "step": 2980 | |
| }, | |
| { | |
| "epoch": 2.8729547641963427, | |
| "grad_norm": 0.12207041079872989, | |
| "learning_rate": 1.5035050224692746e-05, | |
| "loss": 0.1311, | |
| "mean_token_accuracy": 0.955256050825119, | |
| "step": 2985 | |
| }, | |
| { | |
| "epoch": 2.87776708373436, | |
| "grad_norm": 0.12011719633558507, | |
| "learning_rate": 1.4956487614785076e-05, | |
| "loss": 0.1294, | |
| "mean_token_accuracy": 0.9556836128234864, | |
| "step": 2990 | |
| }, | |
| { | |
| "epoch": 2.882579403272377, | |
| "grad_norm": 0.117683592777178, | |
| "learning_rate": 1.4878146271075793e-05, | |
| "loss": 0.1292, | |
| "mean_token_accuracy": 0.9556024372577667, | |
| "step": 2995 | |
| }, | |
| { | |
| "epoch": 2.8873917228103947, | |
| "grad_norm": 0.11754513539176431, | |
| "learning_rate": 1.4800027575497699e-05, | |
| "loss": 0.1296, | |
| "mean_token_accuracy": 0.9553449332714081, | |
| "step": 3000 | |
| }, | |
| { | |
| "epoch": 2.892204042348412, | |
| "grad_norm": 0.11765635495599021, | |
| "learning_rate": 1.4722132906056102e-05, | |
| "loss": 0.1299, | |
| "mean_token_accuracy": 0.9554186820983886, | |
| "step": 3005 | |
| }, | |
| { | |
| "epoch": 2.897016361886429, | |
| "grad_norm": 0.11794482192224673, | |
| "learning_rate": 1.4644463636804546e-05, | |
| "loss": 0.1299, | |
| "mean_token_accuracy": 0.9553541541099548, | |
| "step": 3010 | |
| }, | |
| { | |
| "epoch": 2.9018286814244467, | |
| "grad_norm": 0.12339637491154125, | |
| "learning_rate": 1.4567021137820506e-05, | |
| "loss": 0.1291, | |
| "mean_token_accuracy": 0.9557243764400483, | |
| "step": 3015 | |
| }, | |
| { | |
| "epoch": 2.906641000962464, | |
| "grad_norm": 0.12052468676544233, | |
| "learning_rate": 1.4489806775181261e-05, | |
| "loss": 0.1305, | |
| "mean_token_accuracy": 0.9551128268241882, | |
| "step": 3020 | |
| }, | |
| { | |
| "epoch": 2.911453320500481, | |
| "grad_norm": 0.11728982379711779, | |
| "learning_rate": 1.4412821910939814e-05, | |
| "loss": 0.1297, | |
| "mean_token_accuracy": 0.9554867327213288, | |
| "step": 3025 | |
| }, | |
| { | |
| "epoch": 2.9162656400384988, | |
| "grad_norm": 0.12074144590804294, | |
| "learning_rate": 1.4336067903100809e-05, | |
| "loss": 0.1283, | |
| "mean_token_accuracy": 0.9561033308506012, | |
| "step": 3030 | |
| }, | |
| { | |
| "epoch": 2.921077959576516, | |
| "grad_norm": 0.11856500442809872, | |
| "learning_rate": 1.4259546105596616e-05, | |
| "loss": 0.1322, | |
| "mean_token_accuracy": 0.9547417223453522, | |
| "step": 3035 | |
| }, | |
| { | |
| "epoch": 2.925890279114533, | |
| "grad_norm": 0.12090286265287456, | |
| "learning_rate": 1.4183257868263463e-05, | |
| "loss": 0.1287, | |
| "mean_token_accuracy": 0.9557977437973022, | |
| "step": 3040 | |
| }, | |
| { | |
| "epoch": 2.930702598652551, | |
| "grad_norm": 0.12084014482195214, | |
| "learning_rate": 1.4107204536817581e-05, | |
| "loss": 0.1278, | |
| "mean_token_accuracy": 0.9559150397777557, | |
| "step": 3045 | |
| }, | |
| { | |
| "epoch": 2.935514918190568, | |
| "grad_norm": 0.12988839766790128, | |
| "learning_rate": 1.403138745283148e-05, | |
| "loss": 0.1397, | |
| "mean_token_accuracy": 0.953763622045517, | |
| "step": 3050 | |
| }, | |
| { | |
| "epoch": 2.940327237728585, | |
| "grad_norm": 0.12807973603559855, | |
| "learning_rate": 1.395580795371031e-05, | |
| "loss": 0.1281, | |
| "mean_token_accuracy": 0.9558646023273468, | |
| "step": 3055 | |
| }, | |
| { | |
| "epoch": 2.9451395572666024, | |
| "grad_norm": 0.13171881809754798, | |
| "learning_rate": 1.3880467372668227e-05, | |
| "loss": 0.1318, | |
| "mean_token_accuracy": 0.9546797037124634, | |
| "step": 3060 | |
| }, | |
| { | |
| "epoch": 2.9499518768046196, | |
| "grad_norm": 0.1481482959862089, | |
| "learning_rate": 1.3805367038704928e-05, | |
| "loss": 0.1324, | |
| "mean_token_accuracy": 0.9545677185058594, | |
| "step": 3065 | |
| }, | |
| { | |
| "epoch": 2.954764196342637, | |
| "grad_norm": 0.11590529541262443, | |
| "learning_rate": 1.3730508276582155e-05, | |
| "loss": 0.1292, | |
| "mean_token_accuracy": 0.9555728912353516, | |
| "step": 3070 | |
| }, | |
| { | |
| "epoch": 2.9595765158806544, | |
| "grad_norm": 0.12052575933661766, | |
| "learning_rate": 1.3655892406800347e-05, | |
| "loss": 0.1331, | |
| "mean_token_accuracy": 0.9543036758899689, | |
| "step": 3075 | |
| }, | |
| { | |
| "epoch": 2.9643888354186716, | |
| "grad_norm": 0.12124225661711782, | |
| "learning_rate": 1.3581520745575368e-05, | |
| "loss": 0.1277, | |
| "mean_token_accuracy": 0.9562006831169129, | |
| "step": 3080 | |
| }, | |
| { | |
| "epoch": 2.9692011549566892, | |
| "grad_norm": 0.12324032197982851, | |
| "learning_rate": 1.3507394604815254e-05, | |
| "loss": 0.1287, | |
| "mean_token_accuracy": 0.9557781279087066, | |
| "step": 3085 | |
| }, | |
| { | |
| "epoch": 2.9740134744947064, | |
| "grad_norm": 0.1273950770018207, | |
| "learning_rate": 1.3433515292097081e-05, | |
| "loss": 0.1299, | |
| "mean_token_accuracy": 0.9553958296775817, | |
| "step": 3090 | |
| }, | |
| { | |
| "epoch": 2.9788257940327236, | |
| "grad_norm": 0.12124892063330928, | |
| "learning_rate": 1.3359884110643936e-05, | |
| "loss": 0.129, | |
| "mean_token_accuracy": 0.9557174980640412, | |
| "step": 3095 | |
| }, | |
| { | |
| "epoch": 2.9836381135707413, | |
| "grad_norm": 0.12359870287619534, | |
| "learning_rate": 1.3286502359301862e-05, | |
| "loss": 0.1275, | |
| "mean_token_accuracy": 0.9559661984443665, | |
| "step": 3100 | |
| }, | |
| { | |
| "epoch": 2.9884504331087585, | |
| "grad_norm": 0.16559046313377282, | |
| "learning_rate": 1.3213371332516983e-05, | |
| "loss": 0.1297, | |
| "mean_token_accuracy": 0.9555854201316833, | |
| "step": 3105 | |
| }, | |
| { | |
| "epoch": 2.9932627526467757, | |
| "grad_norm": 0.11919465919446796, | |
| "learning_rate": 1.314049232031271e-05, | |
| "loss": 0.1301, | |
| "mean_token_accuracy": 0.9553444862365723, | |
| "step": 3110 | |
| }, | |
| { | |
| "epoch": 2.9980750721847933, | |
| "grad_norm": 0.1186023842064123, | |
| "learning_rate": 1.3067866608266898e-05, | |
| "loss": 0.1275, | |
| "mean_token_accuracy": 0.9560285627841949, | |
| "step": 3115 | |
| }, | |
| { | |
| "epoch": 3.0028873917228105, | |
| "grad_norm": 0.12434952847761321, | |
| "learning_rate": 1.2995495477489228e-05, | |
| "loss": 0.109, | |
| "mean_token_accuracy": 0.9625612318515777, | |
| "step": 3120 | |
| }, | |
| { | |
| "epoch": 3.0076997112608277, | |
| "grad_norm": 0.14706432465535094, | |
| "learning_rate": 1.2923380204598617e-05, | |
| "loss": 0.0931, | |
| "mean_token_accuracy": 0.9678871333599091, | |
| "step": 3125 | |
| }, | |
| { | |
| "epoch": 3.012512030798845, | |
| "grad_norm": 0.15409333659384583, | |
| "learning_rate": 1.2851522061700655e-05, | |
| "loss": 0.0937, | |
| "mean_token_accuracy": 0.9677077949047088, | |
| "step": 3130 | |
| }, | |
| { | |
| "epoch": 3.0173243503368625, | |
| "grad_norm": 0.1312873405063952, | |
| "learning_rate": 1.2779922316365172e-05, | |
| "loss": 0.0902, | |
| "mean_token_accuracy": 0.9688529968261719, | |
| "step": 3135 | |
| }, | |
| { | |
| "epoch": 3.0221366698748797, | |
| "grad_norm": 0.14917215599996234, | |
| "learning_rate": 1.2708582231603939e-05, | |
| "loss": 0.0916, | |
| "mean_token_accuracy": 0.9682607412338257, | |
| "step": 3140 | |
| }, | |
| { | |
| "epoch": 3.026948989412897, | |
| "grad_norm": 0.13842392334899215, | |
| "learning_rate": 1.26375030658483e-05, | |
| "loss": 0.0932, | |
| "mean_token_accuracy": 0.9676888287067413, | |
| "step": 3145 | |
| }, | |
| { | |
| "epoch": 3.0317613089509146, | |
| "grad_norm": 0.13785720750369992, | |
| "learning_rate": 1.256668607292704e-05, | |
| "loss": 0.0906, | |
| "mean_token_accuracy": 0.9686121463775634, | |
| "step": 3150 | |
| }, | |
| { | |
| "epoch": 3.0365736284889318, | |
| "grad_norm": 0.14343146440304305, | |
| "learning_rate": 1.2496132502044227e-05, | |
| "loss": 0.0938, | |
| "mean_token_accuracy": 0.9678133964538574, | |
| "step": 3155 | |
| }, | |
| { | |
| "epoch": 3.041385948026949, | |
| "grad_norm": 0.1289873593840911, | |
| "learning_rate": 1.24258435977572e-05, | |
| "loss": 0.0914, | |
| "mean_token_accuracy": 0.9683366477489471, | |
| "step": 3160 | |
| }, | |
| { | |
| "epoch": 3.046198267564966, | |
| "grad_norm": 0.1375301741023587, | |
| "learning_rate": 1.235582059995462e-05, | |
| "loss": 0.15, | |
| "mean_token_accuracy": 0.9617392539978027, | |
| "step": 3165 | |
| }, | |
| { | |
| "epoch": 3.051010587102984, | |
| "grad_norm": 0.17873391511676393, | |
| "learning_rate": 1.228606474383457e-05, | |
| "loss": 0.0911, | |
| "mean_token_accuracy": 0.9686980247497559, | |
| "step": 3170 | |
| }, | |
| { | |
| "epoch": 3.055822906641001, | |
| "grad_norm": 0.13437179954892056, | |
| "learning_rate": 1.221657725988278e-05, | |
| "loss": 0.09, | |
| "mean_token_accuracy": 0.9687510788440704, | |
| "step": 3175 | |
| }, | |
| { | |
| "epoch": 3.060635226179018, | |
| "grad_norm": 0.132715658708249, | |
| "learning_rate": 1.2147359373850947e-05, | |
| "loss": 0.0933, | |
| "mean_token_accuracy": 0.9676034927368165, | |
| "step": 3180 | |
| }, | |
| { | |
| "epoch": 3.065447545717036, | |
| "grad_norm": 0.1363781033463036, | |
| "learning_rate": 1.2078412306735071e-05, | |
| "loss": 0.0929, | |
| "mean_token_accuracy": 0.9677293360233307, | |
| "step": 3185 | |
| }, | |
| { | |
| "epoch": 3.070259865255053, | |
| "grad_norm": 0.18327206646265473, | |
| "learning_rate": 1.2009737274753931e-05, | |
| "loss": 0.0909, | |
| "mean_token_accuracy": 0.968484491109848, | |
| "step": 3190 | |
| }, | |
| { | |
| "epoch": 3.07507218479307, | |
| "grad_norm": 0.13465891639972638, | |
| "learning_rate": 1.1941335489327658e-05, | |
| "loss": 0.0918, | |
| "mean_token_accuracy": 0.9680214643478393, | |
| "step": 3195 | |
| }, | |
| { | |
| "epoch": 3.0798845043310874, | |
| "grad_norm": 0.13627149330922256, | |
| "learning_rate": 1.1873208157056323e-05, | |
| "loss": 0.0923, | |
| "mean_token_accuracy": 0.9681505382061004, | |
| "step": 3200 | |
| }, | |
| { | |
| "epoch": 3.084696823869105, | |
| "grad_norm": 0.13228454803325826, | |
| "learning_rate": 1.1805356479698673e-05, | |
| "loss": 0.09, | |
| "mean_token_accuracy": 0.9687126517295838, | |
| "step": 3205 | |
| }, | |
| { | |
| "epoch": 3.0895091434071222, | |
| "grad_norm": 0.12925810695025527, | |
| "learning_rate": 1.1737781654150954e-05, | |
| "loss": 0.0888, | |
| "mean_token_accuracy": 0.9692953944206237, | |
| "step": 3210 | |
| }, | |
| { | |
| "epoch": 3.0943214629451394, | |
| "grad_norm": 0.13298345353337704, | |
| "learning_rate": 1.1670484872425757e-05, | |
| "loss": 0.0918, | |
| "mean_token_accuracy": 0.9681942939758301, | |
| "step": 3215 | |
| }, | |
| { | |
| "epoch": 3.099133782483157, | |
| "grad_norm": 0.13079572079571744, | |
| "learning_rate": 1.1603467321631007e-05, | |
| "loss": 0.0897, | |
| "mean_token_accuracy": 0.9688230633735657, | |
| "step": 3220 | |
| }, | |
| { | |
| "epoch": 3.1039461020211743, | |
| "grad_norm": 0.1396780678443276, | |
| "learning_rate": 1.1536730183949042e-05, | |
| "loss": 0.094, | |
| "mean_token_accuracy": 0.9673632800579071, | |
| "step": 3225 | |
| }, | |
| { | |
| "epoch": 3.1087584215591915, | |
| "grad_norm": 0.13356595168748497, | |
| "learning_rate": 1.147027463661573e-05, | |
| "loss": 0.0913, | |
| "mean_token_accuracy": 0.9683010756969452, | |
| "step": 3230 | |
| }, | |
| { | |
| "epoch": 3.1135707410972087, | |
| "grad_norm": 0.13083812488404448, | |
| "learning_rate": 1.1404101851899715e-05, | |
| "loss": 0.0913, | |
| "mean_token_accuracy": 0.9681405127048492, | |
| "step": 3235 | |
| }, | |
| { | |
| "epoch": 3.1183830606352263, | |
| "grad_norm": 0.13408282803700197, | |
| "learning_rate": 1.1338212997081758e-05, | |
| "loss": 0.0903, | |
| "mean_token_accuracy": 0.9686260998249054, | |
| "step": 3240 | |
| }, | |
| { | |
| "epoch": 3.1231953801732435, | |
| "grad_norm": 0.1328576682054844, | |
| "learning_rate": 1.1272609234434107e-05, | |
| "loss": 0.0925, | |
| "mean_token_accuracy": 0.9677883803844451, | |
| "step": 3245 | |
| }, | |
| { | |
| "epoch": 3.1280076997112607, | |
| "grad_norm": 0.13708789631366744, | |
| "learning_rate": 1.1207291721200013e-05, | |
| "loss": 0.0916, | |
| "mean_token_accuracy": 0.9682895362377166, | |
| "step": 3250 | |
| }, | |
| { | |
| "epoch": 3.1328200192492783, | |
| "grad_norm": 0.13495765462602108, | |
| "learning_rate": 1.1142261609573349e-05, | |
| "loss": 0.0917, | |
| "mean_token_accuracy": 0.9683321177959442, | |
| "step": 3255 | |
| }, | |
| { | |
| "epoch": 3.1376323387872955, | |
| "grad_norm": 0.13393491094816032, | |
| "learning_rate": 1.1077520046678202e-05, | |
| "loss": 0.0915, | |
| "mean_token_accuracy": 0.9682754933834076, | |
| "step": 3260 | |
| }, | |
| { | |
| "epoch": 3.1424446583253127, | |
| "grad_norm": 0.13427081636155144, | |
| "learning_rate": 1.1013068174548749e-05, | |
| "loss": 0.0917, | |
| "mean_token_accuracy": 0.9681111812591553, | |
| "step": 3265 | |
| }, | |
| { | |
| "epoch": 3.14725697786333, | |
| "grad_norm": 0.13751924031532214, | |
| "learning_rate": 1.0948907130109013e-05, | |
| "loss": 0.0918, | |
| "mean_token_accuracy": 0.9682234287261963, | |
| "step": 3270 | |
| }, | |
| { | |
| "epoch": 3.1520692974013476, | |
| "grad_norm": 0.13692106248674146, | |
| "learning_rate": 1.0885038045152857e-05, | |
| "loss": 0.0914, | |
| "mean_token_accuracy": 0.9683238744735718, | |
| "step": 3275 | |
| }, | |
| { | |
| "epoch": 3.1568816169393648, | |
| "grad_norm": 0.13616884278766314, | |
| "learning_rate": 1.0821462046324024e-05, | |
| "loss": 0.0922, | |
| "mean_token_accuracy": 0.9678296506404876, | |
| "step": 3280 | |
| }, | |
| { | |
| "epoch": 3.161693936477382, | |
| "grad_norm": 0.14300439959195926, | |
| "learning_rate": 1.0758180255096239e-05, | |
| "loss": 0.0899, | |
| "mean_token_accuracy": 0.9688584566116333, | |
| "step": 3285 | |
| }, | |
| { | |
| "epoch": 3.1665062560153996, | |
| "grad_norm": 0.13656339919755447, | |
| "learning_rate": 1.069519378775343e-05, | |
| "loss": 0.0925, | |
| "mean_token_accuracy": 0.9679355382919311, | |
| "step": 3290 | |
| }, | |
| { | |
| "epoch": 3.171318575553417, | |
| "grad_norm": 0.13464780469940738, | |
| "learning_rate": 1.0632503755370057e-05, | |
| "loss": 0.0902, | |
| "mean_token_accuracy": 0.9687452852725983, | |
| "step": 3295 | |
| }, | |
| { | |
| "epoch": 3.176130895091434, | |
| "grad_norm": 0.13764356429072314, | |
| "learning_rate": 1.0570111263791497e-05, | |
| "loss": 0.0918, | |
| "mean_token_accuracy": 0.9681958973407745, | |
| "step": 3300 | |
| }, | |
| { | |
| "epoch": 3.180943214629451, | |
| "grad_norm": 0.13967572646701193, | |
| "learning_rate": 1.0508017413614524e-05, | |
| "loss": 0.0904, | |
| "mean_token_accuracy": 0.9686243951320648, | |
| "step": 3305 | |
| }, | |
| { | |
| "epoch": 3.185755534167469, | |
| "grad_norm": 0.13723543857891365, | |
| "learning_rate": 1.0446223300167937e-05, | |
| "loss": 0.0903, | |
| "mean_token_accuracy": 0.9686541378498077, | |
| "step": 3310 | |
| }, | |
| { | |
| "epoch": 3.190567853705486, | |
| "grad_norm": 0.13473698825726957, | |
| "learning_rate": 1.0384730013493189e-05, | |
| "loss": 0.0915, | |
| "mean_token_accuracy": 0.9682977855205536, | |
| "step": 3315 | |
| }, | |
| { | |
| "epoch": 3.195380173243503, | |
| "grad_norm": 0.13892047356535198, | |
| "learning_rate": 1.0323538638325184e-05, | |
| "loss": 0.0927, | |
| "mean_token_accuracy": 0.967754465341568, | |
| "step": 3320 | |
| }, | |
| { | |
| "epoch": 3.200192492781521, | |
| "grad_norm": 0.1365119482916851, | |
| "learning_rate": 1.0262650254073156e-05, | |
| "loss": 0.0909, | |
| "mean_token_accuracy": 0.9685104787349701, | |
| "step": 3325 | |
| }, | |
| { | |
| "epoch": 3.205004812319538, | |
| "grad_norm": 0.13540915330796455, | |
| "learning_rate": 1.02020659348016e-05, | |
| "loss": 0.0896, | |
| "mean_token_accuracy": 0.9688210964202881, | |
| "step": 3330 | |
| }, | |
| { | |
| "epoch": 3.2098171318575552, | |
| "grad_norm": 0.13629962837781337, | |
| "learning_rate": 1.0141786749211325e-05, | |
| "loss": 0.0911, | |
| "mean_token_accuracy": 0.9684777975082397, | |
| "step": 3335 | |
| }, | |
| { | |
| "epoch": 3.214629451395573, | |
| "grad_norm": 0.1471752938559655, | |
| "learning_rate": 1.0081813760620646e-05, | |
| "loss": 0.0913, | |
| "mean_token_accuracy": 0.9682195365428925, | |
| "step": 3340 | |
| }, | |
| { | |
| "epoch": 3.21944177093359, | |
| "grad_norm": 0.13303987478993942, | |
| "learning_rate": 1.002214802694657e-05, | |
| "loss": 0.0915, | |
| "mean_token_accuracy": 0.9684331357479096, | |
| "step": 3345 | |
| }, | |
| { | |
| "epoch": 3.2242540904716073, | |
| "grad_norm": 0.14347460038561782, | |
| "learning_rate": 9.962790600686167e-06, | |
| "loss": 0.0916, | |
| "mean_token_accuracy": 0.9682085990905762, | |
| "step": 3350 | |
| }, | |
| { | |
| "epoch": 3.2290664100096245, | |
| "grad_norm": 0.13333845775714026, | |
| "learning_rate": 9.90374252889801e-06, | |
| "loss": 0.0902, | |
| "mean_token_accuracy": 0.968821543455124, | |
| "step": 3355 | |
| }, | |
| { | |
| "epoch": 3.233878729547642, | |
| "grad_norm": 0.13568644460994048, | |
| "learning_rate": 9.845004853183676e-06, | |
| "loss": 0.091, | |
| "mean_token_accuracy": 0.9684585392475128, | |
| "step": 3360 | |
| }, | |
| { | |
| "epoch": 3.2386910490856593, | |
| "grad_norm": 0.13284650087254843, | |
| "learning_rate": 9.7865786096694e-06, | |
| "loss": 0.0886, | |
| "mean_token_accuracy": 0.9692508637905121, | |
| "step": 3365 | |
| }, | |
| { | |
| "epoch": 3.2435033686236765, | |
| "grad_norm": 0.13621426057647495, | |
| "learning_rate": 9.728464828987776e-06, | |
| "loss": 0.0904, | |
| "mean_token_accuracy": 0.9686996936798096, | |
| "step": 3370 | |
| }, | |
| { | |
| "epoch": 3.248315688161694, | |
| "grad_norm": 0.13646334499064428, | |
| "learning_rate": 9.67066453625959e-06, | |
| "loss": 0.0903, | |
| "mean_token_accuracy": 0.968849265575409, | |
| "step": 3375 | |
| }, | |
| { | |
| "epoch": 3.2531280076997113, | |
| "grad_norm": 0.13624746746959862, | |
| "learning_rate": 9.613178751075752e-06, | |
| "loss": 0.091, | |
| "mean_token_accuracy": 0.968310970067978, | |
| "step": 3380 | |
| }, | |
| { | |
| "epoch": 3.2579403272377285, | |
| "grad_norm": 0.1395356762449855, | |
| "learning_rate": 9.556008487479274e-06, | |
| "loss": 0.0916, | |
| "mean_token_accuracy": 0.9682559728622436, | |
| "step": 3385 | |
| }, | |
| { | |
| "epoch": 3.2627526467757457, | |
| "grad_norm": 0.13382347293173258, | |
| "learning_rate": 9.499154753947397e-06, | |
| "loss": 0.0891, | |
| "mean_token_accuracy": 0.9691741287708282, | |
| "step": 3390 | |
| }, | |
| { | |
| "epoch": 3.2675649663137634, | |
| "grad_norm": 0.135644673574654, | |
| "learning_rate": 9.442618553373834e-06, | |
| "loss": 0.0904, | |
| "mean_token_accuracy": 0.968697601556778, | |
| "step": 3395 | |
| }, | |
| { | |
| "epoch": 3.2723772858517806, | |
| "grad_norm": 0.13907697854966813, | |
| "learning_rate": 9.38640088305102e-06, | |
| "loss": 0.0916, | |
| "mean_token_accuracy": 0.9683789730072021, | |
| "step": 3400 | |
| }, | |
| { | |
| "epoch": 3.2771896053897978, | |
| "grad_norm": 0.1346504519525067, | |
| "learning_rate": 9.33050273465256e-06, | |
| "loss": 0.0912, | |
| "mean_token_accuracy": 0.9683473885059357, | |
| "step": 3405 | |
| }, | |
| { | |
| "epoch": 3.2820019249278154, | |
| "grad_norm": 0.134340725161769, | |
| "learning_rate": 9.274925094215747e-06, | |
| "loss": 0.0907, | |
| "mean_token_accuracy": 0.968653804063797, | |
| "step": 3410 | |
| }, | |
| { | |
| "epoch": 3.2868142444658326, | |
| "grad_norm": 0.13948117931371806, | |
| "learning_rate": 9.219668942124124e-06, | |
| "loss": 0.0908, | |
| "mean_token_accuracy": 0.9683266997337341, | |
| "step": 3415 | |
| }, | |
| { | |
| "epoch": 3.29162656400385, | |
| "grad_norm": 0.13169986506984857, | |
| "learning_rate": 9.164735253090212e-06, | |
| "loss": 0.0912, | |
| "mean_token_accuracy": 0.9685522735118866, | |
| "step": 3420 | |
| }, | |
| { | |
| "epoch": 3.2964388835418674, | |
| "grad_norm": 0.13358211631944877, | |
| "learning_rate": 9.110124996138344e-06, | |
| "loss": 0.0903, | |
| "mean_token_accuracy": 0.9687002718448638, | |
| "step": 3425 | |
| }, | |
| { | |
| "epoch": 3.3012512030798846, | |
| "grad_norm": 0.13326680593802695, | |
| "learning_rate": 9.055839134587527e-06, | |
| "loss": 0.0892, | |
| "mean_token_accuracy": 0.9689774870872497, | |
| "step": 3430 | |
| }, | |
| { | |
| "epoch": 3.306063522617902, | |
| "grad_norm": 0.12975003937153856, | |
| "learning_rate": 9.001878626034466e-06, | |
| "loss": 0.0895, | |
| "mean_token_accuracy": 0.9689989626407624, | |
| "step": 3435 | |
| }, | |
| { | |
| "epoch": 3.310875842155919, | |
| "grad_norm": 0.13463975701309033, | |
| "learning_rate": 8.948244422336691e-06, | |
| "loss": 0.0901, | |
| "mean_token_accuracy": 0.9687969565391541, | |
| "step": 3440 | |
| }, | |
| { | |
| "epoch": 3.3156881616939367, | |
| "grad_norm": 0.13544902263064948, | |
| "learning_rate": 8.894937469595733e-06, | |
| "loss": 0.0914, | |
| "mean_token_accuracy": 0.9683208525180816, | |
| "step": 3445 | |
| }, | |
| { | |
| "epoch": 3.320500481231954, | |
| "grad_norm": 0.1339648265704724, | |
| "learning_rate": 8.841958708140458e-06, | |
| "loss": 0.0924, | |
| "mean_token_accuracy": 0.9678017616271972, | |
| "step": 3450 | |
| }, | |
| { | |
| "epoch": 3.325312800769971, | |
| "grad_norm": 0.13821582032724328, | |
| "learning_rate": 8.789309072510478e-06, | |
| "loss": 0.0891, | |
| "mean_token_accuracy": 0.9691683113574981, | |
| "step": 3455 | |
| }, | |
| { | |
| "epoch": 3.3301251203079882, | |
| "grad_norm": 0.1310796417774881, | |
| "learning_rate": 8.736989491439655e-06, | |
| "loss": 0.0906, | |
| "mean_token_accuracy": 0.9686806917190551, | |
| "step": 3460 | |
| }, | |
| { | |
| "epoch": 3.334937439846006, | |
| "grad_norm": 0.1369032777357672, | |
| "learning_rate": 8.685000887839728e-06, | |
| "loss": 0.0912, | |
| "mean_token_accuracy": 0.9683325886726379, | |
| "step": 3465 | |
| }, | |
| { | |
| "epoch": 3.339749759384023, | |
| "grad_norm": 0.13739882212292243, | |
| "learning_rate": 8.633344178784021e-06, | |
| "loss": 0.0908, | |
| "mean_token_accuracy": 0.9685930073261261, | |
| "step": 3470 | |
| }, | |
| { | |
| "epoch": 3.3445620789220403, | |
| "grad_norm": 0.14034358070622463, | |
| "learning_rate": 8.58202027549128e-06, | |
| "loss": 0.0906, | |
| "mean_token_accuracy": 0.968478548526764, | |
| "step": 3475 | |
| }, | |
| { | |
| "epoch": 3.349374398460058, | |
| "grad_norm": 0.13191072297755266, | |
| "learning_rate": 8.531030083309604e-06, | |
| "loss": 0.0897, | |
| "mean_token_accuracy": 0.9689924597740174, | |
| "step": 3480 | |
| }, | |
| { | |
| "epoch": 3.354186717998075, | |
| "grad_norm": 0.1408296781588095, | |
| "learning_rate": 8.480374501700447e-06, | |
| "loss": 0.0919, | |
| "mean_token_accuracy": 0.9680519282817841, | |
| "step": 3485 | |
| }, | |
| { | |
| "epoch": 3.3589990375360923, | |
| "grad_norm": 0.1350070826906196, | |
| "learning_rate": 8.430054424222775e-06, | |
| "loss": 0.0896, | |
| "mean_token_accuracy": 0.9688198208808899, | |
| "step": 3490 | |
| }, | |
| { | |
| "epoch": 3.36381135707411, | |
| "grad_norm": 0.1413619568327864, | |
| "learning_rate": 8.380070738517304e-06, | |
| "loss": 0.0904, | |
| "mean_token_accuracy": 0.9686203837394715, | |
| "step": 3495 | |
| }, | |
| { | |
| "epoch": 3.368623676612127, | |
| "grad_norm": 0.13260205437674083, | |
| "learning_rate": 8.330424326290828e-06, | |
| "loss": 0.0901, | |
| "mean_token_accuracy": 0.9688274085521698, | |
| "step": 3500 | |
| }, | |
| { | |
| "epoch": 3.3734359961501443, | |
| "grad_norm": 0.143913106272494, | |
| "learning_rate": 8.281116063300668e-06, | |
| "loss": 0.0898, | |
| "mean_token_accuracy": 0.9688499093055725, | |
| "step": 3505 | |
| }, | |
| { | |
| "epoch": 3.3782483156881615, | |
| "grad_norm": 0.1394116426687455, | |
| "learning_rate": 8.23214681933925e-06, | |
| "loss": 0.0909, | |
| "mean_token_accuracy": 0.9682900190353394, | |
| "step": 3510 | |
| }, | |
| { | |
| "epoch": 3.383060635226179, | |
| "grad_norm": 0.1357305045404523, | |
| "learning_rate": 8.18351745821872e-06, | |
| "loss": 0.0931, | |
| "mean_token_accuracy": 0.9677324056625366, | |
| "step": 3515 | |
| }, | |
| { | |
| "epoch": 3.3878729547641964, | |
| "grad_norm": 0.1387599656151157, | |
| "learning_rate": 8.135228837755729e-06, | |
| "loss": 0.0913, | |
| "mean_token_accuracy": 0.968377536535263, | |
| "step": 3520 | |
| }, | |
| { | |
| "epoch": 3.3926852743022136, | |
| "grad_norm": 0.12956244653090163, | |
| "learning_rate": 8.087281809756324e-06, | |
| "loss": 0.0888, | |
| "mean_token_accuracy": 0.9691174328327179, | |
| "step": 3525 | |
| }, | |
| { | |
| "epoch": 3.3974975938402308, | |
| "grad_norm": 0.14002846207915792, | |
| "learning_rate": 8.039677220000863e-06, | |
| "loss": 0.0908, | |
| "mean_token_accuracy": 0.9685690402984619, | |
| "step": 3530 | |
| }, | |
| { | |
| "epoch": 3.4023099133782484, | |
| "grad_norm": 0.13948876655094175, | |
| "learning_rate": 7.992415908229153e-06, | |
| "loss": 0.0925, | |
| "mean_token_accuracy": 0.9679752647876739, | |
| "step": 3535 | |
| }, | |
| { | |
| "epoch": 3.4071222329162656, | |
| "grad_norm": 0.13331743167738266, | |
| "learning_rate": 7.945498708125612e-06, | |
| "loss": 0.0899, | |
| "mean_token_accuracy": 0.9688387513160706, | |
| "step": 3540 | |
| }, | |
| { | |
| "epoch": 3.411934552454283, | |
| "grad_norm": 0.13639449654556354, | |
| "learning_rate": 7.898926447304563e-06, | |
| "loss": 0.0912, | |
| "mean_token_accuracy": 0.9682780504226685, | |
| "step": 3545 | |
| }, | |
| { | |
| "epoch": 3.4167468719923004, | |
| "grad_norm": 0.13339582179589207, | |
| "learning_rate": 7.852699947295628e-06, | |
| "loss": 0.091, | |
| "mean_token_accuracy": 0.9684050261974335, | |
| "step": 3550 | |
| }, | |
| { | |
| "epoch": 3.4215591915303176, | |
| "grad_norm": 0.14296042802518497, | |
| "learning_rate": 7.806820023529265e-06, | |
| "loss": 0.0912, | |
| "mean_token_accuracy": 0.968423455953598, | |
| "step": 3555 | |
| }, | |
| { | |
| "epoch": 3.426371511068335, | |
| "grad_norm": 0.13825497930396133, | |
| "learning_rate": 7.761287485322353e-06, | |
| "loss": 0.0889, | |
| "mean_token_accuracy": 0.9692943871021271, | |
| "step": 3560 | |
| }, | |
| { | |
| "epoch": 3.4311838306063525, | |
| "grad_norm": 0.1393461728825337, | |
| "learning_rate": 7.716103135863928e-06, | |
| "loss": 0.0885, | |
| "mean_token_accuracy": 0.9693212032318115, | |
| "step": 3565 | |
| }, | |
| { | |
| "epoch": 3.4359961501443697, | |
| "grad_norm": 0.13468833492353335, | |
| "learning_rate": 7.67126777220101e-06, | |
| "loss": 0.0882, | |
| "mean_token_accuracy": 0.9695115029811859, | |
| "step": 3570 | |
| }, | |
| { | |
| "epoch": 3.440808469682387, | |
| "grad_norm": 0.13665504829234054, | |
| "learning_rate": 7.626782185224558e-06, | |
| "loss": 0.0914, | |
| "mean_token_accuracy": 0.9684857487678528, | |
| "step": 3575 | |
| }, | |
| { | |
| "epoch": 3.445620789220404, | |
| "grad_norm": 0.13096296052633197, | |
| "learning_rate": 7.582647159655494e-06, | |
| "loss": 0.089, | |
| "mean_token_accuracy": 0.9691302180290222, | |
| "step": 3580 | |
| }, | |
| { | |
| "epoch": 3.4504331087584217, | |
| "grad_norm": 0.13521248818905815, | |
| "learning_rate": 7.538863474030898e-06, | |
| "loss": 0.0905, | |
| "mean_token_accuracy": 0.9686833798885346, | |
| "step": 3585 | |
| }, | |
| { | |
| "epoch": 3.455245428296439, | |
| "grad_norm": 0.1370495626790798, | |
| "learning_rate": 7.495431900690224e-06, | |
| "loss": 0.0892, | |
| "mean_token_accuracy": 0.96921107172966, | |
| "step": 3590 | |
| }, | |
| { | |
| "epoch": 3.460057747834456, | |
| "grad_norm": 0.1364263767538315, | |
| "learning_rate": 7.452353205761725e-06, | |
| "loss": 0.091, | |
| "mean_token_accuracy": 0.9683542311191559, | |
| "step": 3595 | |
| }, | |
| { | |
| "epoch": 3.4648700673724737, | |
| "grad_norm": 0.13991109366982188, | |
| "learning_rate": 7.409628149148906e-06, | |
| "loss": 0.0882, | |
| "mean_token_accuracy": 0.9695057034492492, | |
| "step": 3600 | |
| }, | |
| { | |
| "epoch": 3.469682386910491, | |
| "grad_norm": 0.1361528981450403, | |
| "learning_rate": 7.367257484517127e-06, | |
| "loss": 0.0896, | |
| "mean_token_accuracy": 0.9688239395618439, | |
| "step": 3605 | |
| }, | |
| { | |
| "epoch": 3.474494706448508, | |
| "grad_norm": 0.14039390393058748, | |
| "learning_rate": 7.325241959280328e-06, | |
| "loss": 0.0893, | |
| "mean_token_accuracy": 0.9691827893257141, | |
| "step": 3610 | |
| }, | |
| { | |
| "epoch": 3.4793070259865253, | |
| "grad_norm": 0.13496727455761756, | |
| "learning_rate": 7.283582314587814e-06, | |
| "loss": 0.0917, | |
| "mean_token_accuracy": 0.9682125985622406, | |
| "step": 3615 | |
| }, | |
| { | |
| "epoch": 3.484119345524543, | |
| "grad_norm": 0.1327869707380875, | |
| "learning_rate": 7.242279285311196e-06, | |
| "loss": 0.0906, | |
| "mean_token_accuracy": 0.9686414361000061, | |
| "step": 3620 | |
| }, | |
| { | |
| "epoch": 3.48893166506256, | |
| "grad_norm": 0.13667811803270496, | |
| "learning_rate": 7.2013336000314375e-06, | |
| "loss": 0.0882, | |
| "mean_token_accuracy": 0.9694166958332062, | |
| "step": 3625 | |
| }, | |
| { | |
| "epoch": 3.4937439846005773, | |
| "grad_norm": 0.142568787704858, | |
| "learning_rate": 7.160745981025986e-06, | |
| "loss": 0.0904, | |
| "mean_token_accuracy": 0.9686079502105713, | |
| "step": 3630 | |
| }, | |
| { | |
| "epoch": 3.498556304138595, | |
| "grad_norm": 0.13484002904977735, | |
| "learning_rate": 7.120517144256036e-06, | |
| "loss": 0.09, | |
| "mean_token_accuracy": 0.968810212612152, | |
| "step": 3635 | |
| }, | |
| { | |
| "epoch": 3.503368623676612, | |
| "grad_norm": 0.13501489720823745, | |
| "learning_rate": 7.080647799353912e-06, | |
| "loss": 0.0913, | |
| "mean_token_accuracy": 0.9684961140155792, | |
| "step": 3640 | |
| }, | |
| { | |
| "epoch": 3.5081809432146294, | |
| "grad_norm": 0.13860068018792182, | |
| "learning_rate": 7.041138649610532e-06, | |
| "loss": 0.0887, | |
| "mean_token_accuracy": 0.9693265676498413, | |
| "step": 3645 | |
| }, | |
| { | |
| "epoch": 3.512993262752647, | |
| "grad_norm": 0.1377188663141405, | |
| "learning_rate": 7.001990391963011e-06, | |
| "loss": 0.0906, | |
| "mean_token_accuracy": 0.9683993935585022, | |
| "step": 3650 | |
| }, | |
| { | |
| "epoch": 3.517805582290664, | |
| "grad_norm": 0.13399081463747997, | |
| "learning_rate": 6.963203716982375e-06, | |
| "loss": 0.0904, | |
| "mean_token_accuracy": 0.9685521602630616, | |
| "step": 3655 | |
| }, | |
| { | |
| "epoch": 3.5226179018286814, | |
| "grad_norm": 0.13854624455921152, | |
| "learning_rate": 6.924779308861361e-06, | |
| "loss": 0.0897, | |
| "mean_token_accuracy": 0.9688117682933808, | |
| "step": 3660 | |
| }, | |
| { | |
| "epoch": 3.5274302213666986, | |
| "grad_norm": 0.14148875052679258, | |
| "learning_rate": 6.886717845402358e-06, | |
| "loss": 0.0917, | |
| "mean_token_accuracy": 0.9681609988212585, | |
| "step": 3665 | |
| }, | |
| { | |
| "epoch": 3.5322425409047162, | |
| "grad_norm": 0.13471570321431886, | |
| "learning_rate": 6.849019998005471e-06, | |
| "loss": 0.0928, | |
| "mean_token_accuracy": 0.96789670586586, | |
| "step": 3670 | |
| }, | |
| { | |
| "epoch": 3.5370548604427334, | |
| "grad_norm": 0.13838875237047302, | |
| "learning_rate": 6.811686431656621e-06, | |
| "loss": 0.0909, | |
| "mean_token_accuracy": 0.9685190081596374, | |
| "step": 3675 | |
| }, | |
| { | |
| "epoch": 3.5418671799807506, | |
| "grad_norm": 0.13505140342109126, | |
| "learning_rate": 6.774717804915876e-06, | |
| "loss": 0.0906, | |
| "mean_token_accuracy": 0.9685005247592926, | |
| "step": 3680 | |
| }, | |
| { | |
| "epoch": 3.546679499518768, | |
| "grad_norm": 0.14859908671158864, | |
| "learning_rate": 6.738114769905806e-06, | |
| "loss": 0.0891, | |
| "mean_token_accuracy": 0.9690281331539154, | |
| "step": 3685 | |
| }, | |
| { | |
| "epoch": 3.5514918190567855, | |
| "grad_norm": 0.13916249581778348, | |
| "learning_rate": 6.70187797229998e-06, | |
| "loss": 0.0914, | |
| "mean_token_accuracy": 0.9683689653873444, | |
| "step": 3690 | |
| }, | |
| { | |
| "epoch": 3.5563041385948027, | |
| "grad_norm": 0.1375088431602144, | |
| "learning_rate": 6.666008051311573e-06, | |
| "loss": 0.0905, | |
| "mean_token_accuracy": 0.9686783015727997, | |
| "step": 3695 | |
| }, | |
| { | |
| "epoch": 3.56111645813282, | |
| "grad_norm": 0.13836813431335634, | |
| "learning_rate": 6.63050563968211e-06, | |
| "loss": 0.0894, | |
| "mean_token_accuracy": 0.968965369462967, | |
| "step": 3700 | |
| }, | |
| { | |
| "epoch": 3.5659287776708375, | |
| "grad_norm": 0.1360083978726538, | |
| "learning_rate": 6.59537136367028e-06, | |
| "loss": 0.0892, | |
| "mean_token_accuracy": 0.9689577162265778, | |
| "step": 3705 | |
| }, | |
| { | |
| "epoch": 3.5707410972088547, | |
| "grad_norm": 0.1364252051726429, | |
| "learning_rate": 6.560605843040896e-06, | |
| "loss": 0.0908, | |
| "mean_token_accuracy": 0.9684828460216522, | |
| "step": 3710 | |
| }, | |
| { | |
| "epoch": 3.575553416746872, | |
| "grad_norm": 0.13753479765396984, | |
| "learning_rate": 6.526209691053982e-06, | |
| "loss": 0.0908, | |
| "mean_token_accuracy": 0.9685666382312774, | |
| "step": 3715 | |
| }, | |
| { | |
| "epoch": 3.5803657362848895, | |
| "grad_norm": 0.1399839620065608, | |
| "learning_rate": 6.492183514453923e-06, | |
| "loss": 0.0895, | |
| "mean_token_accuracy": 0.9690784811973572, | |
| "step": 3720 | |
| }, | |
| { | |
| "epoch": 3.5851780558229067, | |
| "grad_norm": 0.1367698935543579, | |
| "learning_rate": 6.458527913458785e-06, | |
| "loss": 0.0902, | |
| "mean_token_accuracy": 0.9688356578350067, | |
| "step": 3725 | |
| }, | |
| { | |
| "epoch": 3.589990375360924, | |
| "grad_norm": 0.14113648414137153, | |
| "learning_rate": 6.425243481749724e-06, | |
| "loss": 0.0892, | |
| "mean_token_accuracy": 0.9691427707672119, | |
| "step": 3730 | |
| }, | |
| { | |
| "epoch": 3.5948026948989416, | |
| "grad_norm": 0.1427482144634009, | |
| "learning_rate": 6.392330806460499e-06, | |
| "loss": 0.0898, | |
| "mean_token_accuracy": 0.9689215421676636, | |
| "step": 3735 | |
| }, | |
| { | |
| "epoch": 3.5996150144369587, | |
| "grad_norm": 0.1355943563608752, | |
| "learning_rate": 6.359790468167145e-06, | |
| "loss": 0.0908, | |
| "mean_token_accuracy": 0.9685070931911468, | |
| "step": 3740 | |
| }, | |
| { | |
| "epoch": 3.604427333974976, | |
| "grad_norm": 0.13456315333584198, | |
| "learning_rate": 6.327623040877694e-06, | |
| "loss": 0.0893, | |
| "mean_token_accuracy": 0.9689961552619935, | |
| "step": 3745 | |
| }, | |
| { | |
| "epoch": 3.609239653512993, | |
| "grad_norm": 0.135895962451549, | |
| "learning_rate": 6.295829092022071e-06, | |
| "loss": 0.0908, | |
| "mean_token_accuracy": 0.9683876216411591, | |
| "step": 3750 | |
| }, | |
| { | |
| "epoch": 3.6140519730510103, | |
| "grad_norm": 0.13369924830536217, | |
| "learning_rate": 6.264409182442095e-06, | |
| "loss": 0.0898, | |
| "mean_token_accuracy": 0.9689429521560669, | |
| "step": 3755 | |
| }, | |
| { | |
| "epoch": 3.618864292589028, | |
| "grad_norm": 0.1443969878995605, | |
| "learning_rate": 6.233363866381562e-06, | |
| "loss": 0.0886, | |
| "mean_token_accuracy": 0.9694486260414124, | |
| "step": 3760 | |
| }, | |
| { | |
| "epoch": 3.623676612127045, | |
| "grad_norm": 0.1374835632502456, | |
| "learning_rate": 6.202693691476475e-06, | |
| "loss": 0.0889, | |
| "mean_token_accuracy": 0.9690564334392547, | |
| "step": 3765 | |
| }, | |
| { | |
| "epoch": 3.6284889316650624, | |
| "grad_norm": 0.1322486595622557, | |
| "learning_rate": 6.172399198745402e-06, | |
| "loss": 0.0875, | |
| "mean_token_accuracy": 0.9697753429412842, | |
| "step": 3770 | |
| }, | |
| { | |
| "epoch": 3.63330125120308, | |
| "grad_norm": 0.1282503297553987, | |
| "learning_rate": 6.14248092257991e-06, | |
| "loss": 0.088, | |
| "mean_token_accuracy": 0.9694970846176147, | |
| "step": 3775 | |
| }, | |
| { | |
| "epoch": 3.638113570741097, | |
| "grad_norm": 0.13776363066804362, | |
| "learning_rate": 6.112939390735136e-06, | |
| "loss": 0.0917, | |
| "mean_token_accuracy": 0.9680108070373535, | |
| "step": 3780 | |
| }, | |
| { | |
| "epoch": 3.6429258902791144, | |
| "grad_norm": 0.13564641557278212, | |
| "learning_rate": 6.083775124320508e-06, | |
| "loss": 0.089, | |
| "mean_token_accuracy": 0.9691525280475617, | |
| "step": 3785 | |
| }, | |
| { | |
| "epoch": 3.647738209817132, | |
| "grad_norm": 0.13179746986006852, | |
| "learning_rate": 6.0549886377905196e-06, | |
| "loss": 0.0892, | |
| "mean_token_accuracy": 0.9690011382102967, | |
| "step": 3790 | |
| }, | |
| { | |
| "epoch": 3.6525505293551492, | |
| "grad_norm": 0.14091565358006583, | |
| "learning_rate": 6.026580438935671e-06, | |
| "loss": 0.0885, | |
| "mean_token_accuracy": 0.969417268037796, | |
| "step": 3795 | |
| }, | |
| { | |
| "epoch": 3.6573628488931664, | |
| "grad_norm": 0.14910262532320515, | |
| "learning_rate": 5.9985510288735166e-06, | |
| "loss": 0.0893, | |
| "mean_token_accuracy": 0.9689526975154876, | |
| "step": 3800 | |
| }, | |
| { | |
| "epoch": 3.662175168431184, | |
| "grad_norm": 0.13611419175627348, | |
| "learning_rate": 5.970900902039801e-06, | |
| "loss": 0.0881, | |
| "mean_token_accuracy": 0.9694447040557861, | |
| "step": 3805 | |
| }, | |
| { | |
| "epoch": 3.6669874879692013, | |
| "grad_norm": 0.1332456259673305, | |
| "learning_rate": 5.94363054617977e-06, | |
| "loss": 0.089, | |
| "mean_token_accuracy": 0.9692049920558929, | |
| "step": 3810 | |
| }, | |
| { | |
| "epoch": 3.6717998075072185, | |
| "grad_norm": 0.13901127023379464, | |
| "learning_rate": 5.91674044233954e-06, | |
| "loss": 0.0901, | |
| "mean_token_accuracy": 0.9688822150230407, | |
| "step": 3815 | |
| }, | |
| { | |
| "epoch": 3.6766121270452357, | |
| "grad_norm": 0.14140074770998537, | |
| "learning_rate": 5.8902310648576335e-06, | |
| "loss": 0.0879, | |
| "mean_token_accuracy": 0.9696286439895629, | |
| "step": 3820 | |
| }, | |
| { | |
| "epoch": 3.681424446583253, | |
| "grad_norm": 0.13641153863848185, | |
| "learning_rate": 5.8641028813565865e-06, | |
| "loss": 0.0894, | |
| "mean_token_accuracy": 0.9690168917179107, | |
| "step": 3825 | |
| }, | |
| { | |
| "epoch": 3.6862367661212705, | |
| "grad_norm": 0.1374018549361659, | |
| "learning_rate": 5.838356352734728e-06, | |
| "loss": 0.0922, | |
| "mean_token_accuracy": 0.9680605947971344, | |
| "step": 3830 | |
| }, | |
| { | |
| "epoch": 3.6910490856592877, | |
| "grad_norm": 0.1370134651080559, | |
| "learning_rate": 5.812991933158031e-06, | |
| "loss": 0.0875, | |
| "mean_token_accuracy": 0.9695887923240661, | |
| "step": 3835 | |
| }, | |
| { | |
| "epoch": 3.695861405197305, | |
| "grad_norm": 0.1377411356872754, | |
| "learning_rate": 5.788010070052104e-06, | |
| "loss": 0.0912, | |
| "mean_token_accuracy": 0.9684119820594788, | |
| "step": 3840 | |
| }, | |
| { | |
| "epoch": 3.7006737247353225, | |
| "grad_norm": 0.14349732442513227, | |
| "learning_rate": 5.763411204094308e-06, | |
| "loss": 0.0898, | |
| "mean_token_accuracy": 0.9688425540924073, | |
| "step": 3845 | |
| }, | |
| { | |
| "epoch": 3.7054860442733397, | |
| "grad_norm": 0.13265495672692082, | |
| "learning_rate": 5.739195769205967e-06, | |
| "loss": 0.0895, | |
| "mean_token_accuracy": 0.9691309094429016, | |
| "step": 3850 | |
| }, | |
| { | |
| "epoch": 3.710298363811357, | |
| "grad_norm": 0.14600493809544218, | |
| "learning_rate": 5.715364192544725e-06, | |
| "loss": 0.0883, | |
| "mean_token_accuracy": 0.9695326447486877, | |
| "step": 3855 | |
| }, | |
| { | |
| "epoch": 3.7151106833493746, | |
| "grad_norm": 0.13875356655043747, | |
| "learning_rate": 5.691916894497016e-06, | |
| "loss": 0.0896, | |
| "mean_token_accuracy": 0.9689690589904785, | |
| "step": 3860 | |
| }, | |
| { | |
| "epoch": 3.7199230028873917, | |
| "grad_norm": 0.13496075423672457, | |
| "learning_rate": 5.668854288670632e-06, | |
| "loss": 0.089, | |
| "mean_token_accuracy": 0.9691781044006348, | |
| "step": 3865 | |
| }, | |
| { | |
| "epoch": 3.724735322425409, | |
| "grad_norm": 0.13839694515743828, | |
| "learning_rate": 5.646176781887437e-06, | |
| "loss": 0.0902, | |
| "mean_token_accuracy": 0.9687245488166809, | |
| "step": 3870 | |
| }, | |
| { | |
| "epoch": 3.7295476419634266, | |
| "grad_norm": 0.13977918074193177, | |
| "learning_rate": 5.6238847741761995e-06, | |
| "loss": 0.0881, | |
| "mean_token_accuracy": 0.9694719612598419, | |
| "step": 3875 | |
| }, | |
| { | |
| "epoch": 3.734359961501444, | |
| "grad_norm": 0.13700290555555267, | |
| "learning_rate": 5.6019786587655105e-06, | |
| "loss": 0.0899, | |
| "mean_token_accuracy": 0.9687611639499665, | |
| "step": 3880 | |
| }, | |
| { | |
| "epoch": 3.739172281039461, | |
| "grad_norm": 0.1384415272354067, | |
| "learning_rate": 5.580458822076873e-06, | |
| "loss": 0.0886, | |
| "mean_token_accuracy": 0.96945241689682, | |
| "step": 3885 | |
| }, | |
| { | |
| "epoch": 3.7439846005774786, | |
| "grad_norm": 0.13429678265470868, | |
| "learning_rate": 5.559325643717874e-06, | |
| "loss": 0.0905, | |
| "mean_token_accuracy": 0.968583631515503, | |
| "step": 3890 | |
| }, | |
| { | |
| "epoch": 3.748796920115496, | |
| "grad_norm": 0.14030339353124033, | |
| "learning_rate": 5.538579496475484e-06, | |
| "loss": 0.09, | |
| "mean_token_accuracy": 0.9686710894107818, | |
| "step": 3895 | |
| }, | |
| { | |
| "epoch": 3.753609239653513, | |
| "grad_norm": 0.13991681878390033, | |
| "learning_rate": 5.518220746309499e-06, | |
| "loss": 0.0904, | |
| "mean_token_accuracy": 0.9689141511917114, | |
| "step": 3900 | |
| }, | |
| { | |
| "epoch": 3.75842155919153, | |
| "grad_norm": 0.13228798810082415, | |
| "learning_rate": 5.498249752346055e-06, | |
| "loss": 0.0905, | |
| "mean_token_accuracy": 0.9687845289707184, | |
| "step": 3905 | |
| }, | |
| { | |
| "epoch": 3.7632338787295474, | |
| "grad_norm": 0.13555462975026847, | |
| "learning_rate": 5.4786668668713255e-06, | |
| "loss": 0.0884, | |
| "mean_token_accuracy": 0.9694622159004211, | |
| "step": 3910 | |
| }, | |
| { | |
| "epoch": 3.768046198267565, | |
| "grad_norm": 0.13834369747634273, | |
| "learning_rate": 5.459472435325288e-06, | |
| "loss": 0.0901, | |
| "mean_token_accuracy": 0.9688325703144074, | |
| "step": 3915 | |
| }, | |
| { | |
| "epoch": 3.7728585178055822, | |
| "grad_norm": 0.1362996900602539, | |
| "learning_rate": 5.440666796295631e-06, | |
| "loss": 0.0891, | |
| "mean_token_accuracy": 0.969083023071289, | |
| "step": 3920 | |
| }, | |
| { | |
| "epoch": 3.7776708373435994, | |
| "grad_norm": 0.1396964932833639, | |
| "learning_rate": 5.422250281511786e-06, | |
| "loss": 0.0887, | |
| "mean_token_accuracy": 0.969150710105896, | |
| "step": 3925 | |
| }, | |
| { | |
| "epoch": 3.782483156881617, | |
| "grad_norm": 0.13823240172068574, | |
| "learning_rate": 5.404223215839082e-06, | |
| "loss": 0.0892, | |
| "mean_token_accuracy": 0.9690535187721252, | |
| "step": 3930 | |
| }, | |
| { | |
| "epoch": 3.7872954764196343, | |
| "grad_norm": 0.1349286852059626, | |
| "learning_rate": 5.386585917273001e-06, | |
| "loss": 0.0904, | |
| "mean_token_accuracy": 0.9686072170734406, | |
| "step": 3935 | |
| }, | |
| { | |
| "epoch": 3.7921077959576515, | |
| "grad_norm": 0.13601502138614252, | |
| "learning_rate": 5.3693386969335745e-06, | |
| "loss": 0.0896, | |
| "mean_token_accuracy": 0.9689940690994263, | |
| "step": 3940 | |
| }, | |
| { | |
| "epoch": 3.796920115495669, | |
| "grad_norm": 0.13764427328960968, | |
| "learning_rate": 5.352481859059902e-06, | |
| "loss": 0.0895, | |
| "mean_token_accuracy": 0.9688514411449433, | |
| "step": 3945 | |
| }, | |
| { | |
| "epoch": 3.8017324350336863, | |
| "grad_norm": 0.1378012250050132, | |
| "learning_rate": 5.336015701004775e-06, | |
| "loss": 0.09, | |
| "mean_token_accuracy": 0.9688797056674957, | |
| "step": 3950 | |
| }, | |
| { | |
| "epoch": 3.8065447545717035, | |
| "grad_norm": 0.13740906428497687, | |
| "learning_rate": 5.3199405132294345e-06, | |
| "loss": 0.0898, | |
| "mean_token_accuracy": 0.968880695104599, | |
| "step": 3955 | |
| }, | |
| { | |
| "epoch": 3.811357074109721, | |
| "grad_norm": 0.13609547289970403, | |
| "learning_rate": 5.304256579298454e-06, | |
| "loss": 0.0884, | |
| "mean_token_accuracy": 0.9692340552806854, | |
| "step": 3960 | |
| }, | |
| { | |
| "epoch": 3.8161693936477383, | |
| "grad_norm": 0.1341601536740279, | |
| "learning_rate": 5.288964175874724e-06, | |
| "loss": 0.0895, | |
| "mean_token_accuracy": 0.9690073788166046, | |
| "step": 3965 | |
| }, | |
| { | |
| "epoch": 3.8209817131857555, | |
| "grad_norm": 0.13789279249489372, | |
| "learning_rate": 5.274063572714582e-06, | |
| "loss": 0.0885, | |
| "mean_token_accuracy": 0.9693139493465424, | |
| "step": 3970 | |
| }, | |
| { | |
| "epoch": 3.8257940327237727, | |
| "grad_norm": 0.13731394210460737, | |
| "learning_rate": 5.2595550326630565e-06, | |
| "loss": 0.0899, | |
| "mean_token_accuracy": 0.9687747836112977, | |
| "step": 3975 | |
| }, | |
| { | |
| "epoch": 3.83060635226179, | |
| "grad_norm": 0.1359478343857469, | |
| "learning_rate": 5.245438811649216e-06, | |
| "loss": 0.089, | |
| "mean_token_accuracy": 0.9691655397415161, | |
| "step": 3980 | |
| }, | |
| { | |
| "epoch": 3.8354186717998076, | |
| "grad_norm": 0.13425270492448074, | |
| "learning_rate": 5.231715158681672e-06, | |
| "loss": 0.0881, | |
| "mean_token_accuracy": 0.9695821583271027, | |
| "step": 3985 | |
| }, | |
| { | |
| "epoch": 3.8402309913378248, | |
| "grad_norm": 0.13943260615825612, | |
| "learning_rate": 5.218384315844173e-06, | |
| "loss": 0.0892, | |
| "mean_token_accuracy": 0.9689464092254638, | |
| "step": 3990 | |
| }, | |
| { | |
| "epoch": 3.845043310875842, | |
| "grad_norm": 0.13863014654325273, | |
| "learning_rate": 5.205446518291341e-06, | |
| "loss": 0.0903, | |
| "mean_token_accuracy": 0.9687318921089172, | |
| "step": 3995 | |
| }, | |
| { | |
| "epoch": 3.8498556304138596, | |
| "grad_norm": 0.14119475124085532, | |
| "learning_rate": 5.1929019942445224e-06, | |
| "loss": 0.0893, | |
| "mean_token_accuracy": 0.9691673576831817, | |
| "step": 4000 | |
| }, | |
| { | |
| "epoch": 3.854667949951877, | |
| "grad_norm": 0.13803282143201645, | |
| "learning_rate": 5.180750964987762e-06, | |
| "loss": 0.0891, | |
| "mean_token_accuracy": 0.9693030953407288, | |
| "step": 4005 | |
| }, | |
| { | |
| "epoch": 3.859480269489894, | |
| "grad_norm": 0.134817242606034, | |
| "learning_rate": 5.1689936448638984e-06, | |
| "loss": 0.0896, | |
| "mean_token_accuracy": 0.9689365267753601, | |
| "step": 4010 | |
| }, | |
| { | |
| "epoch": 3.8642925890279116, | |
| "grad_norm": 0.13539207671811226, | |
| "learning_rate": 5.1576302412707815e-06, | |
| "loss": 0.0891, | |
| "mean_token_accuracy": 0.9692870140075683, | |
| "step": 4015 | |
| }, | |
| { | |
| "epoch": 3.869104908565929, | |
| "grad_norm": 0.13940674506199696, | |
| "learning_rate": 5.146660954657621e-06, | |
| "loss": 0.0874, | |
| "mean_token_accuracy": 0.9696322739124298, | |
| "step": 4020 | |
| }, | |
| { | |
| "epoch": 3.873917228103946, | |
| "grad_norm": 0.13748538901705848, | |
| "learning_rate": 5.1360859785214415e-06, | |
| "loss": 0.0885, | |
| "mean_token_accuracy": 0.9693687736988068, | |
| "step": 4025 | |
| }, | |
| { | |
| "epoch": 3.8787295476419636, | |
| "grad_norm": 0.13614377849694345, | |
| "learning_rate": 5.125905499403678e-06, | |
| "loss": 0.089, | |
| "mean_token_accuracy": 0.9691945254802704, | |
| "step": 4030 | |
| }, | |
| { | |
| "epoch": 3.883541867179981, | |
| "grad_norm": 0.1338741619132659, | |
| "learning_rate": 5.116119696886876e-06, | |
| "loss": 0.0894, | |
| "mean_token_accuracy": 0.9690902769565582, | |
| "step": 4035 | |
| }, | |
| { | |
| "epoch": 3.888354186717998, | |
| "grad_norm": 0.13574488027975073, | |
| "learning_rate": 5.106728743591529e-06, | |
| "loss": 0.0905, | |
| "mean_token_accuracy": 0.9686701238155365, | |
| "step": 4040 | |
| }, | |
| { | |
| "epoch": 3.8931665062560152, | |
| "grad_norm": 0.13848318609403668, | |
| "learning_rate": 5.097732805173042e-06, | |
| "loss": 0.0893, | |
| "mean_token_accuracy": 0.9692742109298706, | |
| "step": 4045 | |
| }, | |
| { | |
| "epoch": 3.897978825794033, | |
| "grad_norm": 0.13586746043022693, | |
| "learning_rate": 5.089132040318785e-06, | |
| "loss": 0.09, | |
| "mean_token_accuracy": 0.9688207983970643, | |
| "step": 4050 | |
| }, | |
| { | |
| "epoch": 3.90279114533205, | |
| "grad_norm": 0.1374908289293142, | |
| "learning_rate": 5.080926600745323e-06, | |
| "loss": 0.0873, | |
| "mean_token_accuracy": 0.9698093652725219, | |
| "step": 4055 | |
| }, | |
| { | |
| "epoch": 3.9076034648700673, | |
| "grad_norm": 0.1355725331395753, | |
| "learning_rate": 5.073116631195715e-06, | |
| "loss": 0.088, | |
| "mean_token_accuracy": 0.9695097863674164, | |
| "step": 4060 | |
| }, | |
| { | |
| "epoch": 3.9124157844080845, | |
| "grad_norm": 0.140761419405981, | |
| "learning_rate": 5.0657022694369844e-06, | |
| "loss": 0.0905, | |
| "mean_token_accuracy": 0.9686882495880127, | |
| "step": 4065 | |
| }, | |
| { | |
| "epoch": 3.917228103946102, | |
| "grad_norm": 0.13901638419636575, | |
| "learning_rate": 5.058683646257663e-06, | |
| "loss": 0.088, | |
| "mean_token_accuracy": 0.969620281457901, | |
| "step": 4070 | |
| }, | |
| { | |
| "epoch": 3.9220404234841193, | |
| "grad_norm": 0.13784564869908433, | |
| "learning_rate": 5.052060885465503e-06, | |
| "loss": 0.0885, | |
| "mean_token_accuracy": 0.9691882967948914, | |
| "step": 4075 | |
| }, | |
| { | |
| "epoch": 3.9268527430221365, | |
| "grad_norm": 0.13840132801395807, | |
| "learning_rate": 5.045834103885289e-06, | |
| "loss": 0.0895, | |
| "mean_token_accuracy": 0.9690020859241486, | |
| "step": 4080 | |
| }, | |
| { | |
| "epoch": 3.931665062560154, | |
| "grad_norm": 0.13764463918702646, | |
| "learning_rate": 5.040003411356773e-06, | |
| "loss": 0.089, | |
| "mean_token_accuracy": 0.9692375659942627, | |
| "step": 4085 | |
| }, | |
| { | |
| "epoch": 3.9364773820981713, | |
| "grad_norm": 0.1322073791746964, | |
| "learning_rate": 5.034568910732737e-06, | |
| "loss": 0.0899, | |
| "mean_token_accuracy": 0.9688105344772339, | |
| "step": 4090 | |
| }, | |
| { | |
| "epoch": 3.9412897016361885, | |
| "grad_norm": 0.14020475905571375, | |
| "learning_rate": 5.029530697877181e-06, | |
| "loss": 0.089, | |
| "mean_token_accuracy": 0.9692331671714782, | |
| "step": 4095 | |
| }, | |
| { | |
| "epoch": 3.946102021174206, | |
| "grad_norm": 0.14170916726691946, | |
| "learning_rate": 5.02488886166364e-06, | |
| "loss": 0.0904, | |
| "mean_token_accuracy": 0.9686473906040192, | |
| "step": 4100 | |
| }, | |
| { | |
| "epoch": 3.9509143407122234, | |
| "grad_norm": 0.131660053900958, | |
| "learning_rate": 5.020643483973598e-06, | |
| "loss": 0.0875, | |
| "mean_token_accuracy": 0.969598114490509, | |
| "step": 4105 | |
| }, | |
| { | |
| "epoch": 3.9557266602502406, | |
| "grad_norm": 0.14500516603141905, | |
| "learning_rate": 5.016794639695054e-06, | |
| "loss": 0.09, | |
| "mean_token_accuracy": 0.9688828349113464, | |
| "step": 4110 | |
| }, | |
| { | |
| "epoch": 3.960538979788258, | |
| "grad_norm": 0.1415408119884466, | |
| "learning_rate": 5.013342396721207e-06, | |
| "loss": 0.0883, | |
| "mean_token_accuracy": 0.9692238152027131, | |
| "step": 4115 | |
| }, | |
| { | |
| "epoch": 3.9653512993262754, | |
| "grad_norm": 0.13860286948706352, | |
| "learning_rate": 5.010286815949247e-06, | |
| "loss": 0.0914, | |
| "mean_token_accuracy": 0.9684812486171722, | |
| "step": 4120 | |
| }, | |
| { | |
| "epoch": 3.9701636188642926, | |
| "grad_norm": 0.13816538102802578, | |
| "learning_rate": 5.007627951279292e-06, | |
| "loss": 0.0901, | |
| "mean_token_accuracy": 0.9687812924385071, | |
| "step": 4125 | |
| }, | |
| { | |
| "epoch": 3.97497593840231, | |
| "grad_norm": 0.13608368363986517, | |
| "learning_rate": 5.00536584961342e-06, | |
| "loss": 0.0898, | |
| "mean_token_accuracy": 0.9689698159694672, | |
| "step": 4130 | |
| }, | |
| { | |
| "epoch": 3.979788257940327, | |
| "grad_norm": 0.13489390360358144, | |
| "learning_rate": 5.003500550854863e-06, | |
| "loss": 0.0873, | |
| "mean_token_accuracy": 0.9698358178138733, | |
| "step": 4135 | |
| }, | |
| { | |
| "epoch": 3.9846005774783446, | |
| "grad_norm": 0.1399827539100902, | |
| "learning_rate": 5.00203208790729e-06, | |
| "loss": 0.0904, | |
| "mean_token_accuracy": 0.9687283575534821, | |
| "step": 4140 | |
| }, | |
| { | |
| "epoch": 3.989412897016362, | |
| "grad_norm": 0.13697439177706988, | |
| "learning_rate": 5.000960486674224e-06, | |
| "loss": 0.0873, | |
| "mean_token_accuracy": 0.9697397172451019, | |
| "step": 4145 | |
| }, | |
| { | |
| "epoch": 3.994225216554379, | |
| "grad_norm": 0.1395836169277757, | |
| "learning_rate": 5.0002857660585965e-06, | |
| "loss": 0.0882, | |
| "mean_token_accuracy": 0.9696287274360657, | |
| "step": 4150 | |
| }, | |
| { | |
| "epoch": 3.9990375360923966, | |
| "grad_norm": 0.13828729027376532, | |
| "learning_rate": 5.000007937962408e-06, | |
| "loss": 0.0889, | |
| "mean_token_accuracy": 0.969226461648941, | |
| "step": 4155 | |
| }, | |
| { | |
| "epoch": 4.0, | |
| "mean_token_accuracy": 0.9694786071777344, | |
| "step": 4156, | |
| "total_flos": 2162610407735296.0, | |
| "train_loss": 0.15963262656956367, | |
| "train_runtime": 23750.3378, | |
| "train_samples_per_second": 2.799, | |
| "train_steps_per_second": 0.175 | |
| } | |
| ], | |
| "logging_steps": 5, | |
| "max_steps": 4156, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 4, | |
| "save_steps": 1000, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 2162610407735296.0, | |
| "train_batch_size": 1, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |