| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 3.992481203007519, | |
| "eval_steps": 500, | |
| "global_step": 1860, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.010741138560687433, | |
| "grad_norm": 2.6824158480436027, | |
| "learning_rate": 1.3440860215053765e-06, | |
| "loss": 0.8294, | |
| "mean_token_accuracy": 0.8010891914367676, | |
| "step": 5 | |
| }, | |
| { | |
| "epoch": 0.021482277121374866, | |
| "grad_norm": 1.0834186154450132, | |
| "learning_rate": 2.688172043010753e-06, | |
| "loss": 0.7976, | |
| "mean_token_accuracy": 0.8042729198932648, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.0322234156820623, | |
| "grad_norm": 0.9912101287518572, | |
| "learning_rate": 4.032258064516129e-06, | |
| "loss": 0.7318, | |
| "mean_token_accuracy": 0.8116350173950195, | |
| "step": 15 | |
| }, | |
| { | |
| "epoch": 0.04296455424274973, | |
| "grad_norm": 0.6161547535500218, | |
| "learning_rate": 5.376344086021506e-06, | |
| "loss": 0.6796, | |
| "mean_token_accuracy": 0.8214974880218506, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.05370569280343716, | |
| "grad_norm": 0.4711983922639431, | |
| "learning_rate": 6.720430107526882e-06, | |
| "loss": 0.6403, | |
| "mean_token_accuracy": 0.8289329469203949, | |
| "step": 25 | |
| }, | |
| { | |
| "epoch": 0.0644468313641246, | |
| "grad_norm": 0.3514340436445771, | |
| "learning_rate": 8.064516129032258e-06, | |
| "loss": 0.6101, | |
| "mean_token_accuracy": 0.8344561219215393, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.07518796992481203, | |
| "grad_norm": 0.2900558861500113, | |
| "learning_rate": 9.408602150537635e-06, | |
| "loss": 0.5849, | |
| "mean_token_accuracy": 0.8396502792835235, | |
| "step": 35 | |
| }, | |
| { | |
| "epoch": 0.08592910848549946, | |
| "grad_norm": 0.2722947727971047, | |
| "learning_rate": 1.0752688172043012e-05, | |
| "loss": 0.5701, | |
| "mean_token_accuracy": 0.8420377433300018, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.0966702470461869, | |
| "grad_norm": 0.25248070544882645, | |
| "learning_rate": 1.2096774193548388e-05, | |
| "loss": 0.561, | |
| "mean_token_accuracy": 0.8443691551685333, | |
| "step": 45 | |
| }, | |
| { | |
| "epoch": 0.10741138560687433, | |
| "grad_norm": 0.2504332745775819, | |
| "learning_rate": 1.3440860215053763e-05, | |
| "loss": 0.5601, | |
| "mean_token_accuracy": 0.8441641569137573, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.11815252416756176, | |
| "grad_norm": 0.21685484456472007, | |
| "learning_rate": 1.4784946236559142e-05, | |
| "loss": 0.5455, | |
| "mean_token_accuracy": 0.8471231937408448, | |
| "step": 55 | |
| }, | |
| { | |
| "epoch": 0.1288936627282492, | |
| "grad_norm": 0.23513981149675298, | |
| "learning_rate": 1.6129032258064517e-05, | |
| "loss": 0.5486, | |
| "mean_token_accuracy": 0.8462919056415558, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.13963480128893663, | |
| "grad_norm": 0.21971215723488632, | |
| "learning_rate": 1.7473118279569895e-05, | |
| "loss": 0.5372, | |
| "mean_token_accuracy": 0.8488749146461487, | |
| "step": 65 | |
| }, | |
| { | |
| "epoch": 0.15037593984962405, | |
| "grad_norm": 0.22582010917696982, | |
| "learning_rate": 1.881720430107527e-05, | |
| "loss": 0.5341, | |
| "mean_token_accuracy": 0.8489724159240722, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.1611170784103115, | |
| "grad_norm": 0.2505238494065726, | |
| "learning_rate": 2.0161290322580645e-05, | |
| "loss": 0.5288, | |
| "mean_token_accuracy": 0.8500843226909638, | |
| "step": 75 | |
| }, | |
| { | |
| "epoch": 0.17185821697099893, | |
| "grad_norm": 0.2485546682065235, | |
| "learning_rate": 2.1505376344086024e-05, | |
| "loss": 0.5265, | |
| "mean_token_accuracy": 0.8504622042179107, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.18259935553168635, | |
| "grad_norm": 0.25134861732181085, | |
| "learning_rate": 2.28494623655914e-05, | |
| "loss": 0.5245, | |
| "mean_token_accuracy": 0.8512703776359558, | |
| "step": 85 | |
| }, | |
| { | |
| "epoch": 0.1933404940923738, | |
| "grad_norm": 0.2607421207193637, | |
| "learning_rate": 2.4193548387096777e-05, | |
| "loss": 0.5225, | |
| "mean_token_accuracy": 0.8512581944465637, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.20408163265306123, | |
| "grad_norm": 0.2571937237076843, | |
| "learning_rate": 2.5537634408602152e-05, | |
| "loss": 0.5169, | |
| "mean_token_accuracy": 0.8526618123054505, | |
| "step": 95 | |
| }, | |
| { | |
| "epoch": 0.21482277121374865, | |
| "grad_norm": 0.2559454741361629, | |
| "learning_rate": 2.6881720430107527e-05, | |
| "loss": 0.5087, | |
| "mean_token_accuracy": 0.8544329702854156, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.22556390977443608, | |
| "grad_norm": 0.25657620243689094, | |
| "learning_rate": 2.822580645161291e-05, | |
| "loss": 0.5069, | |
| "mean_token_accuracy": 0.8545464932918548, | |
| "step": 105 | |
| }, | |
| { | |
| "epoch": 0.23630504833512353, | |
| "grad_norm": 0.3084326429216429, | |
| "learning_rate": 2.9569892473118284e-05, | |
| "loss": 0.5109, | |
| "mean_token_accuracy": 0.8538104116916656, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.24704618689581095, | |
| "grad_norm": 0.2964885334930525, | |
| "learning_rate": 3.091397849462366e-05, | |
| "loss": 0.5026, | |
| "mean_token_accuracy": 0.8555706679821015, | |
| "step": 115 | |
| }, | |
| { | |
| "epoch": 0.2577873254564984, | |
| "grad_norm": 0.2640055744535602, | |
| "learning_rate": 3.2258064516129034e-05, | |
| "loss": 0.4952, | |
| "mean_token_accuracy": 0.8576966226100922, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.26852846401718583, | |
| "grad_norm": 0.28061492437295604, | |
| "learning_rate": 3.360215053763441e-05, | |
| "loss": 0.4983, | |
| "mean_token_accuracy": 0.8568866074085235, | |
| "step": 125 | |
| }, | |
| { | |
| "epoch": 0.27926960257787325, | |
| "grad_norm": 0.3222080670739919, | |
| "learning_rate": 3.494623655913979e-05, | |
| "loss": 0.4919, | |
| "mean_token_accuracy": 0.8582496762275695, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.2900107411385607, | |
| "grad_norm": 0.3018861867966521, | |
| "learning_rate": 3.6290322580645165e-05, | |
| "loss": 0.4921, | |
| "mean_token_accuracy": 0.858267605304718, | |
| "step": 135 | |
| }, | |
| { | |
| "epoch": 0.3007518796992481, | |
| "grad_norm": 0.27298497353963225, | |
| "learning_rate": 3.763440860215054e-05, | |
| "loss": 0.4897, | |
| "mean_token_accuracy": 0.858799421787262, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.31149301825993553, | |
| "grad_norm": 0.29189277480966186, | |
| "learning_rate": 3.8978494623655915e-05, | |
| "loss": 0.4831, | |
| "mean_token_accuracy": 0.8604558348655701, | |
| "step": 145 | |
| }, | |
| { | |
| "epoch": 0.322234156820623, | |
| "grad_norm": 0.28012276855965057, | |
| "learning_rate": 4.032258064516129e-05, | |
| "loss": 0.4834, | |
| "mean_token_accuracy": 0.8607946753501892, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.33297529538131043, | |
| "grad_norm": 0.2822021421564993, | |
| "learning_rate": 4.166666666666667e-05, | |
| "loss": 0.4822, | |
| "mean_token_accuracy": 0.8607180714607239, | |
| "step": 155 | |
| }, | |
| { | |
| "epoch": 0.34371643394199786, | |
| "grad_norm": 0.2669043120039336, | |
| "learning_rate": 4.301075268817205e-05, | |
| "loss": 0.4709, | |
| "mean_token_accuracy": 0.8635617375373841, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.3544575725026853, | |
| "grad_norm": 0.26430063130872034, | |
| "learning_rate": 4.435483870967742e-05, | |
| "loss": 0.4759, | |
| "mean_token_accuracy": 0.8624868154525757, | |
| "step": 165 | |
| }, | |
| { | |
| "epoch": 0.3651987110633727, | |
| "grad_norm": 0.2768300795347462, | |
| "learning_rate": 4.56989247311828e-05, | |
| "loss": 0.4698, | |
| "mean_token_accuracy": 0.863774424791336, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.37593984962406013, | |
| "grad_norm": 0.27300710251352905, | |
| "learning_rate": 4.704301075268818e-05, | |
| "loss": 0.4688, | |
| "mean_token_accuracy": 0.8640853643417359, | |
| "step": 175 | |
| }, | |
| { | |
| "epoch": 0.3866809881847476, | |
| "grad_norm": 0.28130219154214986, | |
| "learning_rate": 4.8387096774193554e-05, | |
| "loss": 0.4616, | |
| "mean_token_accuracy": 0.8659515857696534, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.39742212674543503, | |
| "grad_norm": 0.28040903261236555, | |
| "learning_rate": 4.973118279569893e-05, | |
| "loss": 0.4652, | |
| "mean_token_accuracy": 0.8656746566295623, | |
| "step": 185 | |
| }, | |
| { | |
| "epoch": 0.40816326530612246, | |
| "grad_norm": 0.32637783754316196, | |
| "learning_rate": 4.999936604372673e-05, | |
| "loss": 0.4584, | |
| "mean_token_accuracy": 0.8662971913814544, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 0.4189044038668099, | |
| "grad_norm": 0.3235247316768069, | |
| "learning_rate": 4.9996790657593474e-05, | |
| "loss": 0.4652, | |
| "mean_token_accuracy": 0.865262484550476, | |
| "step": 195 | |
| }, | |
| { | |
| "epoch": 0.4296455424274973, | |
| "grad_norm": 0.2756975255703871, | |
| "learning_rate": 4.999223444591954e-05, | |
| "loss": 0.4533, | |
| "mean_token_accuracy": 0.8687061607837677, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.44038668098818473, | |
| "grad_norm": 0.26466440633632593, | |
| "learning_rate": 4.998569780987594e-05, | |
| "loss": 0.4521, | |
| "mean_token_accuracy": 0.8684524893760681, | |
| "step": 205 | |
| }, | |
| { | |
| "epoch": 0.45112781954887216, | |
| "grad_norm": 0.25138863961089425, | |
| "learning_rate": 4.997718132500857e-05, | |
| "loss": 0.4456, | |
| "mean_token_accuracy": 0.8701819539070129, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 0.46186895810955964, | |
| "grad_norm": 0.3025611470224811, | |
| "learning_rate": 4.9966685741187544e-05, | |
| "loss": 0.447, | |
| "mean_token_accuracy": 0.8699068784713745, | |
| "step": 215 | |
| }, | |
| { | |
| "epoch": 0.47261009667024706, | |
| "grad_norm": 0.24615962175136596, | |
| "learning_rate": 4.995421198254114e-05, | |
| "loss": 0.4445, | |
| "mean_token_accuracy": 0.8706246316432953, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 0.4833512352309345, | |
| "grad_norm": 0.23780094613136366, | |
| "learning_rate": 4.9939761147374455e-05, | |
| "loss": 0.444, | |
| "mean_token_accuracy": 0.8709352612495422, | |
| "step": 225 | |
| }, | |
| { | |
| "epoch": 0.4940923737916219, | |
| "grad_norm": 0.26418243428675386, | |
| "learning_rate": 4.992333450807268e-05, | |
| "loss": 0.4428, | |
| "mean_token_accuracy": 0.8712534010410309, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 0.5048335123523093, | |
| "grad_norm": 0.2452687330812135, | |
| "learning_rate": 4.990493351098908e-05, | |
| "loss": 0.4375, | |
| "mean_token_accuracy": 0.8728318750858307, | |
| "step": 235 | |
| }, | |
| { | |
| "epoch": 0.5155746509129968, | |
| "grad_norm": 0.2688160648750715, | |
| "learning_rate": 4.9884559776317644e-05, | |
| "loss": 0.4353, | |
| "mean_token_accuracy": 0.8730437099933624, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 0.5263157894736842, | |
| "grad_norm": 0.25960118051112435, | |
| "learning_rate": 4.986221509795043e-05, | |
| "loss": 0.4317, | |
| "mean_token_accuracy": 0.8739780306816101, | |
| "step": 245 | |
| }, | |
| { | |
| "epoch": 0.5370569280343717, | |
| "grad_norm": 0.23341024093650933, | |
| "learning_rate": 4.98379014433196e-05, | |
| "loss": 0.4352, | |
| "mean_token_accuracy": 0.8733076274394989, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.547798066595059, | |
| "grad_norm": 0.25741008352215955, | |
| "learning_rate": 4.981162095322421e-05, | |
| "loss": 0.4324, | |
| "mean_token_accuracy": 0.8738310694694519, | |
| "step": 255 | |
| }, | |
| { | |
| "epoch": 0.5585392051557465, | |
| "grad_norm": 0.23274342659284017, | |
| "learning_rate": 4.9783375941641696e-05, | |
| "loss": 0.4321, | |
| "mean_token_accuracy": 0.8742413520812988, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 0.569280343716434, | |
| "grad_norm": 0.2451922230157493, | |
| "learning_rate": 4.9753168895524136e-05, | |
| "loss": 0.4202, | |
| "mean_token_accuracy": 0.8772394955158234, | |
| "step": 265 | |
| }, | |
| { | |
| "epoch": 0.5800214822771214, | |
| "grad_norm": 0.2681975618828881, | |
| "learning_rate": 4.9721002474579285e-05, | |
| "loss": 0.4265, | |
| "mean_token_accuracy": 0.8758379638195037, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 0.5907626208378088, | |
| "grad_norm": 0.22840035689897775, | |
| "learning_rate": 4.968687951103638e-05, | |
| "loss": 0.4209, | |
| "mean_token_accuracy": 0.8775071561336517, | |
| "step": 275 | |
| }, | |
| { | |
| "epoch": 0.6015037593984962, | |
| "grad_norm": 0.22300755601220718, | |
| "learning_rate": 4.965080300939675e-05, | |
| "loss": 0.4153, | |
| "mean_token_accuracy": 0.8784702062606812, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 0.6122448979591837, | |
| "grad_norm": 0.22676783176605783, | |
| "learning_rate": 4.961277614616931e-05, | |
| "loss": 0.4168, | |
| "mean_token_accuracy": 0.8779775381088257, | |
| "step": 285 | |
| }, | |
| { | |
| "epoch": 0.6229860365198711, | |
| "grad_norm": 0.24574274186354764, | |
| "learning_rate": 4.957280226959083e-05, | |
| "loss": 0.4119, | |
| "mean_token_accuracy": 0.8798301517963409, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 0.6337271750805585, | |
| "grad_norm": 0.2281072685520932, | |
| "learning_rate": 4.953088489933117e-05, | |
| "loss": 0.4176, | |
| "mean_token_accuracy": 0.878108823299408, | |
| "step": 295 | |
| }, | |
| { | |
| "epoch": 0.644468313641246, | |
| "grad_norm": 0.2606268344040068, | |
| "learning_rate": 4.948702772618332e-05, | |
| "loss": 0.4114, | |
| "mean_token_accuracy": 0.879868882894516, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.6552094522019334, | |
| "grad_norm": 0.2192902541038699, | |
| "learning_rate": 4.944123461173849e-05, | |
| "loss": 0.4141, | |
| "mean_token_accuracy": 0.879179573059082, | |
| "step": 305 | |
| }, | |
| { | |
| "epoch": 0.6659505907626209, | |
| "grad_norm": 0.21550855803478997, | |
| "learning_rate": 4.9393509588046036e-05, | |
| "loss": 0.4053, | |
| "mean_token_accuracy": 0.8814833164215088, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 0.6766917293233082, | |
| "grad_norm": 0.23830421980148422, | |
| "learning_rate": 4.934385685725851e-05, | |
| "loss": 0.4068, | |
| "mean_token_accuracy": 0.8807245373725892, | |
| "step": 315 | |
| }, | |
| { | |
| "epoch": 0.6874328678839957, | |
| "grad_norm": 0.22141238716961, | |
| "learning_rate": 4.9292280791261595e-05, | |
| "loss": 0.4023, | |
| "mean_token_accuracy": 0.8820916056632996, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 0.6981740064446831, | |
| "grad_norm": 0.23798938808653466, | |
| "learning_rate": 4.9238785931289225e-05, | |
| "loss": 0.4042, | |
| "mean_token_accuracy": 0.882178908586502, | |
| "step": 325 | |
| }, | |
| { | |
| "epoch": 0.7089151450053706, | |
| "grad_norm": 0.22152782163874513, | |
| "learning_rate": 4.918337698752367e-05, | |
| "loss": 0.4038, | |
| "mean_token_accuracy": 0.8820820569992065, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 0.719656283566058, | |
| "grad_norm": 0.2238393672437065, | |
| "learning_rate": 4.912605883868088e-05, | |
| "loss": 0.4094, | |
| "mean_token_accuracy": 0.8803297877311707, | |
| "step": 335 | |
| }, | |
| { | |
| "epoch": 0.7303974221267454, | |
| "grad_norm": 0.2251835579056735, | |
| "learning_rate": 4.906683653158086e-05, | |
| "loss": 0.4022, | |
| "mean_token_accuracy": 0.8820242047309875, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 0.7411385606874329, | |
| "grad_norm": 0.21096516273893903, | |
| "learning_rate": 4.9005715280703295e-05, | |
| "loss": 0.3963, | |
| "mean_token_accuracy": 0.8838990330696106, | |
| "step": 345 | |
| }, | |
| { | |
| "epoch": 0.7518796992481203, | |
| "grad_norm": 0.20550443098708907, | |
| "learning_rate": 4.8942700467728505e-05, | |
| "loss": 0.3955, | |
| "mean_token_accuracy": 0.8842245638370514, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 0.7626208378088077, | |
| "grad_norm": 0.2058867389466749, | |
| "learning_rate": 4.88777976410635e-05, | |
| "loss": 0.3995, | |
| "mean_token_accuracy": 0.8830176711082458, | |
| "step": 355 | |
| }, | |
| { | |
| "epoch": 0.7733619763694952, | |
| "grad_norm": 0.20958669116131587, | |
| "learning_rate": 4.8811012515353456e-05, | |
| "loss": 0.3911, | |
| "mean_token_accuracy": 0.8853914678096771, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 0.7841031149301826, | |
| "grad_norm": 0.20397609182823062, | |
| "learning_rate": 4.874235097097861e-05, | |
| "loss": 0.393, | |
| "mean_token_accuracy": 0.8846873760223388, | |
| "step": 365 | |
| }, | |
| { | |
| "epoch": 0.7948442534908701, | |
| "grad_norm": 0.21645535614809533, | |
| "learning_rate": 4.8671819053536415e-05, | |
| "loss": 0.3922, | |
| "mean_token_accuracy": 0.8847495734691619, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 0.8055853920515574, | |
| "grad_norm": 0.22258952481615085, | |
| "learning_rate": 4.859942297330932e-05, | |
| "loss": 0.3982, | |
| "mean_token_accuracy": 0.8832435965538025, | |
| "step": 375 | |
| }, | |
| { | |
| "epoch": 0.8163265306122449, | |
| "grad_norm": 0.2024612867389681, | |
| "learning_rate": 4.8525169104717846e-05, | |
| "loss": 0.3903, | |
| "mean_token_accuracy": 0.8853883922100068, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 0.8270676691729323, | |
| "grad_norm": 0.20556087856635372, | |
| "learning_rate": 4.844906398575944e-05, | |
| "loss": 0.3964, | |
| "mean_token_accuracy": 0.8837718069553375, | |
| "step": 385 | |
| }, | |
| { | |
| "epoch": 0.8378088077336198, | |
| "grad_norm": 0.20809549331239957, | |
| "learning_rate": 4.8371114317432726e-05, | |
| "loss": 0.3941, | |
| "mean_token_accuracy": 0.8842520952224732, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 0.8485499462943072, | |
| "grad_norm": 0.21820552680801697, | |
| "learning_rate": 4.8291326963147524e-05, | |
| "loss": 0.3891, | |
| "mean_token_accuracy": 0.8858624398708344, | |
| "step": 395 | |
| }, | |
| { | |
| "epoch": 0.8592910848549946, | |
| "grad_norm": 0.20709264624327767, | |
| "learning_rate": 4.820970894812053e-05, | |
| "loss": 0.3845, | |
| "mean_token_accuracy": 0.886957323551178, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.8700322234156821, | |
| "grad_norm": 0.21155796049345174, | |
| "learning_rate": 4.812626745875673e-05, | |
| "loss": 0.3909, | |
| "mean_token_accuracy": 0.8852347731590271, | |
| "step": 405 | |
| }, | |
| { | |
| "epoch": 0.8807733619763695, | |
| "grad_norm": 0.20230194258239817, | |
| "learning_rate": 4.804100984201667e-05, | |
| "loss": 0.3888, | |
| "mean_token_accuracy": 0.8856496810913086, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 0.8915145005370569, | |
| "grad_norm": 0.1914371442320018, | |
| "learning_rate": 4.795394360476955e-05, | |
| "loss": 0.3927, | |
| "mean_token_accuracy": 0.885220056772232, | |
| "step": 415 | |
| }, | |
| { | |
| "epoch": 0.9022556390977443, | |
| "grad_norm": 0.21955921021321853, | |
| "learning_rate": 4.7865076413132234e-05, | |
| "loss": 0.3862, | |
| "mean_token_accuracy": 0.8869829177856445, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 0.9129967776584318, | |
| "grad_norm": 0.19993088700133185, | |
| "learning_rate": 4.777441609179428e-05, | |
| "loss": 0.389, | |
| "mean_token_accuracy": 0.8861649572849274, | |
| "step": 425 | |
| }, | |
| { | |
| "epoch": 0.9237379162191193, | |
| "grad_norm": 0.20214442771764315, | |
| "learning_rate": 4.768197062332898e-05, | |
| "loss": 0.3805, | |
| "mean_token_accuracy": 0.8884122192859649, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 0.9344790547798066, | |
| "grad_norm": 0.1936799045011743, | |
| "learning_rate": 4.758774814749046e-05, | |
| "loss": 0.3825, | |
| "mean_token_accuracy": 0.8876857936382294, | |
| "step": 435 | |
| }, | |
| { | |
| "epoch": 0.9452201933404941, | |
| "grad_norm": 0.19325903425845148, | |
| "learning_rate": 4.749175696049706e-05, | |
| "loss": 0.3826, | |
| "mean_token_accuracy": 0.8881516516208648, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 0.9559613319011815, | |
| "grad_norm": 0.19255187762230458, | |
| "learning_rate": 4.739400551430077e-05, | |
| "loss": 0.3811, | |
| "mean_token_accuracy": 0.8880790531635284, | |
| "step": 445 | |
| }, | |
| { | |
| "epoch": 0.966702470461869, | |
| "grad_norm": 0.19450067956842618, | |
| "learning_rate": 4.7294502415843105e-05, | |
| "loss": 0.3783, | |
| "mean_token_accuracy": 0.8890111207962036, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 0.9774436090225563, | |
| "grad_norm": 0.20174438790639918, | |
| "learning_rate": 4.719325642629722e-05, | |
| "loss": 0.378, | |
| "mean_token_accuracy": 0.8890378654003144, | |
| "step": 455 | |
| }, | |
| { | |
| "epoch": 0.9881847475832438, | |
| "grad_norm": 0.17832896478111976, | |
| "learning_rate": 4.7090276460296555e-05, | |
| "loss": 0.3843, | |
| "mean_token_accuracy": 0.8872815728187561, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 0.9989258861439313, | |
| "grad_norm": 0.1913931630832869, | |
| "learning_rate": 4.6985571585149876e-05, | |
| "loss": 0.3796, | |
| "mean_token_accuracy": 0.8887166023254395, | |
| "step": 465 | |
| }, | |
| { | |
| "epoch": 1.0085929108485499, | |
| "grad_norm": 0.20263869484120534, | |
| "learning_rate": 4.687915102004286e-05, | |
| "loss": 0.3614, | |
| "mean_token_accuracy": 0.8926012317339579, | |
| "step": 470 | |
| }, | |
| { | |
| "epoch": 1.0193340494092373, | |
| "grad_norm": 0.19678722825673817, | |
| "learning_rate": 4.677102413522645e-05, | |
| "loss": 0.3495, | |
| "mean_token_accuracy": 0.8955722391605377, | |
| "step": 475 | |
| }, | |
| { | |
| "epoch": 1.0300751879699248, | |
| "grad_norm": 0.20376503491728473, | |
| "learning_rate": 4.666120045119174e-05, | |
| "loss": 0.3507, | |
| "mean_token_accuracy": 0.8951772391796112, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 1.0408163265306123, | |
| "grad_norm": 0.2019062903436488, | |
| "learning_rate": 4.654968963783171e-05, | |
| "loss": 0.3531, | |
| "mean_token_accuracy": 0.8947476446628571, | |
| "step": 485 | |
| }, | |
| { | |
| "epoch": 1.0515574650912998, | |
| "grad_norm": 0.18722603018624961, | |
| "learning_rate": 4.643650151358983e-05, | |
| "loss": 0.3526, | |
| "mean_token_accuracy": 0.894485878944397, | |
| "step": 490 | |
| }, | |
| { | |
| "epoch": 1.062298603651987, | |
| "grad_norm": 0.19481656873843595, | |
| "learning_rate": 4.632164604459553e-05, | |
| "loss": 0.3468, | |
| "mean_token_accuracy": 0.8964617013931274, | |
| "step": 495 | |
| }, | |
| { | |
| "epoch": 1.0730397422126745, | |
| "grad_norm": 0.18585853331072713, | |
| "learning_rate": 4.620513334378669e-05, | |
| "loss": 0.3512, | |
| "mean_token_accuracy": 0.8950131058692932, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 1.083780880773362, | |
| "grad_norm": 0.1930388596228489, | |
| "learning_rate": 4.608697367001921e-05, | |
| "loss": 0.3479, | |
| "mean_token_accuracy": 0.895933198928833, | |
| "step": 505 | |
| }, | |
| { | |
| "epoch": 1.0945220193340495, | |
| "grad_norm": 0.1978189680563173, | |
| "learning_rate": 4.596717742716372e-05, | |
| "loss": 0.3532, | |
| "mean_token_accuracy": 0.8942179441452026, | |
| "step": 510 | |
| }, | |
| { | |
| "epoch": 1.1052631578947367, | |
| "grad_norm": 0.2198969141563894, | |
| "learning_rate": 4.584575516318954e-05, | |
| "loss": 0.3492, | |
| "mean_token_accuracy": 0.8957188785076141, | |
| "step": 515 | |
| }, | |
| { | |
| "epoch": 1.1160042964554242, | |
| "grad_norm": 0.19175977623621587, | |
| "learning_rate": 4.5722717569235924e-05, | |
| "loss": 0.3553, | |
| "mean_token_accuracy": 0.8938140749931336, | |
| "step": 520 | |
| }, | |
| { | |
| "epoch": 1.1267454350161117, | |
| "grad_norm": 0.1995625771811619, | |
| "learning_rate": 4.559807547867071e-05, | |
| "loss": 0.3493, | |
| "mean_token_accuracy": 0.8954446971416473, | |
| "step": 525 | |
| }, | |
| { | |
| "epoch": 1.1374865735767992, | |
| "grad_norm": 0.1915734911527379, | |
| "learning_rate": 4.5471839866136475e-05, | |
| "loss": 0.3491, | |
| "mean_token_accuracy": 0.8957653522491456, | |
| "step": 530 | |
| }, | |
| { | |
| "epoch": 1.1482277121374866, | |
| "grad_norm": 0.19836797519712018, | |
| "learning_rate": 4.5344021846584205e-05, | |
| "loss": 0.3539, | |
| "mean_token_accuracy": 0.8943828701972961, | |
| "step": 535 | |
| }, | |
| { | |
| "epoch": 1.158968850698174, | |
| "grad_norm": 0.18808462761740152, | |
| "learning_rate": 4.521463267429464e-05, | |
| "loss": 0.3497, | |
| "mean_token_accuracy": 0.8953365862369538, | |
| "step": 540 | |
| }, | |
| { | |
| "epoch": 1.1697099892588614, | |
| "grad_norm": 0.19280122016496182, | |
| "learning_rate": 4.508368374188731e-05, | |
| "loss": 0.3496, | |
| "mean_token_accuracy": 0.8953313529491425, | |
| "step": 545 | |
| }, | |
| { | |
| "epoch": 1.1804511278195489, | |
| "grad_norm": 0.19677371481260625, | |
| "learning_rate": 4.4951186579317504e-05, | |
| "loss": 0.3528, | |
| "mean_token_accuracy": 0.8949146151542664, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 1.1911922663802363, | |
| "grad_norm": 0.18538032977972374, | |
| "learning_rate": 4.481715285286098e-05, | |
| "loss": 0.3541, | |
| "mean_token_accuracy": 0.8939870595932007, | |
| "step": 555 | |
| }, | |
| { | |
| "epoch": 1.2019334049409238, | |
| "grad_norm": 0.18481539602601102, | |
| "learning_rate": 4.46815943640868e-05, | |
| "loss": 0.3553, | |
| "mean_token_accuracy": 0.8940768420696259, | |
| "step": 560 | |
| }, | |
| { | |
| "epoch": 1.212674543501611, | |
| "grad_norm": 0.1861386211911988, | |
| "learning_rate": 4.454452304881821e-05, | |
| "loss": 0.3468, | |
| "mean_token_accuracy": 0.8959418594837188, | |
| "step": 565 | |
| }, | |
| { | |
| "epoch": 1.2234156820622986, | |
| "grad_norm": 0.18228266310501318, | |
| "learning_rate": 4.440595097608168e-05, | |
| "loss": 0.3467, | |
| "mean_token_accuracy": 0.8962770164012909, | |
| "step": 570 | |
| }, | |
| { | |
| "epoch": 1.234156820622986, | |
| "grad_norm": 0.1841361210717962, | |
| "learning_rate": 4.426589034704428e-05, | |
| "loss": 0.3536, | |
| "mean_token_accuracy": 0.8943024933338165, | |
| "step": 575 | |
| }, | |
| { | |
| "epoch": 1.2448979591836735, | |
| "grad_norm": 0.17281724579297167, | |
| "learning_rate": 4.412435349393931e-05, | |
| "loss": 0.3509, | |
| "mean_token_accuracy": 0.8950875043869019, | |
| "step": 580 | |
| }, | |
| { | |
| "epoch": 1.255639097744361, | |
| "grad_norm": 0.1772300668593227, | |
| "learning_rate": 4.398135287898052e-05, | |
| "loss": 0.3485, | |
| "mean_token_accuracy": 0.8955003321170807, | |
| "step": 585 | |
| }, | |
| { | |
| "epoch": 1.2663802363050483, | |
| "grad_norm": 0.17772581177798846, | |
| "learning_rate": 4.383690109326477e-05, | |
| "loss": 0.3459, | |
| "mean_token_accuracy": 0.8965889751911164, | |
| "step": 590 | |
| }, | |
| { | |
| "epoch": 1.2771213748657357, | |
| "grad_norm": 0.18596059716645308, | |
| "learning_rate": 4.369101085566342e-05, | |
| "loss": 0.3496, | |
| "mean_token_accuracy": 0.8954894125461579, | |
| "step": 595 | |
| }, | |
| { | |
| "epoch": 1.2878625134264232, | |
| "grad_norm": 0.17598132780016223, | |
| "learning_rate": 4.354369501170246e-05, | |
| "loss": 0.3479, | |
| "mean_token_accuracy": 0.8960169315338135, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 1.2986036519871107, | |
| "grad_norm": 0.1804871594490513, | |
| "learning_rate": 4.3394966532431433e-05, | |
| "loss": 0.352, | |
| "mean_token_accuracy": 0.8948932409286499, | |
| "step": 605 | |
| }, | |
| { | |
| "epoch": 1.3093447905477982, | |
| "grad_norm": 0.1865297212423964, | |
| "learning_rate": 4.3244838513281367e-05, | |
| "loss": 0.3515, | |
| "mean_token_accuracy": 0.8949047923088074, | |
| "step": 610 | |
| }, | |
| { | |
| "epoch": 1.3200859291084854, | |
| "grad_norm": 0.18053270547327416, | |
| "learning_rate": 4.309332417291172e-05, | |
| "loss": 0.3505, | |
| "mean_token_accuracy": 0.8953122675418854, | |
| "step": 615 | |
| }, | |
| { | |
| "epoch": 1.330827067669173, | |
| "grad_norm": 0.1744036148367508, | |
| "learning_rate": 4.294043685204651e-05, | |
| "loss": 0.3474, | |
| "mean_token_accuracy": 0.8960575997829437, | |
| "step": 620 | |
| }, | |
| { | |
| "epoch": 1.3415682062298604, | |
| "grad_norm": 0.16842924897825143, | |
| "learning_rate": 4.278619001229962e-05, | |
| "loss": 0.3474, | |
| "mean_token_accuracy": 0.8961166024208069, | |
| "step": 625 | |
| }, | |
| { | |
| "epoch": 1.3523093447905479, | |
| "grad_norm": 0.17741079904542595, | |
| "learning_rate": 4.263059723498961e-05, | |
| "loss": 0.3474, | |
| "mean_token_accuracy": 0.8962021231651306, | |
| "step": 630 | |
| }, | |
| { | |
| "epoch": 1.3630504833512354, | |
| "grad_norm": 0.17634563486082044, | |
| "learning_rate": 4.247367221994377e-05, | |
| "loss": 0.352, | |
| "mean_token_accuracy": 0.8948638260364532, | |
| "step": 635 | |
| }, | |
| { | |
| "epoch": 1.3737916219119226, | |
| "grad_norm": 0.16514936818638581, | |
| "learning_rate": 4.2315428784291965e-05, | |
| "loss": 0.348, | |
| "mean_token_accuracy": 0.8962691247463226, | |
| "step": 640 | |
| }, | |
| { | |
| "epoch": 1.38453276047261, | |
| "grad_norm": 0.18156198450594868, | |
| "learning_rate": 4.215588086125001e-05, | |
| "loss": 0.3473, | |
| "mean_token_accuracy": 0.8962475776672363, | |
| "step": 645 | |
| }, | |
| { | |
| "epoch": 1.3952738990332976, | |
| "grad_norm": 0.17302374962454448, | |
| "learning_rate": 4.199504249889279e-05, | |
| "loss": 0.3499, | |
| "mean_token_accuracy": 0.8956164479255676, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 1.4060150375939848, | |
| "grad_norm": 0.17009271559786848, | |
| "learning_rate": 4.18329278589175e-05, | |
| "loss": 0.3481, | |
| "mean_token_accuracy": 0.8962275862693787, | |
| "step": 655 | |
| }, | |
| { | |
| "epoch": 1.4167561761546725, | |
| "grad_norm": 0.17232579890547844, | |
| "learning_rate": 4.166955121539656e-05, | |
| "loss": 0.3452, | |
| "mean_token_accuracy": 0.8966892838478089, | |
| "step": 660 | |
| }, | |
| { | |
| "epoch": 1.4274973147153598, | |
| "grad_norm": 0.18931912307479049, | |
| "learning_rate": 4.150492695352086e-05, | |
| "loss": 0.3476, | |
| "mean_token_accuracy": 0.8961862683296203, | |
| "step": 665 | |
| }, | |
| { | |
| "epoch": 1.4382384532760473, | |
| "grad_norm": 0.1812257587896816, | |
| "learning_rate": 4.133906956833316e-05, | |
| "loss": 0.3451, | |
| "mean_token_accuracy": 0.8965191125869751, | |
| "step": 670 | |
| }, | |
| { | |
| "epoch": 1.4489795918367347, | |
| "grad_norm": 0.18448866093949617, | |
| "learning_rate": 4.1171993663451816e-05, | |
| "loss": 0.3453, | |
| "mean_token_accuracy": 0.8967220306396484, | |
| "step": 675 | |
| }, | |
| { | |
| "epoch": 1.459720730397422, | |
| "grad_norm": 0.16318177527247005, | |
| "learning_rate": 4.1003713949784905e-05, | |
| "loss": 0.3491, | |
| "mean_token_accuracy": 0.8957133948802948, | |
| "step": 680 | |
| }, | |
| { | |
| "epoch": 1.4704618689581095, | |
| "grad_norm": 0.19223128076002124, | |
| "learning_rate": 4.083424524423498e-05, | |
| "loss": 0.3475, | |
| "mean_token_accuracy": 0.8962952673435212, | |
| "step": 685 | |
| }, | |
| { | |
| "epoch": 1.481203007518797, | |
| "grad_norm": 0.17065645296533696, | |
| "learning_rate": 4.066360246839442e-05, | |
| "loss": 0.3495, | |
| "mean_token_accuracy": 0.8956079244613647, | |
| "step": 690 | |
| }, | |
| { | |
| "epoch": 1.4919441460794844, | |
| "grad_norm": 0.1613801844631258, | |
| "learning_rate": 4.049180064723164e-05, | |
| "loss": 0.3491, | |
| "mean_token_accuracy": 0.8964253485202789, | |
| "step": 695 | |
| }, | |
| { | |
| "epoch": 1.502685284640172, | |
| "grad_norm": 0.17729165960730092, | |
| "learning_rate": 4.031885490776811e-05, | |
| "loss": 0.3461, | |
| "mean_token_accuracy": 0.8965683281421661, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 1.5134264232008592, | |
| "grad_norm": 0.16772417608227957, | |
| "learning_rate": 4.014478047774644e-05, | |
| "loss": 0.3486, | |
| "mean_token_accuracy": 0.8959019482135773, | |
| "step": 705 | |
| }, | |
| { | |
| "epoch": 1.5241675617615469, | |
| "grad_norm": 0.1654092742061062, | |
| "learning_rate": 3.99695926842896e-05, | |
| "loss": 0.3452, | |
| "mean_token_accuracy": 0.8970151007175445, | |
| "step": 710 | |
| }, | |
| { | |
| "epoch": 1.5349087003222341, | |
| "grad_norm": 0.1770663143483711, | |
| "learning_rate": 3.979330695255139e-05, | |
| "loss": 0.3504, | |
| "mean_token_accuracy": 0.8954713106155395, | |
| "step": 715 | |
| }, | |
| { | |
| "epoch": 1.5456498388829216, | |
| "grad_norm": 0.16250407421180885, | |
| "learning_rate": 3.9615938804358254e-05, | |
| "loss": 0.3403, | |
| "mean_token_accuracy": 0.8980903148651123, | |
| "step": 720 | |
| }, | |
| { | |
| "epoch": 1.556390977443609, | |
| "grad_norm": 0.1739734421973896, | |
| "learning_rate": 3.943750385684257e-05, | |
| "loss": 0.3452, | |
| "mean_token_accuracy": 0.8973391890525818, | |
| "step": 725 | |
| }, | |
| { | |
| "epoch": 1.5671321160042964, | |
| "grad_norm": 0.17020682906702797, | |
| "learning_rate": 3.9258017821067595e-05, | |
| "loss": 0.341, | |
| "mean_token_accuracy": 0.8981746196746826, | |
| "step": 730 | |
| }, | |
| { | |
| "epoch": 1.5778732545649838, | |
| "grad_norm": 0.17090518777542177, | |
| "learning_rate": 3.907749650064416e-05, | |
| "loss": 0.3475, | |
| "mean_token_accuracy": 0.8964370787143707, | |
| "step": 735 | |
| }, | |
| { | |
| "epoch": 1.5886143931256713, | |
| "grad_norm": 0.18226436070710383, | |
| "learning_rate": 3.889595579033907e-05, | |
| "loss": 0.3548, | |
| "mean_token_accuracy": 0.8943204343318939, | |
| "step": 740 | |
| }, | |
| { | |
| "epoch": 1.5993555316863588, | |
| "grad_norm": 0.16867971152976394, | |
| "learning_rate": 3.8713411674675706e-05, | |
| "loss": 0.3468, | |
| "mean_token_accuracy": 0.8964660108089447, | |
| "step": 745 | |
| }, | |
| { | |
| "epoch": 1.6100966702470463, | |
| "grad_norm": 0.1634124661472663, | |
| "learning_rate": 3.8529880226526504e-05, | |
| "loss": 0.3419, | |
| "mean_token_accuracy": 0.897741311788559, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 1.6208378088077335, | |
| "grad_norm": 0.16728119897984747, | |
| "learning_rate": 3.834537760569779e-05, | |
| "loss": 0.3477, | |
| "mean_token_accuracy": 0.8964338660240173, | |
| "step": 755 | |
| }, | |
| { | |
| "epoch": 1.631578947368421, | |
| "grad_norm": 0.16636899767836238, | |
| "learning_rate": 3.815992005750691e-05, | |
| "loss": 0.3454, | |
| "mean_token_accuracy": 0.897176194190979, | |
| "step": 760 | |
| }, | |
| { | |
| "epoch": 1.6423200859291085, | |
| "grad_norm": 0.17370655470517776, | |
| "learning_rate": 3.7973523911351873e-05, | |
| "loss": 0.3457, | |
| "mean_token_accuracy": 0.8967864811420441, | |
| "step": 765 | |
| }, | |
| { | |
| "epoch": 1.6530612244897958, | |
| "grad_norm": 0.17387140846382934, | |
| "learning_rate": 3.7786205579273494e-05, | |
| "loss": 0.3461, | |
| "mean_token_accuracy": 0.896539443731308, | |
| "step": 770 | |
| }, | |
| { | |
| "epoch": 1.6638023630504835, | |
| "grad_norm": 0.17312244395133694, | |
| "learning_rate": 3.75979815545104e-05, | |
| "loss": 0.3469, | |
| "mean_token_accuracy": 0.8965823531150818, | |
| "step": 775 | |
| }, | |
| { | |
| "epoch": 1.6745435016111707, | |
| "grad_norm": 0.17134683681288093, | |
| "learning_rate": 3.740886841004678e-05, | |
| "loss": 0.3437, | |
| "mean_token_accuracy": 0.8972635090351104, | |
| "step": 780 | |
| }, | |
| { | |
| "epoch": 1.6852846401718582, | |
| "grad_norm": 0.1703220892784228, | |
| "learning_rate": 3.72188827971531e-05, | |
| "loss": 0.349, | |
| "mean_token_accuracy": 0.8958061695098877, | |
| "step": 785 | |
| }, | |
| { | |
| "epoch": 1.6960257787325457, | |
| "grad_norm": 0.15629690421483755, | |
| "learning_rate": 3.7028041443920106e-05, | |
| "loss": 0.345, | |
| "mean_token_accuracy": 0.8972305715084076, | |
| "step": 790 | |
| }, | |
| { | |
| "epoch": 1.706766917293233, | |
| "grad_norm": 0.16968855316404596, | |
| "learning_rate": 3.6836361153785735e-05, | |
| "loss": 0.3391, | |
| "mean_token_accuracy": 0.8984034955501556, | |
| "step": 795 | |
| }, | |
| { | |
| "epoch": 1.7175080558539206, | |
| "grad_norm": 0.1613956545932139, | |
| "learning_rate": 3.6643858804055764e-05, | |
| "loss": 0.3418, | |
| "mean_token_accuracy": 0.8975095868110656, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 1.728249194414608, | |
| "grad_norm": 0.16488649273144998, | |
| "learning_rate": 3.6450551344417656e-05, | |
| "loss": 0.347, | |
| "mean_token_accuracy": 0.8963462889194489, | |
| "step": 805 | |
| }, | |
| { | |
| "epoch": 1.7389903329752954, | |
| "grad_norm": 0.18336562912600562, | |
| "learning_rate": 3.625645579544824e-05, | |
| "loss": 0.3417, | |
| "mean_token_accuracy": 0.8978760004043579, | |
| "step": 810 | |
| }, | |
| { | |
| "epoch": 1.7497314715359829, | |
| "grad_norm": 0.16442030655020706, | |
| "learning_rate": 3.606158924711498e-05, | |
| "loss": 0.3418, | |
| "mean_token_accuracy": 0.8984208166599273, | |
| "step": 815 | |
| }, | |
| { | |
| "epoch": 1.76047261009667, | |
| "grad_norm": 0.1648466060868627, | |
| "learning_rate": 3.586596885727126e-05, | |
| "loss": 0.346, | |
| "mean_token_accuracy": 0.8967172205448151, | |
| "step": 820 | |
| }, | |
| { | |
| "epoch": 1.7712137486573578, | |
| "grad_norm": 0.16380950472689287, | |
| "learning_rate": 3.5669611850145676e-05, | |
| "loss": 0.3404, | |
| "mean_token_accuracy": 0.8981300175189972, | |
| "step": 825 | |
| }, | |
| { | |
| "epoch": 1.781954887218045, | |
| "grad_norm": 0.16476649720519732, | |
| "learning_rate": 3.54725355148254e-05, | |
| "loss": 0.3417, | |
| "mean_token_accuracy": 0.8978650271892548, | |
| "step": 830 | |
| }, | |
| { | |
| "epoch": 1.7926960257787325, | |
| "grad_norm": 0.16250342083791575, | |
| "learning_rate": 3.5274757203733906e-05, | |
| "loss": 0.3429, | |
| "mean_token_accuracy": 0.8977679431438446, | |
| "step": 835 | |
| }, | |
| { | |
| "epoch": 1.80343716433942, | |
| "grad_norm": 0.1666333005283665, | |
| "learning_rate": 3.507629433110311e-05, | |
| "loss": 0.3437, | |
| "mean_token_accuracy": 0.8972832322120666, | |
| "step": 840 | |
| }, | |
| { | |
| "epoch": 1.8141783029001073, | |
| "grad_norm": 0.1615387362712691, | |
| "learning_rate": 3.4877164371440075e-05, | |
| "loss": 0.3453, | |
| "mean_token_accuracy": 0.8970289349555969, | |
| "step": 845 | |
| }, | |
| { | |
| "epoch": 1.824919441460795, | |
| "grad_norm": 0.16676447906725542, | |
| "learning_rate": 3.467738485798836e-05, | |
| "loss": 0.3451, | |
| "mean_token_accuracy": 0.8969220995903016, | |
| "step": 850 | |
| }, | |
| { | |
| "epoch": 1.8356605800214822, | |
| "grad_norm": 0.16168843045380168, | |
| "learning_rate": 3.447697338118425e-05, | |
| "loss": 0.3395, | |
| "mean_token_accuracy": 0.898131811618805, | |
| "step": 855 | |
| }, | |
| { | |
| "epoch": 1.8464017185821697, | |
| "grad_norm": 0.15334942056157058, | |
| "learning_rate": 3.427594758710794e-05, | |
| "loss": 0.3422, | |
| "mean_token_accuracy": 0.8975472927093506, | |
| "step": 860 | |
| }, | |
| { | |
| "epoch": 1.8571428571428572, | |
| "grad_norm": 0.1672358555124429, | |
| "learning_rate": 3.407432517592979e-05, | |
| "loss": 0.3403, | |
| "mean_token_accuracy": 0.8983366131782532, | |
| "step": 865 | |
| }, | |
| { | |
| "epoch": 1.8678839957035445, | |
| "grad_norm": 0.161941088262071, | |
| "learning_rate": 3.3872123900351835e-05, | |
| "loss": 0.3408, | |
| "mean_token_accuracy": 0.8978644967079162, | |
| "step": 870 | |
| }, | |
| { | |
| "epoch": 1.8786251342642322, | |
| "grad_norm": 0.1519842470665007, | |
| "learning_rate": 3.3669361564044735e-05, | |
| "loss": 0.3396, | |
| "mean_token_accuracy": 0.898490047454834, | |
| "step": 875 | |
| }, | |
| { | |
| "epoch": 1.8893662728249194, | |
| "grad_norm": 0.16037110333088753, | |
| "learning_rate": 3.346605602008007e-05, | |
| "loss": 0.3417, | |
| "mean_token_accuracy": 0.8977841079235077, | |
| "step": 880 | |
| }, | |
| { | |
| "epoch": 1.900107411385607, | |
| "grad_norm": 0.16442639618093918, | |
| "learning_rate": 3.326222516935847e-05, | |
| "loss": 0.3437, | |
| "mean_token_accuracy": 0.8971070289611817, | |
| "step": 885 | |
| }, | |
| { | |
| "epoch": 1.9108485499462944, | |
| "grad_norm": 0.15289173675825762, | |
| "learning_rate": 3.3057886959033426e-05, | |
| "loss": 0.3416, | |
| "mean_token_accuracy": 0.8984978437423706, | |
| "step": 890 | |
| }, | |
| { | |
| "epoch": 1.9215896885069816, | |
| "grad_norm": 0.14450841113047458, | |
| "learning_rate": 3.285305938093108e-05, | |
| "loss": 0.3392, | |
| "mean_token_accuracy": 0.8983058393001556, | |
| "step": 895 | |
| }, | |
| { | |
| "epoch": 1.9323308270676691, | |
| "grad_norm": 0.15549384924856993, | |
| "learning_rate": 3.264776046996602e-05, | |
| "loss": 0.3394, | |
| "mean_token_accuracy": 0.8985956251621247, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 1.9430719656283566, | |
| "grad_norm": 0.162459823198956, | |
| "learning_rate": 3.2442008302553346e-05, | |
| "loss": 0.34, | |
| "mean_token_accuracy": 0.8984286248683929, | |
| "step": 905 | |
| }, | |
| { | |
| "epoch": 1.953813104189044, | |
| "grad_norm": 0.15039221824995944, | |
| "learning_rate": 3.223582099501704e-05, | |
| "loss": 0.3374, | |
| "mean_token_accuracy": 0.8987222969532013, | |
| "step": 910 | |
| }, | |
| { | |
| "epoch": 1.9645542427497316, | |
| "grad_norm": 0.1564002589458454, | |
| "learning_rate": 3.202921670199485e-05, | |
| "loss": 0.3369, | |
| "mean_token_accuracy": 0.8994980156421661, | |
| "step": 915 | |
| }, | |
| { | |
| "epoch": 1.9752953813104188, | |
| "grad_norm": 0.17459425481905663, | |
| "learning_rate": 3.182221361483981e-05, | |
| "loss": 0.3426, | |
| "mean_token_accuracy": 0.8977073311805726, | |
| "step": 920 | |
| }, | |
| { | |
| "epoch": 1.9860365198711063, | |
| "grad_norm": 0.15953782868809285, | |
| "learning_rate": 3.161482996001842e-05, | |
| "loss": 0.3406, | |
| "mean_token_accuracy": 0.8983509004116058, | |
| "step": 925 | |
| }, | |
| { | |
| "epoch": 1.9967776584317938, | |
| "grad_norm": 0.15713432539772912, | |
| "learning_rate": 3.140708399750594e-05, | |
| "loss": 0.3421, | |
| "mean_token_accuracy": 0.8979579448699951, | |
| "step": 930 | |
| }, | |
| { | |
| "epoch": 2.0064446831364124, | |
| "grad_norm": 0.16209947632099436, | |
| "learning_rate": 3.11989940191785e-05, | |
| "loss": 0.3137, | |
| "mean_token_accuracy": 0.9049130148357816, | |
| "step": 935 | |
| }, | |
| { | |
| "epoch": 2.0171858216970997, | |
| "grad_norm": 0.18807228831939848, | |
| "learning_rate": 3.09905783472026e-05, | |
| "loss": 0.305, | |
| "mean_token_accuracy": 0.9070174276828766, | |
| "step": 940 | |
| }, | |
| { | |
| "epoch": 2.0279269602577874, | |
| "grad_norm": 0.1647631068534088, | |
| "learning_rate": 3.07818553324218e-05, | |
| "loss": 0.3039, | |
| "mean_token_accuracy": 0.9071334481239319, | |
| "step": 945 | |
| }, | |
| { | |
| "epoch": 2.0386680988184747, | |
| "grad_norm": 0.16628057896853762, | |
| "learning_rate": 3.057284335274097e-05, | |
| "loss": 0.3026, | |
| "mean_token_accuracy": 0.9071128606796265, | |
| "step": 950 | |
| }, | |
| { | |
| "epoch": 2.0494092373791624, | |
| "grad_norm": 0.16953299184244167, | |
| "learning_rate": 3.036356081150813e-05, | |
| "loss": 0.3034, | |
| "mean_token_accuracy": 0.9072185814380646, | |
| "step": 955 | |
| }, | |
| { | |
| "epoch": 2.0601503759398496, | |
| "grad_norm": 0.16119678084859076, | |
| "learning_rate": 3.0154026135894043e-05, | |
| "loss": 0.2994, | |
| "mean_token_accuracy": 0.9083474159240723, | |
| "step": 960 | |
| }, | |
| { | |
| "epoch": 2.070891514500537, | |
| "grad_norm": 0.16680753647576305, | |
| "learning_rate": 2.9944257775269686e-05, | |
| "loss": 0.3046, | |
| "mean_token_accuracy": 0.9070303261280059, | |
| "step": 965 | |
| }, | |
| { | |
| "epoch": 2.0816326530612246, | |
| "grad_norm": 0.1557469947598615, | |
| "learning_rate": 2.9734274199581857e-05, | |
| "loss": 0.3028, | |
| "mean_token_accuracy": 0.9075248777866364, | |
| "step": 970 | |
| }, | |
| { | |
| "epoch": 2.092373791621912, | |
| "grad_norm": 0.15821336281763043, | |
| "learning_rate": 2.9524093897726875e-05, | |
| "loss": 0.2992, | |
| "mean_token_accuracy": 0.9085965514183044, | |
| "step": 975 | |
| }, | |
| { | |
| "epoch": 2.1031149301825995, | |
| "grad_norm": 0.16912179860419502, | |
| "learning_rate": 2.931373537592264e-05, | |
| "loss": 0.3059, | |
| "mean_token_accuracy": 0.9063934266567231, | |
| "step": 980 | |
| }, | |
| { | |
| "epoch": 2.113856068743287, | |
| "grad_norm": 0.1568909903521791, | |
| "learning_rate": 2.9103217156079183e-05, | |
| "loss": 0.3017, | |
| "mean_token_accuracy": 0.9079225361347198, | |
| "step": 985 | |
| }, | |
| { | |
| "epoch": 2.124597207303974, | |
| "grad_norm": 0.17149311680209844, | |
| "learning_rate": 2.8892557774167843e-05, | |
| "loss": 0.3023, | |
| "mean_token_accuracy": 0.9075566232204437, | |
| "step": 990 | |
| }, | |
| { | |
| "epoch": 2.1353383458646618, | |
| "grad_norm": 0.1730679539636109, | |
| "learning_rate": 2.8681775778589164e-05, | |
| "loss": 0.3031, | |
| "mean_token_accuracy": 0.9074501514434814, | |
| "step": 995 | |
| }, | |
| { | |
| "epoch": 2.146079484425349, | |
| "grad_norm": 0.168662599711155, | |
| "learning_rate": 2.8470889728539725e-05, | |
| "loss": 0.302, | |
| "mean_token_accuracy": 0.9077127814292908, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 2.1568206229860367, | |
| "grad_norm": 0.16226284047590997, | |
| "learning_rate": 2.8259918192378038e-05, | |
| "loss": 0.3041, | |
| "mean_token_accuracy": 0.9070930540561676, | |
| "step": 1005 | |
| }, | |
| { | |
| "epoch": 2.167561761546724, | |
| "grad_norm": 0.1576781128963043, | |
| "learning_rate": 2.804887974598959e-05, | |
| "loss": 0.3022, | |
| "mean_token_accuracy": 0.907502681016922, | |
| "step": 1010 | |
| }, | |
| { | |
| "epoch": 2.1783029001074112, | |
| "grad_norm": 0.15997962819428427, | |
| "learning_rate": 2.7837792971151268e-05, | |
| "loss": 0.3018, | |
| "mean_token_accuracy": 0.9079727530479431, | |
| "step": 1015 | |
| }, | |
| { | |
| "epoch": 2.189044038668099, | |
| "grad_norm": 0.16962861365112525, | |
| "learning_rate": 2.7626676453895238e-05, | |
| "loss": 0.3031, | |
| "mean_token_accuracy": 0.9071884095668793, | |
| "step": 1020 | |
| }, | |
| { | |
| "epoch": 2.199785177228786, | |
| "grad_norm": 0.16322576238996814, | |
| "learning_rate": 2.7415548782872468e-05, | |
| "loss": 0.3057, | |
| "mean_token_accuracy": 0.9065694689750672, | |
| "step": 1025 | |
| }, | |
| { | |
| "epoch": 2.2105263157894735, | |
| "grad_norm": 0.16909277271966566, | |
| "learning_rate": 2.7204428547716027e-05, | |
| "loss": 0.3052, | |
| "mean_token_accuracy": 0.9069810092449189, | |
| "step": 1030 | |
| }, | |
| { | |
| "epoch": 2.221267454350161, | |
| "grad_norm": 0.16098166127750824, | |
| "learning_rate": 2.699333433740422e-05, | |
| "loss": 0.3034, | |
| "mean_token_accuracy": 0.907333254814148, | |
| "step": 1035 | |
| }, | |
| { | |
| "epoch": 2.2320085929108484, | |
| "grad_norm": 0.17075220096927826, | |
| "learning_rate": 2.678228473862391e-05, | |
| "loss": 0.3059, | |
| "mean_token_accuracy": 0.9066526055335998, | |
| "step": 1040 | |
| }, | |
| { | |
| "epoch": 2.242749731471536, | |
| "grad_norm": 0.16370207033646628, | |
| "learning_rate": 2.6571298334133947e-05, | |
| "loss": 0.3049, | |
| "mean_token_accuracy": 0.9068757057189941, | |
| "step": 1045 | |
| }, | |
| { | |
| "epoch": 2.2534908700322234, | |
| "grad_norm": 0.1611010495321633, | |
| "learning_rate": 2.6360393701128968e-05, | |
| "loss": 0.3058, | |
| "mean_token_accuracy": 0.9067712783813476, | |
| "step": 1050 | |
| }, | |
| { | |
| "epoch": 2.264232008592911, | |
| "grad_norm": 0.16970228504955862, | |
| "learning_rate": 2.614958940960369e-05, | |
| "loss": 0.3052, | |
| "mean_token_accuracy": 0.9068210601806641, | |
| "step": 1055 | |
| }, | |
| { | |
| "epoch": 2.2749731471535983, | |
| "grad_norm": 0.1677663409783765, | |
| "learning_rate": 2.593890402071784e-05, | |
| "loss": 0.303, | |
| "mean_token_accuracy": 0.9071888148784637, | |
| "step": 1060 | |
| }, | |
| { | |
| "epoch": 2.2857142857142856, | |
| "grad_norm": 0.1594126722501793, | |
| "learning_rate": 2.5728356085161864e-05, | |
| "loss": 0.2979, | |
| "mean_token_accuracy": 0.9088397026062012, | |
| "step": 1065 | |
| }, | |
| { | |
| "epoch": 2.2964554242749733, | |
| "grad_norm": 0.15755295908932457, | |
| "learning_rate": 2.5517964141523525e-05, | |
| "loss": 0.3009, | |
| "mean_token_accuracy": 0.9078912615776062, | |
| "step": 1070 | |
| }, | |
| { | |
| "epoch": 2.3071965628356605, | |
| "grad_norm": 0.15824119025266686, | |
| "learning_rate": 2.5307746714655634e-05, | |
| "loss": 0.3065, | |
| "mean_token_accuracy": 0.9067668735980987, | |
| "step": 1075 | |
| }, | |
| { | |
| "epoch": 2.317937701396348, | |
| "grad_norm": 0.1593424773763769, | |
| "learning_rate": 2.509772231404493e-05, | |
| "loss": 0.3072, | |
| "mean_token_accuracy": 0.9063262104988098, | |
| "step": 1080 | |
| }, | |
| { | |
| "epoch": 2.3286788399570355, | |
| "grad_norm": 0.16745585583895234, | |
| "learning_rate": 2.4887909432182316e-05, | |
| "loss": 0.3205, | |
| "mean_token_accuracy": 0.9050490736961365, | |
| "step": 1085 | |
| }, | |
| { | |
| "epoch": 2.3394199785177228, | |
| "grad_norm": 0.18108073198198416, | |
| "learning_rate": 2.4678326542934667e-05, | |
| "loss": 0.3048, | |
| "mean_token_accuracy": 0.9068881213665009, | |
| "step": 1090 | |
| }, | |
| { | |
| "epoch": 2.3501611170784105, | |
| "grad_norm": 0.17241262713318053, | |
| "learning_rate": 2.4468992099918138e-05, | |
| "loss": 0.3032, | |
| "mean_token_accuracy": 0.9073716223239898, | |
| "step": 1095 | |
| }, | |
| { | |
| "epoch": 2.3609022556390977, | |
| "grad_norm": 0.16397300617763141, | |
| "learning_rate": 2.4259924534873385e-05, | |
| "loss": 0.3061, | |
| "mean_token_accuracy": 0.9062675356864929, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 2.3716433941997854, | |
| "grad_norm": 0.1700811614554712, | |
| "learning_rate": 2.4051142256042697e-05, | |
| "loss": 0.3011, | |
| "mean_token_accuracy": 0.90796759724617, | |
| "step": 1105 | |
| }, | |
| { | |
| "epoch": 2.3823845327604727, | |
| "grad_norm": 0.16924471517889025, | |
| "learning_rate": 2.3842663646549085e-05, | |
| "loss": 0.3025, | |
| "mean_token_accuracy": 0.9076179921627044, | |
| "step": 1110 | |
| }, | |
| { | |
| "epoch": 2.39312567132116, | |
| "grad_norm": 0.582746886765867, | |
| "learning_rate": 2.3634507062777726e-05, | |
| "loss": 0.3036, | |
| "mean_token_accuracy": 0.9076011419296265, | |
| "step": 1115 | |
| }, | |
| { | |
| "epoch": 2.4038668098818476, | |
| "grad_norm": 0.15789580559295846, | |
| "learning_rate": 2.3426690832759652e-05, | |
| "loss": 0.2997, | |
| "mean_token_accuracy": 0.9084276914596557, | |
| "step": 1120 | |
| }, | |
| { | |
| "epoch": 2.414607948442535, | |
| "grad_norm": 0.15924353242995867, | |
| "learning_rate": 2.3219233254558025e-05, | |
| "loss": 0.3029, | |
| "mean_token_accuracy": 0.9074055433273316, | |
| "step": 1125 | |
| }, | |
| { | |
| "epoch": 2.425349087003222, | |
| "grad_norm": 0.16646800963930639, | |
| "learning_rate": 2.3012152594656982e-05, | |
| "loss": 0.3043, | |
| "mean_token_accuracy": 0.9070705771446228, | |
| "step": 1130 | |
| }, | |
| { | |
| "epoch": 2.43609022556391, | |
| "grad_norm": 0.16197886055551655, | |
| "learning_rate": 2.2805467086353268e-05, | |
| "loss": 0.2983, | |
| "mean_token_accuracy": 0.9087878286838531, | |
| "step": 1135 | |
| }, | |
| { | |
| "epoch": 2.446831364124597, | |
| "grad_norm": 0.16381004501438137, | |
| "learning_rate": 2.2599194928150842e-05, | |
| "loss": 0.3037, | |
| "mean_token_accuracy": 0.9073452115058899, | |
| "step": 1140 | |
| }, | |
| { | |
| "epoch": 2.457572502685285, | |
| "grad_norm": 0.16540282102993875, | |
| "learning_rate": 2.239335428215849e-05, | |
| "loss": 0.3042, | |
| "mean_token_accuracy": 0.9071446895599365, | |
| "step": 1145 | |
| }, | |
| { | |
| "epoch": 2.468313641245972, | |
| "grad_norm": 0.16037824203377551, | |
| "learning_rate": 2.2187963272490676e-05, | |
| "loss": 0.3022, | |
| "mean_token_accuracy": 0.9079298913478852, | |
| "step": 1150 | |
| }, | |
| { | |
| "epoch": 2.4790547798066593, | |
| "grad_norm": 0.15882572997154093, | |
| "learning_rate": 2.198303998367171e-05, | |
| "loss": 0.3067, | |
| "mean_token_accuracy": 0.9064932882785797, | |
| "step": 1155 | |
| }, | |
| { | |
| "epoch": 2.489795918367347, | |
| "grad_norm": 0.15831447424850761, | |
| "learning_rate": 2.1778602459043452e-05, | |
| "loss": 0.3039, | |
| "mean_token_accuracy": 0.9070046961307525, | |
| "step": 1160 | |
| }, | |
| { | |
| "epoch": 2.5005370569280343, | |
| "grad_norm": 0.16081532493077333, | |
| "learning_rate": 2.157466869917658e-05, | |
| "loss": 0.3041, | |
| "mean_token_accuracy": 0.9073209702968598, | |
| "step": 1165 | |
| }, | |
| { | |
| "epoch": 2.511278195488722, | |
| "grad_norm": 0.15516248272553126, | |
| "learning_rate": 2.1371256660285655e-05, | |
| "loss": 0.3044, | |
| "mean_token_accuracy": 0.9070526838302613, | |
| "step": 1170 | |
| }, | |
| { | |
| "epoch": 2.5220193340494093, | |
| "grad_norm": 0.1587382733948704, | |
| "learning_rate": 2.1168384252648117e-05, | |
| "loss": 0.2999, | |
| "mean_token_accuracy": 0.9086295425891876, | |
| "step": 1175 | |
| }, | |
| { | |
| "epoch": 2.5327604726100965, | |
| "grad_norm": 0.15919430172381277, | |
| "learning_rate": 2.0966069339027256e-05, | |
| "loss": 0.3017, | |
| "mean_token_accuracy": 0.9076282560825348, | |
| "step": 1180 | |
| }, | |
| { | |
| "epoch": 2.543501611170784, | |
| "grad_norm": 0.1602383119084914, | |
| "learning_rate": 2.0764329733099446e-05, | |
| "loss": 0.2998, | |
| "mean_token_accuracy": 0.9084926426410675, | |
| "step": 1185 | |
| }, | |
| { | |
| "epoch": 2.5542427497314715, | |
| "grad_norm": 0.16156220155082493, | |
| "learning_rate": 2.0563183197885653e-05, | |
| "loss": 0.3068, | |
| "mean_token_accuracy": 0.9063272118568421, | |
| "step": 1190 | |
| }, | |
| { | |
| "epoch": 2.5649838882921587, | |
| "grad_norm": 0.15676424327787444, | |
| "learning_rate": 2.03626474441874e-05, | |
| "loss": 0.304, | |
| "mean_token_accuracy": 0.9073390066623688, | |
| "step": 1195 | |
| }, | |
| { | |
| "epoch": 2.5757250268528464, | |
| "grad_norm": 0.16064943066993936, | |
| "learning_rate": 2.016274012902737e-05, | |
| "loss": 0.3031, | |
| "mean_token_accuracy": 0.9080215394496918, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 2.5864661654135337, | |
| "grad_norm": 0.15163324815906554, | |
| "learning_rate": 1.996347885409468e-05, | |
| "loss": 0.2995, | |
| "mean_token_accuracy": 0.9081439912319184, | |
| "step": 1205 | |
| }, | |
| { | |
| "epoch": 2.5972073039742214, | |
| "grad_norm": 0.16245754277077917, | |
| "learning_rate": 1.9764881164195113e-05, | |
| "loss": 0.3015, | |
| "mean_token_accuracy": 0.907852166891098, | |
| "step": 1210 | |
| }, | |
| { | |
| "epoch": 2.6079484425349087, | |
| "grad_norm": 0.16043196872565563, | |
| "learning_rate": 1.956696454570629e-05, | |
| "loss": 0.3038, | |
| "mean_token_accuracy": 0.9070708453655243, | |
| "step": 1215 | |
| }, | |
| { | |
| "epoch": 2.6186895810955964, | |
| "grad_norm": 0.1518503511295408, | |
| "learning_rate": 1.9369746425037983e-05, | |
| "loss": 0.3031, | |
| "mean_token_accuracy": 0.9073640763759613, | |
| "step": 1220 | |
| }, | |
| { | |
| "epoch": 2.6294307196562836, | |
| "grad_norm": 0.16579054364092405, | |
| "learning_rate": 1.9173244167097766e-05, | |
| "loss": 0.3021, | |
| "mean_token_accuracy": 0.9075863361358643, | |
| "step": 1225 | |
| }, | |
| { | |
| "epoch": 2.640171858216971, | |
| "grad_norm": 0.16096483480946194, | |
| "learning_rate": 1.8977475073762042e-05, | |
| "loss": 0.3024, | |
| "mean_token_accuracy": 0.907714718580246, | |
| "step": 1230 | |
| }, | |
| { | |
| "epoch": 2.6509129967776586, | |
| "grad_norm": 0.16586554619371632, | |
| "learning_rate": 1.878245638235262e-05, | |
| "loss": 0.3032, | |
| "mean_token_accuracy": 0.9077441573143006, | |
| "step": 1235 | |
| }, | |
| { | |
| "epoch": 2.661654135338346, | |
| "grad_norm": 0.17145727431540336, | |
| "learning_rate": 1.8588205264118974e-05, | |
| "loss": 0.3007, | |
| "mean_token_accuracy": 0.9080956459045411, | |
| "step": 1240 | |
| }, | |
| { | |
| "epoch": 2.672395273899033, | |
| "grad_norm": 0.16247484247551466, | |
| "learning_rate": 1.8394738822726337e-05, | |
| "loss": 0.3078, | |
| "mean_token_accuracy": 0.9063467800617218, | |
| "step": 1245 | |
| }, | |
| { | |
| "epoch": 2.683136412459721, | |
| "grad_norm": 0.16303109945042918, | |
| "learning_rate": 1.8202074092749754e-05, | |
| "loss": 0.305, | |
| "mean_token_accuracy": 0.9077015459537506, | |
| "step": 1250 | |
| }, | |
| { | |
| "epoch": 2.693877551020408, | |
| "grad_norm": 0.15810829618004768, | |
| "learning_rate": 1.8010228038174154e-05, | |
| "loss": 0.3052, | |
| "mean_token_accuracy": 0.9069934606552124, | |
| "step": 1255 | |
| }, | |
| { | |
| "epoch": 2.7046186895810957, | |
| "grad_norm": 0.1572557171403785, | |
| "learning_rate": 1.781921755090072e-05, | |
| "loss": 0.3029, | |
| "mean_token_accuracy": 0.9075438380241394, | |
| "step": 1260 | |
| }, | |
| { | |
| "epoch": 2.715359828141783, | |
| "grad_norm": 0.15752257331645983, | |
| "learning_rate": 1.7629059449259565e-05, | |
| "loss": 0.2978, | |
| "mean_token_accuracy": 0.9092587411403656, | |
| "step": 1265 | |
| }, | |
| { | |
| "epoch": 2.7261009667024707, | |
| "grad_norm": 0.155952159894427, | |
| "learning_rate": 1.7439770476528894e-05, | |
| "loss": 0.3025, | |
| "mean_token_accuracy": 0.9076742231845856, | |
| "step": 1270 | |
| }, | |
| { | |
| "epoch": 2.736842105263158, | |
| "grad_norm": 0.1578844927904049, | |
| "learning_rate": 1.7251367299460735e-05, | |
| "loss": 0.3043, | |
| "mean_token_accuracy": 0.9071321785449982, | |
| "step": 1275 | |
| }, | |
| { | |
| "epoch": 2.7475832438238452, | |
| "grad_norm": 0.15643506287974016, | |
| "learning_rate": 1.7063866506813515e-05, | |
| "loss": 0.3014, | |
| "mean_token_accuracy": 0.9080881893634796, | |
| "step": 1280 | |
| }, | |
| { | |
| "epoch": 2.758324382384533, | |
| "grad_norm": 0.16188588270959753, | |
| "learning_rate": 1.687728460789136e-05, | |
| "loss": 0.3029, | |
| "mean_token_accuracy": 0.9077995300292969, | |
| "step": 1285 | |
| }, | |
| { | |
| "epoch": 2.76906552094522, | |
| "grad_norm": 0.15914290923730717, | |
| "learning_rate": 1.669163803109049e-05, | |
| "loss": 0.3039, | |
| "mean_token_accuracy": 0.9069546043872834, | |
| "step": 1290 | |
| }, | |
| { | |
| "epoch": 2.7798066595059074, | |
| "grad_norm": 0.1531939594797534, | |
| "learning_rate": 1.650694312245272e-05, | |
| "loss": 0.301, | |
| "mean_token_accuracy": 0.9082088112831116, | |
| "step": 1295 | |
| }, | |
| { | |
| "epoch": 2.790547798066595, | |
| "grad_norm": 0.14781879067353518, | |
| "learning_rate": 1.6323216144226218e-05, | |
| "loss": 0.3006, | |
| "mean_token_accuracy": 0.9082107961177825, | |
| "step": 1300 | |
| }, | |
| { | |
| "epoch": 2.8012889366272824, | |
| "grad_norm": 0.15796491533044651, | |
| "learning_rate": 1.614047327343358e-05, | |
| "loss": 0.3037, | |
| "mean_token_accuracy": 0.9073608994483948, | |
| "step": 1305 | |
| }, | |
| { | |
| "epoch": 2.8120300751879697, | |
| "grad_norm": 0.15342589995319128, | |
| "learning_rate": 1.5958730600447483e-05, | |
| "loss": 0.2982, | |
| "mean_token_accuracy": 0.9089851617813111, | |
| "step": 1310 | |
| }, | |
| { | |
| "epoch": 2.8227712137486574, | |
| "grad_norm": 0.15213716012041018, | |
| "learning_rate": 1.5778004127573954e-05, | |
| "loss": 0.3018, | |
| "mean_token_accuracy": 0.9082035005092621, | |
| "step": 1315 | |
| }, | |
| { | |
| "epoch": 2.833512352309345, | |
| "grad_norm": 0.15689344716817114, | |
| "learning_rate": 1.5598309767643355e-05, | |
| "loss": 0.3015, | |
| "mean_token_accuracy": 0.9079676389694213, | |
| "step": 1320 | |
| }, | |
| { | |
| "epoch": 2.8442534908700323, | |
| "grad_norm": 0.15560793520372218, | |
| "learning_rate": 1.5419663342609245e-05, | |
| "loss": 0.301, | |
| "mean_token_accuracy": 0.9079644203186035, | |
| "step": 1325 | |
| }, | |
| { | |
| "epoch": 2.8549946294307196, | |
| "grad_norm": 0.15762229912652725, | |
| "learning_rate": 1.524208058215536e-05, | |
| "loss": 0.3004, | |
| "mean_token_accuracy": 0.9081010043621063, | |
| "step": 1330 | |
| }, | |
| { | |
| "epoch": 2.8657357679914073, | |
| "grad_norm": 0.1492296564674764, | |
| "learning_rate": 1.5065577122310532e-05, | |
| "loss": 0.3038, | |
| "mean_token_accuracy": 0.9071996510028839, | |
| "step": 1335 | |
| }, | |
| { | |
| "epoch": 2.8764769065520945, | |
| "grad_norm": 0.15341782949091415, | |
| "learning_rate": 1.4890168504071986e-05, | |
| "loss": 0.3013, | |
| "mean_token_accuracy": 0.9081071972846985, | |
| "step": 1340 | |
| }, | |
| { | |
| "epoch": 2.887218045112782, | |
| "grad_norm": 0.15319646472290932, | |
| "learning_rate": 1.4715870172036961e-05, | |
| "loss": 0.2985, | |
| "mean_token_accuracy": 0.9089631140232086, | |
| "step": 1345 | |
| }, | |
| { | |
| "epoch": 2.8979591836734695, | |
| "grad_norm": 0.155104806503441, | |
| "learning_rate": 1.4542697473042855e-05, | |
| "loss": 0.3015, | |
| "mean_token_accuracy": 0.9081062614917755, | |
| "step": 1350 | |
| }, | |
| { | |
| "epoch": 2.9087003222341568, | |
| "grad_norm": 0.14997293337059112, | |
| "learning_rate": 1.4370665654815896e-05, | |
| "loss": 0.3016, | |
| "mean_token_accuracy": 0.9077993631362915, | |
| "step": 1355 | |
| }, | |
| { | |
| "epoch": 2.919441460794844, | |
| "grad_norm": 0.15836235770159765, | |
| "learning_rate": 1.4199789864628612e-05, | |
| "loss": 0.3025, | |
| "mean_token_accuracy": 0.9076350510120392, | |
| "step": 1360 | |
| }, | |
| { | |
| "epoch": 2.9301825993555317, | |
| "grad_norm": 0.15239559171871817, | |
| "learning_rate": 1.403008514796616e-05, | |
| "loss": 0.3002, | |
| "mean_token_accuracy": 0.9083379149436951, | |
| "step": 1365 | |
| }, | |
| { | |
| "epoch": 2.940923737916219, | |
| "grad_norm": 0.15596273472793287, | |
| "learning_rate": 1.3861566447201524e-05, | |
| "loss": 0.2989, | |
| "mean_token_accuracy": 0.9084150791168213, | |
| "step": 1370 | |
| }, | |
| { | |
| "epoch": 2.9516648764769067, | |
| "grad_norm": 0.15225411451673648, | |
| "learning_rate": 1.3694248600279886e-05, | |
| "loss": 0.3002, | |
| "mean_token_accuracy": 0.9083608329296112, | |
| "step": 1375 | |
| }, | |
| { | |
| "epoch": 2.962406015037594, | |
| "grad_norm": 0.15301962057571455, | |
| "learning_rate": 1.3528146339412146e-05, | |
| "loss": 0.3021, | |
| "mean_token_accuracy": 0.9078640341758728, | |
| "step": 1380 | |
| }, | |
| { | |
| "epoch": 2.9731471535982816, | |
| "grad_norm": 0.15353042988029672, | |
| "learning_rate": 1.3363274289777773e-05, | |
| "loss": 0.2992, | |
| "mean_token_accuracy": 0.9084159135818481, | |
| "step": 1385 | |
| }, | |
| { | |
| "epoch": 2.983888292158969, | |
| "grad_norm": 0.1565397591962354, | |
| "learning_rate": 1.3199646968237039e-05, | |
| "loss": 0.3019, | |
| "mean_token_accuracy": 0.9077640831470489, | |
| "step": 1390 | |
| }, | |
| { | |
| "epoch": 2.994629430719656, | |
| "grad_norm": 0.15512948456888964, | |
| "learning_rate": 1.3037278782052863e-05, | |
| "loss": 0.301, | |
| "mean_token_accuracy": 0.908068060874939, | |
| "step": 1395 | |
| }, | |
| { | |
| "epoch": 3.004296455424275, | |
| "grad_norm": 0.17611687143689977, | |
| "learning_rate": 1.2876184027622246e-05, | |
| "loss": 0.2837, | |
| "mean_token_accuracy": 0.9126578701866997, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 3.0150375939849625, | |
| "grad_norm": 0.23111560237426948, | |
| "learning_rate": 1.2716376889217446e-05, | |
| "loss": 0.2617, | |
| "mean_token_accuracy": 0.9192156255245209, | |
| "step": 1405 | |
| }, | |
| { | |
| "epoch": 3.0257787325456498, | |
| "grad_norm": 0.18975174760198046, | |
| "learning_rate": 1.2557871437737118e-05, | |
| "loss": 0.2613, | |
| "mean_token_accuracy": 0.9190598428249359, | |
| "step": 1410 | |
| }, | |
| { | |
| "epoch": 3.0365198711063375, | |
| "grad_norm": 0.17890147872689252, | |
| "learning_rate": 1.240068162946737e-05, | |
| "loss": 0.2584, | |
| "mean_token_accuracy": 0.91984983086586, | |
| "step": 1415 | |
| }, | |
| { | |
| "epoch": 3.0472610096670247, | |
| "grad_norm": 0.17315801700410546, | |
| "learning_rate": 1.2244821304852888e-05, | |
| "loss": 0.2557, | |
| "mean_token_accuracy": 0.9208986639976502, | |
| "step": 1420 | |
| }, | |
| { | |
| "epoch": 3.058002148227712, | |
| "grad_norm": 0.18517285000872677, | |
| "learning_rate": 1.2090304187278333e-05, | |
| "loss": 0.2604, | |
| "mean_token_accuracy": 0.9195366144180298, | |
| "step": 1425 | |
| }, | |
| { | |
| "epoch": 3.0687432867883997, | |
| "grad_norm": 0.16562595080311196, | |
| "learning_rate": 1.1937143881859981e-05, | |
| "loss": 0.2577, | |
| "mean_token_accuracy": 0.9203976690769196, | |
| "step": 1430 | |
| }, | |
| { | |
| "epoch": 3.079484425349087, | |
| "grad_norm": 0.17393143558685065, | |
| "learning_rate": 1.178535387424785e-05, | |
| "loss": 0.2574, | |
| "mean_token_accuracy": 0.9199799060821533, | |
| "step": 1435 | |
| }, | |
| { | |
| "epoch": 3.090225563909774, | |
| "grad_norm": 0.1645998735975408, | |
| "learning_rate": 1.163494752943822e-05, | |
| "loss": 0.2568, | |
| "mean_token_accuracy": 0.9204827189445496, | |
| "step": 1440 | |
| }, | |
| { | |
| "epoch": 3.100966702470462, | |
| "grad_norm": 0.16887936249293273, | |
| "learning_rate": 1.1485938090596918e-05, | |
| "loss": 0.2586, | |
| "mean_token_accuracy": 0.9197791635990142, | |
| "step": 1445 | |
| }, | |
| { | |
| "epoch": 3.111707841031149, | |
| "grad_norm": 0.17416795475633623, | |
| "learning_rate": 1.1338338677893261e-05, | |
| "loss": 0.2584, | |
| "mean_token_accuracy": 0.9200873076915741, | |
| "step": 1450 | |
| }, | |
| { | |
| "epoch": 3.122448979591837, | |
| "grad_norm": 0.1751550798568952, | |
| "learning_rate": 1.1192162287344806e-05, | |
| "loss": 0.2584, | |
| "mean_token_accuracy": 0.919762271642685, | |
| "step": 1455 | |
| }, | |
| { | |
| "epoch": 3.133190118152524, | |
| "grad_norm": 0.17592907174451083, | |
| "learning_rate": 1.1047421789673082e-05, | |
| "loss": 0.2597, | |
| "mean_token_accuracy": 0.9195389747619629, | |
| "step": 1460 | |
| }, | |
| { | |
| "epoch": 3.143931256713212, | |
| "grad_norm": 0.17327426676281532, | |
| "learning_rate": 1.0904129929170317e-05, | |
| "loss": 0.2556, | |
| "mean_token_accuracy": 0.9207349836826324, | |
| "step": 1465 | |
| }, | |
| { | |
| "epoch": 3.154672395273899, | |
| "grad_norm": 0.17320030271762202, | |
| "learning_rate": 1.0762299322577352e-05, | |
| "loss": 0.2573, | |
| "mean_token_accuracy": 0.9203036367893219, | |
| "step": 1470 | |
| }, | |
| { | |
| "epoch": 3.1654135338345863, | |
| "grad_norm": 0.1722311431748818, | |
| "learning_rate": 1.0621942457972692e-05, | |
| "loss": 0.26, | |
| "mean_token_accuracy": 0.9195259928703308, | |
| "step": 1475 | |
| }, | |
| { | |
| "epoch": 3.176154672395274, | |
| "grad_norm": 0.17238717747260024, | |
| "learning_rate": 1.0483071693672959e-05, | |
| "loss": 0.2556, | |
| "mean_token_accuracy": 0.9209478557109833, | |
| "step": 1480 | |
| }, | |
| { | |
| "epoch": 3.1868958109559613, | |
| "grad_norm": 0.17188960001484813, | |
| "learning_rate": 1.0345699257144787e-05, | |
| "loss": 0.2599, | |
| "mean_token_accuracy": 0.9196560025215149, | |
| "step": 1485 | |
| }, | |
| { | |
| "epoch": 3.1976369495166486, | |
| "grad_norm": 0.16939046145995434, | |
| "learning_rate": 1.0209837243928163e-05, | |
| "loss": 0.2569, | |
| "mean_token_accuracy": 0.9202696919441223, | |
| "step": 1490 | |
| }, | |
| { | |
| "epoch": 3.2083780880773363, | |
| "grad_norm": 0.1643698296522669, | |
| "learning_rate": 1.0075497616571402e-05, | |
| "loss": 0.2613, | |
| "mean_token_accuracy": 0.9193197846412658, | |
| "step": 1495 | |
| }, | |
| { | |
| "epoch": 3.2191192266380235, | |
| "grad_norm": 0.17523553700537306, | |
| "learning_rate": 9.942692203577937e-06, | |
| "loss": 0.2617, | |
| "mean_token_accuracy": 0.9192265450954438, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 3.2298603651987112, | |
| "grad_norm": 0.17674127090736955, | |
| "learning_rate": 9.811432698364748e-06, | |
| "loss": 0.2611, | |
| "mean_token_accuracy": 0.9191824972629548, | |
| "step": 1505 | |
| }, | |
| { | |
| "epoch": 3.2406015037593985, | |
| "grad_norm": 0.17789280108349984, | |
| "learning_rate": 9.681730658232796e-06, | |
| "loss": 0.2631, | |
| "mean_token_accuracy": 0.9186322450637817, | |
| "step": 1510 | |
| }, | |
| { | |
| "epoch": 3.2513426423200857, | |
| "grad_norm": 0.17266428476273013, | |
| "learning_rate": 9.553597503349415e-06, | |
| "loss": 0.2582, | |
| "mean_token_accuracy": 0.9197676658630372, | |
| "step": 1515 | |
| }, | |
| { | |
| "epoch": 3.2620837808807734, | |
| "grad_norm": 0.1756023449894313, | |
| "learning_rate": 9.427044515742773e-06, | |
| "loss": 0.2583, | |
| "mean_token_accuracy": 0.9203043103218078, | |
| "step": 1520 | |
| }, | |
| { | |
| "epoch": 3.2728249194414607, | |
| "grad_norm": 0.1705185261901335, | |
| "learning_rate": 9.302082838308494e-06, | |
| "loss": 0.2588, | |
| "mean_token_accuracy": 0.9197465479373932, | |
| "step": 1525 | |
| }, | |
| { | |
| "epoch": 3.2835660580021484, | |
| "grad_norm": 0.1863220207081355, | |
| "learning_rate": 9.178723473828517e-06, | |
| "loss": 0.2592, | |
| "mean_token_accuracy": 0.919755893945694, | |
| "step": 1530 | |
| }, | |
| { | |
| "epoch": 3.2943071965628357, | |
| "grad_norm": 0.18144578655920904, | |
| "learning_rate": 9.05697728400236e-06, | |
| "loss": 0.2588, | |
| "mean_token_accuracy": 0.9201307475566864, | |
| "step": 1535 | |
| }, | |
| { | |
| "epoch": 3.305048335123523, | |
| "grad_norm": 0.17313846247861978, | |
| "learning_rate": 8.936854988490695e-06, | |
| "loss": 0.2627, | |
| "mean_token_accuracy": 0.9188291728496552, | |
| "step": 1540 | |
| }, | |
| { | |
| "epoch": 3.3157894736842106, | |
| "grad_norm": 0.1801914802446693, | |
| "learning_rate": 8.818367163971535e-06, | |
| "loss": 0.2557, | |
| "mean_token_accuracy": 0.9207710027694702, | |
| "step": 1545 | |
| }, | |
| { | |
| "epoch": 3.326530612244898, | |
| "grad_norm": 0.16994847146506772, | |
| "learning_rate": 8.701524243208935e-06, | |
| "loss": 0.2598, | |
| "mean_token_accuracy": 0.9194996774196624, | |
| "step": 1550 | |
| }, | |
| { | |
| "epoch": 3.3372717508055856, | |
| "grad_norm": 0.16955583517854705, | |
| "learning_rate": 8.586336514134416e-06, | |
| "loss": 0.2566, | |
| "mean_token_accuracy": 0.9205721557140351, | |
| "step": 1555 | |
| }, | |
| { | |
| "epoch": 3.348012889366273, | |
| "grad_norm": 0.17107585176009693, | |
| "learning_rate": 8.472814118941111e-06, | |
| "loss": 0.2594, | |
| "mean_token_accuracy": 0.9197823405265808, | |
| "step": 1560 | |
| }, | |
| { | |
| "epoch": 3.35875402792696, | |
| "grad_norm": 0.17753792836827956, | |
| "learning_rate": 8.360967053190748e-06, | |
| "loss": 0.2595, | |
| "mean_token_accuracy": 0.9195821940898895, | |
| "step": 1565 | |
| }, | |
| { | |
| "epoch": 3.369495166487648, | |
| "grad_norm": 0.1663276449550015, | |
| "learning_rate": 8.250805164933576e-06, | |
| "loss": 0.2576, | |
| "mean_token_accuracy": 0.9204757869243622, | |
| "step": 1570 | |
| }, | |
| { | |
| "epoch": 3.380236305048335, | |
| "grad_norm": 0.1727926922684143, | |
| "learning_rate": 8.142338153841204e-06, | |
| "loss": 0.2613, | |
| "mean_token_accuracy": 0.9192953467369079, | |
| "step": 1575 | |
| }, | |
| { | |
| "epoch": 3.3909774436090228, | |
| "grad_norm": 0.16245992891648223, | |
| "learning_rate": 8.035575570352586e-06, | |
| "loss": 0.2603, | |
| "mean_token_accuracy": 0.9196378767490387, | |
| "step": 1580 | |
| }, | |
| { | |
| "epoch": 3.40171858216971, | |
| "grad_norm": 0.1728382431801045, | |
| "learning_rate": 7.930526814833114e-06, | |
| "loss": 0.2642, | |
| "mean_token_accuracy": 0.9182481050491333, | |
| "step": 1585 | |
| }, | |
| { | |
| "epoch": 3.4124597207303973, | |
| "grad_norm": 0.17059237401574356, | |
| "learning_rate": 7.827201136746903e-06, | |
| "loss": 0.2608, | |
| "mean_token_accuracy": 0.9196362137794495, | |
| "step": 1590 | |
| }, | |
| { | |
| "epoch": 3.423200859291085, | |
| "grad_norm": 0.17006814998266018, | |
| "learning_rate": 7.725607633842397e-06, | |
| "loss": 0.262, | |
| "mean_token_accuracy": 0.9188037991523743, | |
| "step": 1595 | |
| }, | |
| { | |
| "epoch": 3.4339419978517722, | |
| "grad_norm": 0.17763939677962118, | |
| "learning_rate": 7.625755251351302e-06, | |
| "loss": 0.2571, | |
| "mean_token_accuracy": 0.92064950466156, | |
| "step": 1600 | |
| }, | |
| { | |
| "epoch": 3.4446831364124595, | |
| "grad_norm": 0.16880550111530884, | |
| "learning_rate": 7.52765278120101e-06, | |
| "loss": 0.2619, | |
| "mean_token_accuracy": 0.919091010093689, | |
| "step": 1605 | |
| }, | |
| { | |
| "epoch": 3.455424274973147, | |
| "grad_norm": 0.17470127038229266, | |
| "learning_rate": 7.431308861240405e-06, | |
| "loss": 0.2611, | |
| "mean_token_accuracy": 0.9194313704967498, | |
| "step": 1610 | |
| }, | |
| { | |
| "epoch": 3.4661654135338344, | |
| "grad_norm": 0.18361814009538877, | |
| "learning_rate": 7.336731974479366e-06, | |
| "loss": 0.2606, | |
| "mean_token_accuracy": 0.9194453060626984, | |
| "step": 1615 | |
| }, | |
| { | |
| "epoch": 3.476906552094522, | |
| "grad_norm": 0.16896194278522544, | |
| "learning_rate": 7.2439304483418275e-06, | |
| "loss": 0.2567, | |
| "mean_token_accuracy": 0.9206092417240143, | |
| "step": 1620 | |
| }, | |
| { | |
| "epoch": 3.4876476906552094, | |
| "grad_norm": 0.16668518571688956, | |
| "learning_rate": 7.152912453932546e-06, | |
| "loss": 0.2595, | |
| "mean_token_accuracy": 0.9194850385189056, | |
| "step": 1625 | |
| }, | |
| { | |
| "epoch": 3.498388829215897, | |
| "grad_norm": 0.17386165770379072, | |
| "learning_rate": 7.063686005317651e-06, | |
| "loss": 0.2579, | |
| "mean_token_accuracy": 0.9201728105545044, | |
| "step": 1630 | |
| }, | |
| { | |
| "epoch": 3.5091299677765844, | |
| "grad_norm": 0.17090370338380814, | |
| "learning_rate": 6.976258958819e-06, | |
| "loss": 0.2583, | |
| "mean_token_accuracy": 0.9202900052070617, | |
| "step": 1635 | |
| }, | |
| { | |
| "epoch": 3.5198711063372716, | |
| "grad_norm": 0.1670190265056932, | |
| "learning_rate": 6.890639012322459e-06, | |
| "loss": 0.2547, | |
| "mean_token_accuracy": 0.9211665093898773, | |
| "step": 1640 | |
| }, | |
| { | |
| "epoch": 3.5306122448979593, | |
| "grad_norm": 0.17315381341418587, | |
| "learning_rate": 6.806833704600082e-06, | |
| "loss": 0.2561, | |
| "mean_token_accuracy": 0.9206245243549347, | |
| "step": 1645 | |
| }, | |
| { | |
| "epoch": 3.5413533834586466, | |
| "grad_norm": 0.17367639326439366, | |
| "learning_rate": 6.724850414646344e-06, | |
| "loss": 0.2554, | |
| "mean_token_accuracy": 0.9209690392017365, | |
| "step": 1650 | |
| }, | |
| { | |
| "epoch": 3.552094522019334, | |
| "grad_norm": 0.18356634723924625, | |
| "learning_rate": 6.644696361028427e-06, | |
| "loss": 0.2546, | |
| "mean_token_accuracy": 0.9211890578269959, | |
| "step": 1655 | |
| }, | |
| { | |
| "epoch": 3.5628356605800215, | |
| "grad_norm": 0.1686096868472299, | |
| "learning_rate": 6.566378601250625e-06, | |
| "loss": 0.258, | |
| "mean_token_accuracy": 0.9201010644435883, | |
| "step": 1660 | |
| }, | |
| { | |
| "epoch": 3.573576799140709, | |
| "grad_norm": 0.17097492830249045, | |
| "learning_rate": 6.489904031132919e-06, | |
| "loss": 0.2573, | |
| "mean_token_accuracy": 0.9203424453735352, | |
| "step": 1665 | |
| }, | |
| { | |
| "epoch": 3.5843179377013965, | |
| "grad_norm": 0.1708922574820426, | |
| "learning_rate": 6.415279384203853e-06, | |
| "loss": 0.2573, | |
| "mean_token_accuracy": 0.9202109038829803, | |
| "step": 1670 | |
| }, | |
| { | |
| "epoch": 3.5950590762620838, | |
| "grad_norm": 0.1772280034240442, | |
| "learning_rate": 6.3425112311075965e-06, | |
| "loss": 0.2563, | |
| "mean_token_accuracy": 0.9204185366630554, | |
| "step": 1675 | |
| }, | |
| { | |
| "epoch": 3.6058002148227715, | |
| "grad_norm": 0.17186880847864094, | |
| "learning_rate": 6.271605979025448e-06, | |
| "loss": 0.2555, | |
| "mean_token_accuracy": 0.9206036269664765, | |
| "step": 1680 | |
| }, | |
| { | |
| "epoch": 3.6165413533834587, | |
| "grad_norm": 0.16731807378864566, | |
| "learning_rate": 6.2025698711116535e-06, | |
| "loss": 0.2565, | |
| "mean_token_accuracy": 0.9205489337444306, | |
| "step": 1685 | |
| }, | |
| { | |
| "epoch": 3.627282491944146, | |
| "grad_norm": 0.17180713091530317, | |
| "learning_rate": 6.135408985943734e-06, | |
| "loss": 0.2573, | |
| "mean_token_accuracy": 0.9204003512859344, | |
| "step": 1690 | |
| }, | |
| { | |
| "epoch": 3.6380236305048337, | |
| "grad_norm": 0.1761977177776313, | |
| "learning_rate": 6.07012923698724e-06, | |
| "loss": 0.2587, | |
| "mean_token_accuracy": 0.9196424603462219, | |
| "step": 1695 | |
| }, | |
| { | |
| "epoch": 3.648764769065521, | |
| "grad_norm": 0.17221380858566646, | |
| "learning_rate": 6.006736372075093e-06, | |
| "loss": 0.2579, | |
| "mean_token_accuracy": 0.9200917899608612, | |
| "step": 1700 | |
| }, | |
| { | |
| "epoch": 3.659505907626208, | |
| "grad_norm": 0.16805608384415285, | |
| "learning_rate": 5.9452359729015004e-06, | |
| "loss": 0.2573, | |
| "mean_token_accuracy": 0.9203401625156402, | |
| "step": 1705 | |
| }, | |
| { | |
| "epoch": 3.670247046186896, | |
| "grad_norm": 0.1736765217184823, | |
| "learning_rate": 5.8856334545304676e-06, | |
| "loss": 0.2574, | |
| "mean_token_accuracy": 0.9203644514083862, | |
| "step": 1710 | |
| }, | |
| { | |
| "epoch": 3.680988184747583, | |
| "grad_norm": 0.1726788133620247, | |
| "learning_rate": 5.8279340649190244e-06, | |
| "loss": 0.2611, | |
| "mean_token_accuracy": 0.9194235980510712, | |
| "step": 1715 | |
| }, | |
| { | |
| "epoch": 3.6917293233082704, | |
| "grad_norm": 0.16707078529197217, | |
| "learning_rate": 5.7721428844551425e-06, | |
| "loss": 0.2611, | |
| "mean_token_accuracy": 0.9193582713603974, | |
| "step": 1720 | |
| }, | |
| { | |
| "epoch": 3.702470461868958, | |
| "grad_norm": 0.17182290992101512, | |
| "learning_rate": 5.7182648255104065e-06, | |
| "loss": 0.2596, | |
| "mean_token_accuracy": 0.9196705460548401, | |
| "step": 1725 | |
| }, | |
| { | |
| "epoch": 3.7132116004296454, | |
| "grad_norm": 0.17419790279430714, | |
| "learning_rate": 5.666304632007487e-06, | |
| "loss": 0.2595, | |
| "mean_token_accuracy": 0.9197326540946961, | |
| "step": 1730 | |
| }, | |
| { | |
| "epoch": 3.723952738990333, | |
| "grad_norm": 0.18041100180688655, | |
| "learning_rate": 5.616266879002444e-06, | |
| "loss": 0.2575, | |
| "mean_token_accuracy": 0.9202880382537841, | |
| "step": 1735 | |
| }, | |
| { | |
| "epoch": 3.7346938775510203, | |
| "grad_norm": 0.16636878690891047, | |
| "learning_rate": 5.568155972281892e-06, | |
| "loss": 0.2582, | |
| "mean_token_accuracy": 0.9199542105197906, | |
| "step": 1740 | |
| }, | |
| { | |
| "epoch": 3.745435016111708, | |
| "grad_norm": 0.17005943549418737, | |
| "learning_rate": 5.521976147975078e-06, | |
| "loss": 0.2575, | |
| "mean_token_accuracy": 0.9207047700881958, | |
| "step": 1745 | |
| }, | |
| { | |
| "epoch": 3.7561761546723953, | |
| "grad_norm": 0.17142683208534373, | |
| "learning_rate": 5.477731472180884e-06, | |
| "loss": 0.2578, | |
| "mean_token_accuracy": 0.9200609147548675, | |
| "step": 1750 | |
| }, | |
| { | |
| "epoch": 3.7669172932330826, | |
| "grad_norm": 0.19597039412044637, | |
| "learning_rate": 5.4354258406098275e-06, | |
| "loss": 0.2605, | |
| "mean_token_accuracy": 0.9196163058280945, | |
| "step": 1755 | |
| }, | |
| { | |
| "epoch": 3.7776584317937703, | |
| "grad_norm": 0.1891144335762954, | |
| "learning_rate": 5.395062978241028e-06, | |
| "loss": 0.256, | |
| "mean_token_accuracy": 0.9203970789909363, | |
| "step": 1760 | |
| }, | |
| { | |
| "epoch": 3.7883995703544575, | |
| "grad_norm": 0.1734382570098929, | |
| "learning_rate": 5.356646438994236e-06, | |
| "loss": 0.2562, | |
| "mean_token_accuracy": 0.9206745564937592, | |
| "step": 1765 | |
| }, | |
| { | |
| "epoch": 3.7991407089151448, | |
| "grad_norm": 0.167509733493585, | |
| "learning_rate": 5.3201796054169155e-06, | |
| "loss": 0.2587, | |
| "mean_token_accuracy": 0.919745409488678, | |
| "step": 1770 | |
| }, | |
| { | |
| "epoch": 3.8098818474758325, | |
| "grad_norm": 0.1758205628466223, | |
| "learning_rate": 5.285665688386408e-06, | |
| "loss": 0.2554, | |
| "mean_token_accuracy": 0.9208223819732666, | |
| "step": 1775 | |
| }, | |
| { | |
| "epoch": 3.8206229860365197, | |
| "grad_norm": 0.16934855068248722, | |
| "learning_rate": 5.253107726827213e-06, | |
| "loss": 0.2553, | |
| "mean_token_accuracy": 0.9208275616168976, | |
| "step": 1780 | |
| }, | |
| { | |
| "epoch": 3.8313641245972074, | |
| "grad_norm": 0.17212203700590173, | |
| "learning_rate": 5.222508587443419e-06, | |
| "loss": 0.2558, | |
| "mean_token_accuracy": 0.9208298087120056, | |
| "step": 1785 | |
| }, | |
| { | |
| "epoch": 3.8421052631578947, | |
| "grad_norm": 0.17351309384632746, | |
| "learning_rate": 5.193870964466299e-06, | |
| "loss": 0.2572, | |
| "mean_token_accuracy": 0.9206307530403137, | |
| "step": 1790 | |
| }, | |
| { | |
| "epoch": 3.8528464017185824, | |
| "grad_norm": 0.17423994454268188, | |
| "learning_rate": 5.167197379417072e-06, | |
| "loss": 0.2563, | |
| "mean_token_accuracy": 0.9204454243183136, | |
| "step": 1795 | |
| }, | |
| { | |
| "epoch": 3.8635875402792696, | |
| "grad_norm": 0.17091404042612268, | |
| "learning_rate": 5.142490180884889e-06, | |
| "loss": 0.2566, | |
| "mean_token_accuracy": 0.920625650882721, | |
| "step": 1800 | |
| }, | |
| { | |
| "epoch": 3.874328678839957, | |
| "grad_norm": 0.17402338382213903, | |
| "learning_rate": 5.119751544320045e-06, | |
| "loss": 0.2548, | |
| "mean_token_accuracy": 0.9212319254875183, | |
| "step": 1805 | |
| }, | |
| { | |
| "epoch": 3.8850698174006446, | |
| "grad_norm": 0.17785847377734187, | |
| "learning_rate": 5.098983471842435e-06, | |
| "loss": 0.2582, | |
| "mean_token_accuracy": 0.9204130828380584, | |
| "step": 1810 | |
| }, | |
| { | |
| "epoch": 3.895810955961332, | |
| "grad_norm": 0.17476387276762337, | |
| "learning_rate": 5.080187792065258e-06, | |
| "loss": 0.2576, | |
| "mean_token_accuracy": 0.9203925788402557, | |
| "step": 1815 | |
| }, | |
| { | |
| "epoch": 3.906552094522019, | |
| "grad_norm": 0.17401606856867693, | |
| "learning_rate": 5.063366159934019e-06, | |
| "loss": 0.257, | |
| "mean_token_accuracy": 0.9207073092460633, | |
| "step": 1820 | |
| }, | |
| { | |
| "epoch": 3.917293233082707, | |
| "grad_norm": 0.1709751716211779, | |
| "learning_rate": 5.04852005658081e-06, | |
| "loss": 0.2567, | |
| "mean_token_accuracy": 0.9206726491451264, | |
| "step": 1825 | |
| }, | |
| { | |
| "epoch": 3.928034371643394, | |
| "grad_norm": 0.17944667291264363, | |
| "learning_rate": 5.035650789193893e-06, | |
| "loss": 0.2583, | |
| "mean_token_accuracy": 0.919947350025177, | |
| "step": 1830 | |
| }, | |
| { | |
| "epoch": 3.938775510204082, | |
| "grad_norm": 0.17075839857976619, | |
| "learning_rate": 5.024759490902604e-06, | |
| "loss": 0.2606, | |
| "mean_token_accuracy": 0.9192731857299805, | |
| "step": 1835 | |
| }, | |
| { | |
| "epoch": 3.949516648764769, | |
| "grad_norm": 0.1725574446830871, | |
| "learning_rate": 5.015847120677588e-06, | |
| "loss": 0.2585, | |
| "mean_token_accuracy": 0.9199050843715668, | |
| "step": 1840 | |
| }, | |
| { | |
| "epoch": 3.9602577873254567, | |
| "grad_norm": 0.17546758649223276, | |
| "learning_rate": 5.008914463246362e-06, | |
| "loss": 0.2586, | |
| "mean_token_accuracy": 0.920122253894806, | |
| "step": 1845 | |
| }, | |
| { | |
| "epoch": 3.970998925886144, | |
| "grad_norm": 0.16820021081330186, | |
| "learning_rate": 5.0039621290242065e-06, | |
| "loss": 0.2583, | |
| "mean_token_accuracy": 0.9200729191303253, | |
| "step": 1850 | |
| }, | |
| { | |
| "epoch": 3.9817400644468313, | |
| "grad_norm": 0.17517771341096255, | |
| "learning_rate": 5.000990554060436e-06, | |
| "loss": 0.2604, | |
| "mean_token_accuracy": 0.9193271338939667, | |
| "step": 1855 | |
| }, | |
| { | |
| "epoch": 3.992481203007519, | |
| "grad_norm": 0.17294557291581655, | |
| "learning_rate": 5e-06, | |
| "loss": 0.2556, | |
| "mean_token_accuracy": 0.920825207233429, | |
| "step": 1860 | |
| }, | |
| { | |
| "epoch": 3.992481203007519, | |
| "step": 1860, | |
| "total_flos": 966947082862592.0, | |
| "train_loss": 0.34282420668550717, | |
| "train_runtime": 10626.5662, | |
| "train_samples_per_second": 2.802, | |
| "train_steps_per_second": 0.175 | |
| } | |
| ], | |
| "logging_steps": 5, | |
| "max_steps": 1860, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 4, | |
| "save_steps": 1000, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 966947082862592.0, | |
| "train_batch_size": 1, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |