| { | |
| "best_global_step": null, | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 1.545595054095827, | |
| "eval_steps": 500, | |
| "global_step": 5000, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.0030911901081916537, | |
| "grad_norm": 3.058769941329956, | |
| "learning_rate": 1.9972179289026277e-05, | |
| "loss": 5.0209, | |
| "mean_token_accuracy": 0.21833103336393833, | |
| "num_tokens": 28941.0, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.0061823802163833074, | |
| "grad_norm": 2.485980749130249, | |
| "learning_rate": 1.994126738794436e-05, | |
| "loss": 4.7631, | |
| "mean_token_accuracy": 0.23807235918939113, | |
| "num_tokens": 60990.0, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.00927357032457496, | |
| "grad_norm": 5.288544178009033, | |
| "learning_rate": 1.9910355486862444e-05, | |
| "loss": 4.6799, | |
| "mean_token_accuracy": 0.24858475103974342, | |
| "num_tokens": 90602.0, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.012364760432766615, | |
| "grad_norm": 3.226719379425049, | |
| "learning_rate": 1.9879443585780528e-05, | |
| "loss": 4.5883, | |
| "mean_token_accuracy": 0.2555678006261587, | |
| "num_tokens": 121132.0, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.015455950540958269, | |
| "grad_norm": 2.027597188949585, | |
| "learning_rate": 1.984853168469861e-05, | |
| "loss": 4.5125, | |
| "mean_token_accuracy": 0.26637452803552153, | |
| "num_tokens": 154244.0, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.01854714064914992, | |
| "grad_norm": 1.6715962886810303, | |
| "learning_rate": 1.9817619783616695e-05, | |
| "loss": 4.4224, | |
| "mean_token_accuracy": 0.27198897041380404, | |
| "num_tokens": 185618.0, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.021638330757341576, | |
| "grad_norm": 3.0472540855407715, | |
| "learning_rate": 1.9786707882534775e-05, | |
| "loss": 4.3464, | |
| "mean_token_accuracy": 0.2801041007041931, | |
| "num_tokens": 220138.0, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.02472952086553323, | |
| "grad_norm": 2.74045729637146, | |
| "learning_rate": 1.9755795981452862e-05, | |
| "loss": 4.4569, | |
| "mean_token_accuracy": 0.2733523309230804, | |
| "num_tokens": 252919.0, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.027820710973724884, | |
| "grad_norm": 1.9274535179138184, | |
| "learning_rate": 1.9724884080370946e-05, | |
| "loss": 4.2569, | |
| "mean_token_accuracy": 0.2858310595154762, | |
| "num_tokens": 284739.0, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.030911901081916538, | |
| "grad_norm": 1.906964898109436, | |
| "learning_rate": 1.9693972179289026e-05, | |
| "loss": 4.2144, | |
| "mean_token_accuracy": 0.2953581381589174, | |
| "num_tokens": 315186.0, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.03400309119010819, | |
| "grad_norm": 2.042823314666748, | |
| "learning_rate": 1.966306027820711e-05, | |
| "loss": 4.2595, | |
| "mean_token_accuracy": 0.287172843888402, | |
| "num_tokens": 347507.0, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.03709428129829984, | |
| "grad_norm": 1.9889057874679565, | |
| "learning_rate": 1.9632148377125197e-05, | |
| "loss": 4.1643, | |
| "mean_token_accuracy": 0.30306958928704264, | |
| "num_tokens": 380331.0, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.0401854714064915, | |
| "grad_norm": 1.7952933311462402, | |
| "learning_rate": 1.9601236476043277e-05, | |
| "loss": 4.1334, | |
| "mean_token_accuracy": 0.3080880597233772, | |
| "num_tokens": 410947.0, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.04327666151468315, | |
| "grad_norm": 3.8508658409118652, | |
| "learning_rate": 1.957032457496136e-05, | |
| "loss": 4.0262, | |
| "mean_token_accuracy": 0.3118089348077774, | |
| "num_tokens": 444130.0, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.04636785162287481, | |
| "grad_norm": 31.280546188354492, | |
| "learning_rate": 1.9539412673879444e-05, | |
| "loss": 3.9889, | |
| "mean_token_accuracy": 0.32304659858345985, | |
| "num_tokens": 474570.0, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.04945904173106646, | |
| "grad_norm": 2.1569416522979736, | |
| "learning_rate": 1.950850077279753e-05, | |
| "loss": 4.1053, | |
| "mean_token_accuracy": 0.31258094161748884, | |
| "num_tokens": 505047.0, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.05255023183925812, | |
| "grad_norm": 1.808377981185913, | |
| "learning_rate": 1.947758887171561e-05, | |
| "loss": 3.9711, | |
| "mean_token_accuracy": 0.33312112018465995, | |
| "num_tokens": 533187.0, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.05564142194744977, | |
| "grad_norm": 2.8038811683654785, | |
| "learning_rate": 1.9446676970633695e-05, | |
| "loss": 3.961, | |
| "mean_token_accuracy": 0.33416116759181025, | |
| "num_tokens": 565598.0, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.05873261205564142, | |
| "grad_norm": 8.930831909179688, | |
| "learning_rate": 1.941576506955178e-05, | |
| "loss": 4.0107, | |
| "mean_token_accuracy": 0.3261258576065302, | |
| "num_tokens": 598505.0, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 0.061823802163833076, | |
| "grad_norm": 4.706038475036621, | |
| "learning_rate": 1.9384853168469862e-05, | |
| "loss": 3.9231, | |
| "mean_token_accuracy": 0.33582728281617164, | |
| "num_tokens": 630284.0, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.06491499227202473, | |
| "grad_norm": 2.9916040897369385, | |
| "learning_rate": 1.9353941267387946e-05, | |
| "loss": 3.9407, | |
| "mean_token_accuracy": 0.32844844460487366, | |
| "num_tokens": 660513.0, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 0.06800618238021638, | |
| "grad_norm": 2.763737678527832, | |
| "learning_rate": 1.932302936630603e-05, | |
| "loss": 4.0479, | |
| "mean_token_accuracy": 0.3248734712600708, | |
| "num_tokens": 693154.0, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 0.07109737248840804, | |
| "grad_norm": 3.656487464904785, | |
| "learning_rate": 1.9292117465224113e-05, | |
| "loss": 3.7843, | |
| "mean_token_accuracy": 0.34686593189835546, | |
| "num_tokens": 724686.0, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 0.07418856259659969, | |
| "grad_norm": 2.6140244007110596, | |
| "learning_rate": 1.9261205564142196e-05, | |
| "loss": 3.9261, | |
| "mean_token_accuracy": 0.3310456670820713, | |
| "num_tokens": 755625.0, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 0.07727975270479134, | |
| "grad_norm": 1.595627784729004, | |
| "learning_rate": 1.923029366306028e-05, | |
| "loss": 3.841, | |
| "mean_token_accuracy": 0.34514380544424056, | |
| "num_tokens": 785478.0, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.080370942812983, | |
| "grad_norm": 2.0485758781433105, | |
| "learning_rate": 1.9199381761978363e-05, | |
| "loss": 3.7596, | |
| "mean_token_accuracy": 0.35417362824082377, | |
| "num_tokens": 816351.0, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 0.08346213292117466, | |
| "grad_norm": 1.7564281225204468, | |
| "learning_rate": 1.9168469860896447e-05, | |
| "loss": 3.7927, | |
| "mean_token_accuracy": 0.3487237967550755, | |
| "num_tokens": 849774.0, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 0.0865533230293663, | |
| "grad_norm": 1.5662060976028442, | |
| "learning_rate": 1.913755795981453e-05, | |
| "loss": 3.7821, | |
| "mean_token_accuracy": 0.3511029303073883, | |
| "num_tokens": 881474.0, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 0.08964451313755796, | |
| "grad_norm": 2.539433002471924, | |
| "learning_rate": 1.9106646058732614e-05, | |
| "loss": 3.8613, | |
| "mean_token_accuracy": 0.3471171148121357, | |
| "num_tokens": 913459.0, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 0.09273570324574962, | |
| "grad_norm": 4.281046390533447, | |
| "learning_rate": 1.9075734157650694e-05, | |
| "loss": 3.8436, | |
| "mean_token_accuracy": 0.3475985363125801, | |
| "num_tokens": 947091.0, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.09582689335394126, | |
| "grad_norm": 2.379791736602783, | |
| "learning_rate": 1.904482225656878e-05, | |
| "loss": 3.8309, | |
| "mean_token_accuracy": 0.34195478409528735, | |
| "num_tokens": 983471.0, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 0.09891808346213292, | |
| "grad_norm": 2.4176697731018066, | |
| "learning_rate": 1.9013910355486865e-05, | |
| "loss": 3.75, | |
| "mean_token_accuracy": 0.3633588753640652, | |
| "num_tokens": 1011575.0, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 0.10200927357032458, | |
| "grad_norm": 3.375523328781128, | |
| "learning_rate": 1.898299845440495e-05, | |
| "loss": 3.7556, | |
| "mean_token_accuracy": 0.36073010191321375, | |
| "num_tokens": 1042031.0, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 0.10510046367851623, | |
| "grad_norm": 5.099122524261475, | |
| "learning_rate": 1.895208655332303e-05, | |
| "loss": 3.6861, | |
| "mean_token_accuracy": 0.36301063373684883, | |
| "num_tokens": 1074689.0, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 0.10819165378670788, | |
| "grad_norm": 1.4775930643081665, | |
| "learning_rate": 1.8921174652241116e-05, | |
| "loss": 3.6744, | |
| "mean_token_accuracy": 0.3681299857795238, | |
| "num_tokens": 1105719.0, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 0.11128284389489954, | |
| "grad_norm": 3.931447744369507, | |
| "learning_rate": 1.88902627511592e-05, | |
| "loss": 3.6248, | |
| "mean_token_accuracy": 0.37062914595007895, | |
| "num_tokens": 1136185.0, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 0.1143740340030912, | |
| "grad_norm": 2.6153130531311035, | |
| "learning_rate": 1.885935085007728e-05, | |
| "loss": 3.6971, | |
| "mean_token_accuracy": 0.35686987787485125, | |
| "num_tokens": 1166461.0, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 0.11746522411128284, | |
| "grad_norm": 3.0943849086761475, | |
| "learning_rate": 1.8828438948995363e-05, | |
| "loss": 3.7057, | |
| "mean_token_accuracy": 0.36670113652944564, | |
| "num_tokens": 1194661.0, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 0.1205564142194745, | |
| "grad_norm": 1.760920524597168, | |
| "learning_rate": 1.879752704791345e-05, | |
| "loss": 3.6806, | |
| "mean_token_accuracy": 0.366318603605032, | |
| "num_tokens": 1224670.0, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 0.12364760432766615, | |
| "grad_norm": 1.8976062536239624, | |
| "learning_rate": 1.8766615146831534e-05, | |
| "loss": 3.7318, | |
| "mean_token_accuracy": 0.36500929966568946, | |
| "num_tokens": 1252830.0, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.1267387944358578, | |
| "grad_norm": 1.950358510017395, | |
| "learning_rate": 1.8735703245749614e-05, | |
| "loss": 3.7106, | |
| "mean_token_accuracy": 0.3675060346722603, | |
| "num_tokens": 1285546.0, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 0.12982998454404945, | |
| "grad_norm": 2.707167148590088, | |
| "learning_rate": 1.8704791344667697e-05, | |
| "loss": 3.6688, | |
| "mean_token_accuracy": 0.37156677842140196, | |
| "num_tokens": 1316466.0, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 0.13292117465224113, | |
| "grad_norm": 2.084510564804077, | |
| "learning_rate": 1.8673879443585784e-05, | |
| "loss": 3.6758, | |
| "mean_token_accuracy": 0.3684880450367928, | |
| "num_tokens": 1349121.0, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 0.13601236476043277, | |
| "grad_norm": 2.2626636028289795, | |
| "learning_rate": 1.8642967542503865e-05, | |
| "loss": 3.7214, | |
| "mean_token_accuracy": 0.35720510333776473, | |
| "num_tokens": 1384366.0, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 0.1391035548686244, | |
| "grad_norm": 2.4145290851593018, | |
| "learning_rate": 1.8612055641421948e-05, | |
| "loss": 3.7261, | |
| "mean_token_accuracy": 0.36429562568664553, | |
| "num_tokens": 1415357.0, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 0.14219474497681608, | |
| "grad_norm": 2.7409212589263916, | |
| "learning_rate": 1.8581143740340032e-05, | |
| "loss": 3.7368, | |
| "mean_token_accuracy": 0.3619597226381302, | |
| "num_tokens": 1445641.0, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 0.14528593508500773, | |
| "grad_norm": 4.5937275886535645, | |
| "learning_rate": 1.8550231839258115e-05, | |
| "loss": 3.6911, | |
| "mean_token_accuracy": 0.3683665543794632, | |
| "num_tokens": 1475360.0, | |
| "step": 470 | |
| }, | |
| { | |
| "epoch": 0.14837712519319937, | |
| "grad_norm": 12.00837516784668, | |
| "learning_rate": 1.85193199381762e-05, | |
| "loss": 3.6899, | |
| "mean_token_accuracy": 0.3648961283266544, | |
| "num_tokens": 1506048.0, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 0.15146831530139104, | |
| "grad_norm": 2.5790181159973145, | |
| "learning_rate": 1.8488408037094283e-05, | |
| "loss": 3.6948, | |
| "mean_token_accuracy": 0.36360194012522695, | |
| "num_tokens": 1539998.0, | |
| "step": 490 | |
| }, | |
| { | |
| "epoch": 0.1545595054095827, | |
| "grad_norm": 1.8515182733535767, | |
| "learning_rate": 1.8457496136012366e-05, | |
| "loss": 3.65, | |
| "mean_token_accuracy": 0.36655392646789553, | |
| "num_tokens": 1573641.0, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.15765069551777433, | |
| "grad_norm": 2.3960251808166504, | |
| "learning_rate": 1.842658423493045e-05, | |
| "loss": 3.58, | |
| "mean_token_accuracy": 0.373273029923439, | |
| "num_tokens": 1605367.0, | |
| "step": 510 | |
| }, | |
| { | |
| "epoch": 0.160741885625966, | |
| "grad_norm": 2.578730821609497, | |
| "learning_rate": 1.8395672333848533e-05, | |
| "loss": 3.6695, | |
| "mean_token_accuracy": 0.368588350713253, | |
| "num_tokens": 1638052.0, | |
| "step": 520 | |
| }, | |
| { | |
| "epoch": 0.16383307573415765, | |
| "grad_norm": 1.9368691444396973, | |
| "learning_rate": 1.8364760432766617e-05, | |
| "loss": 3.6363, | |
| "mean_token_accuracy": 0.3681382529437542, | |
| "num_tokens": 1672043.0, | |
| "step": 530 | |
| }, | |
| { | |
| "epoch": 0.16692426584234932, | |
| "grad_norm": 3.562593698501587, | |
| "learning_rate": 1.83338485316847e-05, | |
| "loss": 3.5386, | |
| "mean_token_accuracy": 0.3801154658198357, | |
| "num_tokens": 1704817.0, | |
| "step": 540 | |
| }, | |
| { | |
| "epoch": 0.17001545595054096, | |
| "grad_norm": 1.564929723739624, | |
| "learning_rate": 1.8302936630602784e-05, | |
| "loss": 3.6191, | |
| "mean_token_accuracy": 0.36746986880898475, | |
| "num_tokens": 1737297.0, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 0.1731066460587326, | |
| "grad_norm": 1.4315626621246338, | |
| "learning_rate": 1.8272024729520868e-05, | |
| "loss": 3.592, | |
| "mean_token_accuracy": 0.3701712526381016, | |
| "num_tokens": 1770204.0, | |
| "step": 560 | |
| }, | |
| { | |
| "epoch": 0.17619783616692428, | |
| "grad_norm": 6.3183746337890625, | |
| "learning_rate": 1.824111282843895e-05, | |
| "loss": 3.6281, | |
| "mean_token_accuracy": 0.37336429879069327, | |
| "num_tokens": 1800581.0, | |
| "step": 570 | |
| }, | |
| { | |
| "epoch": 0.17928902627511592, | |
| "grad_norm": 1.3608078956604004, | |
| "learning_rate": 1.8210200927357035e-05, | |
| "loss": 3.5644, | |
| "mean_token_accuracy": 0.37816725075244906, | |
| "num_tokens": 1834564.0, | |
| "step": 580 | |
| }, | |
| { | |
| "epoch": 0.18238021638330756, | |
| "grad_norm": 11.443370819091797, | |
| "learning_rate": 1.817928902627512e-05, | |
| "loss": 3.6134, | |
| "mean_token_accuracy": 0.37460487633943557, | |
| "num_tokens": 1866113.0, | |
| "step": 590 | |
| }, | |
| { | |
| "epoch": 0.18547140649149924, | |
| "grad_norm": 2.1869001388549805, | |
| "learning_rate": 1.8148377125193202e-05, | |
| "loss": 3.6331, | |
| "mean_token_accuracy": 0.3722210742533207, | |
| "num_tokens": 1900627.0, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 0.18856259659969088, | |
| "grad_norm": 1.8551387786865234, | |
| "learning_rate": 1.8117465224111282e-05, | |
| "loss": 3.5707, | |
| "mean_token_accuracy": 0.3772866874933243, | |
| "num_tokens": 1932988.0, | |
| "step": 610 | |
| }, | |
| { | |
| "epoch": 0.19165378670788252, | |
| "grad_norm": 5.528620719909668, | |
| "learning_rate": 1.808655332302937e-05, | |
| "loss": 3.6475, | |
| "mean_token_accuracy": 0.3652547873556614, | |
| "num_tokens": 1962031.0, | |
| "step": 620 | |
| }, | |
| { | |
| "epoch": 0.1947449768160742, | |
| "grad_norm": 3.566514253616333, | |
| "learning_rate": 1.8055641421947453e-05, | |
| "loss": 3.5441, | |
| "mean_token_accuracy": 0.3836688004434109, | |
| "num_tokens": 1995070.0, | |
| "step": 630 | |
| }, | |
| { | |
| "epoch": 0.19783616692426584, | |
| "grad_norm": 1.9582892656326294, | |
| "learning_rate": 1.8024729520865533e-05, | |
| "loss": 3.5473, | |
| "mean_token_accuracy": 0.3757788948714733, | |
| "num_tokens": 2024801.0, | |
| "step": 640 | |
| }, | |
| { | |
| "epoch": 0.2009273570324575, | |
| "grad_norm": 1.7483699321746826, | |
| "learning_rate": 1.7993817619783616e-05, | |
| "loss": 3.6133, | |
| "mean_token_accuracy": 0.37615733668208123, | |
| "num_tokens": 2054131.0, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 0.20401854714064915, | |
| "grad_norm": 1.9855698347091675, | |
| "learning_rate": 1.7962905718701703e-05, | |
| "loss": 3.5724, | |
| "mean_token_accuracy": 0.37552602738142016, | |
| "num_tokens": 2086218.0, | |
| "step": 660 | |
| }, | |
| { | |
| "epoch": 0.2071097372488408, | |
| "grad_norm": 2.380608558654785, | |
| "learning_rate": 1.7931993817619787e-05, | |
| "loss": 3.5476, | |
| "mean_token_accuracy": 0.37423405200243, | |
| "num_tokens": 2121389.0, | |
| "step": 670 | |
| }, | |
| { | |
| "epoch": 0.21020092735703247, | |
| "grad_norm": 1.5300630331039429, | |
| "learning_rate": 1.7901081916537867e-05, | |
| "loss": 3.668, | |
| "mean_token_accuracy": 0.3656138554215431, | |
| "num_tokens": 2152513.0, | |
| "step": 680 | |
| }, | |
| { | |
| "epoch": 0.2132921174652241, | |
| "grad_norm": 1.6176111698150635, | |
| "learning_rate": 1.787017001545595e-05, | |
| "loss": 3.6087, | |
| "mean_token_accuracy": 0.37140627652406694, | |
| "num_tokens": 2184920.0, | |
| "step": 690 | |
| }, | |
| { | |
| "epoch": 0.21638330757341576, | |
| "grad_norm": 1.401524305343628, | |
| "learning_rate": 1.7839258114374038e-05, | |
| "loss": 3.4881, | |
| "mean_token_accuracy": 0.3919842541217804, | |
| "num_tokens": 2214174.0, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 0.21947449768160743, | |
| "grad_norm": 1.4391794204711914, | |
| "learning_rate": 1.7808346213292118e-05, | |
| "loss": 3.5622, | |
| "mean_token_accuracy": 0.37740132212638855, | |
| "num_tokens": 2247593.0, | |
| "step": 710 | |
| }, | |
| { | |
| "epoch": 0.22256568778979907, | |
| "grad_norm": 5.364869117736816, | |
| "learning_rate": 1.77774343122102e-05, | |
| "loss": 3.5855, | |
| "mean_token_accuracy": 0.37872459217905996, | |
| "num_tokens": 2277810.0, | |
| "step": 720 | |
| }, | |
| { | |
| "epoch": 0.22565687789799072, | |
| "grad_norm": 1.3683693408966064, | |
| "learning_rate": 1.7746522411128285e-05, | |
| "loss": 3.4653, | |
| "mean_token_accuracy": 0.38120819330215455, | |
| "num_tokens": 2309457.0, | |
| "step": 730 | |
| }, | |
| { | |
| "epoch": 0.2287480680061824, | |
| "grad_norm": 1.3637374639511108, | |
| "learning_rate": 1.771561051004637e-05, | |
| "loss": 3.5131, | |
| "mean_token_accuracy": 0.3864184685051441, | |
| "num_tokens": 2339852.0, | |
| "step": 740 | |
| }, | |
| { | |
| "epoch": 0.23183925811437403, | |
| "grad_norm": 2.7388429641723633, | |
| "learning_rate": 1.7684698608964452e-05, | |
| "loss": 3.573, | |
| "mean_token_accuracy": 0.3795412413775921, | |
| "num_tokens": 2371444.0, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 0.23493044822256567, | |
| "grad_norm": 9.798398971557617, | |
| "learning_rate": 1.7653786707882536e-05, | |
| "loss": 3.5441, | |
| "mean_token_accuracy": 0.38080229982733727, | |
| "num_tokens": 2401769.0, | |
| "step": 760 | |
| }, | |
| { | |
| "epoch": 0.23802163833075735, | |
| "grad_norm": 2.1416878700256348, | |
| "learning_rate": 1.762287480680062e-05, | |
| "loss": 3.5043, | |
| "mean_token_accuracy": 0.3838089659810066, | |
| "num_tokens": 2432410.0, | |
| "step": 770 | |
| }, | |
| { | |
| "epoch": 0.241112828438949, | |
| "grad_norm": 2.211545467376709, | |
| "learning_rate": 1.7591962905718703e-05, | |
| "loss": 3.5341, | |
| "mean_token_accuracy": 0.388753118366003, | |
| "num_tokens": 2462641.0, | |
| "step": 780 | |
| }, | |
| { | |
| "epoch": 0.24420401854714066, | |
| "grad_norm": 5.351387023925781, | |
| "learning_rate": 1.7561051004636787e-05, | |
| "loss": 3.5031, | |
| "mean_token_accuracy": 0.3821559719741344, | |
| "num_tokens": 2496459.0, | |
| "step": 790 | |
| }, | |
| { | |
| "epoch": 0.2472952086553323, | |
| "grad_norm": 2.3877508640289307, | |
| "learning_rate": 1.753013910355487e-05, | |
| "loss": 3.5589, | |
| "mean_token_accuracy": 0.3789012677967548, | |
| "num_tokens": 2526708.0, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 0.250386398763524, | |
| "grad_norm": 1.2962738275527954, | |
| "learning_rate": 1.7499227202472954e-05, | |
| "loss": 3.505, | |
| "mean_token_accuracy": 0.3823570780456066, | |
| "num_tokens": 2561229.0, | |
| "step": 810 | |
| }, | |
| { | |
| "epoch": 0.2534775888717156, | |
| "grad_norm": 1.8147461414337158, | |
| "learning_rate": 1.7468315301391037e-05, | |
| "loss": 3.4861, | |
| "mean_token_accuracy": 0.3802278622984886, | |
| "num_tokens": 2592914.0, | |
| "step": 820 | |
| }, | |
| { | |
| "epoch": 0.25656877897990726, | |
| "grad_norm": 3.4690439701080322, | |
| "learning_rate": 1.743740340030912e-05, | |
| "loss": 3.5202, | |
| "mean_token_accuracy": 0.38112854287028314, | |
| "num_tokens": 2627185.0, | |
| "step": 830 | |
| }, | |
| { | |
| "epoch": 0.2596599690880989, | |
| "grad_norm": 4.353795051574707, | |
| "learning_rate": 1.7406491499227205e-05, | |
| "loss": 3.5667, | |
| "mean_token_accuracy": 0.37200469225645066, | |
| "num_tokens": 2660839.0, | |
| "step": 840 | |
| }, | |
| { | |
| "epoch": 0.26275115919629055, | |
| "grad_norm": 2.6319100856781006, | |
| "learning_rate": 1.7375579598145288e-05, | |
| "loss": 3.464, | |
| "mean_token_accuracy": 0.38627258986234664, | |
| "num_tokens": 2690042.0, | |
| "step": 850 | |
| }, | |
| { | |
| "epoch": 0.26584234930448225, | |
| "grad_norm": 1.6525287628173828, | |
| "learning_rate": 1.7344667697063372e-05, | |
| "loss": 3.5616, | |
| "mean_token_accuracy": 0.3763453342020512, | |
| "num_tokens": 2724354.0, | |
| "step": 860 | |
| }, | |
| { | |
| "epoch": 0.2689335394126739, | |
| "grad_norm": 1.9737221002578735, | |
| "learning_rate": 1.7313755795981455e-05, | |
| "loss": 3.5147, | |
| "mean_token_accuracy": 0.38340551406145096, | |
| "num_tokens": 2755591.0, | |
| "step": 870 | |
| }, | |
| { | |
| "epoch": 0.27202472952086554, | |
| "grad_norm": 2.5324320793151855, | |
| "learning_rate": 1.7282843894899536e-05, | |
| "loss": 3.4227, | |
| "mean_token_accuracy": 0.3909419260919094, | |
| "num_tokens": 2786566.0, | |
| "step": 880 | |
| }, | |
| { | |
| "epoch": 0.2751159196290572, | |
| "grad_norm": 3.3844659328460693, | |
| "learning_rate": 1.7251931993817623e-05, | |
| "loss": 3.5561, | |
| "mean_token_accuracy": 0.37579271346330645, | |
| "num_tokens": 2814171.0, | |
| "step": 890 | |
| }, | |
| { | |
| "epoch": 0.2782071097372488, | |
| "grad_norm": 2.7620153427124023, | |
| "learning_rate": 1.7221020092735706e-05, | |
| "loss": 3.5042, | |
| "mean_token_accuracy": 0.3822973191738129, | |
| "num_tokens": 2844402.0, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 0.28129829984544047, | |
| "grad_norm": 2.089118719100952, | |
| "learning_rate": 1.7190108191653786e-05, | |
| "loss": 3.4583, | |
| "mean_token_accuracy": 0.39270223304629326, | |
| "num_tokens": 2873728.0, | |
| "step": 910 | |
| }, | |
| { | |
| "epoch": 0.28438948995363217, | |
| "grad_norm": 2.496480941772461, | |
| "learning_rate": 1.715919629057187e-05, | |
| "loss": 3.4466, | |
| "mean_token_accuracy": 0.3897694177925587, | |
| "num_tokens": 2905000.0, | |
| "step": 920 | |
| }, | |
| { | |
| "epoch": 0.2874806800618238, | |
| "grad_norm": 2.162785053253174, | |
| "learning_rate": 1.7128284389489957e-05, | |
| "loss": 3.4075, | |
| "mean_token_accuracy": 0.39374888986349105, | |
| "num_tokens": 2935706.0, | |
| "step": 930 | |
| }, | |
| { | |
| "epoch": 0.29057187017001546, | |
| "grad_norm": 1.5756185054779053, | |
| "learning_rate": 1.709737248840804e-05, | |
| "loss": 3.4396, | |
| "mean_token_accuracy": 0.38780966177582743, | |
| "num_tokens": 2966477.0, | |
| "step": 940 | |
| }, | |
| { | |
| "epoch": 0.2936630602782071, | |
| "grad_norm": 5.708625316619873, | |
| "learning_rate": 1.706646058732612e-05, | |
| "loss": 3.4185, | |
| "mean_token_accuracy": 0.39326486811041833, | |
| "num_tokens": 2996439.0, | |
| "step": 950 | |
| }, | |
| { | |
| "epoch": 0.29675425038639874, | |
| "grad_norm": 2.8689119815826416, | |
| "learning_rate": 1.7035548686244204e-05, | |
| "loss": 3.468, | |
| "mean_token_accuracy": 0.3818407289683819, | |
| "num_tokens": 3026332.0, | |
| "step": 960 | |
| }, | |
| { | |
| "epoch": 0.29984544049459044, | |
| "grad_norm": 4.020405292510986, | |
| "learning_rate": 1.700463678516229e-05, | |
| "loss": 3.4943, | |
| "mean_token_accuracy": 0.3888055384159088, | |
| "num_tokens": 3057606.0, | |
| "step": 970 | |
| }, | |
| { | |
| "epoch": 0.3029366306027821, | |
| "grad_norm": 1.8991873264312744, | |
| "learning_rate": 1.697372488408037e-05, | |
| "loss": 3.4939, | |
| "mean_token_accuracy": 0.38414665684103966, | |
| "num_tokens": 3089449.0, | |
| "step": 980 | |
| }, | |
| { | |
| "epoch": 0.30602782071097373, | |
| "grad_norm": 13.78397274017334, | |
| "learning_rate": 1.6942812982998455e-05, | |
| "loss": 3.5487, | |
| "mean_token_accuracy": 0.3769320294260979, | |
| "num_tokens": 3119537.0, | |
| "step": 990 | |
| }, | |
| { | |
| "epoch": 0.3091190108191654, | |
| "grad_norm": 1.599820613861084, | |
| "learning_rate": 1.691190108191654e-05, | |
| "loss": 3.465, | |
| "mean_token_accuracy": 0.3884730890393257, | |
| "num_tokens": 3151905.0, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.312210200927357, | |
| "grad_norm": 6.015178680419922, | |
| "learning_rate": 1.6880989180834622e-05, | |
| "loss": 3.5043, | |
| "mean_token_accuracy": 0.38035417571663854, | |
| "num_tokens": 3184277.0, | |
| "step": 1010 | |
| }, | |
| { | |
| "epoch": 0.31530139103554866, | |
| "grad_norm": 1.854093313217163, | |
| "learning_rate": 1.6850077279752706e-05, | |
| "loss": 3.4233, | |
| "mean_token_accuracy": 0.389019088447094, | |
| "num_tokens": 3217455.0, | |
| "step": 1020 | |
| }, | |
| { | |
| "epoch": 0.31839258114374036, | |
| "grad_norm": 3.024531126022339, | |
| "learning_rate": 1.681916537867079e-05, | |
| "loss": 3.4768, | |
| "mean_token_accuracy": 0.39151332527399063, | |
| "num_tokens": 3251326.0, | |
| "step": 1030 | |
| }, | |
| { | |
| "epoch": 0.321483771251932, | |
| "grad_norm": 1.8012628555297852, | |
| "learning_rate": 1.6788253477588873e-05, | |
| "loss": 3.4955, | |
| "mean_token_accuracy": 0.3855120025575161, | |
| "num_tokens": 3283488.0, | |
| "step": 1040 | |
| }, | |
| { | |
| "epoch": 0.32457496136012365, | |
| "grad_norm": 1.4446407556533813, | |
| "learning_rate": 1.6757341576506957e-05, | |
| "loss": 3.4023, | |
| "mean_token_accuracy": 0.3883494645357132, | |
| "num_tokens": 3315970.0, | |
| "step": 1050 | |
| }, | |
| { | |
| "epoch": 0.3276661514683153, | |
| "grad_norm": 2.132194995880127, | |
| "learning_rate": 1.672642967542504e-05, | |
| "loss": 3.5195, | |
| "mean_token_accuracy": 0.38440208286046984, | |
| "num_tokens": 3348645.0, | |
| "step": 1060 | |
| }, | |
| { | |
| "epoch": 0.33075734157650694, | |
| "grad_norm": 2.436528444290161, | |
| "learning_rate": 1.6695517774343124e-05, | |
| "loss": 3.4595, | |
| "mean_token_accuracy": 0.3885770753026009, | |
| "num_tokens": 3378707.0, | |
| "step": 1070 | |
| }, | |
| { | |
| "epoch": 0.33384853168469864, | |
| "grad_norm": 2.0806350708007812, | |
| "learning_rate": 1.6664605873261207e-05, | |
| "loss": 3.4576, | |
| "mean_token_accuracy": 0.38848345205187795, | |
| "num_tokens": 3410231.0, | |
| "step": 1080 | |
| }, | |
| { | |
| "epoch": 0.3369397217928903, | |
| "grad_norm": 2.142319679260254, | |
| "learning_rate": 1.663369397217929e-05, | |
| "loss": 3.4644, | |
| "mean_token_accuracy": 0.3861194223165512, | |
| "num_tokens": 3444065.0, | |
| "step": 1090 | |
| }, | |
| { | |
| "epoch": 0.3400309119010819, | |
| "grad_norm": 2.380552053451538, | |
| "learning_rate": 1.6602782071097374e-05, | |
| "loss": 3.3916, | |
| "mean_token_accuracy": 0.39232398346066477, | |
| "num_tokens": 3474974.0, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 0.34312210200927357, | |
| "grad_norm": 1.6462984085083008, | |
| "learning_rate": 1.6571870170015458e-05, | |
| "loss": 3.3687, | |
| "mean_token_accuracy": 0.3944365203380585, | |
| "num_tokens": 3507101.0, | |
| "step": 1110 | |
| }, | |
| { | |
| "epoch": 0.3462132921174652, | |
| "grad_norm": 4.342376232147217, | |
| "learning_rate": 1.654095826893354e-05, | |
| "loss": 3.4589, | |
| "mean_token_accuracy": 0.3856883034110069, | |
| "num_tokens": 3538893.0, | |
| "step": 1120 | |
| }, | |
| { | |
| "epoch": 0.34930448222565685, | |
| "grad_norm": 1.8314056396484375, | |
| "learning_rate": 1.6510046367851625e-05, | |
| "loss": 3.6277, | |
| "mean_token_accuracy": 0.3779118649661541, | |
| "num_tokens": 3567776.0, | |
| "step": 1130 | |
| }, | |
| { | |
| "epoch": 0.35239567233384855, | |
| "grad_norm": 1.7731289863586426, | |
| "learning_rate": 1.647913446676971e-05, | |
| "loss": 3.4375, | |
| "mean_token_accuracy": 0.38362068235874175, | |
| "num_tokens": 3600096.0, | |
| "step": 1140 | |
| }, | |
| { | |
| "epoch": 0.3554868624420402, | |
| "grad_norm": 1.4724918603897095, | |
| "learning_rate": 1.644822256568779e-05, | |
| "loss": 3.4787, | |
| "mean_token_accuracy": 0.39140490964055064, | |
| "num_tokens": 3629645.0, | |
| "step": 1150 | |
| }, | |
| { | |
| "epoch": 0.35857805255023184, | |
| "grad_norm": 2.0459768772125244, | |
| "learning_rate": 1.6417310664605876e-05, | |
| "loss": 3.4346, | |
| "mean_token_accuracy": 0.38677491843700407, | |
| "num_tokens": 3663559.0, | |
| "step": 1160 | |
| }, | |
| { | |
| "epoch": 0.3616692426584235, | |
| "grad_norm": 3.235039710998535, | |
| "learning_rate": 1.638639876352396e-05, | |
| "loss": 3.4915, | |
| "mean_token_accuracy": 0.38628031834959986, | |
| "num_tokens": 3695457.0, | |
| "step": 1170 | |
| }, | |
| { | |
| "epoch": 0.36476043276661513, | |
| "grad_norm": 1.2474194765090942, | |
| "learning_rate": 1.635548686244204e-05, | |
| "loss": 3.3999, | |
| "mean_token_accuracy": 0.39414023533463477, | |
| "num_tokens": 3725294.0, | |
| "step": 1180 | |
| }, | |
| { | |
| "epoch": 0.3678516228748068, | |
| "grad_norm": 1.3599259853363037, | |
| "learning_rate": 1.6324574961360123e-05, | |
| "loss": 3.3712, | |
| "mean_token_accuracy": 0.39129213988780975, | |
| "num_tokens": 3756722.0, | |
| "step": 1190 | |
| }, | |
| { | |
| "epoch": 0.37094281298299847, | |
| "grad_norm": 1.4785717725753784, | |
| "learning_rate": 1.629366306027821e-05, | |
| "loss": 3.3837, | |
| "mean_token_accuracy": 0.39773035794496536, | |
| "num_tokens": 3786396.0, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 0.3740340030911901, | |
| "grad_norm": 1.1180949211120605, | |
| "learning_rate": 1.6262751159196294e-05, | |
| "loss": 3.374, | |
| "mean_token_accuracy": 0.3942295677959919, | |
| "num_tokens": 3816940.0, | |
| "step": 1210 | |
| }, | |
| { | |
| "epoch": 0.37712519319938176, | |
| "grad_norm": 3.58443546295166, | |
| "learning_rate": 1.6231839258114374e-05, | |
| "loss": 3.3892, | |
| "mean_token_accuracy": 0.3918235659599304, | |
| "num_tokens": 3845728.0, | |
| "step": 1220 | |
| }, | |
| { | |
| "epoch": 0.3802163833075734, | |
| "grad_norm": 7.910126686096191, | |
| "learning_rate": 1.6200927357032458e-05, | |
| "loss": 3.4141, | |
| "mean_token_accuracy": 0.39633639603853227, | |
| "num_tokens": 3876251.0, | |
| "step": 1230 | |
| }, | |
| { | |
| "epoch": 0.38330757341576505, | |
| "grad_norm": 1.4737247228622437, | |
| "learning_rate": 1.617001545595054e-05, | |
| "loss": 3.3891, | |
| "mean_token_accuracy": 0.38747691363096237, | |
| "num_tokens": 3907979.0, | |
| "step": 1240 | |
| }, | |
| { | |
| "epoch": 0.38639876352395675, | |
| "grad_norm": 2.543823003768921, | |
| "learning_rate": 1.6139103554868625e-05, | |
| "loss": 3.3857, | |
| "mean_token_accuracy": 0.3897668160498142, | |
| "num_tokens": 3939556.0, | |
| "step": 1250 | |
| }, | |
| { | |
| "epoch": 0.3894899536321484, | |
| "grad_norm": 1.8837332725524902, | |
| "learning_rate": 1.610819165378671e-05, | |
| "loss": 3.3795, | |
| "mean_token_accuracy": 0.39769657924771307, | |
| "num_tokens": 3970481.0, | |
| "step": 1260 | |
| }, | |
| { | |
| "epoch": 0.39258114374034003, | |
| "grad_norm": 3.573788642883301, | |
| "learning_rate": 1.6077279752704792e-05, | |
| "loss": 3.3105, | |
| "mean_token_accuracy": 0.4041719429194927, | |
| "num_tokens": 3999666.0, | |
| "step": 1270 | |
| }, | |
| { | |
| "epoch": 0.3956723338485317, | |
| "grad_norm": 1.7824413776397705, | |
| "learning_rate": 1.6046367851622876e-05, | |
| "loss": 3.4468, | |
| "mean_token_accuracy": 0.3911862142384052, | |
| "num_tokens": 4031876.0, | |
| "step": 1280 | |
| }, | |
| { | |
| "epoch": 0.3987635239567233, | |
| "grad_norm": 1.6730329990386963, | |
| "learning_rate": 1.601545595054096e-05, | |
| "loss": 3.4193, | |
| "mean_token_accuracy": 0.39065720662474634, | |
| "num_tokens": 4064132.0, | |
| "step": 1290 | |
| }, | |
| { | |
| "epoch": 0.401854714064915, | |
| "grad_norm": 1.421411395072937, | |
| "learning_rate": 1.5984544049459043e-05, | |
| "loss": 3.4292, | |
| "mean_token_accuracy": 0.3871223643422127, | |
| "num_tokens": 4095074.0, | |
| "step": 1300 | |
| }, | |
| { | |
| "epoch": 0.40494590417310666, | |
| "grad_norm": 1.7248343229293823, | |
| "learning_rate": 1.5953632148377126e-05, | |
| "loss": 3.379, | |
| "mean_token_accuracy": 0.39316892698407174, | |
| "num_tokens": 4130858.0, | |
| "step": 1310 | |
| }, | |
| { | |
| "epoch": 0.4080370942812983, | |
| "grad_norm": 2.6556711196899414, | |
| "learning_rate": 1.592272024729521e-05, | |
| "loss": 3.4088, | |
| "mean_token_accuracy": 0.38396543338894845, | |
| "num_tokens": 4163277.0, | |
| "step": 1320 | |
| }, | |
| { | |
| "epoch": 0.41112828438948995, | |
| "grad_norm": 2.2201597690582275, | |
| "learning_rate": 1.5891808346213294e-05, | |
| "loss": 3.446, | |
| "mean_token_accuracy": 0.38794904500246047, | |
| "num_tokens": 4195806.0, | |
| "step": 1330 | |
| }, | |
| { | |
| "epoch": 0.4142194744976816, | |
| "grad_norm": 1.4001938104629517, | |
| "learning_rate": 1.5860896445131377e-05, | |
| "loss": 3.4074, | |
| "mean_token_accuracy": 0.3959184519946575, | |
| "num_tokens": 4225811.0, | |
| "step": 1340 | |
| }, | |
| { | |
| "epoch": 0.41731066460587324, | |
| "grad_norm": 1.9553899765014648, | |
| "learning_rate": 1.582998454404946e-05, | |
| "loss": 3.353, | |
| "mean_token_accuracy": 0.392233844101429, | |
| "num_tokens": 4259418.0, | |
| "step": 1350 | |
| }, | |
| { | |
| "epoch": 0.42040185471406494, | |
| "grad_norm": 2.1650876998901367, | |
| "learning_rate": 1.5799072642967544e-05, | |
| "loss": 3.3014, | |
| "mean_token_accuracy": 0.40291827023029325, | |
| "num_tokens": 4292593.0, | |
| "step": 1360 | |
| }, | |
| { | |
| "epoch": 0.4234930448222566, | |
| "grad_norm": 1.6802010536193848, | |
| "learning_rate": 1.5768160741885628e-05, | |
| "loss": 3.3615, | |
| "mean_token_accuracy": 0.3953541323542595, | |
| "num_tokens": 4325628.0, | |
| "step": 1370 | |
| }, | |
| { | |
| "epoch": 0.4265842349304482, | |
| "grad_norm": 2.8798279762268066, | |
| "learning_rate": 1.573724884080371e-05, | |
| "loss": 3.4365, | |
| "mean_token_accuracy": 0.39370308369398116, | |
| "num_tokens": 4358459.0, | |
| "step": 1380 | |
| }, | |
| { | |
| "epoch": 0.42967542503863987, | |
| "grad_norm": 1.523694634437561, | |
| "learning_rate": 1.5706336939721795e-05, | |
| "loss": 3.3153, | |
| "mean_token_accuracy": 0.3944700941443443, | |
| "num_tokens": 4390529.0, | |
| "step": 1390 | |
| }, | |
| { | |
| "epoch": 0.4327666151468315, | |
| "grad_norm": 2.0799732208251953, | |
| "learning_rate": 1.567542503863988e-05, | |
| "loss": 3.4242, | |
| "mean_token_accuracy": 0.389276672154665, | |
| "num_tokens": 4424045.0, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 0.43585780525502316, | |
| "grad_norm": 1.3916538953781128, | |
| "learning_rate": 1.5644513137557962e-05, | |
| "loss": 3.444, | |
| "mean_token_accuracy": 0.388773063570261, | |
| "num_tokens": 4456115.0, | |
| "step": 1410 | |
| }, | |
| { | |
| "epoch": 0.43894899536321486, | |
| "grad_norm": 11.072097778320312, | |
| "learning_rate": 1.5613601236476042e-05, | |
| "loss": 3.2926, | |
| "mean_token_accuracy": 0.40686351582407954, | |
| "num_tokens": 4490990.0, | |
| "step": 1420 | |
| }, | |
| { | |
| "epoch": 0.4420401854714065, | |
| "grad_norm": 1.8008073568344116, | |
| "learning_rate": 1.558268933539413e-05, | |
| "loss": 3.4062, | |
| "mean_token_accuracy": 0.3938455879688263, | |
| "num_tokens": 4519847.0, | |
| "step": 1430 | |
| }, | |
| { | |
| "epoch": 0.44513137557959814, | |
| "grad_norm": 4.065845012664795, | |
| "learning_rate": 1.5551777434312213e-05, | |
| "loss": 3.3776, | |
| "mean_token_accuracy": 0.4004744917154312, | |
| "num_tokens": 4551679.0, | |
| "step": 1440 | |
| }, | |
| { | |
| "epoch": 0.4482225656877898, | |
| "grad_norm": 2.4614624977111816, | |
| "learning_rate": 1.5520865533230297e-05, | |
| "loss": 3.324, | |
| "mean_token_accuracy": 0.4031891174614429, | |
| "num_tokens": 4582622.0, | |
| "step": 1450 | |
| }, | |
| { | |
| "epoch": 0.45131375579598143, | |
| "grad_norm": 2.6623904705047607, | |
| "learning_rate": 1.5489953632148377e-05, | |
| "loss": 3.414, | |
| "mean_token_accuracy": 0.39484291821718215, | |
| "num_tokens": 4613478.0, | |
| "step": 1460 | |
| }, | |
| { | |
| "epoch": 0.45440494590417313, | |
| "grad_norm": 2.342698574066162, | |
| "learning_rate": 1.545904173106646e-05, | |
| "loss": 3.4348, | |
| "mean_token_accuracy": 0.38681296780705454, | |
| "num_tokens": 4647495.0, | |
| "step": 1470 | |
| }, | |
| { | |
| "epoch": 0.4574961360123648, | |
| "grad_norm": 2.6113431453704834, | |
| "learning_rate": 1.5428129829984547e-05, | |
| "loss": 3.39, | |
| "mean_token_accuracy": 0.391095020622015, | |
| "num_tokens": 4681809.0, | |
| "step": 1480 | |
| }, | |
| { | |
| "epoch": 0.4605873261205564, | |
| "grad_norm": 1.3145941495895386, | |
| "learning_rate": 1.5397217928902627e-05, | |
| "loss": 3.3677, | |
| "mean_token_accuracy": 0.38954789489507674, | |
| "num_tokens": 4716308.0, | |
| "step": 1490 | |
| }, | |
| { | |
| "epoch": 0.46367851622874806, | |
| "grad_norm": 1.8231385946273804, | |
| "learning_rate": 1.536630602782071e-05, | |
| "loss": 3.4199, | |
| "mean_token_accuracy": 0.38780914843082426, | |
| "num_tokens": 4747835.0, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 0.4667697063369397, | |
| "grad_norm": 2.2594892978668213, | |
| "learning_rate": 1.5335394126738795e-05, | |
| "loss": 3.3937, | |
| "mean_token_accuracy": 0.3959015667438507, | |
| "num_tokens": 4776486.0, | |
| "step": 1510 | |
| }, | |
| { | |
| "epoch": 0.46986089644513135, | |
| "grad_norm": 3.840742349624634, | |
| "learning_rate": 1.5304482225656878e-05, | |
| "loss": 3.3087, | |
| "mean_token_accuracy": 0.407059845328331, | |
| "num_tokens": 4805943.0, | |
| "step": 1520 | |
| }, | |
| { | |
| "epoch": 0.47295208655332305, | |
| "grad_norm": 4.683995246887207, | |
| "learning_rate": 1.5273570324574962e-05, | |
| "loss": 3.3373, | |
| "mean_token_accuracy": 0.39871800169348715, | |
| "num_tokens": 4838440.0, | |
| "step": 1530 | |
| }, | |
| { | |
| "epoch": 0.4760432766615147, | |
| "grad_norm": 1.2700860500335693, | |
| "learning_rate": 1.5242658423493047e-05, | |
| "loss": 3.3339, | |
| "mean_token_accuracy": 0.39955019652843476, | |
| "num_tokens": 4870809.0, | |
| "step": 1540 | |
| }, | |
| { | |
| "epoch": 0.47913446676970634, | |
| "grad_norm": 2.15336537361145, | |
| "learning_rate": 1.521174652241113e-05, | |
| "loss": 3.3699, | |
| "mean_token_accuracy": 0.39904908165335656, | |
| "num_tokens": 4903332.0, | |
| "step": 1550 | |
| }, | |
| { | |
| "epoch": 0.482225656877898, | |
| "grad_norm": 1.299379587173462, | |
| "learning_rate": 1.5180834621329213e-05, | |
| "loss": 3.4024, | |
| "mean_token_accuracy": 0.3913417667150497, | |
| "num_tokens": 4933852.0, | |
| "step": 1560 | |
| }, | |
| { | |
| "epoch": 0.4853168469860896, | |
| "grad_norm": 1.1829091310501099, | |
| "learning_rate": 1.5149922720247296e-05, | |
| "loss": 3.3253, | |
| "mean_token_accuracy": 0.39646707102656364, | |
| "num_tokens": 4966530.0, | |
| "step": 1570 | |
| }, | |
| { | |
| "epoch": 0.4884080370942813, | |
| "grad_norm": 1.2553237676620483, | |
| "learning_rate": 1.511901081916538e-05, | |
| "loss": 3.2771, | |
| "mean_token_accuracy": 0.405765625834465, | |
| "num_tokens": 4998456.0, | |
| "step": 1580 | |
| }, | |
| { | |
| "epoch": 0.49149922720247297, | |
| "grad_norm": 2.037930488586426, | |
| "learning_rate": 1.5088098918083462e-05, | |
| "loss": 3.3754, | |
| "mean_token_accuracy": 0.3952351205050945, | |
| "num_tokens": 5028995.0, | |
| "step": 1590 | |
| }, | |
| { | |
| "epoch": 0.4945904173106646, | |
| "grad_norm": 1.090571403503418, | |
| "learning_rate": 1.5057187017001547e-05, | |
| "loss": 3.2832, | |
| "mean_token_accuracy": 0.4088763400912285, | |
| "num_tokens": 5057407.0, | |
| "step": 1600 | |
| }, | |
| { | |
| "epoch": 0.49768160741885625, | |
| "grad_norm": 2.4721672534942627, | |
| "learning_rate": 1.502627511591963e-05, | |
| "loss": 3.3535, | |
| "mean_token_accuracy": 0.3982778422534466, | |
| "num_tokens": 5088330.0, | |
| "step": 1610 | |
| }, | |
| { | |
| "epoch": 0.500772797527048, | |
| "grad_norm": 1.3971518278121948, | |
| "learning_rate": 1.4995363214837714e-05, | |
| "loss": 3.359, | |
| "mean_token_accuracy": 0.3978785939514637, | |
| "num_tokens": 5115511.0, | |
| "step": 1620 | |
| }, | |
| { | |
| "epoch": 0.5038639876352395, | |
| "grad_norm": 1.766021490097046, | |
| "learning_rate": 1.4964451313755796e-05, | |
| "loss": 3.3388, | |
| "mean_token_accuracy": 0.4023651979863644, | |
| "num_tokens": 5145560.0, | |
| "step": 1630 | |
| }, | |
| { | |
| "epoch": 0.5069551777434312, | |
| "grad_norm": 0.9848290681838989, | |
| "learning_rate": 1.4933539412673881e-05, | |
| "loss": 3.3312, | |
| "mean_token_accuracy": 0.4001339070498943, | |
| "num_tokens": 5178491.0, | |
| "step": 1640 | |
| }, | |
| { | |
| "epoch": 0.5100463678516228, | |
| "grad_norm": 5.008669376373291, | |
| "learning_rate": 1.4902627511591965e-05, | |
| "loss": 3.3395, | |
| "mean_token_accuracy": 0.39800570756196973, | |
| "num_tokens": 5209376.0, | |
| "step": 1650 | |
| }, | |
| { | |
| "epoch": 0.5131375579598145, | |
| "grad_norm": 3.3218369483947754, | |
| "learning_rate": 1.4871715610510047e-05, | |
| "loss": 3.398, | |
| "mean_token_accuracy": 0.3898357510566711, | |
| "num_tokens": 5240240.0, | |
| "step": 1660 | |
| }, | |
| { | |
| "epoch": 0.5162287480680062, | |
| "grad_norm": 1.3590713739395142, | |
| "learning_rate": 1.484080370942813e-05, | |
| "loss": 3.3246, | |
| "mean_token_accuracy": 0.39972665831446647, | |
| "num_tokens": 5269658.0, | |
| "step": 1670 | |
| }, | |
| { | |
| "epoch": 0.5193199381761978, | |
| "grad_norm": 1.647360920906067, | |
| "learning_rate": 1.4809891808346216e-05, | |
| "loss": 3.3101, | |
| "mean_token_accuracy": 0.4039941616356373, | |
| "num_tokens": 5301959.0, | |
| "step": 1680 | |
| }, | |
| { | |
| "epoch": 0.5224111282843895, | |
| "grad_norm": 1.3231589794158936, | |
| "learning_rate": 1.4778979907264298e-05, | |
| "loss": 3.2715, | |
| "mean_token_accuracy": 0.409288527816534, | |
| "num_tokens": 5332939.0, | |
| "step": 1690 | |
| }, | |
| { | |
| "epoch": 0.5255023183925811, | |
| "grad_norm": 1.8494716882705688, | |
| "learning_rate": 1.4748068006182381e-05, | |
| "loss": 3.3472, | |
| "mean_token_accuracy": 0.4054514840245247, | |
| "num_tokens": 5361660.0, | |
| "step": 1700 | |
| }, | |
| { | |
| "epoch": 0.5285935085007728, | |
| "grad_norm": 1.1796019077301025, | |
| "learning_rate": 1.4717156105100465e-05, | |
| "loss": 3.3374, | |
| "mean_token_accuracy": 0.40053225085139277, | |
| "num_tokens": 5390818.0, | |
| "step": 1710 | |
| }, | |
| { | |
| "epoch": 0.5316846986089645, | |
| "grad_norm": 1.4589906930923462, | |
| "learning_rate": 1.468624420401855e-05, | |
| "loss": 3.3532, | |
| "mean_token_accuracy": 0.3908236466348171, | |
| "num_tokens": 5423948.0, | |
| "step": 1720 | |
| }, | |
| { | |
| "epoch": 0.5347758887171561, | |
| "grad_norm": 2.48760986328125, | |
| "learning_rate": 1.4655332302936632e-05, | |
| "loss": 3.3916, | |
| "mean_token_accuracy": 0.39238951057195665, | |
| "num_tokens": 5456721.0, | |
| "step": 1730 | |
| }, | |
| { | |
| "epoch": 0.5378670788253478, | |
| "grad_norm": 1.3151116371154785, | |
| "learning_rate": 1.4624420401854715e-05, | |
| "loss": 3.3755, | |
| "mean_token_accuracy": 0.39836090207099917, | |
| "num_tokens": 5485487.0, | |
| "step": 1740 | |
| }, | |
| { | |
| "epoch": 0.5409582689335394, | |
| "grad_norm": 1.3862218856811523, | |
| "learning_rate": 1.4593508500772799e-05, | |
| "loss": 3.3972, | |
| "mean_token_accuracy": 0.39552291929721833, | |
| "num_tokens": 5517812.0, | |
| "step": 1750 | |
| }, | |
| { | |
| "epoch": 0.5440494590417311, | |
| "grad_norm": 1.4198209047317505, | |
| "learning_rate": 1.4562596599690881e-05, | |
| "loss": 3.323, | |
| "mean_token_accuracy": 0.4038604758679867, | |
| "num_tokens": 5549085.0, | |
| "step": 1760 | |
| }, | |
| { | |
| "epoch": 0.5471406491499228, | |
| "grad_norm": 2.063263416290283, | |
| "learning_rate": 1.4531684698608966e-05, | |
| "loss": 3.2286, | |
| "mean_token_accuracy": 0.40857273861765864, | |
| "num_tokens": 5580826.0, | |
| "step": 1770 | |
| }, | |
| { | |
| "epoch": 0.5502318392581144, | |
| "grad_norm": 1.5246375799179077, | |
| "learning_rate": 1.450077279752705e-05, | |
| "loss": 3.3077, | |
| "mean_token_accuracy": 0.3984913781285286, | |
| "num_tokens": 5611654.0, | |
| "step": 1780 | |
| }, | |
| { | |
| "epoch": 0.5533230293663061, | |
| "grad_norm": 3.1407299041748047, | |
| "learning_rate": 1.4469860896445132e-05, | |
| "loss": 3.3631, | |
| "mean_token_accuracy": 0.4005904957652092, | |
| "num_tokens": 5643972.0, | |
| "step": 1790 | |
| }, | |
| { | |
| "epoch": 0.5564142194744977, | |
| "grad_norm": 1.1959761381149292, | |
| "learning_rate": 1.4438948995363215e-05, | |
| "loss": 3.3863, | |
| "mean_token_accuracy": 0.39492699652910235, | |
| "num_tokens": 5674824.0, | |
| "step": 1800 | |
| }, | |
| { | |
| "epoch": 0.5595054095826894, | |
| "grad_norm": 1.6108070611953735, | |
| "learning_rate": 1.44080370942813e-05, | |
| "loss": 3.3524, | |
| "mean_token_accuracy": 0.3943831264972687, | |
| "num_tokens": 5710374.0, | |
| "step": 1810 | |
| }, | |
| { | |
| "epoch": 0.5625965996908809, | |
| "grad_norm": 2.3074188232421875, | |
| "learning_rate": 1.4377125193199384e-05, | |
| "loss": 3.4149, | |
| "mean_token_accuracy": 0.3902184680104256, | |
| "num_tokens": 5740450.0, | |
| "step": 1820 | |
| }, | |
| { | |
| "epoch": 0.5656877897990726, | |
| "grad_norm": 1.2210988998413086, | |
| "learning_rate": 1.4346213292117466e-05, | |
| "loss": 3.368, | |
| "mean_token_accuracy": 0.3951147675514221, | |
| "num_tokens": 5769865.0, | |
| "step": 1830 | |
| }, | |
| { | |
| "epoch": 0.5687789799072643, | |
| "grad_norm": 1.301492691040039, | |
| "learning_rate": 1.431530139103555e-05, | |
| "loss": 3.4011, | |
| "mean_token_accuracy": 0.39827719777822496, | |
| "num_tokens": 5798647.0, | |
| "step": 1840 | |
| }, | |
| { | |
| "epoch": 0.5718701700154559, | |
| "grad_norm": 1.8520913124084473, | |
| "learning_rate": 1.4284389489953633e-05, | |
| "loss": 3.3297, | |
| "mean_token_accuracy": 0.401262603700161, | |
| "num_tokens": 5830176.0, | |
| "step": 1850 | |
| }, | |
| { | |
| "epoch": 0.5749613601236476, | |
| "grad_norm": 1.7829524278640747, | |
| "learning_rate": 1.4253477588871715e-05, | |
| "loss": 3.2177, | |
| "mean_token_accuracy": 0.4113538973033428, | |
| "num_tokens": 5860769.0, | |
| "step": 1860 | |
| }, | |
| { | |
| "epoch": 0.5780525502318392, | |
| "grad_norm": 1.5205345153808594, | |
| "learning_rate": 1.42225656877898e-05, | |
| "loss": 3.4594, | |
| "mean_token_accuracy": 0.3904502220451832, | |
| "num_tokens": 5896213.0, | |
| "step": 1870 | |
| }, | |
| { | |
| "epoch": 0.5811437403400309, | |
| "grad_norm": 1.492475986480713, | |
| "learning_rate": 1.4191653786707884e-05, | |
| "loss": 3.2639, | |
| "mean_token_accuracy": 0.40337538048624993, | |
| "num_tokens": 5928539.0, | |
| "step": 1880 | |
| }, | |
| { | |
| "epoch": 0.5842349304482226, | |
| "grad_norm": 2.07590651512146, | |
| "learning_rate": 1.4160741885625968e-05, | |
| "loss": 3.3918, | |
| "mean_token_accuracy": 0.39563274309039115, | |
| "num_tokens": 5960442.0, | |
| "step": 1890 | |
| }, | |
| { | |
| "epoch": 0.5873261205564142, | |
| "grad_norm": 2.6959567070007324, | |
| "learning_rate": 1.412982998454405e-05, | |
| "loss": 3.3494, | |
| "mean_token_accuracy": 0.40076613500714303, | |
| "num_tokens": 5990404.0, | |
| "step": 1900 | |
| }, | |
| { | |
| "epoch": 0.5904173106646059, | |
| "grad_norm": 1.291227102279663, | |
| "learning_rate": 1.4098918083462135e-05, | |
| "loss": 3.3794, | |
| "mean_token_accuracy": 0.3960807867348194, | |
| "num_tokens": 6023418.0, | |
| "step": 1910 | |
| }, | |
| { | |
| "epoch": 0.5935085007727975, | |
| "grad_norm": 3.709761381149292, | |
| "learning_rate": 1.4068006182380218e-05, | |
| "loss": 3.3441, | |
| "mean_token_accuracy": 0.3981980659067631, | |
| "num_tokens": 6051604.0, | |
| "step": 1920 | |
| }, | |
| { | |
| "epoch": 0.5965996908809892, | |
| "grad_norm": 1.5294419527053833, | |
| "learning_rate": 1.40370942812983e-05, | |
| "loss": 3.3821, | |
| "mean_token_accuracy": 0.40094061717391016, | |
| "num_tokens": 6081609.0, | |
| "step": 1930 | |
| }, | |
| { | |
| "epoch": 0.5996908809891809, | |
| "grad_norm": 1.792324185371399, | |
| "learning_rate": 1.4006182380216384e-05, | |
| "loss": 3.3228, | |
| "mean_token_accuracy": 0.40257288739085195, | |
| "num_tokens": 6113487.0, | |
| "step": 1940 | |
| }, | |
| { | |
| "epoch": 0.6027820710973725, | |
| "grad_norm": 1.3564989566802979, | |
| "learning_rate": 1.3975270479134469e-05, | |
| "loss": 3.3603, | |
| "mean_token_accuracy": 0.3971732698380947, | |
| "num_tokens": 6143546.0, | |
| "step": 1950 | |
| }, | |
| { | |
| "epoch": 0.6058732612055642, | |
| "grad_norm": 1.8166826963424683, | |
| "learning_rate": 1.3944358578052551e-05, | |
| "loss": 3.3511, | |
| "mean_token_accuracy": 0.3947428591549397, | |
| "num_tokens": 6177302.0, | |
| "step": 1960 | |
| }, | |
| { | |
| "epoch": 0.6089644513137558, | |
| "grad_norm": 2.3519530296325684, | |
| "learning_rate": 1.3913446676970635e-05, | |
| "loss": 3.3258, | |
| "mean_token_accuracy": 0.40014824345707894, | |
| "num_tokens": 6209913.0, | |
| "step": 1970 | |
| }, | |
| { | |
| "epoch": 0.6120556414219475, | |
| "grad_norm": 1.9432330131530762, | |
| "learning_rate": 1.3882534775888718e-05, | |
| "loss": 3.2683, | |
| "mean_token_accuracy": 0.40405927672982217, | |
| "num_tokens": 6241773.0, | |
| "step": 1980 | |
| }, | |
| { | |
| "epoch": 0.615146831530139, | |
| "grad_norm": 1.2190839052200317, | |
| "learning_rate": 1.3851622874806803e-05, | |
| "loss": 3.3476, | |
| "mean_token_accuracy": 0.4006108805537224, | |
| "num_tokens": 6274930.0, | |
| "step": 1990 | |
| }, | |
| { | |
| "epoch": 0.6182380216383307, | |
| "grad_norm": 1.395822525024414, | |
| "learning_rate": 1.3820710973724885e-05, | |
| "loss": 3.2706, | |
| "mean_token_accuracy": 0.40902155488729475, | |
| "num_tokens": 6306366.0, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 0.6213292117465224, | |
| "grad_norm": 3.3072211742401123, | |
| "learning_rate": 1.3789799072642969e-05, | |
| "loss": 3.2842, | |
| "mean_token_accuracy": 0.4068531468510628, | |
| "num_tokens": 6336620.0, | |
| "step": 2010 | |
| }, | |
| { | |
| "epoch": 0.624420401854714, | |
| "grad_norm": 6.775637626647949, | |
| "learning_rate": 1.3758887171561052e-05, | |
| "loss": 3.3359, | |
| "mean_token_accuracy": 0.399787887185812, | |
| "num_tokens": 6368783.0, | |
| "step": 2020 | |
| }, | |
| { | |
| "epoch": 0.6275115919629057, | |
| "grad_norm": 2.236809253692627, | |
| "learning_rate": 1.3727975270479134e-05, | |
| "loss": 3.1501, | |
| "mean_token_accuracy": 0.4164234817028046, | |
| "num_tokens": 6396050.0, | |
| "step": 2030 | |
| }, | |
| { | |
| "epoch": 0.6306027820710973, | |
| "grad_norm": 1.864715814590454, | |
| "learning_rate": 1.369706336939722e-05, | |
| "loss": 3.2679, | |
| "mean_token_accuracy": 0.40604618191719055, | |
| "num_tokens": 6428401.0, | |
| "step": 2040 | |
| }, | |
| { | |
| "epoch": 0.633693972179289, | |
| "grad_norm": 13.025823593139648, | |
| "learning_rate": 1.3666151468315303e-05, | |
| "loss": 3.2663, | |
| "mean_token_accuracy": 0.39983370155096054, | |
| "num_tokens": 6463617.0, | |
| "step": 2050 | |
| }, | |
| { | |
| "epoch": 0.6367851622874807, | |
| "grad_norm": 1.4827322959899902, | |
| "learning_rate": 1.3635239567233387e-05, | |
| "loss": 3.3267, | |
| "mean_token_accuracy": 0.4056967757642269, | |
| "num_tokens": 6494545.0, | |
| "step": 2060 | |
| }, | |
| { | |
| "epoch": 0.6398763523956723, | |
| "grad_norm": 2.8298745155334473, | |
| "learning_rate": 1.3604327666151469e-05, | |
| "loss": 3.2855, | |
| "mean_token_accuracy": 0.4048807807266712, | |
| "num_tokens": 6526119.0, | |
| "step": 2070 | |
| }, | |
| { | |
| "epoch": 0.642967542503864, | |
| "grad_norm": 2.294051170349121, | |
| "learning_rate": 1.3573415765069552e-05, | |
| "loss": 3.3445, | |
| "mean_token_accuracy": 0.3997214540839195, | |
| "num_tokens": 6559408.0, | |
| "step": 2080 | |
| }, | |
| { | |
| "epoch": 0.6460587326120556, | |
| "grad_norm": 6.12084436416626, | |
| "learning_rate": 1.3542503863987638e-05, | |
| "loss": 3.2882, | |
| "mean_token_accuracy": 0.40451241433620455, | |
| "num_tokens": 6588923.0, | |
| "step": 2090 | |
| }, | |
| { | |
| "epoch": 0.6491499227202473, | |
| "grad_norm": 1.5177021026611328, | |
| "learning_rate": 1.351159196290572e-05, | |
| "loss": 3.2833, | |
| "mean_token_accuracy": 0.39674848690629005, | |
| "num_tokens": 6623193.0, | |
| "step": 2100 | |
| }, | |
| { | |
| "epoch": 0.652241112828439, | |
| "grad_norm": 1.6658954620361328, | |
| "learning_rate": 1.3480680061823803e-05, | |
| "loss": 3.4093, | |
| "mean_token_accuracy": 0.39299999698996546, | |
| "num_tokens": 6657638.0, | |
| "step": 2110 | |
| }, | |
| { | |
| "epoch": 0.6553323029366306, | |
| "grad_norm": 5.038951396942139, | |
| "learning_rate": 1.3449768160741887e-05, | |
| "loss": 3.3359, | |
| "mean_token_accuracy": 0.4020788729190826, | |
| "num_tokens": 6686552.0, | |
| "step": 2120 | |
| }, | |
| { | |
| "epoch": 0.6584234930448223, | |
| "grad_norm": 1.931733250617981, | |
| "learning_rate": 1.3418856259659968e-05, | |
| "loss": 3.2761, | |
| "mean_token_accuracy": 0.4039643190801144, | |
| "num_tokens": 6719036.0, | |
| "step": 2130 | |
| }, | |
| { | |
| "epoch": 0.6615146831530139, | |
| "grad_norm": 1.7501503229141235, | |
| "learning_rate": 1.3387944358578054e-05, | |
| "loss": 3.2999, | |
| "mean_token_accuracy": 0.40199958309531214, | |
| "num_tokens": 6752457.0, | |
| "step": 2140 | |
| }, | |
| { | |
| "epoch": 0.6646058732612056, | |
| "grad_norm": 1.330138921737671, | |
| "learning_rate": 1.3357032457496137e-05, | |
| "loss": 3.2815, | |
| "mean_token_accuracy": 0.4096176542341709, | |
| "num_tokens": 6780937.0, | |
| "step": 2150 | |
| }, | |
| { | |
| "epoch": 0.6676970633693973, | |
| "grad_norm": 3.2849926948547363, | |
| "learning_rate": 1.3326120556414221e-05, | |
| "loss": 3.3212, | |
| "mean_token_accuracy": 0.39906698688864706, | |
| "num_tokens": 6813861.0, | |
| "step": 2160 | |
| }, | |
| { | |
| "epoch": 0.6707882534775889, | |
| "grad_norm": 1.3946915864944458, | |
| "learning_rate": 1.3295208655332303e-05, | |
| "loss": 3.3151, | |
| "mean_token_accuracy": 0.40061264783143996, | |
| "num_tokens": 6843256.0, | |
| "step": 2170 | |
| }, | |
| { | |
| "epoch": 0.6738794435857806, | |
| "grad_norm": 1.5147260427474976, | |
| "learning_rate": 1.3264296754250388e-05, | |
| "loss": 3.2324, | |
| "mean_token_accuracy": 0.4085581362247467, | |
| "num_tokens": 6881169.0, | |
| "step": 2180 | |
| }, | |
| { | |
| "epoch": 0.6769706336939721, | |
| "grad_norm": 1.5191727876663208, | |
| "learning_rate": 1.3233384853168472e-05, | |
| "loss": 3.2258, | |
| "mean_token_accuracy": 0.4069525547325611, | |
| "num_tokens": 6914374.0, | |
| "step": 2190 | |
| }, | |
| { | |
| "epoch": 0.6800618238021638, | |
| "grad_norm": 11.71318531036377, | |
| "learning_rate": 1.3202472952086554e-05, | |
| "loss": 3.2411, | |
| "mean_token_accuracy": 0.40940716192126275, | |
| "num_tokens": 6944810.0, | |
| "step": 2200 | |
| }, | |
| { | |
| "epoch": 0.6831530139103554, | |
| "grad_norm": 0.9575105309486389, | |
| "learning_rate": 1.3171561051004637e-05, | |
| "loss": 3.2721, | |
| "mean_token_accuracy": 0.40148399621248243, | |
| "num_tokens": 6975417.0, | |
| "step": 2210 | |
| }, | |
| { | |
| "epoch": 0.6862442040185471, | |
| "grad_norm": 1.6727248430252075, | |
| "learning_rate": 1.3140649149922722e-05, | |
| "loss": 3.3072, | |
| "mean_token_accuracy": 0.40622576996684073, | |
| "num_tokens": 7005387.0, | |
| "step": 2220 | |
| }, | |
| { | |
| "epoch": 0.6893353941267388, | |
| "grad_norm": 1.9732425212860107, | |
| "learning_rate": 1.3109737248840804e-05, | |
| "loss": 3.3486, | |
| "mean_token_accuracy": 0.4046866536140442, | |
| "num_tokens": 7036641.0, | |
| "step": 2230 | |
| }, | |
| { | |
| "epoch": 0.6924265842349304, | |
| "grad_norm": 3.0926458835601807, | |
| "learning_rate": 1.3078825347758888e-05, | |
| "loss": 3.2027, | |
| "mean_token_accuracy": 0.41145659387111666, | |
| "num_tokens": 7065613.0, | |
| "step": 2240 | |
| }, | |
| { | |
| "epoch": 0.6955177743431221, | |
| "grad_norm": 1.2291103601455688, | |
| "learning_rate": 1.3047913446676972e-05, | |
| "loss": 3.2273, | |
| "mean_token_accuracy": 0.41015600264072416, | |
| "num_tokens": 7098541.0, | |
| "step": 2250 | |
| }, | |
| { | |
| "epoch": 0.6986089644513137, | |
| "grad_norm": 1.393871784210205, | |
| "learning_rate": 1.3017001545595057e-05, | |
| "loss": 3.3534, | |
| "mean_token_accuracy": 0.398224713653326, | |
| "num_tokens": 7131991.0, | |
| "step": 2260 | |
| }, | |
| { | |
| "epoch": 0.7017001545595054, | |
| "grad_norm": 2.056251287460327, | |
| "learning_rate": 1.2986089644513139e-05, | |
| "loss": 3.2873, | |
| "mean_token_accuracy": 0.4067487485706806, | |
| "num_tokens": 7163563.0, | |
| "step": 2270 | |
| }, | |
| { | |
| "epoch": 0.7047913446676971, | |
| "grad_norm": 3.4474806785583496, | |
| "learning_rate": 1.2955177743431222e-05, | |
| "loss": 3.3688, | |
| "mean_token_accuracy": 0.39887151271104815, | |
| "num_tokens": 7196235.0, | |
| "step": 2280 | |
| }, | |
| { | |
| "epoch": 0.7078825347758887, | |
| "grad_norm": 4.399552345275879, | |
| "learning_rate": 1.2924265842349306e-05, | |
| "loss": 3.2107, | |
| "mean_token_accuracy": 0.4083859778940678, | |
| "num_tokens": 7229484.0, | |
| "step": 2290 | |
| }, | |
| { | |
| "epoch": 0.7109737248840804, | |
| "grad_norm": 1.0608441829681396, | |
| "learning_rate": 1.2893353941267388e-05, | |
| "loss": 3.3511, | |
| "mean_token_accuracy": 0.395867995172739, | |
| "num_tokens": 7263929.0, | |
| "step": 2300 | |
| }, | |
| { | |
| "epoch": 0.714064914992272, | |
| "grad_norm": 1.6351842880249023, | |
| "learning_rate": 1.2862442040185471e-05, | |
| "loss": 3.2312, | |
| "mean_token_accuracy": 0.40859498232603075, | |
| "num_tokens": 7297276.0, | |
| "step": 2310 | |
| }, | |
| { | |
| "epoch": 0.7171561051004637, | |
| "grad_norm": 1.5289595127105713, | |
| "learning_rate": 1.2831530139103557e-05, | |
| "loss": 3.325, | |
| "mean_token_accuracy": 0.40636713430285454, | |
| "num_tokens": 7329069.0, | |
| "step": 2320 | |
| }, | |
| { | |
| "epoch": 0.7202472952086554, | |
| "grad_norm": 1.5375980138778687, | |
| "learning_rate": 1.280061823802164e-05, | |
| "loss": 3.3243, | |
| "mean_token_accuracy": 0.40893488600850103, | |
| "num_tokens": 7362905.0, | |
| "step": 2330 | |
| }, | |
| { | |
| "epoch": 0.723338485316847, | |
| "grad_norm": 1.2137787342071533, | |
| "learning_rate": 1.2769706336939722e-05, | |
| "loss": 3.2944, | |
| "mean_token_accuracy": 0.4049150198698044, | |
| "num_tokens": 7392845.0, | |
| "step": 2340 | |
| }, | |
| { | |
| "epoch": 0.7264296754250387, | |
| "grad_norm": 3.618687152862549, | |
| "learning_rate": 1.2738794435857806e-05, | |
| "loss": 3.3114, | |
| "mean_token_accuracy": 0.398918454349041, | |
| "num_tokens": 7426273.0, | |
| "step": 2350 | |
| }, | |
| { | |
| "epoch": 0.7295208655332303, | |
| "grad_norm": 1.3081494569778442, | |
| "learning_rate": 1.2707882534775891e-05, | |
| "loss": 3.2284, | |
| "mean_token_accuracy": 0.4098616696894169, | |
| "num_tokens": 7457128.0, | |
| "step": 2360 | |
| }, | |
| { | |
| "epoch": 0.732612055641422, | |
| "grad_norm": 1.2867871522903442, | |
| "learning_rate": 1.2676970633693973e-05, | |
| "loss": 3.2734, | |
| "mean_token_accuracy": 0.40420192629098894, | |
| "num_tokens": 7490613.0, | |
| "step": 2370 | |
| }, | |
| { | |
| "epoch": 0.7357032457496137, | |
| "grad_norm": 6.16511869430542, | |
| "learning_rate": 1.2646058732612056e-05, | |
| "loss": 3.2275, | |
| "mean_token_accuracy": 0.4089609131217003, | |
| "num_tokens": 7524066.0, | |
| "step": 2380 | |
| }, | |
| { | |
| "epoch": 0.7387944358578052, | |
| "grad_norm": 3.3427209854125977, | |
| "learning_rate": 1.261514683153014e-05, | |
| "loss": 3.3428, | |
| "mean_token_accuracy": 0.3993862606585026, | |
| "num_tokens": 7554388.0, | |
| "step": 2390 | |
| }, | |
| { | |
| "epoch": 0.7418856259659969, | |
| "grad_norm": 1.759032964706421, | |
| "learning_rate": 1.2584234930448222e-05, | |
| "loss": 3.2427, | |
| "mean_token_accuracy": 0.411670895665884, | |
| "num_tokens": 7586805.0, | |
| "step": 2400 | |
| }, | |
| { | |
| "epoch": 0.7449768160741885, | |
| "grad_norm": 6.3605523109436035, | |
| "learning_rate": 1.2553323029366307e-05, | |
| "loss": 3.2989, | |
| "mean_token_accuracy": 0.4035753831267357, | |
| "num_tokens": 7621026.0, | |
| "step": 2410 | |
| }, | |
| { | |
| "epoch": 0.7480680061823802, | |
| "grad_norm": 1.3226512670516968, | |
| "learning_rate": 1.252241112828439e-05, | |
| "loss": 3.2676, | |
| "mean_token_accuracy": 0.4091234177350998, | |
| "num_tokens": 7653588.0, | |
| "step": 2420 | |
| }, | |
| { | |
| "epoch": 0.7511591962905718, | |
| "grad_norm": 1.2271887063980103, | |
| "learning_rate": 1.2491499227202474e-05, | |
| "loss": 3.1859, | |
| "mean_token_accuracy": 0.4189001992344856, | |
| "num_tokens": 7682672.0, | |
| "step": 2430 | |
| }, | |
| { | |
| "epoch": 0.7542503863987635, | |
| "grad_norm": 1.1789538860321045, | |
| "learning_rate": 1.2460587326120556e-05, | |
| "loss": 3.2707, | |
| "mean_token_accuracy": 0.41208377107977867, | |
| "num_tokens": 7711661.0, | |
| "step": 2440 | |
| }, | |
| { | |
| "epoch": 0.7573415765069552, | |
| "grad_norm": 1.4042397737503052, | |
| "learning_rate": 1.2429675425038642e-05, | |
| "loss": 3.3216, | |
| "mean_token_accuracy": 0.40250647664070127, | |
| "num_tokens": 7743853.0, | |
| "step": 2450 | |
| }, | |
| { | |
| "epoch": 0.7604327666151468, | |
| "grad_norm": 1.9280140399932861, | |
| "learning_rate": 1.2398763523956725e-05, | |
| "loss": 3.2948, | |
| "mean_token_accuracy": 0.4117195881903172, | |
| "num_tokens": 7776380.0, | |
| "step": 2460 | |
| }, | |
| { | |
| "epoch": 0.7635239567233385, | |
| "grad_norm": 1.244311809539795, | |
| "learning_rate": 1.2367851622874807e-05, | |
| "loss": 3.2397, | |
| "mean_token_accuracy": 0.41137626469135286, | |
| "num_tokens": 7808412.0, | |
| "step": 2470 | |
| }, | |
| { | |
| "epoch": 0.7666151468315301, | |
| "grad_norm": 1.2338584661483765, | |
| "learning_rate": 1.233693972179289e-05, | |
| "loss": 3.2887, | |
| "mean_token_accuracy": 0.41127968281507493, | |
| "num_tokens": 7837345.0, | |
| "step": 2480 | |
| }, | |
| { | |
| "epoch": 0.7697063369397218, | |
| "grad_norm": 1.0948349237442017, | |
| "learning_rate": 1.2306027820710976e-05, | |
| "loss": 3.2336, | |
| "mean_token_accuracy": 0.4080419853329659, | |
| "num_tokens": 7869621.0, | |
| "step": 2490 | |
| }, | |
| { | |
| "epoch": 0.7727975270479135, | |
| "grad_norm": 3.051591157913208, | |
| "learning_rate": 1.227511591962906e-05, | |
| "loss": 3.3946, | |
| "mean_token_accuracy": 0.39375910386443136, | |
| "num_tokens": 7900668.0, | |
| "step": 2500 | |
| }, | |
| { | |
| "epoch": 0.7758887171561051, | |
| "grad_norm": 1.2603470087051392, | |
| "learning_rate": 1.2244204018547141e-05, | |
| "loss": 3.2936, | |
| "mean_token_accuracy": 0.40462088733911517, | |
| "num_tokens": 7931473.0, | |
| "step": 2510 | |
| }, | |
| { | |
| "epoch": 0.7789799072642968, | |
| "grad_norm": 1.3495979309082031, | |
| "learning_rate": 1.2213292117465225e-05, | |
| "loss": 3.2914, | |
| "mean_token_accuracy": 0.40598206520080565, | |
| "num_tokens": 7962649.0, | |
| "step": 2520 | |
| }, | |
| { | |
| "epoch": 0.7820710973724884, | |
| "grad_norm": 1.4672921895980835, | |
| "learning_rate": 1.218238021638331e-05, | |
| "loss": 3.2363, | |
| "mean_token_accuracy": 0.41317210271954535, | |
| "num_tokens": 7994607.0, | |
| "step": 2530 | |
| }, | |
| { | |
| "epoch": 0.7851622874806801, | |
| "grad_norm": 1.618283748626709, | |
| "learning_rate": 1.2151468315301392e-05, | |
| "loss": 3.2778, | |
| "mean_token_accuracy": 0.40982001796364786, | |
| "num_tokens": 8023977.0, | |
| "step": 2540 | |
| }, | |
| { | |
| "epoch": 0.7882534775888718, | |
| "grad_norm": 1.4595403671264648, | |
| "learning_rate": 1.2120556414219476e-05, | |
| "loss": 3.306, | |
| "mean_token_accuracy": 0.4041280455887318, | |
| "num_tokens": 8053965.0, | |
| "step": 2550 | |
| }, | |
| { | |
| "epoch": 0.7913446676970634, | |
| "grad_norm": 2.1894686222076416, | |
| "learning_rate": 1.208964451313756e-05, | |
| "loss": 3.2235, | |
| "mean_token_accuracy": 0.4136913321912289, | |
| "num_tokens": 8085818.0, | |
| "step": 2560 | |
| }, | |
| { | |
| "epoch": 0.794435857805255, | |
| "grad_norm": 1.4496268033981323, | |
| "learning_rate": 1.2058732612055641e-05, | |
| "loss": 3.3197, | |
| "mean_token_accuracy": 0.39820486679673195, | |
| "num_tokens": 8118503.0, | |
| "step": 2570 | |
| }, | |
| { | |
| "epoch": 0.7975270479134466, | |
| "grad_norm": 1.4080617427825928, | |
| "learning_rate": 1.2027820710973725e-05, | |
| "loss": 3.2565, | |
| "mean_token_accuracy": 0.4059484012424946, | |
| "num_tokens": 8151137.0, | |
| "step": 2580 | |
| }, | |
| { | |
| "epoch": 0.8006182380216383, | |
| "grad_norm": 1.0931172370910645, | |
| "learning_rate": 1.199690880989181e-05, | |
| "loss": 3.2123, | |
| "mean_token_accuracy": 0.40805021226406096, | |
| "num_tokens": 8183728.0, | |
| "step": 2590 | |
| }, | |
| { | |
| "epoch": 0.80370942812983, | |
| "grad_norm": 1.512048602104187, | |
| "learning_rate": 1.1965996908809894e-05, | |
| "loss": 3.2074, | |
| "mean_token_accuracy": 0.40843924283981325, | |
| "num_tokens": 8214541.0, | |
| "step": 2600 | |
| }, | |
| { | |
| "epoch": 0.8068006182380216, | |
| "grad_norm": 1.5905691385269165, | |
| "learning_rate": 1.1935085007727975e-05, | |
| "loss": 3.3711, | |
| "mean_token_accuracy": 0.40454950705170634, | |
| "num_tokens": 8245713.0, | |
| "step": 2610 | |
| }, | |
| { | |
| "epoch": 0.8098918083462133, | |
| "grad_norm": 1.6936638355255127, | |
| "learning_rate": 1.1904173106646059e-05, | |
| "loss": 3.2563, | |
| "mean_token_accuracy": 0.40816242843866346, | |
| "num_tokens": 8279816.0, | |
| "step": 2620 | |
| }, | |
| { | |
| "epoch": 0.8129829984544049, | |
| "grad_norm": 1.3420774936676025, | |
| "learning_rate": 1.1873261205564144e-05, | |
| "loss": 3.226, | |
| "mean_token_accuracy": 0.41122067645192145, | |
| "num_tokens": 8313266.0, | |
| "step": 2630 | |
| }, | |
| { | |
| "epoch": 0.8160741885625966, | |
| "grad_norm": 1.5639405250549316, | |
| "learning_rate": 1.1842349304482226e-05, | |
| "loss": 3.2754, | |
| "mean_token_accuracy": 0.40760004371404646, | |
| "num_tokens": 8344438.0, | |
| "step": 2640 | |
| }, | |
| { | |
| "epoch": 0.8191653786707882, | |
| "grad_norm": 1.428361415863037, | |
| "learning_rate": 1.181143740340031e-05, | |
| "loss": 3.2856, | |
| "mean_token_accuracy": 0.40872066244482996, | |
| "num_tokens": 8375560.0, | |
| "step": 2650 | |
| }, | |
| { | |
| "epoch": 0.8222565687789799, | |
| "grad_norm": 1.565278172492981, | |
| "learning_rate": 1.1780525502318393e-05, | |
| "loss": 3.2155, | |
| "mean_token_accuracy": 0.40958060398697854, | |
| "num_tokens": 8409286.0, | |
| "step": 2660 | |
| }, | |
| { | |
| "epoch": 0.8253477588871716, | |
| "grad_norm": 1.5016591548919678, | |
| "learning_rate": 1.1749613601236479e-05, | |
| "loss": 3.2018, | |
| "mean_token_accuracy": 0.4215152218937874, | |
| "num_tokens": 8441409.0, | |
| "step": 2670 | |
| }, | |
| { | |
| "epoch": 0.8284389489953632, | |
| "grad_norm": 1.3982653617858887, | |
| "learning_rate": 1.171870170015456e-05, | |
| "loss": 3.2591, | |
| "mean_token_accuracy": 0.40580501705408095, | |
| "num_tokens": 8473244.0, | |
| "step": 2680 | |
| }, | |
| { | |
| "epoch": 0.8315301391035549, | |
| "grad_norm": 2.566338539123535, | |
| "learning_rate": 1.1687789799072644e-05, | |
| "loss": 3.2374, | |
| "mean_token_accuracy": 0.41476899906992915, | |
| "num_tokens": 8503342.0, | |
| "step": 2690 | |
| }, | |
| { | |
| "epoch": 0.8346213292117465, | |
| "grad_norm": 1.7439295053482056, | |
| "learning_rate": 1.1656877897990728e-05, | |
| "loss": 3.2027, | |
| "mean_token_accuracy": 0.4182712368667126, | |
| "num_tokens": 8535344.0, | |
| "step": 2700 | |
| }, | |
| { | |
| "epoch": 0.8377125193199382, | |
| "grad_norm": 3.3643314838409424, | |
| "learning_rate": 1.162596599690881e-05, | |
| "loss": 3.2049, | |
| "mean_token_accuracy": 0.4144292987883091, | |
| "num_tokens": 8566017.0, | |
| "step": 2710 | |
| }, | |
| { | |
| "epoch": 0.8408037094281299, | |
| "grad_norm": 1.662192463874817, | |
| "learning_rate": 1.1595054095826895e-05, | |
| "loss": 3.2681, | |
| "mean_token_accuracy": 0.3985265463590622, | |
| "num_tokens": 8599866.0, | |
| "step": 2720 | |
| }, | |
| { | |
| "epoch": 0.8438948995363215, | |
| "grad_norm": 1.715958833694458, | |
| "learning_rate": 1.1564142194744979e-05, | |
| "loss": 3.3068, | |
| "mean_token_accuracy": 0.39729173853993416, | |
| "num_tokens": 8630951.0, | |
| "step": 2730 | |
| }, | |
| { | |
| "epoch": 0.8469860896445132, | |
| "grad_norm": 1.5541205406188965, | |
| "learning_rate": 1.153323029366306e-05, | |
| "loss": 3.2624, | |
| "mean_token_accuracy": 0.4051229901611805, | |
| "num_tokens": 8662309.0, | |
| "step": 2740 | |
| }, | |
| { | |
| "epoch": 0.8500772797527048, | |
| "grad_norm": 1.596884846687317, | |
| "learning_rate": 1.1502318392581144e-05, | |
| "loss": 3.2699, | |
| "mean_token_accuracy": 0.40654050633311273, | |
| "num_tokens": 8697126.0, | |
| "step": 2750 | |
| }, | |
| { | |
| "epoch": 0.8531684698608965, | |
| "grad_norm": 3.2481422424316406, | |
| "learning_rate": 1.147140649149923e-05, | |
| "loss": 3.148, | |
| "mean_token_accuracy": 0.42360129579901695, | |
| "num_tokens": 8725676.0, | |
| "step": 2760 | |
| }, | |
| { | |
| "epoch": 0.8562596599690881, | |
| "grad_norm": 1.5681166648864746, | |
| "learning_rate": 1.1440494590417313e-05, | |
| "loss": 3.2573, | |
| "mean_token_accuracy": 0.406087576597929, | |
| "num_tokens": 8758924.0, | |
| "step": 2770 | |
| }, | |
| { | |
| "epoch": 0.8593508500772797, | |
| "grad_norm": 1.4387476444244385, | |
| "learning_rate": 1.1409582689335395e-05, | |
| "loss": 3.2587, | |
| "mean_token_accuracy": 0.4091624394059181, | |
| "num_tokens": 8790688.0, | |
| "step": 2780 | |
| }, | |
| { | |
| "epoch": 0.8624420401854714, | |
| "grad_norm": 2.345499038696289, | |
| "learning_rate": 1.1378670788253478e-05, | |
| "loss": 3.2515, | |
| "mean_token_accuracy": 0.406622239202261, | |
| "num_tokens": 8821597.0, | |
| "step": 2790 | |
| }, | |
| { | |
| "epoch": 0.865533230293663, | |
| "grad_norm": 1.172705888748169, | |
| "learning_rate": 1.1347758887171562e-05, | |
| "loss": 3.1511, | |
| "mean_token_accuracy": 0.42104474529623986, | |
| "num_tokens": 8852087.0, | |
| "step": 2800 | |
| }, | |
| { | |
| "epoch": 0.8686244204018547, | |
| "grad_norm": 2.773860216140747, | |
| "learning_rate": 1.1316846986089644e-05, | |
| "loss": 3.2913, | |
| "mean_token_accuracy": 0.40134228840470315, | |
| "num_tokens": 8885781.0, | |
| "step": 2810 | |
| }, | |
| { | |
| "epoch": 0.8717156105100463, | |
| "grad_norm": 2.000077962875366, | |
| "learning_rate": 1.1285935085007729e-05, | |
| "loss": 3.1904, | |
| "mean_token_accuracy": 0.4104874156415462, | |
| "num_tokens": 8917898.0, | |
| "step": 2820 | |
| }, | |
| { | |
| "epoch": 0.874806800618238, | |
| "grad_norm": 1.8527454137802124, | |
| "learning_rate": 1.1255023183925813e-05, | |
| "loss": 3.2801, | |
| "mean_token_accuracy": 0.40994274243712425, | |
| "num_tokens": 8949837.0, | |
| "step": 2830 | |
| }, | |
| { | |
| "epoch": 0.8778979907264297, | |
| "grad_norm": 1.6382179260253906, | |
| "learning_rate": 1.1224111282843895e-05, | |
| "loss": 3.2767, | |
| "mean_token_accuracy": 0.4028928212821484, | |
| "num_tokens": 8984243.0, | |
| "step": 2840 | |
| }, | |
| { | |
| "epoch": 0.8809891808346213, | |
| "grad_norm": 0.9455551505088806, | |
| "learning_rate": 1.1193199381761978e-05, | |
| "loss": 3.3033, | |
| "mean_token_accuracy": 0.40776450037956236, | |
| "num_tokens": 9017527.0, | |
| "step": 2850 | |
| }, | |
| { | |
| "epoch": 0.884080370942813, | |
| "grad_norm": 1.8901394605636597, | |
| "learning_rate": 1.1162287480680063e-05, | |
| "loss": 3.1427, | |
| "mean_token_accuracy": 0.4225546672940254, | |
| "num_tokens": 9048607.0, | |
| "step": 2860 | |
| }, | |
| { | |
| "epoch": 0.8871715610510046, | |
| "grad_norm": 3.34131121635437, | |
| "learning_rate": 1.1131375579598147e-05, | |
| "loss": 3.2084, | |
| "mean_token_accuracy": 0.4111571215093136, | |
| "num_tokens": 9083312.0, | |
| "step": 2870 | |
| }, | |
| { | |
| "epoch": 0.8902627511591963, | |
| "grad_norm": 1.8717923164367676, | |
| "learning_rate": 1.1100463678516229e-05, | |
| "loss": 3.2198, | |
| "mean_token_accuracy": 0.4138743795454502, | |
| "num_tokens": 9114070.0, | |
| "step": 2880 | |
| }, | |
| { | |
| "epoch": 0.893353941267388, | |
| "grad_norm": 3.7949368953704834, | |
| "learning_rate": 1.1069551777434312e-05, | |
| "loss": 3.2532, | |
| "mean_token_accuracy": 0.4119734108448029, | |
| "num_tokens": 9143686.0, | |
| "step": 2890 | |
| }, | |
| { | |
| "epoch": 0.8964451313755796, | |
| "grad_norm": 1.2137411832809448, | |
| "learning_rate": 1.1038639876352398e-05, | |
| "loss": 3.2034, | |
| "mean_token_accuracy": 0.41725371927022936, | |
| "num_tokens": 9174271.0, | |
| "step": 2900 | |
| }, | |
| { | |
| "epoch": 0.8995363214837713, | |
| "grad_norm": 1.1149791479110718, | |
| "learning_rate": 1.100772797527048e-05, | |
| "loss": 3.2835, | |
| "mean_token_accuracy": 0.4074005588889122, | |
| "num_tokens": 9208354.0, | |
| "step": 2910 | |
| }, | |
| { | |
| "epoch": 0.9026275115919629, | |
| "grad_norm": 1.3814709186553955, | |
| "learning_rate": 1.0976816074188563e-05, | |
| "loss": 3.2484, | |
| "mean_token_accuracy": 0.4074396938085556, | |
| "num_tokens": 9238547.0, | |
| "step": 2920 | |
| }, | |
| { | |
| "epoch": 0.9057187017001546, | |
| "grad_norm": 1.3796359300613403, | |
| "learning_rate": 1.0945904173106647e-05, | |
| "loss": 3.1762, | |
| "mean_token_accuracy": 0.4161891110241413, | |
| "num_tokens": 9272451.0, | |
| "step": 2930 | |
| }, | |
| { | |
| "epoch": 0.9088098918083463, | |
| "grad_norm": 3.8864829540252686, | |
| "learning_rate": 1.0914992272024732e-05, | |
| "loss": 3.1503, | |
| "mean_token_accuracy": 0.42186372056603433, | |
| "num_tokens": 9303885.0, | |
| "step": 2940 | |
| }, | |
| { | |
| "epoch": 0.9119010819165378, | |
| "grad_norm": 1.6369949579238892, | |
| "learning_rate": 1.0884080370942814e-05, | |
| "loss": 3.2353, | |
| "mean_token_accuracy": 0.4133276604115963, | |
| "num_tokens": 9333763.0, | |
| "step": 2950 | |
| }, | |
| { | |
| "epoch": 0.9149922720247295, | |
| "grad_norm": 1.507304310798645, | |
| "learning_rate": 1.0853168469860898e-05, | |
| "loss": 3.2037, | |
| "mean_token_accuracy": 0.41107733473181723, | |
| "num_tokens": 9367354.0, | |
| "step": 2960 | |
| }, | |
| { | |
| "epoch": 0.9180834621329211, | |
| "grad_norm": 4.891152858734131, | |
| "learning_rate": 1.0822256568778981e-05, | |
| "loss": 3.2332, | |
| "mean_token_accuracy": 0.4125970214605331, | |
| "num_tokens": 9397706.0, | |
| "step": 2970 | |
| }, | |
| { | |
| "epoch": 0.9211746522411128, | |
| "grad_norm": 1.820520043373108, | |
| "learning_rate": 1.0791344667697063e-05, | |
| "loss": 3.2168, | |
| "mean_token_accuracy": 0.40903283953666686, | |
| "num_tokens": 9428549.0, | |
| "step": 2980 | |
| }, | |
| { | |
| "epoch": 0.9242658423493045, | |
| "grad_norm": 1.2856605052947998, | |
| "learning_rate": 1.0760432766615148e-05, | |
| "loss": 3.2704, | |
| "mean_token_accuracy": 0.4016518287360668, | |
| "num_tokens": 9462364.0, | |
| "step": 2990 | |
| }, | |
| { | |
| "epoch": 0.9273570324574961, | |
| "grad_norm": 1.1914364099502563, | |
| "learning_rate": 1.0729520865533232e-05, | |
| "loss": 3.2313, | |
| "mean_token_accuracy": 0.4183121621608734, | |
| "num_tokens": 9490314.0, | |
| "step": 3000 | |
| }, | |
| { | |
| "epoch": 0.9304482225656878, | |
| "grad_norm": 2.964503526687622, | |
| "learning_rate": 1.0698608964451314e-05, | |
| "loss": 3.2828, | |
| "mean_token_accuracy": 0.4028220146894455, | |
| "num_tokens": 9520119.0, | |
| "step": 3010 | |
| }, | |
| { | |
| "epoch": 0.9335394126738794, | |
| "grad_norm": 1.8764747381210327, | |
| "learning_rate": 1.0667697063369397e-05, | |
| "loss": 3.3497, | |
| "mean_token_accuracy": 0.40479681119322775, | |
| "num_tokens": 9549206.0, | |
| "step": 3020 | |
| }, | |
| { | |
| "epoch": 0.9366306027820711, | |
| "grad_norm": 2.1000001430511475, | |
| "learning_rate": 1.0636785162287481e-05, | |
| "loss": 3.2438, | |
| "mean_token_accuracy": 0.41448465660214423, | |
| "num_tokens": 9578662.0, | |
| "step": 3030 | |
| }, | |
| { | |
| "epoch": 0.9397217928902627, | |
| "grad_norm": 1.7962746620178223, | |
| "learning_rate": 1.0605873261205566e-05, | |
| "loss": 3.2039, | |
| "mean_token_accuracy": 0.41432305723428725, | |
| "num_tokens": 9611638.0, | |
| "step": 3040 | |
| }, | |
| { | |
| "epoch": 0.9428129829984544, | |
| "grad_norm": 1.2367525100708008, | |
| "learning_rate": 1.0574961360123648e-05, | |
| "loss": 3.2756, | |
| "mean_token_accuracy": 0.4112587310373783, | |
| "num_tokens": 9645086.0, | |
| "step": 3050 | |
| }, | |
| { | |
| "epoch": 0.9459041731066461, | |
| "grad_norm": 1.2295643091201782, | |
| "learning_rate": 1.0544049459041732e-05, | |
| "loss": 3.2597, | |
| "mean_token_accuracy": 0.41084068119525907, | |
| "num_tokens": 9676540.0, | |
| "step": 3060 | |
| }, | |
| { | |
| "epoch": 0.9489953632148377, | |
| "grad_norm": 1.836288571357727, | |
| "learning_rate": 1.0513137557959815e-05, | |
| "loss": 3.1906, | |
| "mean_token_accuracy": 0.4185685083270073, | |
| "num_tokens": 9708023.0, | |
| "step": 3070 | |
| }, | |
| { | |
| "epoch": 0.9520865533230294, | |
| "grad_norm": 1.4680696725845337, | |
| "learning_rate": 1.0482225656877897e-05, | |
| "loss": 3.2044, | |
| "mean_token_accuracy": 0.4141043916344643, | |
| "num_tokens": 9741153.0, | |
| "step": 3080 | |
| }, | |
| { | |
| "epoch": 0.955177743431221, | |
| "grad_norm": 1.3278076648712158, | |
| "learning_rate": 1.0451313755795983e-05, | |
| "loss": 3.1744, | |
| "mean_token_accuracy": 0.41462502256035805, | |
| "num_tokens": 9772481.0, | |
| "step": 3090 | |
| }, | |
| { | |
| "epoch": 0.9582689335394127, | |
| "grad_norm": 2.042060136795044, | |
| "learning_rate": 1.0420401854714066e-05, | |
| "loss": 3.2346, | |
| "mean_token_accuracy": 0.4115324914455414, | |
| "num_tokens": 9802122.0, | |
| "step": 3100 | |
| }, | |
| { | |
| "epoch": 0.9613601236476044, | |
| "grad_norm": 1.6663880348205566, | |
| "learning_rate": 1.038948995363215e-05, | |
| "loss": 3.2437, | |
| "mean_token_accuracy": 0.411440496891737, | |
| "num_tokens": 9832001.0, | |
| "step": 3110 | |
| }, | |
| { | |
| "epoch": 0.964451313755796, | |
| "grad_norm": 1.2543443441390991, | |
| "learning_rate": 1.0358578052550232e-05, | |
| "loss": 3.2323, | |
| "mean_token_accuracy": 0.40699815154075625, | |
| "num_tokens": 9862830.0, | |
| "step": 3120 | |
| }, | |
| { | |
| "epoch": 0.9675425038639877, | |
| "grad_norm": 10.45780086517334, | |
| "learning_rate": 1.0327666151468317e-05, | |
| "loss": 3.2624, | |
| "mean_token_accuracy": 0.4110853001475334, | |
| "num_tokens": 9897540.0, | |
| "step": 3130 | |
| }, | |
| { | |
| "epoch": 0.9706336939721792, | |
| "grad_norm": 1.6870477199554443, | |
| "learning_rate": 1.02967542503864e-05, | |
| "loss": 3.2788, | |
| "mean_token_accuracy": 0.4078416295349598, | |
| "num_tokens": 9929002.0, | |
| "step": 3140 | |
| }, | |
| { | |
| "epoch": 0.973724884080371, | |
| "grad_norm": 1.3050179481506348, | |
| "learning_rate": 1.0265842349304482e-05, | |
| "loss": 3.1642, | |
| "mean_token_accuracy": 0.41117783561348914, | |
| "num_tokens": 9961618.0, | |
| "step": 3150 | |
| }, | |
| { | |
| "epoch": 0.9768160741885626, | |
| "grad_norm": 1.570807695388794, | |
| "learning_rate": 1.0234930448222566e-05, | |
| "loss": 3.2458, | |
| "mean_token_accuracy": 0.4177241921424866, | |
| "num_tokens": 9991255.0, | |
| "step": 3160 | |
| }, | |
| { | |
| "epoch": 0.9799072642967542, | |
| "grad_norm": 3.6920664310455322, | |
| "learning_rate": 1.0204018547140651e-05, | |
| "loss": 3.2963, | |
| "mean_token_accuracy": 0.4077574260532856, | |
| "num_tokens": 10024237.0, | |
| "step": 3170 | |
| }, | |
| { | |
| "epoch": 0.9829984544049459, | |
| "grad_norm": 4.246991157531738, | |
| "learning_rate": 1.0173106646058733e-05, | |
| "loss": 3.2326, | |
| "mean_token_accuracy": 0.41490627601742747, | |
| "num_tokens": 10057244.0, | |
| "step": 3180 | |
| }, | |
| { | |
| "epoch": 0.9860896445131375, | |
| "grad_norm": 1.615694522857666, | |
| "learning_rate": 1.0142194744976817e-05, | |
| "loss": 3.2114, | |
| "mean_token_accuracy": 0.4113995648920536, | |
| "num_tokens": 10091258.0, | |
| "step": 3190 | |
| }, | |
| { | |
| "epoch": 0.9891808346213292, | |
| "grad_norm": 1.5819542407989502, | |
| "learning_rate": 1.01112828438949e-05, | |
| "loss": 3.0528, | |
| "mean_token_accuracy": 0.4291278474032879, | |
| "num_tokens": 10118722.0, | |
| "step": 3200 | |
| }, | |
| { | |
| "epoch": 0.9922720247295209, | |
| "grad_norm": 4.877267360687256, | |
| "learning_rate": 1.0080370942812986e-05, | |
| "loss": 3.3058, | |
| "mean_token_accuracy": 0.40104425325989723, | |
| "num_tokens": 10149898.0, | |
| "step": 3210 | |
| }, | |
| { | |
| "epoch": 0.9953632148377125, | |
| "grad_norm": 1.5499932765960693, | |
| "learning_rate": 1.0049459041731067e-05, | |
| "loss": 3.2483, | |
| "mean_token_accuracy": 0.41600828766822817, | |
| "num_tokens": 10180542.0, | |
| "step": 3220 | |
| }, | |
| { | |
| "epoch": 0.9984544049459042, | |
| "grad_norm": 3.9266202449798584, | |
| "learning_rate": 1.0018547140649151e-05, | |
| "loss": 3.2672, | |
| "mean_token_accuracy": 0.41103068739175797, | |
| "num_tokens": 10211504.0, | |
| "step": 3230 | |
| }, | |
| { | |
| "epoch": 1.001545595054096, | |
| "grad_norm": 2.134188413619995, | |
| "learning_rate": 9.987635239567235e-06, | |
| "loss": 3.1624, | |
| "mean_token_accuracy": 0.4230015531182289, | |
| "num_tokens": 10239942.0, | |
| "step": 3240 | |
| }, | |
| { | |
| "epoch": 1.0046367851622875, | |
| "grad_norm": 1.5933483839035034, | |
| "learning_rate": 9.956723338485318e-06, | |
| "loss": 3.1625, | |
| "mean_token_accuracy": 0.41645141169428823, | |
| "num_tokens": 10274555.0, | |
| "step": 3250 | |
| }, | |
| { | |
| "epoch": 1.007727975270479, | |
| "grad_norm": 1.4850564002990723, | |
| "learning_rate": 9.925811437403402e-06, | |
| "loss": 3.2296, | |
| "mean_token_accuracy": 0.41107223033905027, | |
| "num_tokens": 10308953.0, | |
| "step": 3260 | |
| }, | |
| { | |
| "epoch": 1.010819165378671, | |
| "grad_norm": 1.2016632556915283, | |
| "learning_rate": 9.894899536321485e-06, | |
| "loss": 3.1902, | |
| "mean_token_accuracy": 0.41538654640316963, | |
| "num_tokens": 10343548.0, | |
| "step": 3270 | |
| }, | |
| { | |
| "epoch": 1.0139103554868625, | |
| "grad_norm": 1.4952160120010376, | |
| "learning_rate": 9.863987635239567e-06, | |
| "loss": 3.2712, | |
| "mean_token_accuracy": 0.4115736290812492, | |
| "num_tokens": 10376226.0, | |
| "step": 3280 | |
| }, | |
| { | |
| "epoch": 1.017001545595054, | |
| "grad_norm": 1.2473570108413696, | |
| "learning_rate": 9.83307573415765e-06, | |
| "loss": 3.1719, | |
| "mean_token_accuracy": 0.4191995531320572, | |
| "num_tokens": 10407408.0, | |
| "step": 3290 | |
| }, | |
| { | |
| "epoch": 1.0200927357032457, | |
| "grad_norm": 1.5981847047805786, | |
| "learning_rate": 9.802163833075734e-06, | |
| "loss": 3.1756, | |
| "mean_token_accuracy": 0.4126800112426281, | |
| "num_tokens": 10439332.0, | |
| "step": 3300 | |
| }, | |
| { | |
| "epoch": 1.0231839258114375, | |
| "grad_norm": 1.6504404544830322, | |
| "learning_rate": 9.771251931993818e-06, | |
| "loss": 3.2747, | |
| "mean_token_accuracy": 0.4050762981176376, | |
| "num_tokens": 10468931.0, | |
| "step": 3310 | |
| }, | |
| { | |
| "epoch": 1.026275115919629, | |
| "grad_norm": 1.2191509008407593, | |
| "learning_rate": 9.740340030911902e-06, | |
| "loss": 3.1862, | |
| "mean_token_accuracy": 0.4132268287241459, | |
| "num_tokens": 10501965.0, | |
| "step": 3320 | |
| }, | |
| { | |
| "epoch": 1.0293663060278206, | |
| "grad_norm": 1.6509348154067993, | |
| "learning_rate": 9.709428129829985e-06, | |
| "loss": 3.2574, | |
| "mean_token_accuracy": 0.40723603740334513, | |
| "num_tokens": 10532613.0, | |
| "step": 3330 | |
| }, | |
| { | |
| "epoch": 1.0324574961360125, | |
| "grad_norm": 1.8046737909317017, | |
| "learning_rate": 9.678516228748069e-06, | |
| "loss": 3.2028, | |
| "mean_token_accuracy": 0.4177181996405125, | |
| "num_tokens": 10562323.0, | |
| "step": 3340 | |
| }, | |
| { | |
| "epoch": 1.035548686244204, | |
| "grad_norm": 1.156565546989441, | |
| "learning_rate": 9.647604327666152e-06, | |
| "loss": 3.2189, | |
| "mean_token_accuracy": 0.4168844804167747, | |
| "num_tokens": 10592216.0, | |
| "step": 3350 | |
| }, | |
| { | |
| "epoch": 1.0386398763523956, | |
| "grad_norm": 1.5646038055419922, | |
| "learning_rate": 9.616692426584236e-06, | |
| "loss": 3.239, | |
| "mean_token_accuracy": 0.40751678571105004, | |
| "num_tokens": 10624766.0, | |
| "step": 3360 | |
| }, | |
| { | |
| "epoch": 1.0417310664605872, | |
| "grad_norm": 1.1149756908416748, | |
| "learning_rate": 9.58578052550232e-06, | |
| "loss": 3.0939, | |
| "mean_token_accuracy": 0.42675758227705957, | |
| "num_tokens": 10653981.0, | |
| "step": 3370 | |
| }, | |
| { | |
| "epoch": 1.044822256568779, | |
| "grad_norm": 1.9262531995773315, | |
| "learning_rate": 9.554868624420403e-06, | |
| "loss": 3.1622, | |
| "mean_token_accuracy": 0.41861816495656967, | |
| "num_tokens": 10685402.0, | |
| "step": 3380 | |
| }, | |
| { | |
| "epoch": 1.0479134466769706, | |
| "grad_norm": 1.5397542715072632, | |
| "learning_rate": 9.523956723338487e-06, | |
| "loss": 3.2075, | |
| "mean_token_accuracy": 0.4133962944149971, | |
| "num_tokens": 10719468.0, | |
| "step": 3390 | |
| }, | |
| { | |
| "epoch": 1.0510046367851622, | |
| "grad_norm": 2.140308141708374, | |
| "learning_rate": 9.49304482225657e-06, | |
| "loss": 3.2143, | |
| "mean_token_accuracy": 0.41401648372411726, | |
| "num_tokens": 10750774.0, | |
| "step": 3400 | |
| }, | |
| { | |
| "epoch": 1.054095826893354, | |
| "grad_norm": 1.6291162967681885, | |
| "learning_rate": 9.462132921174652e-06, | |
| "loss": 3.2932, | |
| "mean_token_accuracy": 0.4043385870754719, | |
| "num_tokens": 10782891.0, | |
| "step": 3410 | |
| }, | |
| { | |
| "epoch": 1.0571870170015456, | |
| "grad_norm": 1.2068161964416504, | |
| "learning_rate": 9.431221020092737e-06, | |
| "loss": 3.2038, | |
| "mean_token_accuracy": 0.4151684492826462, | |
| "num_tokens": 10814759.0, | |
| "step": 3420 | |
| }, | |
| { | |
| "epoch": 1.0602782071097372, | |
| "grad_norm": 1.3435413837432861, | |
| "learning_rate": 9.40030911901082e-06, | |
| "loss": 3.2655, | |
| "mean_token_accuracy": 0.41177373975515363, | |
| "num_tokens": 10846021.0, | |
| "step": 3430 | |
| }, | |
| { | |
| "epoch": 1.063369397217929, | |
| "grad_norm": 1.1493477821350098, | |
| "learning_rate": 9.369397217928905e-06, | |
| "loss": 3.186, | |
| "mean_token_accuracy": 0.4216080687940121, | |
| "num_tokens": 10877787.0, | |
| "step": 3440 | |
| }, | |
| { | |
| "epoch": 1.0664605873261206, | |
| "grad_norm": 1.7987961769104004, | |
| "learning_rate": 9.338485316846986e-06, | |
| "loss": 3.2626, | |
| "mean_token_accuracy": 0.4110354706645012, | |
| "num_tokens": 10906159.0, | |
| "step": 3450 | |
| }, | |
| { | |
| "epoch": 1.0695517774343122, | |
| "grad_norm": 1.6236494779586792, | |
| "learning_rate": 9.30757341576507e-06, | |
| "loss": 3.2564, | |
| "mean_token_accuracy": 0.4106232084333897, | |
| "num_tokens": 10934948.0, | |
| "step": 3460 | |
| }, | |
| { | |
| "epoch": 1.0726429675425038, | |
| "grad_norm": 2.3455023765563965, | |
| "learning_rate": 9.276661514683154e-06, | |
| "loss": 3.133, | |
| "mean_token_accuracy": 0.4222550518810749, | |
| "num_tokens": 10967516.0, | |
| "step": 3470 | |
| }, | |
| { | |
| "epoch": 1.0757341576506956, | |
| "grad_norm": 1.278497576713562, | |
| "learning_rate": 9.245749613601237e-06, | |
| "loss": 3.2399, | |
| "mean_token_accuracy": 0.4149567700922489, | |
| "num_tokens": 10999132.0, | |
| "step": 3480 | |
| }, | |
| { | |
| "epoch": 1.0788253477588872, | |
| "grad_norm": 2.0221869945526123, | |
| "learning_rate": 9.21483771251932e-06, | |
| "loss": 3.2568, | |
| "mean_token_accuracy": 0.4081104606389999, | |
| "num_tokens": 11031589.0, | |
| "step": 3490 | |
| }, | |
| { | |
| "epoch": 1.0819165378670788, | |
| "grad_norm": 37.65148162841797, | |
| "learning_rate": 9.183925811437404e-06, | |
| "loss": 3.2473, | |
| "mean_token_accuracy": 0.4095830604434013, | |
| "num_tokens": 11059575.0, | |
| "step": 3500 | |
| }, | |
| { | |
| "epoch": 1.0850077279752706, | |
| "grad_norm": 4.20313835144043, | |
| "learning_rate": 9.153013910355486e-06, | |
| "loss": 3.143, | |
| "mean_token_accuracy": 0.426883215457201, | |
| "num_tokens": 11091120.0, | |
| "step": 3510 | |
| }, | |
| { | |
| "epoch": 1.0880989180834622, | |
| "grad_norm": 3.1230990886688232, | |
| "learning_rate": 9.122102009273572e-06, | |
| "loss": 3.3042, | |
| "mean_token_accuracy": 0.4000666797161102, | |
| "num_tokens": 11123837.0, | |
| "step": 3520 | |
| }, | |
| { | |
| "epoch": 1.0911901081916537, | |
| "grad_norm": 0.9987094402313232, | |
| "learning_rate": 9.091190108191653e-06, | |
| "loss": 3.1809, | |
| "mean_token_accuracy": 0.4136147178709507, | |
| "num_tokens": 11154747.0, | |
| "step": 3530 | |
| }, | |
| { | |
| "epoch": 1.0942812982998453, | |
| "grad_norm": 2.0009162425994873, | |
| "learning_rate": 9.060278207109739e-06, | |
| "loss": 3.2353, | |
| "mean_token_accuracy": 0.40693147107958794, | |
| "num_tokens": 11190794.0, | |
| "step": 3540 | |
| }, | |
| { | |
| "epoch": 1.0973724884080371, | |
| "grad_norm": 2.4473884105682373, | |
| "learning_rate": 9.02936630602782e-06, | |
| "loss": 3.1614, | |
| "mean_token_accuracy": 0.42429070770740507, | |
| "num_tokens": 11219554.0, | |
| "step": 3550 | |
| }, | |
| { | |
| "epoch": 1.1004636785162287, | |
| "grad_norm": 1.1736706495285034, | |
| "learning_rate": 8.998454404945904e-06, | |
| "loss": 3.2173, | |
| "mean_token_accuracy": 0.41124544814229014, | |
| "num_tokens": 11252636.0, | |
| "step": 3560 | |
| }, | |
| { | |
| "epoch": 1.1035548686244203, | |
| "grad_norm": 1.163642168045044, | |
| "learning_rate": 8.967542503863988e-06, | |
| "loss": 3.1291, | |
| "mean_token_accuracy": 0.4211157590150833, | |
| "num_tokens": 11282896.0, | |
| "step": 3570 | |
| }, | |
| { | |
| "epoch": 1.1066460587326121, | |
| "grad_norm": 1.362874984741211, | |
| "learning_rate": 8.936630602782071e-06, | |
| "loss": 3.2384, | |
| "mean_token_accuracy": 0.41210982352495196, | |
| "num_tokens": 11315528.0, | |
| "step": 3580 | |
| }, | |
| { | |
| "epoch": 1.1097372488408037, | |
| "grad_norm": 1.640885829925537, | |
| "learning_rate": 8.905718701700155e-06, | |
| "loss": 3.2508, | |
| "mean_token_accuracy": 0.40889245420694353, | |
| "num_tokens": 11345228.0, | |
| "step": 3590 | |
| }, | |
| { | |
| "epoch": 1.1128284389489953, | |
| "grad_norm": 1.803788661956787, | |
| "learning_rate": 8.874806800618239e-06, | |
| "loss": 3.1479, | |
| "mean_token_accuracy": 0.42345268800854685, | |
| "num_tokens": 11376476.0, | |
| "step": 3600 | |
| }, | |
| { | |
| "epoch": 1.1159196290571871, | |
| "grad_norm": 3.247323751449585, | |
| "learning_rate": 8.843894899536322e-06, | |
| "loss": 3.1825, | |
| "mean_token_accuracy": 0.4189397856593132, | |
| "num_tokens": 11406589.0, | |
| "step": 3610 | |
| }, | |
| { | |
| "epoch": 1.1190108191653787, | |
| "grad_norm": 1.748858094215393, | |
| "learning_rate": 8.812982998454406e-06, | |
| "loss": 3.2348, | |
| "mean_token_accuracy": 0.41790731623768806, | |
| "num_tokens": 11435694.0, | |
| "step": 3620 | |
| }, | |
| { | |
| "epoch": 1.1221020092735703, | |
| "grad_norm": 1.1326861381530762, | |
| "learning_rate": 8.78207109737249e-06, | |
| "loss": 3.2358, | |
| "mean_token_accuracy": 0.4107509456574917, | |
| "num_tokens": 11465245.0, | |
| "step": 3630 | |
| }, | |
| { | |
| "epoch": 1.125193199381762, | |
| "grad_norm": 1.4564932584762573, | |
| "learning_rate": 8.751159196290573e-06, | |
| "loss": 3.2268, | |
| "mean_token_accuracy": 0.41108732894062994, | |
| "num_tokens": 11498154.0, | |
| "step": 3640 | |
| }, | |
| { | |
| "epoch": 1.1282843894899537, | |
| "grad_norm": 0.9113560914993286, | |
| "learning_rate": 8.720247295208657e-06, | |
| "loss": 3.1874, | |
| "mean_token_accuracy": 0.41293532848358155, | |
| "num_tokens": 11531872.0, | |
| "step": 3650 | |
| }, | |
| { | |
| "epoch": 1.1313755795981453, | |
| "grad_norm": 1.6753915548324585, | |
| "learning_rate": 8.68933539412674e-06, | |
| "loss": 3.1935, | |
| "mean_token_accuracy": 0.4139715678989887, | |
| "num_tokens": 11565247.0, | |
| "step": 3660 | |
| }, | |
| { | |
| "epoch": 1.1344667697063369, | |
| "grad_norm": 1.6150254011154175, | |
| "learning_rate": 8.658423493044824e-06, | |
| "loss": 3.2335, | |
| "mean_token_accuracy": 0.41677759736776354, | |
| "num_tokens": 11594001.0, | |
| "step": 3670 | |
| }, | |
| { | |
| "epoch": 1.1375579598145287, | |
| "grad_norm": 1.0996955633163452, | |
| "learning_rate": 8.627511591962906e-06, | |
| "loss": 3.2328, | |
| "mean_token_accuracy": 0.40990992560982703, | |
| "num_tokens": 11626151.0, | |
| "step": 3680 | |
| }, | |
| { | |
| "epoch": 1.1406491499227203, | |
| "grad_norm": 2.7414052486419678, | |
| "learning_rate": 8.59659969088099e-06, | |
| "loss": 3.2059, | |
| "mean_token_accuracy": 0.41417448669672013, | |
| "num_tokens": 11655585.0, | |
| "step": 3690 | |
| }, | |
| { | |
| "epoch": 1.1437403400309119, | |
| "grad_norm": 1.7157678604125977, | |
| "learning_rate": 8.565687789799073e-06, | |
| "loss": 3.2184, | |
| "mean_token_accuracy": 0.4183143936097622, | |
| "num_tokens": 11688319.0, | |
| "step": 3700 | |
| }, | |
| { | |
| "epoch": 1.1468315301391034, | |
| "grad_norm": 1.458961009979248, | |
| "learning_rate": 8.534775888717158e-06, | |
| "loss": 3.2694, | |
| "mean_token_accuracy": 0.4085992857813835, | |
| "num_tokens": 11718421.0, | |
| "step": 3710 | |
| }, | |
| { | |
| "epoch": 1.1499227202472952, | |
| "grad_norm": 1.2802034616470337, | |
| "learning_rate": 8.50386398763524e-06, | |
| "loss": 3.2114, | |
| "mean_token_accuracy": 0.41477348655462265, | |
| "num_tokens": 11750217.0, | |
| "step": 3720 | |
| }, | |
| { | |
| "epoch": 1.1530139103554868, | |
| "grad_norm": 1.8998056650161743, | |
| "learning_rate": 8.472952086553323e-06, | |
| "loss": 3.1341, | |
| "mean_token_accuracy": 0.42046748399734496, | |
| "num_tokens": 11783326.0, | |
| "step": 3730 | |
| }, | |
| { | |
| "epoch": 1.1561051004636784, | |
| "grad_norm": 1.2350112199783325, | |
| "learning_rate": 8.442040185471407e-06, | |
| "loss": 3.2182, | |
| "mean_token_accuracy": 0.41114275753498075, | |
| "num_tokens": 11814954.0, | |
| "step": 3740 | |
| }, | |
| { | |
| "epoch": 1.1591962905718702, | |
| "grad_norm": 1.5678590536117554, | |
| "learning_rate": 8.41112828438949e-06, | |
| "loss": 3.3188, | |
| "mean_token_accuracy": 0.40804709047079085, | |
| "num_tokens": 11848958.0, | |
| "step": 3750 | |
| }, | |
| { | |
| "epoch": 1.1622874806800618, | |
| "grad_norm": 1.5195876359939575, | |
| "learning_rate": 8.380216383307574e-06, | |
| "loss": 3.2479, | |
| "mean_token_accuracy": 0.40802566707134247, | |
| "num_tokens": 11880748.0, | |
| "step": 3760 | |
| }, | |
| { | |
| "epoch": 1.1653786707882534, | |
| "grad_norm": 2.050419807434082, | |
| "learning_rate": 8.349304482225658e-06, | |
| "loss": 3.1546, | |
| "mean_token_accuracy": 0.42246685177087784, | |
| "num_tokens": 11910387.0, | |
| "step": 3770 | |
| }, | |
| { | |
| "epoch": 1.1684698608964452, | |
| "grad_norm": 1.7234852313995361, | |
| "learning_rate": 8.31839258114374e-06, | |
| "loss": 3.221, | |
| "mean_token_accuracy": 0.4118661187589169, | |
| "num_tokens": 11939685.0, | |
| "step": 3780 | |
| }, | |
| { | |
| "epoch": 1.1715610510046368, | |
| "grad_norm": 1.4194177389144897, | |
| "learning_rate": 8.287480680061825e-06, | |
| "loss": 3.2305, | |
| "mean_token_accuracy": 0.4091548278927803, | |
| "num_tokens": 11971214.0, | |
| "step": 3790 | |
| }, | |
| { | |
| "epoch": 1.1746522411128284, | |
| "grad_norm": 1.0113506317138672, | |
| "learning_rate": 8.256568778979907e-06, | |
| "loss": 3.2137, | |
| "mean_token_accuracy": 0.41168191134929655, | |
| "num_tokens": 12003031.0, | |
| "step": 3800 | |
| }, | |
| { | |
| "epoch": 1.1777434312210202, | |
| "grad_norm": 2.5737476348876953, | |
| "learning_rate": 8.225656877897992e-06, | |
| "loss": 3.1441, | |
| "mean_token_accuracy": 0.42300955280661584, | |
| "num_tokens": 12033250.0, | |
| "step": 3810 | |
| }, | |
| { | |
| "epoch": 1.1808346213292118, | |
| "grad_norm": 2.5177934169769287, | |
| "learning_rate": 8.194744976816074e-06, | |
| "loss": 3.2089, | |
| "mean_token_accuracy": 0.41499723494052887, | |
| "num_tokens": 12064537.0, | |
| "step": 3820 | |
| }, | |
| { | |
| "epoch": 1.1839258114374034, | |
| "grad_norm": 1.1910068988800049, | |
| "learning_rate": 8.16383307573416e-06, | |
| "loss": 3.2179, | |
| "mean_token_accuracy": 0.41548130139708517, | |
| "num_tokens": 12096056.0, | |
| "step": 3830 | |
| }, | |
| { | |
| "epoch": 1.187017001545595, | |
| "grad_norm": 1.5878336429595947, | |
| "learning_rate": 8.132921174652241e-06, | |
| "loss": 3.144, | |
| "mean_token_accuracy": 0.40882509499788283, | |
| "num_tokens": 12131664.0, | |
| "step": 3840 | |
| }, | |
| { | |
| "epoch": 1.1901081916537868, | |
| "grad_norm": 1.3621925115585327, | |
| "learning_rate": 8.102009273570325e-06, | |
| "loss": 3.1344, | |
| "mean_token_accuracy": 0.4220642536878586, | |
| "num_tokens": 12162170.0, | |
| "step": 3850 | |
| }, | |
| { | |
| "epoch": 1.1931993817619784, | |
| "grad_norm": 3.2442736625671387, | |
| "learning_rate": 8.071097372488408e-06, | |
| "loss": 3.1784, | |
| "mean_token_accuracy": 0.41677999347448347, | |
| "num_tokens": 12192536.0, | |
| "step": 3860 | |
| }, | |
| { | |
| "epoch": 1.19629057187017, | |
| "grad_norm": 1.6372841596603394, | |
| "learning_rate": 8.040185471406492e-06, | |
| "loss": 3.1393, | |
| "mean_token_accuracy": 0.42606005966663363, | |
| "num_tokens": 12222253.0, | |
| "step": 3870 | |
| }, | |
| { | |
| "epoch": 1.1993817619783615, | |
| "grad_norm": 6.679258823394775, | |
| "learning_rate": 8.009273570324576e-06, | |
| "loss": 3.1493, | |
| "mean_token_accuracy": 0.4194886885583401, | |
| "num_tokens": 12253757.0, | |
| "step": 3880 | |
| }, | |
| { | |
| "epoch": 1.2024729520865534, | |
| "grad_norm": 2.9235403537750244, | |
| "learning_rate": 7.97836166924266e-06, | |
| "loss": 3.1981, | |
| "mean_token_accuracy": 0.41685143783688544, | |
| "num_tokens": 12285504.0, | |
| "step": 3890 | |
| }, | |
| { | |
| "epoch": 1.205564142194745, | |
| "grad_norm": 3.680112838745117, | |
| "learning_rate": 7.947449768160743e-06, | |
| "loss": 3.2044, | |
| "mean_token_accuracy": 0.41487403139472007, | |
| "num_tokens": 12317649.0, | |
| "step": 3900 | |
| }, | |
| { | |
| "epoch": 1.2086553323029365, | |
| "grad_norm": 4.2484283447265625, | |
| "learning_rate": 7.916537867078826e-06, | |
| "loss": 3.2147, | |
| "mean_token_accuracy": 0.4154091864824295, | |
| "num_tokens": 12349158.0, | |
| "step": 3910 | |
| }, | |
| { | |
| "epoch": 1.2117465224111283, | |
| "grad_norm": 1.0454447269439697, | |
| "learning_rate": 7.88562596599691e-06, | |
| "loss": 3.2005, | |
| "mean_token_accuracy": 0.4116973325610161, | |
| "num_tokens": 12381883.0, | |
| "step": 3920 | |
| }, | |
| { | |
| "epoch": 1.21483771251932, | |
| "grad_norm": 1.5970138311386108, | |
| "learning_rate": 7.854714064914994e-06, | |
| "loss": 3.2037, | |
| "mean_token_accuracy": 0.41303489953279493, | |
| "num_tokens": 12417101.0, | |
| "step": 3930 | |
| }, | |
| { | |
| "epoch": 1.2179289026275115, | |
| "grad_norm": 1.4790903329849243, | |
| "learning_rate": 7.823802163833077e-06, | |
| "loss": 3.1911, | |
| "mean_token_accuracy": 0.41875759288668635, | |
| "num_tokens": 12446858.0, | |
| "step": 3940 | |
| }, | |
| { | |
| "epoch": 1.2210200927357033, | |
| "grad_norm": 2.267620801925659, | |
| "learning_rate": 7.792890262751159e-06, | |
| "loss": 3.128, | |
| "mean_token_accuracy": 0.4224107012152672, | |
| "num_tokens": 12477693.0, | |
| "step": 3950 | |
| }, | |
| { | |
| "epoch": 1.224111282843895, | |
| "grad_norm": 13.008003234863281, | |
| "learning_rate": 7.761978361669244e-06, | |
| "loss": 3.1414, | |
| "mean_token_accuracy": 0.4242011792957783, | |
| "num_tokens": 12508183.0, | |
| "step": 3960 | |
| }, | |
| { | |
| "epoch": 1.2272024729520865, | |
| "grad_norm": 2.0274596214294434, | |
| "learning_rate": 7.731066460587326e-06, | |
| "loss": 3.1405, | |
| "mean_token_accuracy": 0.41859717667102814, | |
| "num_tokens": 12540160.0, | |
| "step": 3970 | |
| }, | |
| { | |
| "epoch": 1.2302936630602783, | |
| "grad_norm": 1.9864860773086548, | |
| "learning_rate": 7.70015455950541e-06, | |
| "loss": 3.1902, | |
| "mean_token_accuracy": 0.42011781483888627, | |
| "num_tokens": 12573537.0, | |
| "step": 3980 | |
| }, | |
| { | |
| "epoch": 1.23338485316847, | |
| "grad_norm": 2.4452314376831055, | |
| "learning_rate": 7.669242658423493e-06, | |
| "loss": 3.2631, | |
| "mean_token_accuracy": 0.40838914439082147, | |
| "num_tokens": 12605953.0, | |
| "step": 3990 | |
| }, | |
| { | |
| "epoch": 1.2364760432766615, | |
| "grad_norm": 0.847280740737915, | |
| "learning_rate": 7.638330757341577e-06, | |
| "loss": 3.1697, | |
| "mean_token_accuracy": 0.4186425693333149, | |
| "num_tokens": 12639106.0, | |
| "step": 4000 | |
| }, | |
| { | |
| "epoch": 1.239567233384853, | |
| "grad_norm": 1.1845167875289917, | |
| "learning_rate": 7.6074188562596605e-06, | |
| "loss": 3.2143, | |
| "mean_token_accuracy": 0.41409589275717734, | |
| "num_tokens": 12669010.0, | |
| "step": 4010 | |
| }, | |
| { | |
| "epoch": 1.242658423493045, | |
| "grad_norm": 17.06900405883789, | |
| "learning_rate": 7.576506955177744e-06, | |
| "loss": 3.122, | |
| "mean_token_accuracy": 0.4226062521338463, | |
| "num_tokens": 12699833.0, | |
| "step": 4020 | |
| }, | |
| { | |
| "epoch": 1.2457496136012365, | |
| "grad_norm": 1.4105889797210693, | |
| "learning_rate": 7.545595054095828e-06, | |
| "loss": 3.2082, | |
| "mean_token_accuracy": 0.408999927341938, | |
| "num_tokens": 12732842.0, | |
| "step": 4030 | |
| }, | |
| { | |
| "epoch": 1.248840803709428, | |
| "grad_norm": 2.478212356567383, | |
| "learning_rate": 7.51468315301391e-06, | |
| "loss": 3.3335, | |
| "mean_token_accuracy": 0.39411759525537493, | |
| "num_tokens": 12764506.0, | |
| "step": 4040 | |
| }, | |
| { | |
| "epoch": 1.2519319938176197, | |
| "grad_norm": 1.3297427892684937, | |
| "learning_rate": 7.483771251931995e-06, | |
| "loss": 3.1968, | |
| "mean_token_accuracy": 0.409882578253746, | |
| "num_tokens": 12797376.0, | |
| "step": 4050 | |
| }, | |
| { | |
| "epoch": 1.2550231839258115, | |
| "grad_norm": 2.156684637069702, | |
| "learning_rate": 7.4528593508500776e-06, | |
| "loss": 3.2096, | |
| "mean_token_accuracy": 0.4163349486887455, | |
| "num_tokens": 12831070.0, | |
| "step": 4060 | |
| }, | |
| { | |
| "epoch": 1.258114374034003, | |
| "grad_norm": 0.8902810215950012, | |
| "learning_rate": 7.421947449768161e-06, | |
| "loss": 3.0776, | |
| "mean_token_accuracy": 0.42983465269207954, | |
| "num_tokens": 12860548.0, | |
| "step": 4070 | |
| }, | |
| { | |
| "epoch": 1.2612055641421946, | |
| "grad_norm": 1.5070351362228394, | |
| "learning_rate": 7.391035548686245e-06, | |
| "loss": 3.2621, | |
| "mean_token_accuracy": 0.41044663786888125, | |
| "num_tokens": 12895303.0, | |
| "step": 4080 | |
| }, | |
| { | |
| "epoch": 1.2642967542503865, | |
| "grad_norm": 1.4987707138061523, | |
| "learning_rate": 7.360123647604328e-06, | |
| "loss": 3.27, | |
| "mean_token_accuracy": 0.40910629704594614, | |
| "num_tokens": 12926442.0, | |
| "step": 4090 | |
| }, | |
| { | |
| "epoch": 1.267387944358578, | |
| "grad_norm": 1.4421848058700562, | |
| "learning_rate": 7.329211746522412e-06, | |
| "loss": 3.2123, | |
| "mean_token_accuracy": 0.4119173936545849, | |
| "num_tokens": 12958744.0, | |
| "step": 4100 | |
| }, | |
| { | |
| "epoch": 1.2704791344667696, | |
| "grad_norm": 1.8334916830062866, | |
| "learning_rate": 7.2982998454404955e-06, | |
| "loss": 3.2069, | |
| "mean_token_accuracy": 0.4157085955142975, | |
| "num_tokens": 12990543.0, | |
| "step": 4110 | |
| }, | |
| { | |
| "epoch": 1.2735703245749614, | |
| "grad_norm": 1.4132410287857056, | |
| "learning_rate": 7.267387944358578e-06, | |
| "loss": 3.2062, | |
| "mean_token_accuracy": 0.4211029835045338, | |
| "num_tokens": 13020509.0, | |
| "step": 4120 | |
| }, | |
| { | |
| "epoch": 1.276661514683153, | |
| "grad_norm": 1.6009533405303955, | |
| "learning_rate": 7.236476043276663e-06, | |
| "loss": 3.1636, | |
| "mean_token_accuracy": 0.41871346086263656, | |
| "num_tokens": 13052226.0, | |
| "step": 4130 | |
| }, | |
| { | |
| "epoch": 1.2797527047913446, | |
| "grad_norm": 2.6064114570617676, | |
| "learning_rate": 7.205564142194745e-06, | |
| "loss": 3.1769, | |
| "mean_token_accuracy": 0.4199406482279301, | |
| "num_tokens": 13084155.0, | |
| "step": 4140 | |
| }, | |
| { | |
| "epoch": 1.2828438948995364, | |
| "grad_norm": 1.20345938205719, | |
| "learning_rate": 7.17465224111283e-06, | |
| "loss": 3.2276, | |
| "mean_token_accuracy": 0.41344398483633993, | |
| "num_tokens": 13117148.0, | |
| "step": 4150 | |
| }, | |
| { | |
| "epoch": 1.285935085007728, | |
| "grad_norm": 3.0620017051696777, | |
| "learning_rate": 7.143740340030913e-06, | |
| "loss": 3.219, | |
| "mean_token_accuracy": 0.4165691465139389, | |
| "num_tokens": 13148216.0, | |
| "step": 4160 | |
| }, | |
| { | |
| "epoch": 1.2890262751159196, | |
| "grad_norm": 1.6213176250457764, | |
| "learning_rate": 7.112828438948995e-06, | |
| "loss": 3.183, | |
| "mean_token_accuracy": 0.41394164860248567, | |
| "num_tokens": 13178115.0, | |
| "step": 4170 | |
| }, | |
| { | |
| "epoch": 1.2921174652241114, | |
| "grad_norm": 2.890545129776001, | |
| "learning_rate": 7.08191653786708e-06, | |
| "loss": 3.2399, | |
| "mean_token_accuracy": 0.41289833262562753, | |
| "num_tokens": 13209452.0, | |
| "step": 4180 | |
| }, | |
| { | |
| "epoch": 1.295208655332303, | |
| "grad_norm": 1.3374779224395752, | |
| "learning_rate": 7.0510046367851625e-06, | |
| "loss": 3.1967, | |
| "mean_token_accuracy": 0.4149262882769108, | |
| "num_tokens": 13240399.0, | |
| "step": 4190 | |
| }, | |
| { | |
| "epoch": 1.2982998454404946, | |
| "grad_norm": 5.228854656219482, | |
| "learning_rate": 7.020092735703247e-06, | |
| "loss": 3.2154, | |
| "mean_token_accuracy": 0.4165704995393753, | |
| "num_tokens": 13273212.0, | |
| "step": 4200 | |
| }, | |
| { | |
| "epoch": 1.3013910355486862, | |
| "grad_norm": 1.3633702993392944, | |
| "learning_rate": 6.98918083462133e-06, | |
| "loss": 3.1915, | |
| "mean_token_accuracy": 0.41528667509555817, | |
| "num_tokens": 13306852.0, | |
| "step": 4210 | |
| }, | |
| { | |
| "epoch": 1.3044822256568778, | |
| "grad_norm": 1.2878068685531616, | |
| "learning_rate": 6.958268933539414e-06, | |
| "loss": 3.1694, | |
| "mean_token_accuracy": 0.4169937312602997, | |
| "num_tokens": 13340676.0, | |
| "step": 4220 | |
| }, | |
| { | |
| "epoch": 1.3075734157650696, | |
| "grad_norm": 1.3162099123001099, | |
| "learning_rate": 6.927357032457497e-06, | |
| "loss": 3.142, | |
| "mean_token_accuracy": 0.4265909008681774, | |
| "num_tokens": 13371414.0, | |
| "step": 4230 | |
| }, | |
| { | |
| "epoch": 1.3106646058732612, | |
| "grad_norm": 3.4883854389190674, | |
| "learning_rate": 6.8964451313755796e-06, | |
| "loss": 3.1395, | |
| "mean_token_accuracy": 0.42550636306405065, | |
| "num_tokens": 13403328.0, | |
| "step": 4240 | |
| }, | |
| { | |
| "epoch": 1.3137557959814528, | |
| "grad_norm": 2.3180267810821533, | |
| "learning_rate": 6.865533230293664e-06, | |
| "loss": 3.2024, | |
| "mean_token_accuracy": 0.4202399365603924, | |
| "num_tokens": 13434597.0, | |
| "step": 4250 | |
| }, | |
| { | |
| "epoch": 1.3168469860896446, | |
| "grad_norm": 1.3776673078536987, | |
| "learning_rate": 6.834621329211747e-06, | |
| "loss": 3.1493, | |
| "mean_token_accuracy": 0.4209834337234497, | |
| "num_tokens": 13463779.0, | |
| "step": 4260 | |
| }, | |
| { | |
| "epoch": 1.3199381761978362, | |
| "grad_norm": 1.028826355934143, | |
| "learning_rate": 6.803709428129831e-06, | |
| "loss": 3.2035, | |
| "mean_token_accuracy": 0.41408500224351885, | |
| "num_tokens": 13494776.0, | |
| "step": 4270 | |
| }, | |
| { | |
| "epoch": 1.3230293663060277, | |
| "grad_norm": 1.686012625694275, | |
| "learning_rate": 6.772797527047914e-06, | |
| "loss": 3.2745, | |
| "mean_token_accuracy": 0.41270035356283186, | |
| "num_tokens": 13529851.0, | |
| "step": 4280 | |
| }, | |
| { | |
| "epoch": 1.3261205564142196, | |
| "grad_norm": 1.3231185674667358, | |
| "learning_rate": 6.741885625965997e-06, | |
| "loss": 3.2056, | |
| "mean_token_accuracy": 0.4130039505660534, | |
| "num_tokens": 13564424.0, | |
| "step": 4290 | |
| }, | |
| { | |
| "epoch": 1.3292117465224111, | |
| "grad_norm": 1.856342077255249, | |
| "learning_rate": 6.710973724884081e-06, | |
| "loss": 3.1407, | |
| "mean_token_accuracy": 0.4156997807323933, | |
| "num_tokens": 13595883.0, | |
| "step": 4300 | |
| }, | |
| { | |
| "epoch": 1.3323029366306027, | |
| "grad_norm": 1.5125768184661865, | |
| "learning_rate": 6.680061823802164e-06, | |
| "loss": 3.1297, | |
| "mean_token_accuracy": 0.4260141022503376, | |
| "num_tokens": 13628882.0, | |
| "step": 4310 | |
| }, | |
| { | |
| "epoch": 1.3353941267387945, | |
| "grad_norm": 3.0248773097991943, | |
| "learning_rate": 6.649149922720248e-06, | |
| "loss": 3.2258, | |
| "mean_token_accuracy": 0.4115126602351665, | |
| "num_tokens": 13660324.0, | |
| "step": 4320 | |
| }, | |
| { | |
| "epoch": 1.3384853168469861, | |
| "grad_norm": 2.414133310317993, | |
| "learning_rate": 6.618238021638331e-06, | |
| "loss": 3.1616, | |
| "mean_token_accuracy": 0.4194211043417454, | |
| "num_tokens": 13692530.0, | |
| "step": 4330 | |
| }, | |
| { | |
| "epoch": 1.3415765069551777, | |
| "grad_norm": 1.5425348281860352, | |
| "learning_rate": 6.5873261205564146e-06, | |
| "loss": 3.2394, | |
| "mean_token_accuracy": 0.4154033727943897, | |
| "num_tokens": 13725306.0, | |
| "step": 4340 | |
| }, | |
| { | |
| "epoch": 1.3446676970633695, | |
| "grad_norm": 1.8113696575164795, | |
| "learning_rate": 6.556414219474498e-06, | |
| "loss": 3.2223, | |
| "mean_token_accuracy": 0.4164779372513294, | |
| "num_tokens": 13758945.0, | |
| "step": 4350 | |
| }, | |
| { | |
| "epoch": 1.3477588871715611, | |
| "grad_norm": 1.9818004369735718, | |
| "learning_rate": 6.525502318392582e-06, | |
| "loss": 3.1813, | |
| "mean_token_accuracy": 0.41521124318242075, | |
| "num_tokens": 13791607.0, | |
| "step": 4360 | |
| }, | |
| { | |
| "epoch": 1.3508500772797527, | |
| "grad_norm": 1.7219854593276978, | |
| "learning_rate": 6.494590417310665e-06, | |
| "loss": 3.1954, | |
| "mean_token_accuracy": 0.40949844419956205, | |
| "num_tokens": 13822651.0, | |
| "step": 4370 | |
| }, | |
| { | |
| "epoch": 1.3539412673879443, | |
| "grad_norm": 2.069145441055298, | |
| "learning_rate": 6.463678516228749e-06, | |
| "loss": 3.1196, | |
| "mean_token_accuracy": 0.42854725793004034, | |
| "num_tokens": 13852169.0, | |
| "step": 4380 | |
| }, | |
| { | |
| "epoch": 1.3570324574961359, | |
| "grad_norm": 1.8879189491271973, | |
| "learning_rate": 6.432766615146832e-06, | |
| "loss": 3.1188, | |
| "mean_token_accuracy": 0.4243581973016262, | |
| "num_tokens": 13883678.0, | |
| "step": 4390 | |
| }, | |
| { | |
| "epoch": 1.3601236476043277, | |
| "grad_norm": 1.1765724420547485, | |
| "learning_rate": 6.401854714064915e-06, | |
| "loss": 3.1485, | |
| "mean_token_accuracy": 0.41584895700216296, | |
| "num_tokens": 13918991.0, | |
| "step": 4400 | |
| }, | |
| { | |
| "epoch": 1.3632148377125193, | |
| "grad_norm": 1.3428053855895996, | |
| "learning_rate": 6.370942812982999e-06, | |
| "loss": 3.1697, | |
| "mean_token_accuracy": 0.41857780367136, | |
| "num_tokens": 13953094.0, | |
| "step": 4410 | |
| }, | |
| { | |
| "epoch": 1.3663060278207109, | |
| "grad_norm": 1.49298894405365, | |
| "learning_rate": 6.340030911901082e-06, | |
| "loss": 3.1357, | |
| "mean_token_accuracy": 0.4209234081208706, | |
| "num_tokens": 13982945.0, | |
| "step": 4420 | |
| }, | |
| { | |
| "epoch": 1.3693972179289027, | |
| "grad_norm": 1.0709565877914429, | |
| "learning_rate": 6.309119010819166e-06, | |
| "loss": 3.2582, | |
| "mean_token_accuracy": 0.4108918808400631, | |
| "num_tokens": 14014071.0, | |
| "step": 4430 | |
| }, | |
| { | |
| "epoch": 1.3724884080370943, | |
| "grad_norm": 2.4061436653137207, | |
| "learning_rate": 6.27820710973725e-06, | |
| "loss": 3.1506, | |
| "mean_token_accuracy": 0.4154247589409351, | |
| "num_tokens": 14042486.0, | |
| "step": 4440 | |
| }, | |
| { | |
| "epoch": 1.3755795981452859, | |
| "grad_norm": 1.2331550121307373, | |
| "learning_rate": 6.247295208655333e-06, | |
| "loss": 3.1823, | |
| "mean_token_accuracy": 0.4121369063854218, | |
| "num_tokens": 14073272.0, | |
| "step": 4450 | |
| }, | |
| { | |
| "epoch": 1.3786707882534777, | |
| "grad_norm": 1.5143426656723022, | |
| "learning_rate": 6.216383307573416e-06, | |
| "loss": 3.1427, | |
| "mean_token_accuracy": 0.41784567162394526, | |
| "num_tokens": 14103341.0, | |
| "step": 4460 | |
| }, | |
| { | |
| "epoch": 1.3817619783616693, | |
| "grad_norm": 1.448833703994751, | |
| "learning_rate": 6.1854714064915e-06, | |
| "loss": 3.2622, | |
| "mean_token_accuracy": 0.40576266273856165, | |
| "num_tokens": 14134972.0, | |
| "step": 4470 | |
| }, | |
| { | |
| "epoch": 1.3848531684698608, | |
| "grad_norm": 0.988325834274292, | |
| "learning_rate": 6.154559505409583e-06, | |
| "loss": 3.1114, | |
| "mean_token_accuracy": 0.428489201515913, | |
| "num_tokens": 14166647.0, | |
| "step": 4480 | |
| }, | |
| { | |
| "epoch": 1.3879443585780527, | |
| "grad_norm": 1.2479734420776367, | |
| "learning_rate": 6.1236476043276675e-06, | |
| "loss": 3.1902, | |
| "mean_token_accuracy": 0.4144682168960571, | |
| "num_tokens": 14199779.0, | |
| "step": 4490 | |
| }, | |
| { | |
| "epoch": 1.3910355486862442, | |
| "grad_norm": 4.43798303604126, | |
| "learning_rate": 6.09273570324575e-06, | |
| "loss": 3.1645, | |
| "mean_token_accuracy": 0.4205217458307743, | |
| "num_tokens": 14229188.0, | |
| "step": 4500 | |
| }, | |
| { | |
| "epoch": 1.3941267387944358, | |
| "grad_norm": 2.4214413166046143, | |
| "learning_rate": 6.061823802163833e-06, | |
| "loss": 3.2059, | |
| "mean_token_accuracy": 0.4073712095618248, | |
| "num_tokens": 14263707.0, | |
| "step": 4510 | |
| }, | |
| { | |
| "epoch": 1.3972179289026276, | |
| "grad_norm": 1.08336341381073, | |
| "learning_rate": 6.030911901081917e-06, | |
| "loss": 3.1831, | |
| "mean_token_accuracy": 0.42334684580564497, | |
| "num_tokens": 14294877.0, | |
| "step": 4520 | |
| }, | |
| { | |
| "epoch": 1.4003091190108192, | |
| "grad_norm": 1.0653190612792969, | |
| "learning_rate": 6e-06, | |
| "loss": 3.1047, | |
| "mean_token_accuracy": 0.42504297345876696, | |
| "num_tokens": 14325959.0, | |
| "step": 4530 | |
| }, | |
| { | |
| "epoch": 1.4034003091190108, | |
| "grad_norm": 1.7035560607910156, | |
| "learning_rate": 5.969088098918085e-06, | |
| "loss": 3.3124, | |
| "mean_token_accuracy": 0.4030896335840225, | |
| "num_tokens": 14358216.0, | |
| "step": 4540 | |
| }, | |
| { | |
| "epoch": 1.4064914992272024, | |
| "grad_norm": 1.1060764789581299, | |
| "learning_rate": 5.938176197836167e-06, | |
| "loss": 3.2152, | |
| "mean_token_accuracy": 0.4193869881331921, | |
| "num_tokens": 14391366.0, | |
| "step": 4550 | |
| }, | |
| { | |
| "epoch": 1.409582689335394, | |
| "grad_norm": 1.7701034545898438, | |
| "learning_rate": 5.90726429675425e-06, | |
| "loss": 3.2466, | |
| "mean_token_accuracy": 0.4122362986207008, | |
| "num_tokens": 14421925.0, | |
| "step": 4560 | |
| }, | |
| { | |
| "epoch": 1.4126738794435858, | |
| "grad_norm": 1.8544549942016602, | |
| "learning_rate": 5.8763523956723345e-06, | |
| "loss": 3.1179, | |
| "mean_token_accuracy": 0.42258076667785643, | |
| "num_tokens": 14451856.0, | |
| "step": 4570 | |
| }, | |
| { | |
| "epoch": 1.4157650695517774, | |
| "grad_norm": 2.212045431137085, | |
| "learning_rate": 5.845440494590417e-06, | |
| "loss": 3.364, | |
| "mean_token_accuracy": 0.4010948471724987, | |
| "num_tokens": 14483118.0, | |
| "step": 4580 | |
| }, | |
| { | |
| "epoch": 1.418856259659969, | |
| "grad_norm": 1.5026146173477173, | |
| "learning_rate": 5.814528593508502e-06, | |
| "loss": 3.19, | |
| "mean_token_accuracy": 0.4188863389194012, | |
| "num_tokens": 14513476.0, | |
| "step": 4590 | |
| }, | |
| { | |
| "epoch": 1.4219474497681608, | |
| "grad_norm": 1.5973678827285767, | |
| "learning_rate": 5.783616692426584e-06, | |
| "loss": 3.0433, | |
| "mean_token_accuracy": 0.44073015078902245, | |
| "num_tokens": 14539496.0, | |
| "step": 4600 | |
| }, | |
| { | |
| "epoch": 1.4250386398763524, | |
| "grad_norm": 1.7628804445266724, | |
| "learning_rate": 5.752704791344668e-06, | |
| "loss": 3.2062, | |
| "mean_token_accuracy": 0.4122942849993706, | |
| "num_tokens": 14569765.0, | |
| "step": 4610 | |
| }, | |
| { | |
| "epoch": 1.428129829984544, | |
| "grad_norm": 1.5680344104766846, | |
| "learning_rate": 5.7217928902627516e-06, | |
| "loss": 3.1586, | |
| "mean_token_accuracy": 0.42099211886525156, | |
| "num_tokens": 14600216.0, | |
| "step": 4620 | |
| }, | |
| { | |
| "epoch": 1.4312210200927358, | |
| "grad_norm": 1.244256615638733, | |
| "learning_rate": 5.690880989180835e-06, | |
| "loss": 3.1328, | |
| "mean_token_accuracy": 0.42461210340261457, | |
| "num_tokens": 14634376.0, | |
| "step": 4630 | |
| }, | |
| { | |
| "epoch": 1.4343122102009274, | |
| "grad_norm": 1.2207525968551636, | |
| "learning_rate": 5.659969088098919e-06, | |
| "loss": 3.1971, | |
| "mean_token_accuracy": 0.4105593167245388, | |
| "num_tokens": 14669227.0, | |
| "step": 4640 | |
| }, | |
| { | |
| "epoch": 1.437403400309119, | |
| "grad_norm": 1.2037540674209595, | |
| "learning_rate": 5.6290571870170015e-06, | |
| "loss": 3.1483, | |
| "mean_token_accuracy": 0.4223948784172535, | |
| "num_tokens": 14699394.0, | |
| "step": 4650 | |
| }, | |
| { | |
| "epoch": 1.4404945904173108, | |
| "grad_norm": 1.5292669534683228, | |
| "learning_rate": 5.598145285935086e-06, | |
| "loss": 3.2663, | |
| "mean_token_accuracy": 0.4052347682416439, | |
| "num_tokens": 14732089.0, | |
| "step": 4660 | |
| }, | |
| { | |
| "epoch": 1.4435857805255023, | |
| "grad_norm": 1.7726776599884033, | |
| "learning_rate": 5.567233384853169e-06, | |
| "loss": 3.2756, | |
| "mean_token_accuracy": 0.4116224706172943, | |
| "num_tokens": 14761533.0, | |
| "step": 4670 | |
| }, | |
| { | |
| "epoch": 1.446676970633694, | |
| "grad_norm": 1.3190699815750122, | |
| "learning_rate": 5.536321483771252e-06, | |
| "loss": 3.0779, | |
| "mean_token_accuracy": 0.43598859906196596, | |
| "num_tokens": 14789931.0, | |
| "step": 4680 | |
| }, | |
| { | |
| "epoch": 1.4497681607418857, | |
| "grad_norm": 1.1200242042541504, | |
| "learning_rate": 5.505409582689336e-06, | |
| "loss": 3.1462, | |
| "mean_token_accuracy": 0.42209191918373107, | |
| "num_tokens": 14819843.0, | |
| "step": 4690 | |
| }, | |
| { | |
| "epoch": 1.4528593508500773, | |
| "grad_norm": 1.8637281656265259, | |
| "learning_rate": 5.474497681607419e-06, | |
| "loss": 3.2485, | |
| "mean_token_accuracy": 0.40834289118647576, | |
| "num_tokens": 14852186.0, | |
| "step": 4700 | |
| }, | |
| { | |
| "epoch": 1.455950540958269, | |
| "grad_norm": 1.0149264335632324, | |
| "learning_rate": 5.443585780525503e-06, | |
| "loss": 3.1762, | |
| "mean_token_accuracy": 0.4144597060978413, | |
| "num_tokens": 14885812.0, | |
| "step": 4710 | |
| }, | |
| { | |
| "epoch": 1.4590417310664605, | |
| "grad_norm": 1.3329026699066162, | |
| "learning_rate": 5.412673879443587e-06, | |
| "loss": 3.1562, | |
| "mean_token_accuracy": 0.42381680980324743, | |
| "num_tokens": 14915638.0, | |
| "step": 4720 | |
| }, | |
| { | |
| "epoch": 1.4621329211746523, | |
| "grad_norm": 1.2801613807678223, | |
| "learning_rate": 5.381761978361669e-06, | |
| "loss": 3.2291, | |
| "mean_token_accuracy": 0.4082433968782425, | |
| "num_tokens": 14948608.0, | |
| "step": 4730 | |
| }, | |
| { | |
| "epoch": 1.465224111282844, | |
| "grad_norm": 1.522076964378357, | |
| "learning_rate": 5.350850077279754e-06, | |
| "loss": 3.1562, | |
| "mean_token_accuracy": 0.42298023998737333, | |
| "num_tokens": 14979493.0, | |
| "step": 4740 | |
| }, | |
| { | |
| "epoch": 1.4683153013910355, | |
| "grad_norm": 1.266901969909668, | |
| "learning_rate": 5.3199381761978365e-06, | |
| "loss": 3.0935, | |
| "mean_token_accuracy": 0.42193435728549955, | |
| "num_tokens": 15009695.0, | |
| "step": 4750 | |
| }, | |
| { | |
| "epoch": 1.471406491499227, | |
| "grad_norm": 2.0726027488708496, | |
| "learning_rate": 5.28902627511592e-06, | |
| "loss": 3.1655, | |
| "mean_token_accuracy": 0.41975434496998787, | |
| "num_tokens": 15042123.0, | |
| "step": 4760 | |
| }, | |
| { | |
| "epoch": 1.474497681607419, | |
| "grad_norm": 1.504747986793518, | |
| "learning_rate": 5.258114374034004e-06, | |
| "loss": 3.1425, | |
| "mean_token_accuracy": 0.42110041007399557, | |
| "num_tokens": 15074722.0, | |
| "step": 4770 | |
| }, | |
| { | |
| "epoch": 1.4775888717156105, | |
| "grad_norm": 1.510971188545227, | |
| "learning_rate": 5.227202472952086e-06, | |
| "loss": 3.1586, | |
| "mean_token_accuracy": 0.4183110870420933, | |
| "num_tokens": 15103293.0, | |
| "step": 4780 | |
| }, | |
| { | |
| "epoch": 1.480680061823802, | |
| "grad_norm": 3.7153401374816895, | |
| "learning_rate": 5.196290571870171e-06, | |
| "loss": 3.1719, | |
| "mean_token_accuracy": 0.4164193421602249, | |
| "num_tokens": 15136915.0, | |
| "step": 4790 | |
| }, | |
| { | |
| "epoch": 1.4837712519319939, | |
| "grad_norm": 1.9305976629257202, | |
| "learning_rate": 5.1653786707882536e-06, | |
| "loss": 3.1606, | |
| "mean_token_accuracy": 0.4086958207190037, | |
| "num_tokens": 15170496.0, | |
| "step": 4800 | |
| }, | |
| { | |
| "epoch": 1.4868624420401855, | |
| "grad_norm": 1.2468324899673462, | |
| "learning_rate": 5.134466769706338e-06, | |
| "loss": 3.157, | |
| "mean_token_accuracy": 0.4179227910935879, | |
| "num_tokens": 15201560.0, | |
| "step": 4810 | |
| }, | |
| { | |
| "epoch": 1.489953632148377, | |
| "grad_norm": 1.8711183071136475, | |
| "learning_rate": 5.103554868624421e-06, | |
| "loss": 3.1446, | |
| "mean_token_accuracy": 0.42116508409380915, | |
| "num_tokens": 15233195.0, | |
| "step": 4820 | |
| }, | |
| { | |
| "epoch": 1.4930448222565689, | |
| "grad_norm": 1.30099618434906, | |
| "learning_rate": 5.0726429675425035e-06, | |
| "loss": 3.2033, | |
| "mean_token_accuracy": 0.4153247632086277, | |
| "num_tokens": 15266312.0, | |
| "step": 4830 | |
| }, | |
| { | |
| "epoch": 1.4961360123647605, | |
| "grad_norm": 2.5389606952667236, | |
| "learning_rate": 5.041731066460588e-06, | |
| "loss": 3.1667, | |
| "mean_token_accuracy": 0.4175982415676117, | |
| "num_tokens": 15296476.0, | |
| "step": 4840 | |
| }, | |
| { | |
| "epoch": 1.499227202472952, | |
| "grad_norm": 1.4658409357070923, | |
| "learning_rate": 5.010819165378671e-06, | |
| "loss": 3.1887, | |
| "mean_token_accuracy": 0.41349576637148855, | |
| "num_tokens": 15329787.0, | |
| "step": 4850 | |
| }, | |
| { | |
| "epoch": 1.5023183925811439, | |
| "grad_norm": 1.282163143157959, | |
| "learning_rate": 4.979907264296754e-06, | |
| "loss": 3.1846, | |
| "mean_token_accuracy": 0.4133185692131519, | |
| "num_tokens": 15364093.0, | |
| "step": 4860 | |
| }, | |
| { | |
| "epoch": 1.5054095826893354, | |
| "grad_norm": 1.9515365362167358, | |
| "learning_rate": 4.948995363214838e-06, | |
| "loss": 3.1212, | |
| "mean_token_accuracy": 0.424532825499773, | |
| "num_tokens": 15394102.0, | |
| "step": 4870 | |
| }, | |
| { | |
| "epoch": 1.508500772797527, | |
| "grad_norm": 1.0463203191757202, | |
| "learning_rate": 4.918083462132921e-06, | |
| "loss": 3.1278, | |
| "mean_token_accuracy": 0.42214716374874117, | |
| "num_tokens": 15425672.0, | |
| "step": 4880 | |
| }, | |
| { | |
| "epoch": 1.5115919629057188, | |
| "grad_norm": 2.7366933822631836, | |
| "learning_rate": 4.887171561051005e-06, | |
| "loss": 3.1837, | |
| "mean_token_accuracy": 0.41872839331626893, | |
| "num_tokens": 15457839.0, | |
| "step": 4890 | |
| }, | |
| { | |
| "epoch": 1.5146831530139102, | |
| "grad_norm": 1.0603899955749512, | |
| "learning_rate": 4.8562596599690886e-06, | |
| "loss": 3.1954, | |
| "mean_token_accuracy": 0.41243630051612856, | |
| "num_tokens": 15492367.0, | |
| "step": 4900 | |
| }, | |
| { | |
| "epoch": 1.517774343122102, | |
| "grad_norm": 1.2490918636322021, | |
| "learning_rate": 4.825347758887172e-06, | |
| "loss": 3.1728, | |
| "mean_token_accuracy": 0.4147431656718254, | |
| "num_tokens": 15524871.0, | |
| "step": 4910 | |
| }, | |
| { | |
| "epoch": 1.5208655332302936, | |
| "grad_norm": 1.474180817604065, | |
| "learning_rate": 4.794435857805255e-06, | |
| "loss": 3.0814, | |
| "mean_token_accuracy": 0.4267166741192341, | |
| "num_tokens": 15556521.0, | |
| "step": 4920 | |
| }, | |
| { | |
| "epoch": 1.5239567233384852, | |
| "grad_norm": 0.972766637802124, | |
| "learning_rate": 4.7635239567233385e-06, | |
| "loss": 3.0783, | |
| "mean_token_accuracy": 0.42358799651265144, | |
| "num_tokens": 15592437.0, | |
| "step": 4930 | |
| }, | |
| { | |
| "epoch": 1.527047913446677, | |
| "grad_norm": 1.0824300050735474, | |
| "learning_rate": 4.732612055641422e-06, | |
| "loss": 3.2082, | |
| "mean_token_accuracy": 0.4090407736599445, | |
| "num_tokens": 15625545.0, | |
| "step": 4940 | |
| }, | |
| { | |
| "epoch": 1.5301391035548686, | |
| "grad_norm": 1.0225831270217896, | |
| "learning_rate": 4.701700154559506e-06, | |
| "loss": 3.1231, | |
| "mean_token_accuracy": 0.423577306419611, | |
| "num_tokens": 15657233.0, | |
| "step": 4950 | |
| }, | |
| { | |
| "epoch": 1.5332302936630602, | |
| "grad_norm": 1.2960033416748047, | |
| "learning_rate": 4.670788253477589e-06, | |
| "loss": 3.2722, | |
| "mean_token_accuracy": 0.4066340148448944, | |
| "num_tokens": 15689926.0, | |
| "step": 4960 | |
| }, | |
| { | |
| "epoch": 1.536321483771252, | |
| "grad_norm": 13.400914192199707, | |
| "learning_rate": 4.639876352395673e-06, | |
| "loss": 3.1884, | |
| "mean_token_accuracy": 0.41838386580348014, | |
| "num_tokens": 15721424.0, | |
| "step": 4970 | |
| }, | |
| { | |
| "epoch": 1.5394126738794436, | |
| "grad_norm": 6.94038200378418, | |
| "learning_rate": 4.608964451313756e-06, | |
| "loss": 3.1619, | |
| "mean_token_accuracy": 0.41957569122314453, | |
| "num_tokens": 15749894.0, | |
| "step": 4980 | |
| }, | |
| { | |
| "epoch": 1.5425038639876352, | |
| "grad_norm": 2.4333202838897705, | |
| "learning_rate": 4.57805255023184e-06, | |
| "loss": 3.1165, | |
| "mean_token_accuracy": 0.42476601898670197, | |
| "num_tokens": 15782492.0, | |
| "step": 4990 | |
| }, | |
| { | |
| "epoch": 1.545595054095827, | |
| "grad_norm": 1.1471242904663086, | |
| "learning_rate": 4.547140649149923e-06, | |
| "loss": 3.2202, | |
| "mean_token_accuracy": 0.411971789598465, | |
| "num_tokens": 15816554.0, | |
| "step": 5000 | |
| } | |
| ], | |
| "logging_steps": 10, | |
| "max_steps": 6470, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 2, | |
| "save_steps": 500, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": false | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 9099448107669504.0, | |
| "train_batch_size": 4, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |