| { | |
| "best_global_step": null, | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 3.0045125505311647, | |
| "eval_steps": 500, | |
| "global_step": 3996, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.005264642286358936, | |
| "grad_norm": 14.54857063293457, | |
| "learning_rate": 0.00019977426636568851, | |
| "loss": 6.8005, | |
| "mean_token_accuracy": 0.8582593138728823, | |
| "num_tokens": 64246.0, | |
| "step": 7 | |
| }, | |
| { | |
| "epoch": 0.010529284572717871, | |
| "grad_norm": 10.067983627319336, | |
| "learning_rate": 0.00019951091045899172, | |
| "loss": 1.7259, | |
| "mean_token_accuracy": 0.9386104983942849, | |
| "num_tokens": 124242.0, | |
| "step": 14 | |
| }, | |
| { | |
| "epoch": 0.015793926859076806, | |
| "grad_norm": 34.51327896118164, | |
| "learning_rate": 0.00019924755455229495, | |
| "loss": 1.3296, | |
| "mean_token_accuracy": 0.9472483290093285, | |
| "num_tokens": 187129.0, | |
| "step": 21 | |
| }, | |
| { | |
| "epoch": 0.021058569145435743, | |
| "grad_norm": 12.282989501953125, | |
| "learning_rate": 0.0001989841986455982, | |
| "loss": 1.0084, | |
| "mean_token_accuracy": 0.9634656320725169, | |
| "num_tokens": 248341.0, | |
| "step": 28 | |
| }, | |
| { | |
| "epoch": 0.02632321143179468, | |
| "grad_norm": 9.929710388183594, | |
| "learning_rate": 0.00019872084273890144, | |
| "loss": 0.7327, | |
| "mean_token_accuracy": 0.9718545792358262, | |
| "num_tokens": 309429.0, | |
| "step": 35 | |
| }, | |
| { | |
| "epoch": 0.03158785371815361, | |
| "grad_norm": 20.227418899536133, | |
| "learning_rate": 0.00019845748683220468, | |
| "loss": 0.8666, | |
| "mean_token_accuracy": 0.966648525425366, | |
| "num_tokens": 371076.0, | |
| "step": 42 | |
| }, | |
| { | |
| "epoch": 0.03685249600451255, | |
| "grad_norm": 6.689088344573975, | |
| "learning_rate": 0.0001981941309255079, | |
| "loss": 0.7135, | |
| "mean_token_accuracy": 0.9689371681639126, | |
| "num_tokens": 432760.0, | |
| "step": 49 | |
| }, | |
| { | |
| "epoch": 0.042117138290871485, | |
| "grad_norm": 11.68930721282959, | |
| "learning_rate": 0.00019793077501881114, | |
| "loss": 0.8719, | |
| "mean_token_accuracy": 0.9674729821937424, | |
| "num_tokens": 496149.0, | |
| "step": 56 | |
| }, | |
| { | |
| "epoch": 0.04738178057723042, | |
| "grad_norm": 17.330333709716797, | |
| "learning_rate": 0.00019766741911211437, | |
| "loss": 0.7345, | |
| "mean_token_accuracy": 0.9703272080847195, | |
| "num_tokens": 562063.0, | |
| "step": 63 | |
| }, | |
| { | |
| "epoch": 0.05264642286358936, | |
| "grad_norm": 22.020919799804688, | |
| "learning_rate": 0.00019740406320541763, | |
| "loss": 0.8587, | |
| "mean_token_accuracy": 0.9689001132334981, | |
| "num_tokens": 623449.0, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.057911065149948295, | |
| "grad_norm": 4.088195323944092, | |
| "learning_rate": 0.00019714070729872084, | |
| "loss": 0.4383, | |
| "mean_token_accuracy": 0.982292217867715, | |
| "num_tokens": 689814.0, | |
| "step": 77 | |
| }, | |
| { | |
| "epoch": 0.06317570743630722, | |
| "grad_norm": 7.574158191680908, | |
| "learning_rate": 0.00019687735139202407, | |
| "loss": 0.4877, | |
| "mean_token_accuracy": 0.9801784849592617, | |
| "num_tokens": 752477.0, | |
| "step": 84 | |
| }, | |
| { | |
| "epoch": 0.06844034972266616, | |
| "grad_norm": 8.776512145996094, | |
| "learning_rate": 0.00019661399548532733, | |
| "loss": 0.9338, | |
| "mean_token_accuracy": 0.9614966022116798, | |
| "num_tokens": 813601.0, | |
| "step": 91 | |
| }, | |
| { | |
| "epoch": 0.0737049920090251, | |
| "grad_norm": 7.411587715148926, | |
| "learning_rate": 0.00019635063957863056, | |
| "loss": 0.6915, | |
| "mean_token_accuracy": 0.9757387105907712, | |
| "num_tokens": 876034.0, | |
| "step": 98 | |
| }, | |
| { | |
| "epoch": 0.07896963429538403, | |
| "grad_norm": 9.888399124145508, | |
| "learning_rate": 0.0001960872836719338, | |
| "loss": 0.4982, | |
| "mean_token_accuracy": 0.979215474000999, | |
| "num_tokens": 939783.0, | |
| "step": 105 | |
| }, | |
| { | |
| "epoch": 0.08423427658174297, | |
| "grad_norm": 35.44892501831055, | |
| "learning_rate": 0.00019582392776523703, | |
| "loss": 0.8836, | |
| "mean_token_accuracy": 0.973310401397092, | |
| "num_tokens": 1004068.0, | |
| "step": 112 | |
| }, | |
| { | |
| "epoch": 0.0894989188681019, | |
| "grad_norm": 6.216617584228516, | |
| "learning_rate": 0.00019556057185854026, | |
| "loss": 0.8344, | |
| "mean_token_accuracy": 0.9695504722850663, | |
| "num_tokens": 1064611.0, | |
| "step": 119 | |
| }, | |
| { | |
| "epoch": 0.09476356115446084, | |
| "grad_norm": 8.965764045715332, | |
| "learning_rate": 0.0001952972159518435, | |
| "loss": 0.5338, | |
| "mean_token_accuracy": 0.9761039987206459, | |
| "num_tokens": 1127218.0, | |
| "step": 126 | |
| }, | |
| { | |
| "epoch": 0.10002820344081978, | |
| "grad_norm": 16.67660903930664, | |
| "learning_rate": 0.00019503386004514675, | |
| "loss": 0.6175, | |
| "mean_token_accuracy": 0.9744474536606244, | |
| "num_tokens": 1189388.0, | |
| "step": 133 | |
| }, | |
| { | |
| "epoch": 0.10529284572717872, | |
| "grad_norm": 4.124516010284424, | |
| "learning_rate": 0.00019477050413844996, | |
| "loss": 0.539, | |
| "mean_token_accuracy": 0.9823912392769542, | |
| "num_tokens": 1247163.0, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.11055748801353765, | |
| "grad_norm": 5.809010982513428, | |
| "learning_rate": 0.0001945071482317532, | |
| "loss": 0.7526, | |
| "mean_token_accuracy": 0.9701232707926205, | |
| "num_tokens": 1307472.0, | |
| "step": 147 | |
| }, | |
| { | |
| "epoch": 0.11582213029989659, | |
| "grad_norm": 8.480691909790039, | |
| "learning_rate": 0.00019424379232505645, | |
| "loss": 0.6834, | |
| "mean_token_accuracy": 0.9743446367127555, | |
| "num_tokens": 1365874.0, | |
| "step": 154 | |
| }, | |
| { | |
| "epoch": 0.12108677258625553, | |
| "grad_norm": 9.879121780395508, | |
| "learning_rate": 0.00019398043641835969, | |
| "loss": 0.9584, | |
| "mean_token_accuracy": 0.9579050487705639, | |
| "num_tokens": 1440177.0, | |
| "step": 161 | |
| }, | |
| { | |
| "epoch": 0.12635141487261445, | |
| "grad_norm": 6.907696723937988, | |
| "learning_rate": 0.00019371708051166292, | |
| "loss": 0.3608, | |
| "mean_token_accuracy": 0.9884324765631131, | |
| "num_tokens": 1500840.0, | |
| "step": 168 | |
| }, | |
| { | |
| "epoch": 0.1316160571589734, | |
| "grad_norm": 8.442209243774414, | |
| "learning_rate": 0.00019345372460496615, | |
| "loss": 0.6454, | |
| "mean_token_accuracy": 0.9759144516927856, | |
| "num_tokens": 1562349.0, | |
| "step": 175 | |
| }, | |
| { | |
| "epoch": 0.13688069944533232, | |
| "grad_norm": 3.8133952617645264, | |
| "learning_rate": 0.00019319036869826938, | |
| "loss": 0.5659, | |
| "mean_token_accuracy": 0.9775396840912955, | |
| "num_tokens": 1622203.0, | |
| "step": 182 | |
| }, | |
| { | |
| "epoch": 0.14214534173169127, | |
| "grad_norm": 6.373013019561768, | |
| "learning_rate": 0.00019292701279157262, | |
| "loss": 0.4503, | |
| "mean_token_accuracy": 0.9862660957234246, | |
| "num_tokens": 1684790.0, | |
| "step": 189 | |
| }, | |
| { | |
| "epoch": 0.1474099840180502, | |
| "grad_norm": 7.824089050292969, | |
| "learning_rate": 0.00019266365688487588, | |
| "loss": 0.502, | |
| "mean_token_accuracy": 0.9827718670879092, | |
| "num_tokens": 1750877.0, | |
| "step": 196 | |
| }, | |
| { | |
| "epoch": 0.15267462630440914, | |
| "grad_norm": 3.7179150581359863, | |
| "learning_rate": 0.00019240030097817908, | |
| "loss": 0.4627, | |
| "mean_token_accuracy": 0.9793707515512194, | |
| "num_tokens": 1813575.0, | |
| "step": 203 | |
| }, | |
| { | |
| "epoch": 0.15793926859076807, | |
| "grad_norm": 3.6602864265441895, | |
| "learning_rate": 0.0001921369450714823, | |
| "loss": 0.4731, | |
| "mean_token_accuracy": 0.9835461378097534, | |
| "num_tokens": 1873725.0, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 0.16320391087712702, | |
| "grad_norm": 2.5665481090545654, | |
| "learning_rate": 0.00019187358916478557, | |
| "loss": 0.5558, | |
| "mean_token_accuracy": 0.9767591782978603, | |
| "num_tokens": 1937688.0, | |
| "step": 217 | |
| }, | |
| { | |
| "epoch": 0.16846855316348594, | |
| "grad_norm": 6.829793930053711, | |
| "learning_rate": 0.0001916102332580888, | |
| "loss": 0.5528, | |
| "mean_token_accuracy": 0.979142793587276, | |
| "num_tokens": 2002761.0, | |
| "step": 224 | |
| }, | |
| { | |
| "epoch": 0.1737331954498449, | |
| "grad_norm": 4.044713020324707, | |
| "learning_rate": 0.00019134687735139204, | |
| "loss": 0.4845, | |
| "mean_token_accuracy": 0.982716862644468, | |
| "num_tokens": 2059723.0, | |
| "step": 231 | |
| }, | |
| { | |
| "epoch": 0.1789978377362038, | |
| "grad_norm": 11.39633560180664, | |
| "learning_rate": 0.00019108352144469527, | |
| "loss": 0.5202, | |
| "mean_token_accuracy": 0.9801062345504761, | |
| "num_tokens": 2122504.0, | |
| "step": 238 | |
| }, | |
| { | |
| "epoch": 0.18426248002256276, | |
| "grad_norm": 3.029054880142212, | |
| "learning_rate": 0.0001908201655379985, | |
| "loss": 0.4843, | |
| "mean_token_accuracy": 0.9803362818700927, | |
| "num_tokens": 2185406.0, | |
| "step": 245 | |
| }, | |
| { | |
| "epoch": 0.18952712230892169, | |
| "grad_norm": 3.5702171325683594, | |
| "learning_rate": 0.00019055680963130174, | |
| "loss": 0.8407, | |
| "mean_token_accuracy": 0.9690316447189876, | |
| "num_tokens": 2245561.0, | |
| "step": 252 | |
| }, | |
| { | |
| "epoch": 0.19479176459528064, | |
| "grad_norm": 4.639045715332031, | |
| "learning_rate": 0.00019029345372460497, | |
| "loss": 0.5357, | |
| "mean_token_accuracy": 0.9732738541705268, | |
| "num_tokens": 2306315.0, | |
| "step": 259 | |
| }, | |
| { | |
| "epoch": 0.20005640688163956, | |
| "grad_norm": 2.1046903133392334, | |
| "learning_rate": 0.00019003009781790823, | |
| "loss": 0.4489, | |
| "mean_token_accuracy": 0.9792884513735771, | |
| "num_tokens": 2365821.0, | |
| "step": 266 | |
| }, | |
| { | |
| "epoch": 0.2053210491679985, | |
| "grad_norm": 1.8757308721542358, | |
| "learning_rate": 0.00018976674191121143, | |
| "loss": 0.3658, | |
| "mean_token_accuracy": 0.9825292006134987, | |
| "num_tokens": 2427193.0, | |
| "step": 273 | |
| }, | |
| { | |
| "epoch": 0.21058569145435743, | |
| "grad_norm": 11.889435768127441, | |
| "learning_rate": 0.00018950338600451467, | |
| "loss": 0.2899, | |
| "mean_token_accuracy": 0.9884258146796908, | |
| "num_tokens": 2492899.0, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 0.21585033374071635, | |
| "grad_norm": 5.143967151641846, | |
| "learning_rate": 0.00018924003009781793, | |
| "loss": 0.4093, | |
| "mean_token_accuracy": 0.979480759373733, | |
| "num_tokens": 2552913.0, | |
| "step": 287 | |
| }, | |
| { | |
| "epoch": 0.2211149760270753, | |
| "grad_norm": 6.770453453063965, | |
| "learning_rate": 0.00018897667419112116, | |
| "loss": 0.5634, | |
| "mean_token_accuracy": 0.9833344774586814, | |
| "num_tokens": 2610484.0, | |
| "step": 294 | |
| }, | |
| { | |
| "epoch": 0.22637961831343423, | |
| "grad_norm": 11.86190128326416, | |
| "learning_rate": 0.0001887133182844244, | |
| "loss": 0.4431, | |
| "mean_token_accuracy": 0.9816493753876004, | |
| "num_tokens": 2676514.0, | |
| "step": 301 | |
| }, | |
| { | |
| "epoch": 0.23164426059979318, | |
| "grad_norm": 13.320977210998535, | |
| "learning_rate": 0.00018844996237772762, | |
| "loss": 0.5942, | |
| "mean_token_accuracy": 0.9759356592382703, | |
| "num_tokens": 2737171.0, | |
| "step": 308 | |
| }, | |
| { | |
| "epoch": 0.2369089028861521, | |
| "grad_norm": 3.966480016708374, | |
| "learning_rate": 0.00018818660647103086, | |
| "loss": 0.5571, | |
| "mean_token_accuracy": 0.9825907392161233, | |
| "num_tokens": 2798583.0, | |
| "step": 315 | |
| }, | |
| { | |
| "epoch": 0.24217354517251105, | |
| "grad_norm": 4.871067047119141, | |
| "learning_rate": 0.0001879232505643341, | |
| "loss": 0.4242, | |
| "mean_token_accuracy": 0.9838380888104439, | |
| "num_tokens": 2863816.0, | |
| "step": 322 | |
| }, | |
| { | |
| "epoch": 0.24743818745886997, | |
| "grad_norm": 13.88951301574707, | |
| "learning_rate": 0.00018765989465763735, | |
| "loss": 0.3687, | |
| "mean_token_accuracy": 0.9865035521132606, | |
| "num_tokens": 2922479.0, | |
| "step": 329 | |
| }, | |
| { | |
| "epoch": 0.2527028297452289, | |
| "grad_norm": 3.8883068561553955, | |
| "learning_rate": 0.00018739653875094055, | |
| "loss": 0.3592, | |
| "mean_token_accuracy": 0.9850486857550484, | |
| "num_tokens": 2983955.0, | |
| "step": 336 | |
| }, | |
| { | |
| "epoch": 0.2579674720315879, | |
| "grad_norm": 9.375052452087402, | |
| "learning_rate": 0.00018713318284424379, | |
| "loss": 0.6825, | |
| "mean_token_accuracy": 0.9810899102262088, | |
| "num_tokens": 3049631.0, | |
| "step": 343 | |
| }, | |
| { | |
| "epoch": 0.2632321143179468, | |
| "grad_norm": 3.961210012435913, | |
| "learning_rate": 0.00018686982693754705, | |
| "loss": 0.4222, | |
| "mean_token_accuracy": 0.9809500447341374, | |
| "num_tokens": 3109415.0, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 0.2684967566043057, | |
| "grad_norm": 5.651891708374023, | |
| "learning_rate": 0.00018660647103085028, | |
| "loss": 0.5125, | |
| "mean_token_accuracy": 0.9830025434494019, | |
| "num_tokens": 3171068.0, | |
| "step": 357 | |
| }, | |
| { | |
| "epoch": 0.27376139889066464, | |
| "grad_norm": 7.219700336456299, | |
| "learning_rate": 0.0001863431151241535, | |
| "loss": 0.2511, | |
| "mean_token_accuracy": 0.9893663280776569, | |
| "num_tokens": 3233528.0, | |
| "step": 364 | |
| }, | |
| { | |
| "epoch": 0.2790260411770236, | |
| "grad_norm": 6.468575477600098, | |
| "learning_rate": 0.00018607975921745674, | |
| "loss": 0.3758, | |
| "mean_token_accuracy": 0.9862235305564744, | |
| "num_tokens": 3299894.0, | |
| "step": 371 | |
| }, | |
| { | |
| "epoch": 0.28429068346338254, | |
| "grad_norm": 4.638998031616211, | |
| "learning_rate": 0.00018581640331075998, | |
| "loss": 0.6791, | |
| "mean_token_accuracy": 0.9745566131813186, | |
| "num_tokens": 3361996.0, | |
| "step": 378 | |
| }, | |
| { | |
| "epoch": 0.28955532574974147, | |
| "grad_norm": 15.218616485595703, | |
| "learning_rate": 0.0001855530474040632, | |
| "loss": 0.797, | |
| "mean_token_accuracy": 0.9799516892858914, | |
| "num_tokens": 3423605.0, | |
| "step": 385 | |
| }, | |
| { | |
| "epoch": 0.2948199680361004, | |
| "grad_norm": 10.657114028930664, | |
| "learning_rate": 0.00018528969149736647, | |
| "loss": 0.4383, | |
| "mean_token_accuracy": 0.982287561254842, | |
| "num_tokens": 3494913.0, | |
| "step": 392 | |
| }, | |
| { | |
| "epoch": 0.30008461032245937, | |
| "grad_norm": 5.565535068511963, | |
| "learning_rate": 0.00018502633559066967, | |
| "loss": 0.4929, | |
| "mean_token_accuracy": 0.9823328203388623, | |
| "num_tokens": 3555741.0, | |
| "step": 399 | |
| }, | |
| { | |
| "epoch": 0.3053492526088183, | |
| "grad_norm": 11.824295997619629, | |
| "learning_rate": 0.0001847629796839729, | |
| "loss": 0.4915, | |
| "mean_token_accuracy": 0.9843728531684194, | |
| "num_tokens": 3617067.0, | |
| "step": 406 | |
| }, | |
| { | |
| "epoch": 0.3106138948951772, | |
| "grad_norm": 3.0792489051818848, | |
| "learning_rate": 0.00018449962377727617, | |
| "loss": 0.3579, | |
| "mean_token_accuracy": 0.985108135002, | |
| "num_tokens": 3679159.0, | |
| "step": 413 | |
| }, | |
| { | |
| "epoch": 0.31587853718153613, | |
| "grad_norm": 4.57699728012085, | |
| "learning_rate": 0.0001842362678705794, | |
| "loss": 0.5547, | |
| "mean_token_accuracy": 0.9762242904731205, | |
| "num_tokens": 3738635.0, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 0.32114317946789506, | |
| "grad_norm": 2.911987066268921, | |
| "learning_rate": 0.00018397291196388263, | |
| "loss": 0.4215, | |
| "mean_token_accuracy": 0.9774217935545104, | |
| "num_tokens": 3800036.0, | |
| "step": 427 | |
| }, | |
| { | |
| "epoch": 0.32640782175425403, | |
| "grad_norm": 0.9452666640281677, | |
| "learning_rate": 0.00018370955605718586, | |
| "loss": 0.3826, | |
| "mean_token_accuracy": 0.9784759293709483, | |
| "num_tokens": 3860759.0, | |
| "step": 434 | |
| }, | |
| { | |
| "epoch": 0.33167246404061296, | |
| "grad_norm": 9.154603004455566, | |
| "learning_rate": 0.0001834462001504891, | |
| "loss": 0.8441, | |
| "mean_token_accuracy": 0.9765165512050901, | |
| "num_tokens": 3919165.0, | |
| "step": 441 | |
| }, | |
| { | |
| "epoch": 0.3369371063269719, | |
| "grad_norm": 5.021421909332275, | |
| "learning_rate": 0.00018318284424379233, | |
| "loss": 0.4031, | |
| "mean_token_accuracy": 0.980344859617097, | |
| "num_tokens": 3981920.0, | |
| "step": 448 | |
| }, | |
| { | |
| "epoch": 0.3422017486133308, | |
| "grad_norm": 3.1184449195861816, | |
| "learning_rate": 0.0001829194883370956, | |
| "loss": 0.208, | |
| "mean_token_accuracy": 0.9901526123285294, | |
| "num_tokens": 4045797.0, | |
| "step": 455 | |
| }, | |
| { | |
| "epoch": 0.3474663908996898, | |
| "grad_norm": 1.3984616994857788, | |
| "learning_rate": 0.0001826561324303988, | |
| "loss": 0.4599, | |
| "mean_token_accuracy": 0.9853011731590543, | |
| "num_tokens": 4110637.0, | |
| "step": 462 | |
| }, | |
| { | |
| "epoch": 0.3527310331860487, | |
| "grad_norm": 5.359049320220947, | |
| "learning_rate": 0.00018239277652370203, | |
| "loss": 0.3354, | |
| "mean_token_accuracy": 0.9861837976745197, | |
| "num_tokens": 4175019.0, | |
| "step": 469 | |
| }, | |
| { | |
| "epoch": 0.3579956754724076, | |
| "grad_norm": 1.1707082986831665, | |
| "learning_rate": 0.0001821294206170053, | |
| "loss": 0.4807, | |
| "mean_token_accuracy": 0.9810051141040665, | |
| "num_tokens": 4235468.0, | |
| "step": 476 | |
| }, | |
| { | |
| "epoch": 0.36326031775876655, | |
| "grad_norm": 1.4972728490829468, | |
| "learning_rate": 0.00018186606471030852, | |
| "loss": 0.3913, | |
| "mean_token_accuracy": 0.9782918233956609, | |
| "num_tokens": 4292871.0, | |
| "step": 483 | |
| }, | |
| { | |
| "epoch": 0.3685249600451255, | |
| "grad_norm": 1.478913426399231, | |
| "learning_rate": 0.00018160270880361175, | |
| "loss": 0.2305, | |
| "mean_token_accuracy": 0.989903599023819, | |
| "num_tokens": 4352192.0, | |
| "step": 490 | |
| }, | |
| { | |
| "epoch": 0.37378960233148445, | |
| "grad_norm": 1.8192346096038818, | |
| "learning_rate": 0.00018133935289691498, | |
| "loss": 0.5126, | |
| "mean_token_accuracy": 0.9809825590678624, | |
| "num_tokens": 4409310.0, | |
| "step": 497 | |
| }, | |
| { | |
| "epoch": 0.37905424461784337, | |
| "grad_norm": 16.94274139404297, | |
| "learning_rate": 0.00018107599699021822, | |
| "loss": 0.5592, | |
| "mean_token_accuracy": 0.983451705958162, | |
| "num_tokens": 4474435.0, | |
| "step": 504 | |
| }, | |
| { | |
| "epoch": 0.3843188869042023, | |
| "grad_norm": 6.541215896606445, | |
| "learning_rate": 0.00018081264108352145, | |
| "loss": 0.5675, | |
| "mean_token_accuracy": 0.9818735644221306, | |
| "num_tokens": 4534354.0, | |
| "step": 511 | |
| }, | |
| { | |
| "epoch": 0.3895835291905613, | |
| "grad_norm": 4.328241348266602, | |
| "learning_rate": 0.00018054928517682468, | |
| "loss": 0.6128, | |
| "mean_token_accuracy": 0.9794616454413959, | |
| "num_tokens": 4601079.0, | |
| "step": 518 | |
| }, | |
| { | |
| "epoch": 0.3948481714769202, | |
| "grad_norm": 6.170828342437744, | |
| "learning_rate": 0.00018028592927012794, | |
| "loss": 0.5336, | |
| "mean_token_accuracy": 0.9807506884847369, | |
| "num_tokens": 4663000.0, | |
| "step": 525 | |
| }, | |
| { | |
| "epoch": 0.4001128137632791, | |
| "grad_norm": 1.9526742696762085, | |
| "learning_rate": 0.00018002257336343115, | |
| "loss": 0.5157, | |
| "mean_token_accuracy": 0.9808543622493744, | |
| "num_tokens": 4725037.0, | |
| "step": 532 | |
| }, | |
| { | |
| "epoch": 0.40537745604963804, | |
| "grad_norm": 2.270033597946167, | |
| "learning_rate": 0.00017975921745673438, | |
| "loss": 0.5037, | |
| "mean_token_accuracy": 0.9824284464120865, | |
| "num_tokens": 4785585.0, | |
| "step": 539 | |
| }, | |
| { | |
| "epoch": 0.410642098335997, | |
| "grad_norm": 1.9409046173095703, | |
| "learning_rate": 0.00017949586155003764, | |
| "loss": 0.2311, | |
| "mean_token_accuracy": 0.9882342325789588, | |
| "num_tokens": 4846950.0, | |
| "step": 546 | |
| }, | |
| { | |
| "epoch": 0.41590674062235594, | |
| "grad_norm": 7.580455780029297, | |
| "learning_rate": 0.00017923250564334087, | |
| "loss": 0.3169, | |
| "mean_token_accuracy": 0.9886538844023433, | |
| "num_tokens": 4905467.0, | |
| "step": 553 | |
| }, | |
| { | |
| "epoch": 0.42117138290871486, | |
| "grad_norm": 0.8075457215309143, | |
| "learning_rate": 0.0001789691497366441, | |
| "loss": 0.4776, | |
| "mean_token_accuracy": 0.9876182835016932, | |
| "num_tokens": 4966809.0, | |
| "step": 560 | |
| }, | |
| { | |
| "epoch": 0.4264360251950738, | |
| "grad_norm": 4.91216516494751, | |
| "learning_rate": 0.00017870579382994734, | |
| "loss": 0.544, | |
| "mean_token_accuracy": 0.9781337806156704, | |
| "num_tokens": 5027758.0, | |
| "step": 567 | |
| }, | |
| { | |
| "epoch": 0.4317006674814327, | |
| "grad_norm": 3.2579822540283203, | |
| "learning_rate": 0.00017844243792325057, | |
| "loss": 0.4504, | |
| "mean_token_accuracy": 0.9829125489507403, | |
| "num_tokens": 5091525.0, | |
| "step": 574 | |
| }, | |
| { | |
| "epoch": 0.4369653097677917, | |
| "grad_norm": 3.100821018218994, | |
| "learning_rate": 0.0001781790820165538, | |
| "loss": 0.41, | |
| "mean_token_accuracy": 0.981966610465731, | |
| "num_tokens": 5154269.0, | |
| "step": 581 | |
| }, | |
| { | |
| "epoch": 0.4422299520541506, | |
| "grad_norm": 4.908782958984375, | |
| "learning_rate": 0.00017791572610985706, | |
| "loss": 0.4313, | |
| "mean_token_accuracy": 0.9870239070483616, | |
| "num_tokens": 5217416.0, | |
| "step": 588 | |
| }, | |
| { | |
| "epoch": 0.44749459434050953, | |
| "grad_norm": 3.5250935554504395, | |
| "learning_rate": 0.00017765237020316027, | |
| "loss": 0.3928, | |
| "mean_token_accuracy": 0.9802717332329068, | |
| "num_tokens": 5274838.0, | |
| "step": 595 | |
| }, | |
| { | |
| "epoch": 0.45275923662686846, | |
| "grad_norm": 4.7593889236450195, | |
| "learning_rate": 0.0001773890142964635, | |
| "loss": 0.4205, | |
| "mean_token_accuracy": 0.9890570204172816, | |
| "num_tokens": 5335614.0, | |
| "step": 602 | |
| }, | |
| { | |
| "epoch": 0.45802387891322743, | |
| "grad_norm": 7.824479103088379, | |
| "learning_rate": 0.00017712565838976676, | |
| "loss": 0.413, | |
| "mean_token_accuracy": 0.9809066823550633, | |
| "num_tokens": 5395539.0, | |
| "step": 609 | |
| }, | |
| { | |
| "epoch": 0.46328852119958636, | |
| "grad_norm": 4.103699684143066, | |
| "learning_rate": 0.00017686230248307, | |
| "loss": 0.4272, | |
| "mean_token_accuracy": 0.9875162328992572, | |
| "num_tokens": 5456180.0, | |
| "step": 616 | |
| }, | |
| { | |
| "epoch": 0.4685531634859453, | |
| "grad_norm": 3.346459150314331, | |
| "learning_rate": 0.00017659894657637323, | |
| "loss": 0.3928, | |
| "mean_token_accuracy": 0.9815813388143267, | |
| "num_tokens": 5520087.0, | |
| "step": 623 | |
| }, | |
| { | |
| "epoch": 0.4738178057723042, | |
| "grad_norm": 32.111202239990234, | |
| "learning_rate": 0.00017633559066967646, | |
| "loss": 0.3948, | |
| "mean_token_accuracy": 0.9823211793388639, | |
| "num_tokens": 5581277.0, | |
| "step": 630 | |
| }, | |
| { | |
| "epoch": 0.4790824480586632, | |
| "grad_norm": 10.885830879211426, | |
| "learning_rate": 0.0001760722347629797, | |
| "loss": 0.5193, | |
| "mean_token_accuracy": 0.9819483490926879, | |
| "num_tokens": 5640568.0, | |
| "step": 637 | |
| }, | |
| { | |
| "epoch": 0.4843470903450221, | |
| "grad_norm": 2.700174331665039, | |
| "learning_rate": 0.00017580887885628292, | |
| "loss": 0.3007, | |
| "mean_token_accuracy": 0.9868088832923344, | |
| "num_tokens": 5706780.0, | |
| "step": 644 | |
| }, | |
| { | |
| "epoch": 0.489611732631381, | |
| "grad_norm": 1.4396345615386963, | |
| "learning_rate": 0.00017554552294958618, | |
| "loss": 0.3752, | |
| "mean_token_accuracy": 0.9875728871141162, | |
| "num_tokens": 5767062.0, | |
| "step": 651 | |
| }, | |
| { | |
| "epoch": 0.49487637491773995, | |
| "grad_norm": 3.3768303394317627, | |
| "learning_rate": 0.0001752821670428894, | |
| "loss": 0.5468, | |
| "mean_token_accuracy": 0.9852647270475116, | |
| "num_tokens": 5829744.0, | |
| "step": 658 | |
| }, | |
| { | |
| "epoch": 0.5001410172040989, | |
| "grad_norm": 3.7110822200775146, | |
| "learning_rate": 0.00017501881113619262, | |
| "loss": 0.4116, | |
| "mean_token_accuracy": 0.9863748614277158, | |
| "num_tokens": 5892897.0, | |
| "step": 665 | |
| }, | |
| { | |
| "epoch": 0.5054056594904578, | |
| "grad_norm": 7.288447856903076, | |
| "learning_rate": 0.00017475545522949588, | |
| "loss": 0.3753, | |
| "mean_token_accuracy": 0.9877457331333842, | |
| "num_tokens": 5956343.0, | |
| "step": 672 | |
| }, | |
| { | |
| "epoch": 0.5106703017768168, | |
| "grad_norm": 5.511970043182373, | |
| "learning_rate": 0.0001744920993227991, | |
| "loss": 0.5806, | |
| "mean_token_accuracy": 0.9813916959932872, | |
| "num_tokens": 6015280.0, | |
| "step": 679 | |
| }, | |
| { | |
| "epoch": 0.5159349440631757, | |
| "grad_norm": 2.800093650817871, | |
| "learning_rate": 0.00017422874341610235, | |
| "loss": 0.4278, | |
| "mean_token_accuracy": 0.9831469027059418, | |
| "num_tokens": 6073627.0, | |
| "step": 686 | |
| }, | |
| { | |
| "epoch": 0.5211995863495347, | |
| "grad_norm": 4.792038440704346, | |
| "learning_rate": 0.00017396538750940558, | |
| "loss": 0.3394, | |
| "mean_token_accuracy": 0.9836582307304654, | |
| "num_tokens": 6136535.0, | |
| "step": 693 | |
| }, | |
| { | |
| "epoch": 0.5264642286358936, | |
| "grad_norm": 1.4135181903839111, | |
| "learning_rate": 0.0001737020316027088, | |
| "loss": 0.2978, | |
| "mean_token_accuracy": 0.9872254133224487, | |
| "num_tokens": 6195695.0, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 0.5317288709222525, | |
| "grad_norm": 1.5809706449508667, | |
| "learning_rate": 0.00017343867569601204, | |
| "loss": 0.3772, | |
| "mean_token_accuracy": 0.9874131977558136, | |
| "num_tokens": 6259179.0, | |
| "step": 707 | |
| }, | |
| { | |
| "epoch": 0.5369935132086114, | |
| "grad_norm": 4.83329439163208, | |
| "learning_rate": 0.0001731753197893153, | |
| "loss": 0.3513, | |
| "mean_token_accuracy": 0.9856318128960473, | |
| "num_tokens": 6318980.0, | |
| "step": 714 | |
| }, | |
| { | |
| "epoch": 0.5422581554949704, | |
| "grad_norm": 4.257256507873535, | |
| "learning_rate": 0.0001729119638826185, | |
| "loss": 0.3748, | |
| "mean_token_accuracy": 0.9859073907136917, | |
| "num_tokens": 6377902.0, | |
| "step": 721 | |
| }, | |
| { | |
| "epoch": 0.5475227977813293, | |
| "grad_norm": 2.0148396492004395, | |
| "learning_rate": 0.00017264860797592174, | |
| "loss": 0.4356, | |
| "mean_token_accuracy": 0.9887675217219761, | |
| "num_tokens": 6441010.0, | |
| "step": 728 | |
| }, | |
| { | |
| "epoch": 0.5527874400676882, | |
| "grad_norm": 12.911645889282227, | |
| "learning_rate": 0.00017238525206922497, | |
| "loss": 0.5471, | |
| "mean_token_accuracy": 0.9760884706463132, | |
| "num_tokens": 6502476.0, | |
| "step": 735 | |
| }, | |
| { | |
| "epoch": 0.5580520823540472, | |
| "grad_norm": 5.5378737449646, | |
| "learning_rate": 0.00017212189616252823, | |
| "loss": 0.4676, | |
| "mean_token_accuracy": 0.9874839474047933, | |
| "num_tokens": 6567291.0, | |
| "step": 742 | |
| }, | |
| { | |
| "epoch": 0.5633167246404062, | |
| "grad_norm": 6.218775272369385, | |
| "learning_rate": 0.00017185854025583147, | |
| "loss": 0.3836, | |
| "mean_token_accuracy": 0.985468885728291, | |
| "num_tokens": 6625578.0, | |
| "step": 749 | |
| }, | |
| { | |
| "epoch": 0.5685813669267651, | |
| "grad_norm": 3.6012978553771973, | |
| "learning_rate": 0.0001715951843491347, | |
| "loss": 0.4657, | |
| "mean_token_accuracy": 0.9746146191443715, | |
| "num_tokens": 6684441.0, | |
| "step": 756 | |
| }, | |
| { | |
| "epoch": 0.573846009213124, | |
| "grad_norm": 6.051623821258545, | |
| "learning_rate": 0.00017133182844243793, | |
| "loss": 0.2821, | |
| "mean_token_accuracy": 0.986162223986217, | |
| "num_tokens": 6747208.0, | |
| "step": 763 | |
| }, | |
| { | |
| "epoch": 0.5791106514994829, | |
| "grad_norm": 6.143646240234375, | |
| "learning_rate": 0.00017106847253574116, | |
| "loss": 0.2502, | |
| "mean_token_accuracy": 0.9908400624990463, | |
| "num_tokens": 6809941.0, | |
| "step": 770 | |
| }, | |
| { | |
| "epoch": 0.5843752937858419, | |
| "grad_norm": 3.526141881942749, | |
| "learning_rate": 0.0001708051166290444, | |
| "loss": 0.4888, | |
| "mean_token_accuracy": 0.9807415125625474, | |
| "num_tokens": 6871591.0, | |
| "step": 777 | |
| }, | |
| { | |
| "epoch": 0.5896399360722008, | |
| "grad_norm": 6.934294700622559, | |
| "learning_rate": 0.00017054176072234766, | |
| "loss": 0.5856, | |
| "mean_token_accuracy": 0.9890980411853109, | |
| "num_tokens": 6933894.0, | |
| "step": 784 | |
| }, | |
| { | |
| "epoch": 0.5949045783585597, | |
| "grad_norm": 3.7759432792663574, | |
| "learning_rate": 0.00017027840481565086, | |
| "loss": 0.6127, | |
| "mean_token_accuracy": 0.9797992152827126, | |
| "num_tokens": 6996862.0, | |
| "step": 791 | |
| }, | |
| { | |
| "epoch": 0.6001692206449187, | |
| "grad_norm": 1.14512300491333, | |
| "learning_rate": 0.0001700150489089541, | |
| "loss": 0.5505, | |
| "mean_token_accuracy": 0.9833417545471873, | |
| "num_tokens": 7055168.0, | |
| "step": 798 | |
| }, | |
| { | |
| "epoch": 0.6054338629312777, | |
| "grad_norm": 10.16712474822998, | |
| "learning_rate": 0.00016975169300225735, | |
| "loss": 0.205, | |
| "mean_token_accuracy": 0.99239603217159, | |
| "num_tokens": 7117841.0, | |
| "step": 805 | |
| }, | |
| { | |
| "epoch": 0.6106985052176366, | |
| "grad_norm": 4.889142990112305, | |
| "learning_rate": 0.00016948833709556059, | |
| "loss": 0.5831, | |
| "mean_token_accuracy": 0.9765042832919529, | |
| "num_tokens": 7179097.0, | |
| "step": 812 | |
| }, | |
| { | |
| "epoch": 0.6159631475039955, | |
| "grad_norm": 12.288302421569824, | |
| "learning_rate": 0.00016922498118886382, | |
| "loss": 0.3002, | |
| "mean_token_accuracy": 0.9869685673287937, | |
| "num_tokens": 7239704.0, | |
| "step": 819 | |
| }, | |
| { | |
| "epoch": 0.6212277897903544, | |
| "grad_norm": 8.271245956420898, | |
| "learning_rate": 0.00016896162528216705, | |
| "loss": 0.2886, | |
| "mean_token_accuracy": 0.9869830970253263, | |
| "num_tokens": 7294327.0, | |
| "step": 826 | |
| }, | |
| { | |
| "epoch": 0.6264924320767133, | |
| "grad_norm": 5.007772922515869, | |
| "learning_rate": 0.00016869826937547028, | |
| "loss": 0.3947, | |
| "mean_token_accuracy": 0.9871470332145691, | |
| "num_tokens": 7359189.0, | |
| "step": 833 | |
| }, | |
| { | |
| "epoch": 0.6317570743630723, | |
| "grad_norm": 2.422032356262207, | |
| "learning_rate": 0.00016843491346877352, | |
| "loss": 0.3749, | |
| "mean_token_accuracy": 0.9881211455379214, | |
| "num_tokens": 7414653.0, | |
| "step": 840 | |
| }, | |
| { | |
| "epoch": 0.6370217166494312, | |
| "grad_norm": 3.326697826385498, | |
| "learning_rate": 0.00016817155756207678, | |
| "loss": 0.3625, | |
| "mean_token_accuracy": 0.984813029212611, | |
| "num_tokens": 7479068.0, | |
| "step": 847 | |
| }, | |
| { | |
| "epoch": 0.6422863589357901, | |
| "grad_norm": 2.6617870330810547, | |
| "learning_rate": 0.00016790820165537998, | |
| "loss": 0.243, | |
| "mean_token_accuracy": 0.9900081945317132, | |
| "num_tokens": 7539732.0, | |
| "step": 854 | |
| }, | |
| { | |
| "epoch": 0.6475510012221491, | |
| "grad_norm": 5.4873552322387695, | |
| "learning_rate": 0.00016764484574868321, | |
| "loss": 0.3848, | |
| "mean_token_accuracy": 0.9846341354506356, | |
| "num_tokens": 7604810.0, | |
| "step": 861 | |
| }, | |
| { | |
| "epoch": 0.6528156435085081, | |
| "grad_norm": 4.150392055511475, | |
| "learning_rate": 0.00016738148984198647, | |
| "loss": 0.5153, | |
| "mean_token_accuracy": 0.9825600971068654, | |
| "num_tokens": 7667204.0, | |
| "step": 868 | |
| }, | |
| { | |
| "epoch": 0.658080285794867, | |
| "grad_norm": 3.989720344543457, | |
| "learning_rate": 0.0001671181339352897, | |
| "loss": 0.3861, | |
| "mean_token_accuracy": 0.9847170146448272, | |
| "num_tokens": 7730916.0, | |
| "step": 875 | |
| }, | |
| { | |
| "epoch": 0.6633449280812259, | |
| "grad_norm": 4.056293964385986, | |
| "learning_rate": 0.00016685477802859294, | |
| "loss": 0.4463, | |
| "mean_token_accuracy": 0.9862849723015513, | |
| "num_tokens": 7794002.0, | |
| "step": 882 | |
| }, | |
| { | |
| "epoch": 0.6686095703675848, | |
| "grad_norm": 8.739585876464844, | |
| "learning_rate": 0.00016659142212189617, | |
| "loss": 0.4678, | |
| "mean_token_accuracy": 0.9864655403154237, | |
| "num_tokens": 7854934.0, | |
| "step": 889 | |
| }, | |
| { | |
| "epoch": 0.6738742126539438, | |
| "grad_norm": 0.729511559009552, | |
| "learning_rate": 0.0001663280662151994, | |
| "loss": 0.2778, | |
| "mean_token_accuracy": 0.9883942146386419, | |
| "num_tokens": 7916943.0, | |
| "step": 896 | |
| }, | |
| { | |
| "epoch": 0.6791388549403027, | |
| "grad_norm": 7.289599895477295, | |
| "learning_rate": 0.00016606471030850264, | |
| "loss": 0.3439, | |
| "mean_token_accuracy": 0.9849940932222775, | |
| "num_tokens": 7980989.0, | |
| "step": 903 | |
| }, | |
| { | |
| "epoch": 0.6844034972266616, | |
| "grad_norm": 3.160820722579956, | |
| "learning_rate": 0.0001658013544018059, | |
| "loss": 0.3914, | |
| "mean_token_accuracy": 0.9837205186486244, | |
| "num_tokens": 8040014.0, | |
| "step": 910 | |
| }, | |
| { | |
| "epoch": 0.6896681395130206, | |
| "grad_norm": 6.414876937866211, | |
| "learning_rate": 0.0001655379984951091, | |
| "loss": 0.5513, | |
| "mean_token_accuracy": 0.9829057125108582, | |
| "num_tokens": 8104908.0, | |
| "step": 917 | |
| }, | |
| { | |
| "epoch": 0.6949327817993796, | |
| "grad_norm": 2.485938787460327, | |
| "learning_rate": 0.00016527464258841233, | |
| "loss": 0.2446, | |
| "mean_token_accuracy": 0.9906999223998615, | |
| "num_tokens": 8163961.0, | |
| "step": 924 | |
| }, | |
| { | |
| "epoch": 0.7001974240857385, | |
| "grad_norm": 10.552153587341309, | |
| "learning_rate": 0.0001650112866817156, | |
| "loss": 0.3388, | |
| "mean_token_accuracy": 0.9885075858661106, | |
| "num_tokens": 8226031.0, | |
| "step": 931 | |
| }, | |
| { | |
| "epoch": 0.7054620663720974, | |
| "grad_norm": 5.112913608551025, | |
| "learning_rate": 0.00016474793077501883, | |
| "loss": 0.3849, | |
| "mean_token_accuracy": 0.9874296316078731, | |
| "num_tokens": 8284558.0, | |
| "step": 938 | |
| }, | |
| { | |
| "epoch": 0.7107267086584563, | |
| "grad_norm": 3.028000593185425, | |
| "learning_rate": 0.00016448457486832206, | |
| "loss": 0.2842, | |
| "mean_token_accuracy": 0.9855868539639882, | |
| "num_tokens": 8345944.0, | |
| "step": 945 | |
| }, | |
| { | |
| "epoch": 0.7159913509448153, | |
| "grad_norm": 7.714634895324707, | |
| "learning_rate": 0.0001642212189616253, | |
| "loss": 0.5605, | |
| "mean_token_accuracy": 0.9805406683257648, | |
| "num_tokens": 8407295.0, | |
| "step": 952 | |
| }, | |
| { | |
| "epoch": 0.7212559932311742, | |
| "grad_norm": 5.177945613861084, | |
| "learning_rate": 0.00016395786305492852, | |
| "loss": 0.5029, | |
| "mean_token_accuracy": 0.9816750564745494, | |
| "num_tokens": 8471038.0, | |
| "step": 959 | |
| }, | |
| { | |
| "epoch": 0.7265206355175331, | |
| "grad_norm": 1.9517511129379272, | |
| "learning_rate": 0.00016369450714823176, | |
| "loss": 0.2681, | |
| "mean_token_accuracy": 0.9889151624270848, | |
| "num_tokens": 8532994.0, | |
| "step": 966 | |
| }, | |
| { | |
| "epoch": 0.731785277803892, | |
| "grad_norm": 4.55762243270874, | |
| "learning_rate": 0.000163431151241535, | |
| "loss": 0.2467, | |
| "mean_token_accuracy": 0.9897659484829221, | |
| "num_tokens": 8595693.0, | |
| "step": 973 | |
| }, | |
| { | |
| "epoch": 0.737049920090251, | |
| "grad_norm": 4.429182052612305, | |
| "learning_rate": 0.00016316779533483825, | |
| "loss": 0.4387, | |
| "mean_token_accuracy": 0.9855312353798321, | |
| "num_tokens": 8654482.0, | |
| "step": 980 | |
| }, | |
| { | |
| "epoch": 0.74231456237661, | |
| "grad_norm": 7.037961959838867, | |
| "learning_rate": 0.00016290443942814145, | |
| "loss": 0.3473, | |
| "mean_token_accuracy": 0.9889997733490807, | |
| "num_tokens": 8714030.0, | |
| "step": 987 | |
| }, | |
| { | |
| "epoch": 0.7475792046629689, | |
| "grad_norm": 3.0206305980682373, | |
| "learning_rate": 0.0001626410835214447, | |
| "loss": 0.2398, | |
| "mean_token_accuracy": 0.9872059928519386, | |
| "num_tokens": 8778073.0, | |
| "step": 994 | |
| }, | |
| { | |
| "epoch": 0.7528438469493278, | |
| "grad_norm": 1.796095848083496, | |
| "learning_rate": 0.00016237772761474795, | |
| "loss": 0.3203, | |
| "mean_token_accuracy": 0.98711987052645, | |
| "num_tokens": 8843739.0, | |
| "step": 1001 | |
| }, | |
| { | |
| "epoch": 0.7581084892356867, | |
| "grad_norm": 3.418198347091675, | |
| "learning_rate": 0.00016211437170805118, | |
| "loss": 0.4184, | |
| "mean_token_accuracy": 0.984727283673627, | |
| "num_tokens": 8907228.0, | |
| "step": 1008 | |
| }, | |
| { | |
| "epoch": 0.7633731315220457, | |
| "grad_norm": 4.245222568511963, | |
| "learning_rate": 0.0001618510158013544, | |
| "loss": 0.2533, | |
| "mean_token_accuracy": 0.9887216197592872, | |
| "num_tokens": 8963623.0, | |
| "step": 1015 | |
| }, | |
| { | |
| "epoch": 0.7686377738084046, | |
| "grad_norm": 2.08406138420105, | |
| "learning_rate": 0.00016158765989465764, | |
| "loss": 0.2624, | |
| "mean_token_accuracy": 0.9887272076947349, | |
| "num_tokens": 9024728.0, | |
| "step": 1022 | |
| }, | |
| { | |
| "epoch": 0.7739024160947635, | |
| "grad_norm": 2.813343048095703, | |
| "learning_rate": 0.00016132430398796088, | |
| "loss": 0.397, | |
| "mean_token_accuracy": 0.987725670848574, | |
| "num_tokens": 9088032.0, | |
| "step": 1029 | |
| }, | |
| { | |
| "epoch": 0.7791670583811225, | |
| "grad_norm": 3.6622424125671387, | |
| "learning_rate": 0.0001610609480812641, | |
| "loss": 0.3668, | |
| "mean_token_accuracy": 0.9874992828283992, | |
| "num_tokens": 9148588.0, | |
| "step": 1036 | |
| }, | |
| { | |
| "epoch": 0.7844317006674815, | |
| "grad_norm": 2.5884439945220947, | |
| "learning_rate": 0.00016079759217456737, | |
| "loss": 0.4092, | |
| "mean_token_accuracy": 0.9852293921368462, | |
| "num_tokens": 9214674.0, | |
| "step": 1043 | |
| }, | |
| { | |
| "epoch": 0.7896963429538404, | |
| "grad_norm": 3.1072351932525635, | |
| "learning_rate": 0.00016053423626787058, | |
| "loss": 0.5184, | |
| "mean_token_accuracy": 0.980239770242146, | |
| "num_tokens": 9276035.0, | |
| "step": 1050 | |
| }, | |
| { | |
| "epoch": 0.7949609852401993, | |
| "grad_norm": 4.759675979614258, | |
| "learning_rate": 0.0001602708803611738, | |
| "loss": 0.3336, | |
| "mean_token_accuracy": 0.9884161640490804, | |
| "num_tokens": 9336657.0, | |
| "step": 1057 | |
| }, | |
| { | |
| "epoch": 0.8002256275265582, | |
| "grad_norm": 6.43472957611084, | |
| "learning_rate": 0.00016000752445447707, | |
| "loss": 0.3132, | |
| "mean_token_accuracy": 0.9860205660973277, | |
| "num_tokens": 9399352.0, | |
| "step": 1064 | |
| }, | |
| { | |
| "epoch": 0.8054902698129172, | |
| "grad_norm": 7.920260906219482, | |
| "learning_rate": 0.0001597441685477803, | |
| "loss": 0.2899, | |
| "mean_token_accuracy": 0.9875276652829987, | |
| "num_tokens": 9459142.0, | |
| "step": 1071 | |
| }, | |
| { | |
| "epoch": 0.8107549120992761, | |
| "grad_norm": 1.4461262226104736, | |
| "learning_rate": 0.00015948081264108353, | |
| "loss": 0.2892, | |
| "mean_token_accuracy": 0.9903218214000974, | |
| "num_tokens": 9518675.0, | |
| "step": 1078 | |
| }, | |
| { | |
| "epoch": 0.816019554385635, | |
| "grad_norm": 1.212272047996521, | |
| "learning_rate": 0.00015921745673438677, | |
| "loss": 0.2628, | |
| "mean_token_accuracy": 0.9898819465722356, | |
| "num_tokens": 9580940.0, | |
| "step": 1085 | |
| }, | |
| { | |
| "epoch": 0.821284196671994, | |
| "grad_norm": 6.286269664764404, | |
| "learning_rate": 0.00015895410082769, | |
| "loss": 0.4574, | |
| "mean_token_accuracy": 0.9877909858311925, | |
| "num_tokens": 9644375.0, | |
| "step": 1092 | |
| }, | |
| { | |
| "epoch": 0.826548838958353, | |
| "grad_norm": 3.9918956756591797, | |
| "learning_rate": 0.00015869074492099323, | |
| "loss": 0.4851, | |
| "mean_token_accuracy": 0.9860655079994883, | |
| "num_tokens": 9704997.0, | |
| "step": 1099 | |
| }, | |
| { | |
| "epoch": 0.8318134812447119, | |
| "grad_norm": 6.517810821533203, | |
| "learning_rate": 0.0001584273890142965, | |
| "loss": 0.3021, | |
| "mean_token_accuracy": 0.9874418069209371, | |
| "num_tokens": 9763654.0, | |
| "step": 1106 | |
| }, | |
| { | |
| "epoch": 0.8370781235310708, | |
| "grad_norm": 17.5554141998291, | |
| "learning_rate": 0.0001581640331075997, | |
| "loss": 0.5429, | |
| "mean_token_accuracy": 0.9781381785869598, | |
| "num_tokens": 9828703.0, | |
| "step": 1113 | |
| }, | |
| { | |
| "epoch": 0.8423427658174297, | |
| "grad_norm": 7.6073431968688965, | |
| "learning_rate": 0.00015790067720090293, | |
| "loss": 0.425, | |
| "mean_token_accuracy": 0.983854914350169, | |
| "num_tokens": 9888934.0, | |
| "step": 1120 | |
| }, | |
| { | |
| "epoch": 0.8476074081037887, | |
| "grad_norm": 2.790940523147583, | |
| "learning_rate": 0.0001576373212942062, | |
| "loss": 0.3905, | |
| "mean_token_accuracy": 0.9857985803059169, | |
| "num_tokens": 9954096.0, | |
| "step": 1127 | |
| }, | |
| { | |
| "epoch": 0.8528720503901476, | |
| "grad_norm": 3.5584349632263184, | |
| "learning_rate": 0.00015737396538750942, | |
| "loss": 0.3152, | |
| "mean_token_accuracy": 0.9878693467804364, | |
| "num_tokens": 10013567.0, | |
| "step": 1134 | |
| }, | |
| { | |
| "epoch": 0.8581366926765065, | |
| "grad_norm": 3.4691076278686523, | |
| "learning_rate": 0.00015711060948081265, | |
| "loss": 0.288, | |
| "mean_token_accuracy": 0.9880775543195861, | |
| "num_tokens": 10074992.0, | |
| "step": 1141 | |
| }, | |
| { | |
| "epoch": 0.8634013349628654, | |
| "grad_norm": 2.822650671005249, | |
| "learning_rate": 0.00015684725357411589, | |
| "loss": 0.322, | |
| "mean_token_accuracy": 0.9853216132947377, | |
| "num_tokens": 10141510.0, | |
| "step": 1148 | |
| }, | |
| { | |
| "epoch": 0.8686659772492245, | |
| "grad_norm": 5.202315807342529, | |
| "learning_rate": 0.00015658389766741912, | |
| "loss": 0.3071, | |
| "mean_token_accuracy": 0.9883281514048576, | |
| "num_tokens": 10205618.0, | |
| "step": 1155 | |
| }, | |
| { | |
| "epoch": 0.8739306195355834, | |
| "grad_norm": 6.426777362823486, | |
| "learning_rate": 0.00015632054176072235, | |
| "loss": 0.3244, | |
| "mean_token_accuracy": 0.9853953857507024, | |
| "num_tokens": 10268922.0, | |
| "step": 1162 | |
| }, | |
| { | |
| "epoch": 0.8791952618219423, | |
| "grad_norm": 2.698624849319458, | |
| "learning_rate": 0.0001560571858540256, | |
| "loss": 0.3603, | |
| "mean_token_accuracy": 0.9864558428525925, | |
| "num_tokens": 10332735.0, | |
| "step": 1169 | |
| }, | |
| { | |
| "epoch": 0.8844599041083012, | |
| "grad_norm": 1.197142243385315, | |
| "learning_rate": 0.00015579382994732882, | |
| "loss": 0.4445, | |
| "mean_token_accuracy": 0.9851714053324291, | |
| "num_tokens": 10392571.0, | |
| "step": 1176 | |
| }, | |
| { | |
| "epoch": 0.8897245463946601, | |
| "grad_norm": 6.625059127807617, | |
| "learning_rate": 0.00015553047404063205, | |
| "loss": 0.3128, | |
| "mean_token_accuracy": 0.9871101187808173, | |
| "num_tokens": 10451734.0, | |
| "step": 1183 | |
| }, | |
| { | |
| "epoch": 0.8949891886810191, | |
| "grad_norm": 5.674603462219238, | |
| "learning_rate": 0.0001552671181339353, | |
| "loss": 0.4159, | |
| "mean_token_accuracy": 0.9857557533042771, | |
| "num_tokens": 10513544.0, | |
| "step": 1190 | |
| }, | |
| { | |
| "epoch": 0.900253830967378, | |
| "grad_norm": 9.796287536621094, | |
| "learning_rate": 0.00015500376222723854, | |
| "loss": 0.2514, | |
| "mean_token_accuracy": 0.9887904301285744, | |
| "num_tokens": 10581197.0, | |
| "step": 1197 | |
| }, | |
| { | |
| "epoch": 0.9055184732537369, | |
| "grad_norm": 2.4740524291992188, | |
| "learning_rate": 0.00015474040632054177, | |
| "loss": 0.2875, | |
| "mean_token_accuracy": 0.9888958718095507, | |
| "num_tokens": 10649527.0, | |
| "step": 1204 | |
| }, | |
| { | |
| "epoch": 0.9107831155400959, | |
| "grad_norm": 1.7939389944076538, | |
| "learning_rate": 0.000154477050413845, | |
| "loss": 0.3445, | |
| "mean_token_accuracy": 0.9881748471941266, | |
| "num_tokens": 10709393.0, | |
| "step": 1211 | |
| }, | |
| { | |
| "epoch": 0.9160477578264549, | |
| "grad_norm": 5.42411470413208, | |
| "learning_rate": 0.00015421369450714824, | |
| "loss": 0.2091, | |
| "mean_token_accuracy": 0.991015836596489, | |
| "num_tokens": 10771667.0, | |
| "step": 1218 | |
| }, | |
| { | |
| "epoch": 0.9213124001128138, | |
| "grad_norm": 10.44039535522461, | |
| "learning_rate": 0.00015395033860045147, | |
| "loss": 0.2356, | |
| "mean_token_accuracy": 0.9929288434130805, | |
| "num_tokens": 10830996.0, | |
| "step": 1225 | |
| }, | |
| { | |
| "epoch": 0.9265770423991727, | |
| "grad_norm": 1.3637195825576782, | |
| "learning_rate": 0.0001536869826937547, | |
| "loss": 0.3027, | |
| "mean_token_accuracy": 0.9890089812023299, | |
| "num_tokens": 10891742.0, | |
| "step": 1232 | |
| }, | |
| { | |
| "epoch": 0.9318416846855316, | |
| "grad_norm": 9.645872116088867, | |
| "learning_rate": 0.00015342362678705796, | |
| "loss": 0.2897, | |
| "mean_token_accuracy": 0.9920872087989535, | |
| "num_tokens": 10951665.0, | |
| "step": 1239 | |
| }, | |
| { | |
| "epoch": 0.9371063269718906, | |
| "grad_norm": 4.489771366119385, | |
| "learning_rate": 0.00015316027088036117, | |
| "loss": 0.2861, | |
| "mean_token_accuracy": 0.9874759646398681, | |
| "num_tokens": 11012175.0, | |
| "step": 1246 | |
| }, | |
| { | |
| "epoch": 0.9423709692582495, | |
| "grad_norm": 3.856848955154419, | |
| "learning_rate": 0.0001528969149736644, | |
| "loss": 0.369, | |
| "mean_token_accuracy": 0.9833821335009166, | |
| "num_tokens": 11076432.0, | |
| "step": 1253 | |
| }, | |
| { | |
| "epoch": 0.9476356115446084, | |
| "grad_norm": 2.9677295684814453, | |
| "learning_rate": 0.00015263355906696766, | |
| "loss": 0.4231, | |
| "mean_token_accuracy": 0.9856973545891898, | |
| "num_tokens": 11140014.0, | |
| "step": 1260 | |
| }, | |
| { | |
| "epoch": 0.9529002538309673, | |
| "grad_norm": 5.096759796142578, | |
| "learning_rate": 0.0001523702031602709, | |
| "loss": 0.2352, | |
| "mean_token_accuracy": 0.9874877844538007, | |
| "num_tokens": 11203786.0, | |
| "step": 1267 | |
| }, | |
| { | |
| "epoch": 0.9581648961173264, | |
| "grad_norm": 1.4774101972579956, | |
| "learning_rate": 0.00015210684725357413, | |
| "loss": 0.446, | |
| "mean_token_accuracy": 0.9873196259140968, | |
| "num_tokens": 11262354.0, | |
| "step": 1274 | |
| }, | |
| { | |
| "epoch": 0.9634295384036853, | |
| "grad_norm": 3.908897638320923, | |
| "learning_rate": 0.00015184349134687736, | |
| "loss": 0.4166, | |
| "mean_token_accuracy": 0.9863720110484532, | |
| "num_tokens": 11327850.0, | |
| "step": 1281 | |
| }, | |
| { | |
| "epoch": 0.9686941806900442, | |
| "grad_norm": 1.6400980949401855, | |
| "learning_rate": 0.0001515801354401806, | |
| "loss": 0.443, | |
| "mean_token_accuracy": 0.9865829348564148, | |
| "num_tokens": 11389888.0, | |
| "step": 1288 | |
| }, | |
| { | |
| "epoch": 0.9739588229764031, | |
| "grad_norm": 2.286592721939087, | |
| "learning_rate": 0.00015131677953348382, | |
| "loss": 0.1958, | |
| "mean_token_accuracy": 0.9906766563653946, | |
| "num_tokens": 11448617.0, | |
| "step": 1295 | |
| }, | |
| { | |
| "epoch": 0.979223465262762, | |
| "grad_norm": 1.3930996656417847, | |
| "learning_rate": 0.00015105342362678708, | |
| "loss": 0.3306, | |
| "mean_token_accuracy": 0.989928315792765, | |
| "num_tokens": 11504992.0, | |
| "step": 1302 | |
| }, | |
| { | |
| "epoch": 0.984488107549121, | |
| "grad_norm": 3.627218723297119, | |
| "learning_rate": 0.0001507900677200903, | |
| "loss": 0.5234, | |
| "mean_token_accuracy": 0.9835356186543193, | |
| "num_tokens": 11567427.0, | |
| "step": 1309 | |
| }, | |
| { | |
| "epoch": 0.9897527498354799, | |
| "grad_norm": 0.9143754839897156, | |
| "learning_rate": 0.00015052671181339352, | |
| "loss": 0.1712, | |
| "mean_token_accuracy": 0.992523233805384, | |
| "num_tokens": 11633019.0, | |
| "step": 1316 | |
| }, | |
| { | |
| "epoch": 0.9950173921218388, | |
| "grad_norm": 4.690375804901123, | |
| "learning_rate": 0.00015026335590669678, | |
| "loss": 0.4812, | |
| "mean_token_accuracy": 0.9837278393762452, | |
| "num_tokens": 11699005.0, | |
| "step": 1323 | |
| }, | |
| { | |
| "epoch": 1.0, | |
| "grad_norm": 1.6737651824951172, | |
| "learning_rate": 0.00015000000000000001, | |
| "loss": 0.3557, | |
| "mean_token_accuracy": 0.9865049416164182, | |
| "num_tokens": 11757868.0, | |
| "step": 1330 | |
| }, | |
| { | |
| "epoch": 1.005264642286359, | |
| "grad_norm": 3.408721685409546, | |
| "learning_rate": 0.00014973664409330325, | |
| "loss": 0.325, | |
| "mean_token_accuracy": 0.9867892999734197, | |
| "num_tokens": 11818912.0, | |
| "step": 1337 | |
| }, | |
| { | |
| "epoch": 1.0105292845727178, | |
| "grad_norm": 1.5289093255996704, | |
| "learning_rate": 0.00014947328818660648, | |
| "loss": 0.3636, | |
| "mean_token_accuracy": 0.9868262644324984, | |
| "num_tokens": 11878588.0, | |
| "step": 1344 | |
| }, | |
| { | |
| "epoch": 1.0157939268590768, | |
| "grad_norm": 3.5710256099700928, | |
| "learning_rate": 0.0001492099322799097, | |
| "loss": 0.3356, | |
| "mean_token_accuracy": 0.9858624967081207, | |
| "num_tokens": 11937855.0, | |
| "step": 1351 | |
| }, | |
| { | |
| "epoch": 1.0210585691454357, | |
| "grad_norm": 2.1009042263031006, | |
| "learning_rate": 0.00014894657637321294, | |
| "loss": 0.1162, | |
| "mean_token_accuracy": 0.9943831084030015, | |
| "num_tokens": 11998055.0, | |
| "step": 1358 | |
| }, | |
| { | |
| "epoch": 1.0263232114317946, | |
| "grad_norm": 2.673617124557495, | |
| "learning_rate": 0.0001486832204665162, | |
| "loss": 0.2241, | |
| "mean_token_accuracy": 0.9894337984068053, | |
| "num_tokens": 12056178.0, | |
| "step": 1365 | |
| }, | |
| { | |
| "epoch": 1.0315878537181535, | |
| "grad_norm": 0.8607624173164368, | |
| "learning_rate": 0.0001484198645598194, | |
| "loss": 0.3341, | |
| "mean_token_accuracy": 0.9911763125232288, | |
| "num_tokens": 12115760.0, | |
| "step": 1372 | |
| }, | |
| { | |
| "epoch": 1.0368524960045125, | |
| "grad_norm": 5.409027576446533, | |
| "learning_rate": 0.00014815650865312264, | |
| "loss": 0.2366, | |
| "mean_token_accuracy": 0.9923227142010417, | |
| "num_tokens": 12180157.0, | |
| "step": 1379 | |
| }, | |
| { | |
| "epoch": 1.0421171382908714, | |
| "grad_norm": 5.06238317489624, | |
| "learning_rate": 0.0001478931527464259, | |
| "loss": 0.1718, | |
| "mean_token_accuracy": 0.9912227732794625, | |
| "num_tokens": 12245068.0, | |
| "step": 1386 | |
| }, | |
| { | |
| "epoch": 1.0473817805772305, | |
| "grad_norm": 4.035508632659912, | |
| "learning_rate": 0.00014762979683972913, | |
| "loss": 0.3846, | |
| "mean_token_accuracy": 0.9892959339278085, | |
| "num_tokens": 12305501.0, | |
| "step": 1393 | |
| }, | |
| { | |
| "epoch": 1.0526464228635894, | |
| "grad_norm": 1.7580567598342896, | |
| "learning_rate": 0.00014736644093303237, | |
| "loss": 0.2004, | |
| "mean_token_accuracy": 0.9921416980879647, | |
| "num_tokens": 12365127.0, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 1.0579110651499484, | |
| "grad_norm": 9.857763290405273, | |
| "learning_rate": 0.0001471030850263356, | |
| "loss": 0.4326, | |
| "mean_token_accuracy": 0.9865811956780297, | |
| "num_tokens": 12426111.0, | |
| "step": 1407 | |
| }, | |
| { | |
| "epoch": 1.0631757074363073, | |
| "grad_norm": 1.851088523864746, | |
| "learning_rate": 0.00014683972911963883, | |
| "loss": 0.2996, | |
| "mean_token_accuracy": 0.9852778209107262, | |
| "num_tokens": 12488034.0, | |
| "step": 1414 | |
| }, | |
| { | |
| "epoch": 1.0684403497226662, | |
| "grad_norm": 3.0664172172546387, | |
| "learning_rate": 0.00014657637321294206, | |
| "loss": 0.137, | |
| "mean_token_accuracy": 0.9941991950784411, | |
| "num_tokens": 12549856.0, | |
| "step": 1421 | |
| }, | |
| { | |
| "epoch": 1.0737049920090251, | |
| "grad_norm": 2.5788049697875977, | |
| "learning_rate": 0.00014631301730624532, | |
| "loss": 0.4694, | |
| "mean_token_accuracy": 0.9857755867498261, | |
| "num_tokens": 12616177.0, | |
| "step": 1428 | |
| }, | |
| { | |
| "epoch": 1.078969634295384, | |
| "grad_norm": 1.363619089126587, | |
| "learning_rate": 0.00014604966139954853, | |
| "loss": 0.2557, | |
| "mean_token_accuracy": 0.9904255930866513, | |
| "num_tokens": 12682645.0, | |
| "step": 1435 | |
| }, | |
| { | |
| "epoch": 1.084234276581743, | |
| "grad_norm": 3.019524574279785, | |
| "learning_rate": 0.00014578630549285176, | |
| "loss": 0.2535, | |
| "mean_token_accuracy": 0.9911204991596085, | |
| "num_tokens": 12740377.0, | |
| "step": 1442 | |
| }, | |
| { | |
| "epoch": 1.089498918868102, | |
| "grad_norm": 2.251373291015625, | |
| "learning_rate": 0.00014552294958615502, | |
| "loss": 0.188, | |
| "mean_token_accuracy": 0.9905956621680941, | |
| "num_tokens": 12797549.0, | |
| "step": 1449 | |
| }, | |
| { | |
| "epoch": 1.0947635611544608, | |
| "grad_norm": 2.197495222091675, | |
| "learning_rate": 0.00014525959367945825, | |
| "loss": 0.1749, | |
| "mean_token_accuracy": 0.9949398615530559, | |
| "num_tokens": 12858027.0, | |
| "step": 1456 | |
| }, | |
| { | |
| "epoch": 1.1000282034408198, | |
| "grad_norm": 4.309481143951416, | |
| "learning_rate": 0.0001449962377727615, | |
| "loss": 0.4218, | |
| "mean_token_accuracy": 0.982670929815088, | |
| "num_tokens": 12925847.0, | |
| "step": 1463 | |
| }, | |
| { | |
| "epoch": 1.1052928457271787, | |
| "grad_norm": 9.219832420349121, | |
| "learning_rate": 0.00014473288186606472, | |
| "loss": 0.1959, | |
| "mean_token_accuracy": 0.9937684174094882, | |
| "num_tokens": 12989703.0, | |
| "step": 1470 | |
| }, | |
| { | |
| "epoch": 1.1105574880135376, | |
| "grad_norm": 9.140572547912598, | |
| "learning_rate": 0.00014446952595936795, | |
| "loss": 0.3885, | |
| "mean_token_accuracy": 0.9879737379295486, | |
| "num_tokens": 13056094.0, | |
| "step": 1477 | |
| }, | |
| { | |
| "epoch": 1.1158221302998965, | |
| "grad_norm": 3.9148099422454834, | |
| "learning_rate": 0.00014420617005267118, | |
| "loss": 0.1446, | |
| "mean_token_accuracy": 0.9924962978277888, | |
| "num_tokens": 13112903.0, | |
| "step": 1484 | |
| }, | |
| { | |
| "epoch": 1.1210867725862554, | |
| "grad_norm": 0.3744335174560547, | |
| "learning_rate": 0.00014394281414597442, | |
| "loss": 0.2441, | |
| "mean_token_accuracy": 0.9917006982224328, | |
| "num_tokens": 13178395.0, | |
| "step": 1491 | |
| }, | |
| { | |
| "epoch": 1.1263514148726144, | |
| "grad_norm": 0.3156713843345642, | |
| "learning_rate": 0.00014367945823927768, | |
| "loss": 0.0885, | |
| "mean_token_accuracy": 0.9961930917842048, | |
| "num_tokens": 13241079.0, | |
| "step": 1498 | |
| }, | |
| { | |
| "epoch": 1.1316160571589733, | |
| "grad_norm": 0.5497475266456604, | |
| "learning_rate": 0.00014341610233258088, | |
| "loss": 0.147, | |
| "mean_token_accuracy": 0.9939297661185265, | |
| "num_tokens": 13304781.0, | |
| "step": 1505 | |
| }, | |
| { | |
| "epoch": 1.1368806994453324, | |
| "grad_norm": 0.22629879415035248, | |
| "learning_rate": 0.00014315274642588412, | |
| "loss": 0.3622, | |
| "mean_token_accuracy": 0.989835972232478, | |
| "num_tokens": 13360706.0, | |
| "step": 1512 | |
| }, | |
| { | |
| "epoch": 1.1421453417316914, | |
| "grad_norm": 1.5972377061843872, | |
| "learning_rate": 0.00014288939051918737, | |
| "loss": 0.2685, | |
| "mean_token_accuracy": 0.9913925932986396, | |
| "num_tokens": 13423752.0, | |
| "step": 1519 | |
| }, | |
| { | |
| "epoch": 1.1474099840180503, | |
| "grad_norm": 2.174511194229126, | |
| "learning_rate": 0.0001426260346124906, | |
| "loss": 0.2304, | |
| "mean_token_accuracy": 0.9914330286639077, | |
| "num_tokens": 13487380.0, | |
| "step": 1526 | |
| }, | |
| { | |
| "epoch": 1.1526746263044092, | |
| "grad_norm": 2.944056510925293, | |
| "learning_rate": 0.00014236267870579384, | |
| "loss": 0.3604, | |
| "mean_token_accuracy": 0.9890576909695353, | |
| "num_tokens": 13548212.0, | |
| "step": 1533 | |
| }, | |
| { | |
| "epoch": 1.1579392685907681, | |
| "grad_norm": 1.1673675775527954, | |
| "learning_rate": 0.00014209932279909707, | |
| "loss": 0.2486, | |
| "mean_token_accuracy": 0.9902358885322299, | |
| "num_tokens": 13611622.0, | |
| "step": 1540 | |
| }, | |
| { | |
| "epoch": 1.163203910877127, | |
| "grad_norm": 2.2113983631134033, | |
| "learning_rate": 0.0001418359668924003, | |
| "loss": 0.2813, | |
| "mean_token_accuracy": 0.9884150496550969, | |
| "num_tokens": 13673519.0, | |
| "step": 1547 | |
| }, | |
| { | |
| "epoch": 1.168468553163486, | |
| "grad_norm": 1.3646266460418701, | |
| "learning_rate": 0.00014157261098570354, | |
| "loss": 0.2837, | |
| "mean_token_accuracy": 0.9899180467639651, | |
| "num_tokens": 13740594.0, | |
| "step": 1554 | |
| }, | |
| { | |
| "epoch": 1.173733195449845, | |
| "grad_norm": 1.47178316116333, | |
| "learning_rate": 0.0001413092550790068, | |
| "loss": 0.4969, | |
| "mean_token_accuracy": 0.9861957728862762, | |
| "num_tokens": 13800981.0, | |
| "step": 1561 | |
| }, | |
| { | |
| "epoch": 1.1789978377362038, | |
| "grad_norm": 4.644566535949707, | |
| "learning_rate": 0.00014104589917231, | |
| "loss": 0.1929, | |
| "mean_token_accuracy": 0.9940712121980531, | |
| "num_tokens": 13859133.0, | |
| "step": 1568 | |
| }, | |
| { | |
| "epoch": 1.1842624800225627, | |
| "grad_norm": 2.941558361053467, | |
| "learning_rate": 0.00014078254326561324, | |
| "loss": 0.2933, | |
| "mean_token_accuracy": 0.9869024146880422, | |
| "num_tokens": 13925916.0, | |
| "step": 1575 | |
| }, | |
| { | |
| "epoch": 1.1895271223089217, | |
| "grad_norm": 3.543349504470825, | |
| "learning_rate": 0.0001405191873589165, | |
| "loss": 0.151, | |
| "mean_token_accuracy": 0.9912234033857074, | |
| "num_tokens": 13989378.0, | |
| "step": 1582 | |
| }, | |
| { | |
| "epoch": 1.1947917645952806, | |
| "grad_norm": 0.26421740651130676, | |
| "learning_rate": 0.00014025583145221973, | |
| "loss": 0.1316, | |
| "mean_token_accuracy": 0.9948174442563739, | |
| "num_tokens": 14048690.0, | |
| "step": 1589 | |
| }, | |
| { | |
| "epoch": 1.2000564068816395, | |
| "grad_norm": 3.704618215560913, | |
| "learning_rate": 0.00013999247554552296, | |
| "loss": 0.2967, | |
| "mean_token_accuracy": 0.9912898987531662, | |
| "num_tokens": 14105350.0, | |
| "step": 1596 | |
| }, | |
| { | |
| "epoch": 1.2053210491679984, | |
| "grad_norm": 9.820611953735352, | |
| "learning_rate": 0.0001397291196388262, | |
| "loss": 0.4279, | |
| "mean_token_accuracy": 0.9896314931767327, | |
| "num_tokens": 14166475.0, | |
| "step": 1603 | |
| }, | |
| { | |
| "epoch": 1.2105856914543573, | |
| "grad_norm": 4.219690322875977, | |
| "learning_rate": 0.00013946576373212943, | |
| "loss": 0.2052, | |
| "mean_token_accuracy": 0.9916545652917453, | |
| "num_tokens": 14230233.0, | |
| "step": 1610 | |
| }, | |
| { | |
| "epoch": 1.2158503337407163, | |
| "grad_norm": 8.233049392700195, | |
| "learning_rate": 0.00013920240782543266, | |
| "loss": 0.2305, | |
| "mean_token_accuracy": 0.9910402031881469, | |
| "num_tokens": 14293031.0, | |
| "step": 1617 | |
| }, | |
| { | |
| "epoch": 1.2211149760270752, | |
| "grad_norm": 3.9408421516418457, | |
| "learning_rate": 0.00013893905191873592, | |
| "loss": 0.1533, | |
| "mean_token_accuracy": 0.9919589406677655, | |
| "num_tokens": 14356077.0, | |
| "step": 1624 | |
| }, | |
| { | |
| "epoch": 1.2263796183134343, | |
| "grad_norm": 4.8626861572265625, | |
| "learning_rate": 0.00013867569601203912, | |
| "loss": 0.2836, | |
| "mean_token_accuracy": 0.9872687991176333, | |
| "num_tokens": 14412514.0, | |
| "step": 1631 | |
| }, | |
| { | |
| "epoch": 1.2316442605997933, | |
| "grad_norm": 0.6099812388420105, | |
| "learning_rate": 0.00013841234010534236, | |
| "loss": 0.157, | |
| "mean_token_accuracy": 0.9954733816640717, | |
| "num_tokens": 14478745.0, | |
| "step": 1638 | |
| }, | |
| { | |
| "epoch": 1.2369089028861522, | |
| "grad_norm": 5.712614059448242, | |
| "learning_rate": 0.00013814898419864562, | |
| "loss": 0.1737, | |
| "mean_token_accuracy": 0.991803133061954, | |
| "num_tokens": 14537007.0, | |
| "step": 1645 | |
| }, | |
| { | |
| "epoch": 1.242173545172511, | |
| "grad_norm": 2.5407156944274902, | |
| "learning_rate": 0.00013788562829194885, | |
| "loss": 0.2482, | |
| "mean_token_accuracy": 0.9922906564814704, | |
| "num_tokens": 14597887.0, | |
| "step": 1652 | |
| }, | |
| { | |
| "epoch": 1.24743818745887, | |
| "grad_norm": 16.862178802490234, | |
| "learning_rate": 0.00013762227238525208, | |
| "loss": 0.2099, | |
| "mean_token_accuracy": 0.9912843480706215, | |
| "num_tokens": 14660075.0, | |
| "step": 1659 | |
| }, | |
| { | |
| "epoch": 1.252702829745229, | |
| "grad_norm": 4.577484130859375, | |
| "learning_rate": 0.0001373589164785553, | |
| "loss": 0.2114, | |
| "mean_token_accuracy": 0.9898757732340268, | |
| "num_tokens": 14720427.0, | |
| "step": 1666 | |
| }, | |
| { | |
| "epoch": 1.2579674720315879, | |
| "grad_norm": 5.4580078125, | |
| "learning_rate": 0.00013709556057185855, | |
| "loss": 0.1987, | |
| "mean_token_accuracy": 0.99252992442676, | |
| "num_tokens": 14780598.0, | |
| "step": 1673 | |
| }, | |
| { | |
| "epoch": 1.2632321143179468, | |
| "grad_norm": 2.710749864578247, | |
| "learning_rate": 0.00013683220466516178, | |
| "loss": 0.2673, | |
| "mean_token_accuracy": 0.9886703054819789, | |
| "num_tokens": 14842301.0, | |
| "step": 1680 | |
| }, | |
| { | |
| "epoch": 1.2684967566043057, | |
| "grad_norm": 0.7136022448539734, | |
| "learning_rate": 0.00013656884875846504, | |
| "loss": 0.1207, | |
| "mean_token_accuracy": 0.9936908494148936, | |
| "num_tokens": 14906315.0, | |
| "step": 1687 | |
| }, | |
| { | |
| "epoch": 1.2737613988906646, | |
| "grad_norm": 10.184287071228027, | |
| "learning_rate": 0.00013630549285176824, | |
| "loss": 0.302, | |
| "mean_token_accuracy": 0.9861791431903839, | |
| "num_tokens": 14964107.0, | |
| "step": 1694 | |
| }, | |
| { | |
| "epoch": 1.2790260411770236, | |
| "grad_norm": 2.538411855697632, | |
| "learning_rate": 0.00013604213694507148, | |
| "loss": 0.1692, | |
| "mean_token_accuracy": 0.9949883254511016, | |
| "num_tokens": 15025560.0, | |
| "step": 1701 | |
| }, | |
| { | |
| "epoch": 1.2842906834633825, | |
| "grad_norm": 1.0461792945861816, | |
| "learning_rate": 0.0001357787810383747, | |
| "loss": 0.1073, | |
| "mean_token_accuracy": 0.9950532178793635, | |
| "num_tokens": 15090883.0, | |
| "step": 1708 | |
| }, | |
| { | |
| "epoch": 1.2895553257497414, | |
| "grad_norm": 0.796981930732727, | |
| "learning_rate": 0.00013551542513167797, | |
| "loss": 0.2213, | |
| "mean_token_accuracy": 0.9920940707836833, | |
| "num_tokens": 15150881.0, | |
| "step": 1715 | |
| }, | |
| { | |
| "epoch": 1.2948199680361003, | |
| "grad_norm": 3.015265703201294, | |
| "learning_rate": 0.0001352520692249812, | |
| "loss": 0.321, | |
| "mean_token_accuracy": 0.9910285813467843, | |
| "num_tokens": 15209552.0, | |
| "step": 1722 | |
| }, | |
| { | |
| "epoch": 1.3000846103224593, | |
| "grad_norm": 1.5789178609848022, | |
| "learning_rate": 0.00013498871331828443, | |
| "loss": 0.1814, | |
| "mean_token_accuracy": 0.991893857717514, | |
| "num_tokens": 15269074.0, | |
| "step": 1729 | |
| }, | |
| { | |
| "epoch": 1.3053492526088184, | |
| "grad_norm": 6.411360740661621, | |
| "learning_rate": 0.00013472535741158767, | |
| "loss": 0.2012, | |
| "mean_token_accuracy": 0.9942227602005005, | |
| "num_tokens": 15334515.0, | |
| "step": 1736 | |
| }, | |
| { | |
| "epoch": 1.310613894895177, | |
| "grad_norm": 15.531742095947266, | |
| "learning_rate": 0.0001344620015048909, | |
| "loss": 0.3704, | |
| "mean_token_accuracy": 0.9869026861020497, | |
| "num_tokens": 15399359.0, | |
| "step": 1743 | |
| }, | |
| { | |
| "epoch": 1.3158785371815362, | |
| "grad_norm": 6.059346675872803, | |
| "learning_rate": 0.00013419864559819413, | |
| "loss": 0.3227, | |
| "mean_token_accuracy": 0.9877499725137439, | |
| "num_tokens": 15460292.0, | |
| "step": 1750 | |
| }, | |
| { | |
| "epoch": 1.321143179467895, | |
| "grad_norm": 2.9363033771514893, | |
| "learning_rate": 0.0001339352896914974, | |
| "loss": 0.2628, | |
| "mean_token_accuracy": 0.9876537759389196, | |
| "num_tokens": 15520040.0, | |
| "step": 1757 | |
| }, | |
| { | |
| "epoch": 1.326407821754254, | |
| "grad_norm": 7.257687091827393, | |
| "learning_rate": 0.0001336719337848006, | |
| "loss": 0.2037, | |
| "mean_token_accuracy": 0.9923295655420848, | |
| "num_tokens": 15580776.0, | |
| "step": 1764 | |
| }, | |
| { | |
| "epoch": 1.331672464040613, | |
| "grad_norm": 16.092811584472656, | |
| "learning_rate": 0.00013340857787810383, | |
| "loss": 0.2106, | |
| "mean_token_accuracy": 0.9921471721359661, | |
| "num_tokens": 15643162.0, | |
| "step": 1771 | |
| }, | |
| { | |
| "epoch": 1.336937106326972, | |
| "grad_norm": 18.12074851989746, | |
| "learning_rate": 0.0001331452219714071, | |
| "loss": 0.2578, | |
| "mean_token_accuracy": 0.9924836467419352, | |
| "num_tokens": 15704193.0, | |
| "step": 1778 | |
| }, | |
| { | |
| "epoch": 1.3422017486133309, | |
| "grad_norm": 5.772324562072754, | |
| "learning_rate": 0.00013288186606471032, | |
| "loss": 0.199, | |
| "mean_token_accuracy": 0.9923907539674214, | |
| "num_tokens": 15770711.0, | |
| "step": 1785 | |
| }, | |
| { | |
| "epoch": 1.3474663908996898, | |
| "grad_norm": 0.9476026296615601, | |
| "learning_rate": 0.00013261851015801355, | |
| "loss": 0.2339, | |
| "mean_token_accuracy": 0.9915089160203934, | |
| "num_tokens": 15833257.0, | |
| "step": 1792 | |
| }, | |
| { | |
| "epoch": 1.3527310331860487, | |
| "grad_norm": 2.190275192260742, | |
| "learning_rate": 0.0001323551542513168, | |
| "loss": 0.3767, | |
| "mean_token_accuracy": 0.9895979432123048, | |
| "num_tokens": 15894005.0, | |
| "step": 1799 | |
| }, | |
| { | |
| "epoch": 1.3579956754724076, | |
| "grad_norm": 4.70867919921875, | |
| "learning_rate": 0.00013209179834462002, | |
| "loss": 0.1757, | |
| "mean_token_accuracy": 0.9928706181900842, | |
| "num_tokens": 15958463.0, | |
| "step": 1806 | |
| }, | |
| { | |
| "epoch": 1.3632603177587665, | |
| "grad_norm": 0.7507139444351196, | |
| "learning_rate": 0.00013182844243792325, | |
| "loss": 0.2775, | |
| "mean_token_accuracy": 0.9922192469239235, | |
| "num_tokens": 16017151.0, | |
| "step": 1813 | |
| }, | |
| { | |
| "epoch": 1.3685249600451255, | |
| "grad_norm": 3.018205165863037, | |
| "learning_rate": 0.0001315650865312265, | |
| "loss": 0.2266, | |
| "mean_token_accuracy": 0.9921427177531379, | |
| "num_tokens": 16079391.0, | |
| "step": 1820 | |
| }, | |
| { | |
| "epoch": 1.3737896023314844, | |
| "grad_norm": 1.5593658685684204, | |
| "learning_rate": 0.00013130173062452972, | |
| "loss": 0.2111, | |
| "mean_token_accuracy": 0.9927768015435764, | |
| "num_tokens": 16139695.0, | |
| "step": 1827 | |
| }, | |
| { | |
| "epoch": 1.3790542446178433, | |
| "grad_norm": 4.4447832107543945, | |
| "learning_rate": 0.00013103837471783295, | |
| "loss": 0.2209, | |
| "mean_token_accuracy": 0.9891561652932849, | |
| "num_tokens": 16200383.0, | |
| "step": 1834 | |
| }, | |
| { | |
| "epoch": 1.3843188869042022, | |
| "grad_norm": 10.619449615478516, | |
| "learning_rate": 0.0001307750188111362, | |
| "loss": 0.3289, | |
| "mean_token_accuracy": 0.9891869202256203, | |
| "num_tokens": 16263088.0, | |
| "step": 1841 | |
| }, | |
| { | |
| "epoch": 1.3895835291905612, | |
| "grad_norm": 6.483500957489014, | |
| "learning_rate": 0.00013051166290443944, | |
| "loss": 0.1773, | |
| "mean_token_accuracy": 0.9928955733776093, | |
| "num_tokens": 16325903.0, | |
| "step": 1848 | |
| }, | |
| { | |
| "epoch": 1.3948481714769203, | |
| "grad_norm": 4.723738670349121, | |
| "learning_rate": 0.00013024830699774267, | |
| "loss": 0.163, | |
| "mean_token_accuracy": 0.9920609401805061, | |
| "num_tokens": 16385398.0, | |
| "step": 1855 | |
| }, | |
| { | |
| "epoch": 1.400112813763279, | |
| "grad_norm": 3.299731492996216, | |
| "learning_rate": 0.0001299849510910459, | |
| "loss": 0.1281, | |
| "mean_token_accuracy": 0.9930625600474221, | |
| "num_tokens": 16446499.0, | |
| "step": 1862 | |
| }, | |
| { | |
| "epoch": 1.4053774560496382, | |
| "grad_norm": 3.176826000213623, | |
| "learning_rate": 0.00012972159518434914, | |
| "loss": 0.2056, | |
| "mean_token_accuracy": 0.9950633091585976, | |
| "num_tokens": 16508603.0, | |
| "step": 1869 | |
| }, | |
| { | |
| "epoch": 1.410642098335997, | |
| "grad_norm": 1.713958740234375, | |
| "learning_rate": 0.00012945823927765237, | |
| "loss": 0.2887, | |
| "mean_token_accuracy": 0.9888357828770366, | |
| "num_tokens": 16566765.0, | |
| "step": 1876 | |
| }, | |
| { | |
| "epoch": 1.415906740622356, | |
| "grad_norm": 60.53436279296875, | |
| "learning_rate": 0.00012919488337095563, | |
| "loss": 0.2608, | |
| "mean_token_accuracy": 0.9921398173485484, | |
| "num_tokens": 16627495.0, | |
| "step": 1883 | |
| }, | |
| { | |
| "epoch": 1.421171382908715, | |
| "grad_norm": 3.4057421684265137, | |
| "learning_rate": 0.00012893152746425884, | |
| "loss": 0.2358, | |
| "mean_token_accuracy": 0.9934137729661805, | |
| "num_tokens": 16685505.0, | |
| "step": 1890 | |
| }, | |
| { | |
| "epoch": 1.4264360251950738, | |
| "grad_norm": 8.682483673095703, | |
| "learning_rate": 0.00012866817155756207, | |
| "loss": 0.3047, | |
| "mean_token_accuracy": 0.9872814172080585, | |
| "num_tokens": 16746158.0, | |
| "step": 1897 | |
| }, | |
| { | |
| "epoch": 1.4317006674814328, | |
| "grad_norm": 6.995699882507324, | |
| "learning_rate": 0.00012840481565086533, | |
| "loss": 0.3074, | |
| "mean_token_accuracy": 0.9887890996677535, | |
| "num_tokens": 16805118.0, | |
| "step": 1904 | |
| }, | |
| { | |
| "epoch": 1.4369653097677917, | |
| "grad_norm": 3.943237066268921, | |
| "learning_rate": 0.00012814145974416856, | |
| "loss": 0.2609, | |
| "mean_token_accuracy": 0.9892492177230972, | |
| "num_tokens": 16865629.0, | |
| "step": 1911 | |
| }, | |
| { | |
| "epoch": 1.4422299520541506, | |
| "grad_norm": 0.8239724636077881, | |
| "learning_rate": 0.0001278781038374718, | |
| "loss": 0.1276, | |
| "mean_token_accuracy": 0.9948458277753421, | |
| "num_tokens": 16929681.0, | |
| "step": 1918 | |
| }, | |
| { | |
| "epoch": 1.4474945943405095, | |
| "grad_norm": 10.795239448547363, | |
| "learning_rate": 0.00012761474793077503, | |
| "loss": 0.2547, | |
| "mean_token_accuracy": 0.9911726553525243, | |
| "num_tokens": 16990985.0, | |
| "step": 1925 | |
| }, | |
| { | |
| "epoch": 1.4527592366268685, | |
| "grad_norm": 22.453044891357422, | |
| "learning_rate": 0.00012735139202407826, | |
| "loss": 0.3043, | |
| "mean_token_accuracy": 0.9873178665127073, | |
| "num_tokens": 17053547.0, | |
| "step": 1932 | |
| }, | |
| { | |
| "epoch": 1.4580238789132274, | |
| "grad_norm": 1.572399377822876, | |
| "learning_rate": 0.0001270880361173815, | |
| "loss": 0.2506, | |
| "mean_token_accuracy": 0.9925586645092283, | |
| "num_tokens": 17120873.0, | |
| "step": 1939 | |
| }, | |
| { | |
| "epoch": 1.4632885211995863, | |
| "grad_norm": 1.8852821588516235, | |
| "learning_rate": 0.00012682468021068473, | |
| "loss": 0.1817, | |
| "mean_token_accuracy": 0.9927859018955912, | |
| "num_tokens": 17180785.0, | |
| "step": 1946 | |
| }, | |
| { | |
| "epoch": 1.4685531634859452, | |
| "grad_norm": 1.3453786373138428, | |
| "learning_rate": 0.00012656132430398798, | |
| "loss": 0.2057, | |
| "mean_token_accuracy": 0.9911574646830559, | |
| "num_tokens": 17239360.0, | |
| "step": 1953 | |
| }, | |
| { | |
| "epoch": 1.4738178057723041, | |
| "grad_norm": 11.025494575500488, | |
| "learning_rate": 0.0001262979683972912, | |
| "loss": 0.1727, | |
| "mean_token_accuracy": 0.9909827506967953, | |
| "num_tokens": 17300223.0, | |
| "step": 1960 | |
| }, | |
| { | |
| "epoch": 1.479082448058663, | |
| "grad_norm": 1.8808586597442627, | |
| "learning_rate": 0.00012603461249059442, | |
| "loss": 0.2561, | |
| "mean_token_accuracy": 0.9896975191576141, | |
| "num_tokens": 17365607.0, | |
| "step": 1967 | |
| }, | |
| { | |
| "epoch": 1.4843470903450222, | |
| "grad_norm": 1.883882761001587, | |
| "learning_rate": 0.00012577125658389768, | |
| "loss": 0.2375, | |
| "mean_token_accuracy": 0.9916414086307798, | |
| "num_tokens": 17428427.0, | |
| "step": 1974 | |
| }, | |
| { | |
| "epoch": 1.489611732631381, | |
| "grad_norm": 28.188386917114258, | |
| "learning_rate": 0.00012550790067720092, | |
| "loss": 0.2718, | |
| "mean_token_accuracy": 0.989918270281383, | |
| "num_tokens": 17492942.0, | |
| "step": 1981 | |
| }, | |
| { | |
| "epoch": 1.49487637491774, | |
| "grad_norm": 11.462503433227539, | |
| "learning_rate": 0.00012524454477050415, | |
| "loss": 0.1901, | |
| "mean_token_accuracy": 0.9914419033697673, | |
| "num_tokens": 17559266.0, | |
| "step": 1988 | |
| }, | |
| { | |
| "epoch": 1.5001410172040988, | |
| "grad_norm": 0.8864675164222717, | |
| "learning_rate": 0.00012498118886380738, | |
| "loss": 0.1183, | |
| "mean_token_accuracy": 0.9930080260549273, | |
| "num_tokens": 17626562.0, | |
| "step": 1995 | |
| }, | |
| { | |
| "epoch": 1.505405659490458, | |
| "grad_norm": 1.2817803621292114, | |
| "learning_rate": 0.0001247178329571106, | |
| "loss": 0.219, | |
| "mean_token_accuracy": 0.9926515775067466, | |
| "num_tokens": 17688339.0, | |
| "step": 2002 | |
| }, | |
| { | |
| "epoch": 1.5106703017768168, | |
| "grad_norm": 3.6939096450805664, | |
| "learning_rate": 0.00012445447705041385, | |
| "loss": 0.2883, | |
| "mean_token_accuracy": 0.9864205483879361, | |
| "num_tokens": 17750888.0, | |
| "step": 2009 | |
| }, | |
| { | |
| "epoch": 1.5159349440631757, | |
| "grad_norm": 8.68113899230957, | |
| "learning_rate": 0.0001241911211437171, | |
| "loss": 0.1713, | |
| "mean_token_accuracy": 0.9931251076715333, | |
| "num_tokens": 17811077.0, | |
| "step": 2016 | |
| }, | |
| { | |
| "epoch": 1.5211995863495347, | |
| "grad_norm": 1.8664782047271729, | |
| "learning_rate": 0.0001239277652370203, | |
| "loss": 0.1683, | |
| "mean_token_accuracy": 0.9920256978699139, | |
| "num_tokens": 17873882.0, | |
| "step": 2023 | |
| }, | |
| { | |
| "epoch": 1.5264642286358936, | |
| "grad_norm": 9.23595905303955, | |
| "learning_rate": 0.00012366440933032354, | |
| "loss": 0.2279, | |
| "mean_token_accuracy": 0.9934034773281643, | |
| "num_tokens": 17934458.0, | |
| "step": 2030 | |
| }, | |
| { | |
| "epoch": 1.5317288709222525, | |
| "grad_norm": 0.5371968746185303, | |
| "learning_rate": 0.0001234010534236268, | |
| "loss": 0.3383, | |
| "mean_token_accuracy": 0.992760239967278, | |
| "num_tokens": 18000388.0, | |
| "step": 2037 | |
| }, | |
| { | |
| "epoch": 1.5369935132086114, | |
| "grad_norm": 6.717473983764648, | |
| "learning_rate": 0.00012313769751693004, | |
| "loss": 0.2705, | |
| "mean_token_accuracy": 0.9929484373756817, | |
| "num_tokens": 18062251.0, | |
| "step": 2044 | |
| }, | |
| { | |
| "epoch": 1.5422581554949704, | |
| "grad_norm": 1.2476862668991089, | |
| "learning_rate": 0.00012287434161023327, | |
| "loss": 0.2027, | |
| "mean_token_accuracy": 0.9934172172631536, | |
| "num_tokens": 18132332.0, | |
| "step": 2051 | |
| }, | |
| { | |
| "epoch": 1.5475227977813293, | |
| "grad_norm": 0.7605043053627014, | |
| "learning_rate": 0.0001226109857035365, | |
| "loss": 0.5097, | |
| "mean_token_accuracy": 0.981914051941463, | |
| "num_tokens": 18193186.0, | |
| "step": 2058 | |
| }, | |
| { | |
| "epoch": 1.5527874400676882, | |
| "grad_norm": 0.6618887186050415, | |
| "learning_rate": 0.00012234762979683973, | |
| "loss": 0.2604, | |
| "mean_token_accuracy": 0.988840958901814, | |
| "num_tokens": 18253081.0, | |
| "step": 2065 | |
| }, | |
| { | |
| "epoch": 1.5580520823540471, | |
| "grad_norm": 1.6868842840194702, | |
| "learning_rate": 0.00012208427389014297, | |
| "loss": 0.1247, | |
| "mean_token_accuracy": 0.9945448264479637, | |
| "num_tokens": 18320322.0, | |
| "step": 2072 | |
| }, | |
| { | |
| "epoch": 1.5633167246404063, | |
| "grad_norm": 1.528398871421814, | |
| "learning_rate": 0.00012182091798344621, | |
| "loss": 0.1558, | |
| "mean_token_accuracy": 0.9914160083447184, | |
| "num_tokens": 18381386.0, | |
| "step": 2079 | |
| }, | |
| { | |
| "epoch": 1.568581366926765, | |
| "grad_norm": 0.9244933128356934, | |
| "learning_rate": 0.00012155756207674944, | |
| "loss": 0.289, | |
| "mean_token_accuracy": 0.9896547826273101, | |
| "num_tokens": 18443269.0, | |
| "step": 2086 | |
| }, | |
| { | |
| "epoch": 1.5738460092131241, | |
| "grad_norm": 1.664876103401184, | |
| "learning_rate": 0.00012129420617005268, | |
| "loss": 0.2955, | |
| "mean_token_accuracy": 0.9906592624528068, | |
| "num_tokens": 18508043.0, | |
| "step": 2093 | |
| }, | |
| { | |
| "epoch": 1.5791106514994828, | |
| "grad_norm": 0.8348265886306763, | |
| "learning_rate": 0.00012103085026335592, | |
| "loss": 0.1227, | |
| "mean_token_accuracy": 0.9941448430929866, | |
| "num_tokens": 18567771.0, | |
| "step": 2100 | |
| }, | |
| { | |
| "epoch": 1.584375293785842, | |
| "grad_norm": 12.391520500183105, | |
| "learning_rate": 0.00012076749435665916, | |
| "loss": 0.2727, | |
| "mean_token_accuracy": 0.9927672892808914, | |
| "num_tokens": 18629149.0, | |
| "step": 2107 | |
| }, | |
| { | |
| "epoch": 1.5896399360722007, | |
| "grad_norm": 1.7599389553070068, | |
| "learning_rate": 0.00012050413844996237, | |
| "loss": 0.1956, | |
| "mean_token_accuracy": 0.9940993743283408, | |
| "num_tokens": 18689785.0, | |
| "step": 2114 | |
| }, | |
| { | |
| "epoch": 1.5949045783585598, | |
| "grad_norm": 3.6339666843414307, | |
| "learning_rate": 0.00012024078254326563, | |
| "loss": 0.1044, | |
| "mean_token_accuracy": 0.9948881470731327, | |
| "num_tokens": 18748306.0, | |
| "step": 2121 | |
| }, | |
| { | |
| "epoch": 1.6001692206449187, | |
| "grad_norm": 4.629573345184326, | |
| "learning_rate": 0.00011997742663656885, | |
| "loss": 0.1948, | |
| "mean_token_accuracy": 0.9910960016506059, | |
| "num_tokens": 18810181.0, | |
| "step": 2128 | |
| }, | |
| { | |
| "epoch": 1.6054338629312777, | |
| "grad_norm": 3.4737141132354736, | |
| "learning_rate": 0.00011971407072987209, | |
| "loss": 0.2604, | |
| "mean_token_accuracy": 0.9872141799756459, | |
| "num_tokens": 18867349.0, | |
| "step": 2135 | |
| }, | |
| { | |
| "epoch": 1.6106985052176366, | |
| "grad_norm": 1.6179413795471191, | |
| "learning_rate": 0.00011945071482317533, | |
| "loss": 0.2287, | |
| "mean_token_accuracy": 0.9942372579659734, | |
| "num_tokens": 18927420.0, | |
| "step": 2142 | |
| }, | |
| { | |
| "epoch": 1.6159631475039955, | |
| "grad_norm": 2.1788082122802734, | |
| "learning_rate": 0.00011918735891647856, | |
| "loss": 0.1952, | |
| "mean_token_accuracy": 0.9907448110835892, | |
| "num_tokens": 18990709.0, | |
| "step": 2149 | |
| }, | |
| { | |
| "epoch": 1.6212277897903544, | |
| "grad_norm": 1.523470401763916, | |
| "learning_rate": 0.0001189240030097818, | |
| "loss": 0.3125, | |
| "mean_token_accuracy": 0.989886862891061, | |
| "num_tokens": 19049861.0, | |
| "step": 2156 | |
| }, | |
| { | |
| "epoch": 1.6264924320767133, | |
| "grad_norm": 2.8504085540771484, | |
| "learning_rate": 0.00011866064710308504, | |
| "loss": 0.2237, | |
| "mean_token_accuracy": 0.9880131227629525, | |
| "num_tokens": 19118045.0, | |
| "step": 2163 | |
| }, | |
| { | |
| "epoch": 1.6317570743630723, | |
| "grad_norm": 0.6833459734916687, | |
| "learning_rate": 0.00011839729119638828, | |
| "loss": 0.2131, | |
| "mean_token_accuracy": 0.9924259898918015, | |
| "num_tokens": 19177671.0, | |
| "step": 2170 | |
| }, | |
| { | |
| "epoch": 1.6370217166494312, | |
| "grad_norm": 1.0239827632904053, | |
| "learning_rate": 0.0001181339352896915, | |
| "loss": 0.2055, | |
| "mean_token_accuracy": 0.9910915142723492, | |
| "num_tokens": 19237591.0, | |
| "step": 2177 | |
| }, | |
| { | |
| "epoch": 1.6422863589357901, | |
| "grad_norm": 1.6337671279907227, | |
| "learning_rate": 0.00011787057938299473, | |
| "loss": 0.2029, | |
| "mean_token_accuracy": 0.9929652927177293, | |
| "num_tokens": 19303521.0, | |
| "step": 2184 | |
| }, | |
| { | |
| "epoch": 1.647551001222149, | |
| "grad_norm": 5.557865142822266, | |
| "learning_rate": 0.00011760722347629797, | |
| "loss": 0.2295, | |
| "mean_token_accuracy": 0.9890761173197201, | |
| "num_tokens": 19365368.0, | |
| "step": 2191 | |
| }, | |
| { | |
| "epoch": 1.6528156435085082, | |
| "grad_norm": 14.096709251403809, | |
| "learning_rate": 0.0001173438675696012, | |
| "loss": 0.312, | |
| "mean_token_accuracy": 0.9904469975403377, | |
| "num_tokens": 19428075.0, | |
| "step": 2198 | |
| }, | |
| { | |
| "epoch": 1.6580802857948669, | |
| "grad_norm": 2.606497049331665, | |
| "learning_rate": 0.00011708051166290444, | |
| "loss": 0.1295, | |
| "mean_token_accuracy": 0.9957098939589092, | |
| "num_tokens": 19492747.0, | |
| "step": 2205 | |
| }, | |
| { | |
| "epoch": 1.663344928081226, | |
| "grad_norm": 0.12460053712129593, | |
| "learning_rate": 0.00011681715575620769, | |
| "loss": 0.1345, | |
| "mean_token_accuracy": 0.9944380170532635, | |
| "num_tokens": 19561523.0, | |
| "step": 2212 | |
| }, | |
| { | |
| "epoch": 1.6686095703675847, | |
| "grad_norm": 4.21990966796875, | |
| "learning_rate": 0.00011655379984951092, | |
| "loss": 0.3116, | |
| "mean_token_accuracy": 0.991795012993472, | |
| "num_tokens": 19626048.0, | |
| "step": 2219 | |
| }, | |
| { | |
| "epoch": 1.6738742126539439, | |
| "grad_norm": 4.622595310211182, | |
| "learning_rate": 0.00011629044394281414, | |
| "loss": 0.2398, | |
| "mean_token_accuracy": 0.9911801485078675, | |
| "num_tokens": 19687886.0, | |
| "step": 2226 | |
| }, | |
| { | |
| "epoch": 1.6791388549403026, | |
| "grad_norm": 0.5433699488639832, | |
| "learning_rate": 0.0001160270880361174, | |
| "loss": 0.2419, | |
| "mean_token_accuracy": 0.9909999359931264, | |
| "num_tokens": 19743942.0, | |
| "step": 2233 | |
| }, | |
| { | |
| "epoch": 1.6844034972266617, | |
| "grad_norm": 6.0370306968688965, | |
| "learning_rate": 0.00011576373212942062, | |
| "loss": 0.2358, | |
| "mean_token_accuracy": 0.9919255109769958, | |
| "num_tokens": 19809298.0, | |
| "step": 2240 | |
| }, | |
| { | |
| "epoch": 1.6896681395130206, | |
| "grad_norm": 5.089265823364258, | |
| "learning_rate": 0.00011550037622272385, | |
| "loss": 0.2458, | |
| "mean_token_accuracy": 0.9876088744827679, | |
| "num_tokens": 19871461.0, | |
| "step": 2247 | |
| }, | |
| { | |
| "epoch": 1.6949327817993796, | |
| "grad_norm": 3.583998680114746, | |
| "learning_rate": 0.0001152370203160271, | |
| "loss": 0.3835, | |
| "mean_token_accuracy": 0.9862523589815412, | |
| "num_tokens": 19935070.0, | |
| "step": 2254 | |
| }, | |
| { | |
| "epoch": 1.7001974240857385, | |
| "grad_norm": 3.997304916381836, | |
| "learning_rate": 0.00011497366440933033, | |
| "loss": 0.2579, | |
| "mean_token_accuracy": 0.9909880150641713, | |
| "num_tokens": 20001908.0, | |
| "step": 2261 | |
| }, | |
| { | |
| "epoch": 1.7054620663720974, | |
| "grad_norm": 0.4542715549468994, | |
| "learning_rate": 0.00011471030850263356, | |
| "loss": 0.1683, | |
| "mean_token_accuracy": 0.9938086505447116, | |
| "num_tokens": 20061957.0, | |
| "step": 2268 | |
| }, | |
| { | |
| "epoch": 1.7107267086584563, | |
| "grad_norm": 2.5569732189178467, | |
| "learning_rate": 0.0001144469525959368, | |
| "loss": 0.2912, | |
| "mean_token_accuracy": 0.9882578796574047, | |
| "num_tokens": 20120969.0, | |
| "step": 2275 | |
| }, | |
| { | |
| "epoch": 1.7159913509448153, | |
| "grad_norm": 1.954804539680481, | |
| "learning_rate": 0.00011418359668924004, | |
| "loss": 0.1393, | |
| "mean_token_accuracy": 0.993672323014055, | |
| "num_tokens": 20182041.0, | |
| "step": 2282 | |
| }, | |
| { | |
| "epoch": 1.7212559932311742, | |
| "grad_norm": 5.033205986022949, | |
| "learning_rate": 0.00011392024078254326, | |
| "loss": 0.2673, | |
| "mean_token_accuracy": 0.9904912710189819, | |
| "num_tokens": 20241585.0, | |
| "step": 2289 | |
| }, | |
| { | |
| "epoch": 1.726520635517533, | |
| "grad_norm": 4.316547870635986, | |
| "learning_rate": 0.00011365688487584652, | |
| "loss": 0.1753, | |
| "mean_token_accuracy": 0.995313975427832, | |
| "num_tokens": 20300988.0, | |
| "step": 2296 | |
| }, | |
| { | |
| "epoch": 1.731785277803892, | |
| "grad_norm": 0.4150407612323761, | |
| "learning_rate": 0.00011339352896914975, | |
| "loss": 0.0651, | |
| "mean_token_accuracy": 0.997933919940676, | |
| "num_tokens": 20363951.0, | |
| "step": 2303 | |
| }, | |
| { | |
| "epoch": 1.737049920090251, | |
| "grad_norm": 3.377779006958008, | |
| "learning_rate": 0.00011313017306245297, | |
| "loss": 0.2292, | |
| "mean_token_accuracy": 0.9914287626743317, | |
| "num_tokens": 20424684.0, | |
| "step": 2310 | |
| }, | |
| { | |
| "epoch": 1.74231456237661, | |
| "grad_norm": 8.437454223632812, | |
| "learning_rate": 0.00011286681715575623, | |
| "loss": 0.347, | |
| "mean_token_accuracy": 0.9894164843218667, | |
| "num_tokens": 20484766.0, | |
| "step": 2317 | |
| }, | |
| { | |
| "epoch": 1.7475792046629688, | |
| "grad_norm": 0.48844149708747864, | |
| "learning_rate": 0.00011260346124905945, | |
| "loss": 0.3996, | |
| "mean_token_accuracy": 0.9900390367422786, | |
| "num_tokens": 20545965.0, | |
| "step": 2324 | |
| }, | |
| { | |
| "epoch": 1.752843846949328, | |
| "grad_norm": 2.870230197906494, | |
| "learning_rate": 0.00011234010534236268, | |
| "loss": 0.1182, | |
| "mean_token_accuracy": 0.9935800592814173, | |
| "num_tokens": 20611006.0, | |
| "step": 2331 | |
| }, | |
| { | |
| "epoch": 1.7581084892356866, | |
| "grad_norm": 6.676984786987305, | |
| "learning_rate": 0.00011207674943566593, | |
| "loss": 0.2172, | |
| "mean_token_accuracy": 0.9920763533030238, | |
| "num_tokens": 20669278.0, | |
| "step": 2338 | |
| }, | |
| { | |
| "epoch": 1.7633731315220458, | |
| "grad_norm": 2.5986998081207275, | |
| "learning_rate": 0.00011181339352896916, | |
| "loss": 0.1656, | |
| "mean_token_accuracy": 0.9913375792758805, | |
| "num_tokens": 20732752.0, | |
| "step": 2345 | |
| }, | |
| { | |
| "epoch": 1.7686377738084045, | |
| "grad_norm": 8.234833717346191, | |
| "learning_rate": 0.00011155003762227239, | |
| "loss": 0.2573, | |
| "mean_token_accuracy": 0.9890632097210202, | |
| "num_tokens": 20800800.0, | |
| "step": 2352 | |
| }, | |
| { | |
| "epoch": 1.7739024160947636, | |
| "grad_norm": 3.0257232189178467, | |
| "learning_rate": 0.00011128668171557564, | |
| "loss": 0.2297, | |
| "mean_token_accuracy": 0.9877320943134171, | |
| "num_tokens": 20862976.0, | |
| "step": 2359 | |
| }, | |
| { | |
| "epoch": 1.7791670583811225, | |
| "grad_norm": 0.9968982338905334, | |
| "learning_rate": 0.00011102332580887887, | |
| "loss": 0.2005, | |
| "mean_token_accuracy": 0.99253696841853, | |
| "num_tokens": 20922277.0, | |
| "step": 2366 | |
| }, | |
| { | |
| "epoch": 1.7844317006674815, | |
| "grad_norm": 0.2723258137702942, | |
| "learning_rate": 0.00011075996990218209, | |
| "loss": 0.1197, | |
| "mean_token_accuracy": 0.9950262244258609, | |
| "num_tokens": 20984667.0, | |
| "step": 2373 | |
| }, | |
| { | |
| "epoch": 1.7896963429538404, | |
| "grad_norm": 1.3707334995269775, | |
| "learning_rate": 0.00011049661399548535, | |
| "loss": 0.2367, | |
| "mean_token_accuracy": 0.9931219986506871, | |
| "num_tokens": 21048933.0, | |
| "step": 2380 | |
| }, | |
| { | |
| "epoch": 1.7949609852401993, | |
| "grad_norm": 1.147256851196289, | |
| "learning_rate": 0.00011023325808878857, | |
| "loss": 0.1998, | |
| "mean_token_accuracy": 0.9940363253865924, | |
| "num_tokens": 21113869.0, | |
| "step": 2387 | |
| }, | |
| { | |
| "epoch": 1.8002256275265582, | |
| "grad_norm": 4.579329490661621, | |
| "learning_rate": 0.0001099699021820918, | |
| "loss": 0.2075, | |
| "mean_token_accuracy": 0.9920091213924545, | |
| "num_tokens": 21175277.0, | |
| "step": 2394 | |
| }, | |
| { | |
| "epoch": 1.8054902698129172, | |
| "grad_norm": 4.437446117401123, | |
| "learning_rate": 0.00010970654627539505, | |
| "loss": 0.1684, | |
| "mean_token_accuracy": 0.9928834150944438, | |
| "num_tokens": 21236427.0, | |
| "step": 2401 | |
| }, | |
| { | |
| "epoch": 1.810754912099276, | |
| "grad_norm": 1.5089209079742432, | |
| "learning_rate": 0.00010944319036869828, | |
| "loss": 0.3311, | |
| "mean_token_accuracy": 0.9877726499523435, | |
| "num_tokens": 21299560.0, | |
| "step": 2408 | |
| }, | |
| { | |
| "epoch": 1.816019554385635, | |
| "grad_norm": 6.240835666656494, | |
| "learning_rate": 0.00010917983446200151, | |
| "loss": 0.2478, | |
| "mean_token_accuracy": 0.9924420320561954, | |
| "num_tokens": 21360062.0, | |
| "step": 2415 | |
| }, | |
| { | |
| "epoch": 1.8212841966719941, | |
| "grad_norm": 0.8350119590759277, | |
| "learning_rate": 0.00010891647855530473, | |
| "loss": 0.1657, | |
| "mean_token_accuracy": 0.9924463789377894, | |
| "num_tokens": 21423904.0, | |
| "step": 2422 | |
| }, | |
| { | |
| "epoch": 1.8265488389583528, | |
| "grad_norm": 0.32087522745132446, | |
| "learning_rate": 0.00010865312264860799, | |
| "loss": 0.1635, | |
| "mean_token_accuracy": 0.9925666440810476, | |
| "num_tokens": 21483842.0, | |
| "step": 2429 | |
| }, | |
| { | |
| "epoch": 1.831813481244712, | |
| "grad_norm": 4.448061466217041, | |
| "learning_rate": 0.00010838976674191121, | |
| "loss": 0.1931, | |
| "mean_token_accuracy": 0.9920029448611396, | |
| "num_tokens": 21546669.0, | |
| "step": 2436 | |
| }, | |
| { | |
| "epoch": 1.8370781235310707, | |
| "grad_norm": 7.151405334472656, | |
| "learning_rate": 0.00010812641083521444, | |
| "loss": 0.1717, | |
| "mean_token_accuracy": 0.9925154458199229, | |
| "num_tokens": 21608479.0, | |
| "step": 2443 | |
| }, | |
| { | |
| "epoch": 1.8423427658174298, | |
| "grad_norm": 1.6041553020477295, | |
| "learning_rate": 0.00010786305492851769, | |
| "loss": 0.1677, | |
| "mean_token_accuracy": 0.9963443108967373, | |
| "num_tokens": 21673360.0, | |
| "step": 2450 | |
| }, | |
| { | |
| "epoch": 1.8476074081037885, | |
| "grad_norm": 0.9680352807044983, | |
| "learning_rate": 0.00010759969902182092, | |
| "loss": 0.1759, | |
| "mean_token_accuracy": 0.9928226534809385, | |
| "num_tokens": 21734047.0, | |
| "step": 2457 | |
| }, | |
| { | |
| "epoch": 1.8528720503901477, | |
| "grad_norm": 2.509469509124756, | |
| "learning_rate": 0.00010733634311512415, | |
| "loss": 0.1713, | |
| "mean_token_accuracy": 0.9933863058686256, | |
| "num_tokens": 21793912.0, | |
| "step": 2464 | |
| }, | |
| { | |
| "epoch": 1.8581366926765064, | |
| "grad_norm": 8.227444648742676, | |
| "learning_rate": 0.0001070729872084274, | |
| "loss": 0.2798, | |
| "mean_token_accuracy": 0.9916236741202218, | |
| "num_tokens": 21851698.0, | |
| "step": 2471 | |
| }, | |
| { | |
| "epoch": 1.8634013349628655, | |
| "grad_norm": 5.465874195098877, | |
| "learning_rate": 0.00010680963130173063, | |
| "loss": 0.2805, | |
| "mean_token_accuracy": 0.989004268177918, | |
| "num_tokens": 21911083.0, | |
| "step": 2478 | |
| }, | |
| { | |
| "epoch": 1.8686659772492245, | |
| "grad_norm": 0.6405107378959656, | |
| "learning_rate": 0.00010654627539503385, | |
| "loss": 0.3395, | |
| "mean_token_accuracy": 0.9919030549270766, | |
| "num_tokens": 21968631.0, | |
| "step": 2485 | |
| }, | |
| { | |
| "epoch": 1.8739306195355834, | |
| "grad_norm": 5.960853576660156, | |
| "learning_rate": 0.00010628291948833711, | |
| "loss": 0.1636, | |
| "mean_token_accuracy": 0.9951228105596134, | |
| "num_tokens": 22033866.0, | |
| "step": 2492 | |
| }, | |
| { | |
| "epoch": 1.8791952618219423, | |
| "grad_norm": 0.32846036553382874, | |
| "learning_rate": 0.00010601956358164033, | |
| "loss": 0.2903, | |
| "mean_token_accuracy": 0.9904723476086345, | |
| "num_tokens": 22098938.0, | |
| "step": 2499 | |
| }, | |
| { | |
| "epoch": 1.8844599041083012, | |
| "grad_norm": 2.838965892791748, | |
| "learning_rate": 0.00010575620767494356, | |
| "loss": 0.1872, | |
| "mean_token_accuracy": 0.9935757315584591, | |
| "num_tokens": 22165302.0, | |
| "step": 2506 | |
| }, | |
| { | |
| "epoch": 1.8897245463946601, | |
| "grad_norm": 0.23618347942829132, | |
| "learning_rate": 0.00010549285176824681, | |
| "loss": 0.2624, | |
| "mean_token_accuracy": 0.9916282934801919, | |
| "num_tokens": 22227071.0, | |
| "step": 2513 | |
| }, | |
| { | |
| "epoch": 1.894989188681019, | |
| "grad_norm": 25.132312774658203, | |
| "learning_rate": 0.00010522949586155004, | |
| "loss": 0.1328, | |
| "mean_token_accuracy": 0.9940263128706387, | |
| "num_tokens": 22291046.0, | |
| "step": 2520 | |
| }, | |
| { | |
| "epoch": 1.900253830967378, | |
| "grad_norm": 0.8783767223358154, | |
| "learning_rate": 0.00010496613995485327, | |
| "loss": 0.1317, | |
| "mean_token_accuracy": 0.9955138234155518, | |
| "num_tokens": 22353600.0, | |
| "step": 2527 | |
| }, | |
| { | |
| "epoch": 1.905518473253737, | |
| "grad_norm": 1.8539149761199951, | |
| "learning_rate": 0.00010470278404815652, | |
| "loss": 0.1141, | |
| "mean_token_accuracy": 0.9950261414051056, | |
| "num_tokens": 22412744.0, | |
| "step": 2534 | |
| }, | |
| { | |
| "epoch": 1.910783115540096, | |
| "grad_norm": 5.939787864685059, | |
| "learning_rate": 0.00010443942814145975, | |
| "loss": 0.2325, | |
| "mean_token_accuracy": 0.9890741993274007, | |
| "num_tokens": 22479122.0, | |
| "step": 2541 | |
| }, | |
| { | |
| "epoch": 1.9160477578264548, | |
| "grad_norm": 8.830121994018555, | |
| "learning_rate": 0.00010417607223476298, | |
| "loss": 0.3206, | |
| "mean_token_accuracy": 0.9917307570576668, | |
| "num_tokens": 22538723.0, | |
| "step": 2548 | |
| }, | |
| { | |
| "epoch": 1.921312400112814, | |
| "grad_norm": 2.3582096099853516, | |
| "learning_rate": 0.00010391271632806623, | |
| "loss": 0.2429, | |
| "mean_token_accuracy": 0.9904759057930538, | |
| "num_tokens": 22597256.0, | |
| "step": 2555 | |
| }, | |
| { | |
| "epoch": 1.9265770423991726, | |
| "grad_norm": 2.018383264541626, | |
| "learning_rate": 0.00010364936042136946, | |
| "loss": 0.1421, | |
| "mean_token_accuracy": 0.9935697657721383, | |
| "num_tokens": 22655270.0, | |
| "step": 2562 | |
| }, | |
| { | |
| "epoch": 1.9318416846855317, | |
| "grad_norm": 2.1314425468444824, | |
| "learning_rate": 0.00010338600451467268, | |
| "loss": 0.1593, | |
| "mean_token_accuracy": 0.9924792807017054, | |
| "num_tokens": 22723308.0, | |
| "step": 2569 | |
| }, | |
| { | |
| "epoch": 1.9371063269718904, | |
| "grad_norm": 0.9874346852302551, | |
| "learning_rate": 0.00010312264860797594, | |
| "loss": 0.0878, | |
| "mean_token_accuracy": 0.9949095217244965, | |
| "num_tokens": 22783701.0, | |
| "step": 2576 | |
| }, | |
| { | |
| "epoch": 1.9423709692582496, | |
| "grad_norm": 6.960457801818848, | |
| "learning_rate": 0.00010285929270127916, | |
| "loss": 0.1411, | |
| "mean_token_accuracy": 0.9944907841937882, | |
| "num_tokens": 22844579.0, | |
| "step": 2583 | |
| }, | |
| { | |
| "epoch": 1.9476356115446083, | |
| "grad_norm": 1.4458867311477661, | |
| "learning_rate": 0.0001025959367945824, | |
| "loss": 0.282, | |
| "mean_token_accuracy": 0.9925453599010196, | |
| "num_tokens": 22906270.0, | |
| "step": 2590 | |
| }, | |
| { | |
| "epoch": 1.9529002538309674, | |
| "grad_norm": 3.4999163150787354, | |
| "learning_rate": 0.00010233258088788564, | |
| "loss": 0.1703, | |
| "mean_token_accuracy": 0.9936967215367726, | |
| "num_tokens": 22964452.0, | |
| "step": 2597 | |
| }, | |
| { | |
| "epoch": 1.9581648961173264, | |
| "grad_norm": 1.28566312789917, | |
| "learning_rate": 0.00010206922498118887, | |
| "loss": 0.2963, | |
| "mean_token_accuracy": 0.9904211834073067, | |
| "num_tokens": 23024591.0, | |
| "step": 2604 | |
| }, | |
| { | |
| "epoch": 1.9634295384036853, | |
| "grad_norm": 2.335764169692993, | |
| "learning_rate": 0.0001018058690744921, | |
| "loss": 0.1865, | |
| "mean_token_accuracy": 0.9936867643679891, | |
| "num_tokens": 23084175.0, | |
| "step": 2611 | |
| }, | |
| { | |
| "epoch": 1.9686941806900442, | |
| "grad_norm": 0.8927153944969177, | |
| "learning_rate": 0.00010154251316779535, | |
| "loss": 0.1548, | |
| "mean_token_accuracy": 0.9943993336388043, | |
| "num_tokens": 23146803.0, | |
| "step": 2618 | |
| }, | |
| { | |
| "epoch": 1.9739588229764031, | |
| "grad_norm": 1.256751537322998, | |
| "learning_rate": 0.00010127915726109858, | |
| "loss": 0.2889, | |
| "mean_token_accuracy": 0.9925849459000996, | |
| "num_tokens": 23207236.0, | |
| "step": 2625 | |
| }, | |
| { | |
| "epoch": 1.979223465262762, | |
| "grad_norm": 3.8248612880706787, | |
| "learning_rate": 0.0001010158013544018, | |
| "loss": 0.1334, | |
| "mean_token_accuracy": 0.9925147891044617, | |
| "num_tokens": 23272939.0, | |
| "step": 2632 | |
| }, | |
| { | |
| "epoch": 1.984488107549121, | |
| "grad_norm": 3.764105796813965, | |
| "learning_rate": 0.00010075244544770506, | |
| "loss": 0.2558, | |
| "mean_token_accuracy": 0.9914434796997479, | |
| "num_tokens": 23330565.0, | |
| "step": 2639 | |
| }, | |
| { | |
| "epoch": 1.98975274983548, | |
| "grad_norm": 3.760984182357788, | |
| "learning_rate": 0.00010048908954100828, | |
| "loss": 0.0973, | |
| "mean_token_accuracy": 0.9952294485909599, | |
| "num_tokens": 23393377.0, | |
| "step": 2646 | |
| }, | |
| { | |
| "epoch": 1.9950173921218388, | |
| "grad_norm": 3.424152135848999, | |
| "learning_rate": 0.00010022573363431151, | |
| "loss": 0.2587, | |
| "mean_token_accuracy": 0.9893047543508666, | |
| "num_tokens": 23453009.0, | |
| "step": 2653 | |
| }, | |
| { | |
| "epoch": 2.0, | |
| "grad_norm": 1.3334497213363647, | |
| "learning_rate": 9.996237772761476e-05, | |
| "loss": 0.1392, | |
| "mean_token_accuracy": 0.9946929070184816, | |
| "num_tokens": 23515736.0, | |
| "step": 2660 | |
| }, | |
| { | |
| "epoch": 2.005264642286359, | |
| "grad_norm": 0.4988378882408142, | |
| "learning_rate": 9.969902182091799e-05, | |
| "loss": 0.0606, | |
| "mean_token_accuracy": 0.9959707015327045, | |
| "num_tokens": 23571933.0, | |
| "step": 2667 | |
| }, | |
| { | |
| "epoch": 2.010529284572718, | |
| "grad_norm": 5.9730095863342285, | |
| "learning_rate": 9.943566591422123e-05, | |
| "loss": 0.1725, | |
| "mean_token_accuracy": 0.9923502142940249, | |
| "num_tokens": 23640038.0, | |
| "step": 2674 | |
| }, | |
| { | |
| "epoch": 2.015793926859077, | |
| "grad_norm": 0.20172494649887085, | |
| "learning_rate": 9.917231000752446e-05, | |
| "loss": 0.1574, | |
| "mean_token_accuracy": 0.9921838481511388, | |
| "num_tokens": 23702291.0, | |
| "step": 2681 | |
| }, | |
| { | |
| "epoch": 2.0210585691454357, | |
| "grad_norm": 1.6150028705596924, | |
| "learning_rate": 9.89089541008277e-05, | |
| "loss": 0.1025, | |
| "mean_token_accuracy": 0.9949785162295613, | |
| "num_tokens": 23765057.0, | |
| "step": 2688 | |
| }, | |
| { | |
| "epoch": 2.026323211431795, | |
| "grad_norm": 2.3926284313201904, | |
| "learning_rate": 9.864559819413092e-05, | |
| "loss": 0.1626, | |
| "mean_token_accuracy": 0.9933823996356556, | |
| "num_tokens": 23824138.0, | |
| "step": 2695 | |
| }, | |
| { | |
| "epoch": 2.0315878537181535, | |
| "grad_norm": 1.6534227132797241, | |
| "learning_rate": 9.838224228743417e-05, | |
| "loss": 0.1641, | |
| "mean_token_accuracy": 0.9940249738948685, | |
| "num_tokens": 23881746.0, | |
| "step": 2702 | |
| }, | |
| { | |
| "epoch": 2.0368524960045127, | |
| "grad_norm": 0.3249776065349579, | |
| "learning_rate": 9.81188863807374e-05, | |
| "loss": 0.1329, | |
| "mean_token_accuracy": 0.9961700716188976, | |
| "num_tokens": 23941313.0, | |
| "step": 2709 | |
| }, | |
| { | |
| "epoch": 2.0421171382908714, | |
| "grad_norm": 2.7090892791748047, | |
| "learning_rate": 9.785553047404063e-05, | |
| "loss": 0.1073, | |
| "mean_token_accuracy": 0.9949856783662524, | |
| "num_tokens": 24000859.0, | |
| "step": 2716 | |
| }, | |
| { | |
| "epoch": 2.0473817805772305, | |
| "grad_norm": 7.788426876068115, | |
| "learning_rate": 9.759217456734388e-05, | |
| "loss": 0.327, | |
| "mean_token_accuracy": 0.9870751915233476, | |
| "num_tokens": 24061983.0, | |
| "step": 2723 | |
| }, | |
| { | |
| "epoch": 2.0526464228635892, | |
| "grad_norm": 6.587625503540039, | |
| "learning_rate": 9.73288186606471e-05, | |
| "loss": 0.1927, | |
| "mean_token_accuracy": 0.9948137562189784, | |
| "num_tokens": 24120562.0, | |
| "step": 2730 | |
| }, | |
| { | |
| "epoch": 2.0579110651499484, | |
| "grad_norm": 1.3047232627868652, | |
| "learning_rate": 9.706546275395035e-05, | |
| "loss": 0.0827, | |
| "mean_token_accuracy": 0.995783129973071, | |
| "num_tokens": 24181195.0, | |
| "step": 2737 | |
| }, | |
| { | |
| "epoch": 2.063175707436307, | |
| "grad_norm": 0.513001024723053, | |
| "learning_rate": 9.680210684725358e-05, | |
| "loss": 0.0955, | |
| "mean_token_accuracy": 0.9948273886527333, | |
| "num_tokens": 24243524.0, | |
| "step": 2744 | |
| }, | |
| { | |
| "epoch": 2.068440349722666, | |
| "grad_norm": 3.26157808303833, | |
| "learning_rate": 9.653875094055681e-05, | |
| "loss": 0.1578, | |
| "mean_token_accuracy": 0.9953958849821772, | |
| "num_tokens": 24305643.0, | |
| "step": 2751 | |
| }, | |
| { | |
| "epoch": 2.073704992009025, | |
| "grad_norm": 1.1425021886825562, | |
| "learning_rate": 9.627539503386004e-05, | |
| "loss": 0.1372, | |
| "mean_token_accuracy": 0.9954785364014762, | |
| "num_tokens": 24370302.0, | |
| "step": 2758 | |
| }, | |
| { | |
| "epoch": 2.078969634295384, | |
| "grad_norm": 0.40805739164352417, | |
| "learning_rate": 9.601203912716329e-05, | |
| "loss": 0.0305, | |
| "mean_token_accuracy": 0.9983581410987037, | |
| "num_tokens": 24430994.0, | |
| "step": 2765 | |
| }, | |
| { | |
| "epoch": 2.0842342765817428, | |
| "grad_norm": 1.9562276601791382, | |
| "learning_rate": 9.574868322046652e-05, | |
| "loss": 0.1345, | |
| "mean_token_accuracy": 0.9939369314483234, | |
| "num_tokens": 24495619.0, | |
| "step": 2772 | |
| }, | |
| { | |
| "epoch": 2.089498918868102, | |
| "grad_norm": 2.0199332237243652, | |
| "learning_rate": 9.548532731376975e-05, | |
| "loss": 0.1521, | |
| "mean_token_accuracy": 0.9944149521844727, | |
| "num_tokens": 24554278.0, | |
| "step": 2779 | |
| }, | |
| { | |
| "epoch": 2.094763561154461, | |
| "grad_norm": 1.2957112789154053, | |
| "learning_rate": 9.5221971407073e-05, | |
| "loss": 0.2164, | |
| "mean_token_accuracy": 0.9946900489074844, | |
| "num_tokens": 24618087.0, | |
| "step": 2786 | |
| }, | |
| { | |
| "epoch": 2.1000282034408198, | |
| "grad_norm": 1.5025010108947754, | |
| "learning_rate": 9.495861550037622e-05, | |
| "loss": 0.1432, | |
| "mean_token_accuracy": 0.9932070214833532, | |
| "num_tokens": 24677298.0, | |
| "step": 2793 | |
| }, | |
| { | |
| "epoch": 2.105292845727179, | |
| "grad_norm": 8.254350662231445, | |
| "learning_rate": 9.469525959367947e-05, | |
| "loss": 0.1458, | |
| "mean_token_accuracy": 0.9960162724767413, | |
| "num_tokens": 24744174.0, | |
| "step": 2800 | |
| }, | |
| { | |
| "epoch": 2.1105574880135376, | |
| "grad_norm": 0.6135422587394714, | |
| "learning_rate": 9.44319036869827e-05, | |
| "loss": 0.0433, | |
| "mean_token_accuracy": 0.9986263726438794, | |
| "num_tokens": 24807651.0, | |
| "step": 2807 | |
| }, | |
| { | |
| "epoch": 2.1158221302998967, | |
| "grad_norm": 5.919787406921387, | |
| "learning_rate": 9.416854778028593e-05, | |
| "loss": 0.1659, | |
| "mean_token_accuracy": 0.9945694825478962, | |
| "num_tokens": 24870759.0, | |
| "step": 2814 | |
| }, | |
| { | |
| "epoch": 2.1210867725862554, | |
| "grad_norm": 0.5038792490959167, | |
| "learning_rate": 9.390519187358918e-05, | |
| "loss": 0.2204, | |
| "mean_token_accuracy": 0.9929102073822703, | |
| "num_tokens": 24936210.0, | |
| "step": 2821 | |
| }, | |
| { | |
| "epoch": 2.1263514148726146, | |
| "grad_norm": 0.3141685426235199, | |
| "learning_rate": 9.36418359668924e-05, | |
| "loss": 0.1366, | |
| "mean_token_accuracy": 0.9947129456060273, | |
| "num_tokens": 24997521.0, | |
| "step": 2828 | |
| }, | |
| { | |
| "epoch": 2.1316160571589733, | |
| "grad_norm": 52.698875427246094, | |
| "learning_rate": 9.337848006019564e-05, | |
| "loss": 0.2059, | |
| "mean_token_accuracy": 0.9949604836957795, | |
| "num_tokens": 25056423.0, | |
| "step": 2835 | |
| }, | |
| { | |
| "epoch": 2.1368806994453324, | |
| "grad_norm": 2.740635633468628, | |
| "learning_rate": 9.311512415349887e-05, | |
| "loss": 0.0923, | |
| "mean_token_accuracy": 0.9957705182688577, | |
| "num_tokens": 25117526.0, | |
| "step": 2842 | |
| }, | |
| { | |
| "epoch": 2.142145341731691, | |
| "grad_norm": 2.0479421615600586, | |
| "learning_rate": 9.285176824680211e-05, | |
| "loss": 0.0776, | |
| "mean_token_accuracy": 0.995914882847241, | |
| "num_tokens": 25183637.0, | |
| "step": 2849 | |
| }, | |
| { | |
| "epoch": 2.1474099840180503, | |
| "grad_norm": 12.468928337097168, | |
| "learning_rate": 9.258841234010534e-05, | |
| "loss": 0.1264, | |
| "mean_token_accuracy": 0.9953470283320972, | |
| "num_tokens": 25246913.0, | |
| "step": 2856 | |
| }, | |
| { | |
| "epoch": 2.152674626304409, | |
| "grad_norm": 0.6688656210899353, | |
| "learning_rate": 9.232505643340859e-05, | |
| "loss": 0.0904, | |
| "mean_token_accuracy": 0.9969109371304512, | |
| "num_tokens": 25310757.0, | |
| "step": 2863 | |
| }, | |
| { | |
| "epoch": 2.157939268590768, | |
| "grad_norm": 2.0603487491607666, | |
| "learning_rate": 9.206170052671182e-05, | |
| "loss": 0.09, | |
| "mean_token_accuracy": 0.9956592119165829, | |
| "num_tokens": 25371077.0, | |
| "step": 2870 | |
| }, | |
| { | |
| "epoch": 2.163203910877127, | |
| "grad_norm": 3.3515512943267822, | |
| "learning_rate": 9.179834462001505e-05, | |
| "loss": 0.1076, | |
| "mean_token_accuracy": 0.9962447775261742, | |
| "num_tokens": 25427971.0, | |
| "step": 2877 | |
| }, | |
| { | |
| "epoch": 2.168468553163486, | |
| "grad_norm": 1.0916286706924438, | |
| "learning_rate": 9.15349887133183e-05, | |
| "loss": 0.1053, | |
| "mean_token_accuracy": 0.9969915130308696, | |
| "num_tokens": 25491902.0, | |
| "step": 2884 | |
| }, | |
| { | |
| "epoch": 2.173733195449845, | |
| "grad_norm": 4.432465076446533, | |
| "learning_rate": 9.127163280662152e-05, | |
| "loss": 0.2201, | |
| "mean_token_accuracy": 0.9930048095328468, | |
| "num_tokens": 25558533.0, | |
| "step": 2891 | |
| }, | |
| { | |
| "epoch": 2.178997837736204, | |
| "grad_norm": 4.1964921951293945, | |
| "learning_rate": 9.100827689992476e-05, | |
| "loss": 0.1796, | |
| "mean_token_accuracy": 0.9948009103536606, | |
| "num_tokens": 25620668.0, | |
| "step": 2898 | |
| }, | |
| { | |
| "epoch": 2.184262480022563, | |
| "grad_norm": 7.1684489250183105, | |
| "learning_rate": 9.0744920993228e-05, | |
| "loss": 0.1968, | |
| "mean_token_accuracy": 0.9917373220835414, | |
| "num_tokens": 25685782.0, | |
| "step": 2905 | |
| }, | |
| { | |
| "epoch": 2.1895271223089217, | |
| "grad_norm": 0.7233154773712158, | |
| "learning_rate": 9.048156508653123e-05, | |
| "loss": 0.0875, | |
| "mean_token_accuracy": 0.9959975065929549, | |
| "num_tokens": 25752376.0, | |
| "step": 2912 | |
| }, | |
| { | |
| "epoch": 2.194791764595281, | |
| "grad_norm": 6.503742694854736, | |
| "learning_rate": 9.021820917983447e-05, | |
| "loss": 0.1752, | |
| "mean_token_accuracy": 0.9938797716583524, | |
| "num_tokens": 25816570.0, | |
| "step": 2919 | |
| }, | |
| { | |
| "epoch": 2.2000564068816395, | |
| "grad_norm": 7.96094274520874, | |
| "learning_rate": 8.99548532731377e-05, | |
| "loss": 0.1672, | |
| "mean_token_accuracy": 0.994648206446852, | |
| "num_tokens": 25876303.0, | |
| "step": 2926 | |
| }, | |
| { | |
| "epoch": 2.2053210491679986, | |
| "grad_norm": 1.0175583362579346, | |
| "learning_rate": 8.969149736644094e-05, | |
| "loss": 0.1783, | |
| "mean_token_accuracy": 0.9924747060452189, | |
| "num_tokens": 25934518.0, | |
| "step": 2933 | |
| }, | |
| { | |
| "epoch": 2.2105856914543573, | |
| "grad_norm": 6.466376781463623, | |
| "learning_rate": 8.942814145974417e-05, | |
| "loss": 0.1552, | |
| "mean_token_accuracy": 0.9944330666746412, | |
| "num_tokens": 25994862.0, | |
| "step": 2940 | |
| }, | |
| { | |
| "epoch": 2.2158503337407165, | |
| "grad_norm": 5.298808574676514, | |
| "learning_rate": 8.91647855530474e-05, | |
| "loss": 0.0565, | |
| "mean_token_accuracy": 0.9965382622820991, | |
| "num_tokens": 26059184.0, | |
| "step": 2947 | |
| }, | |
| { | |
| "epoch": 2.221114976027075, | |
| "grad_norm": 7.090015411376953, | |
| "learning_rate": 8.890142964635064e-05, | |
| "loss": 0.1446, | |
| "mean_token_accuracy": 0.9944667699081557, | |
| "num_tokens": 26121401.0, | |
| "step": 2954 | |
| }, | |
| { | |
| "epoch": 2.2263796183134343, | |
| "grad_norm": 1.296852469444275, | |
| "learning_rate": 8.863807373965388e-05, | |
| "loss": 0.1097, | |
| "mean_token_accuracy": 0.9948959595390728, | |
| "num_tokens": 26184115.0, | |
| "step": 2961 | |
| }, | |
| { | |
| "epoch": 2.231644260599793, | |
| "grad_norm": 5.6778883934021, | |
| "learning_rate": 8.837471783295712e-05, | |
| "loss": 0.1632, | |
| "mean_token_accuracy": 0.99403044794287, | |
| "num_tokens": 26243245.0, | |
| "step": 2968 | |
| }, | |
| { | |
| "epoch": 2.236908902886152, | |
| "grad_norm": 2.0528903007507324, | |
| "learning_rate": 8.811136192626035e-05, | |
| "loss": 0.2464, | |
| "mean_token_accuracy": 0.9938805997371674, | |
| "num_tokens": 26304480.0, | |
| "step": 2975 | |
| }, | |
| { | |
| "epoch": 2.242173545172511, | |
| "grad_norm": 1.7901172637939453, | |
| "learning_rate": 8.78480060195636e-05, | |
| "loss": 0.157, | |
| "mean_token_accuracy": 0.9940778953688485, | |
| "num_tokens": 26366887.0, | |
| "step": 2982 | |
| }, | |
| { | |
| "epoch": 2.24743818745887, | |
| "grad_norm": 0.48134365677833557, | |
| "learning_rate": 8.758465011286681e-05, | |
| "loss": 0.0863, | |
| "mean_token_accuracy": 0.9968163360442434, | |
| "num_tokens": 26430388.0, | |
| "step": 2989 | |
| }, | |
| { | |
| "epoch": 2.2527028297452287, | |
| "grad_norm": 2.1100070476531982, | |
| "learning_rate": 8.732129420617006e-05, | |
| "loss": 0.1689, | |
| "mean_token_accuracy": 0.9938609706504005, | |
| "num_tokens": 26489167.0, | |
| "step": 2996 | |
| }, | |
| { | |
| "epoch": 2.257967472031588, | |
| "grad_norm": 1.791630506515503, | |
| "learning_rate": 8.705793829947329e-05, | |
| "loss": 0.0884, | |
| "mean_token_accuracy": 0.9960729309490749, | |
| "num_tokens": 26552066.0, | |
| "step": 3003 | |
| }, | |
| { | |
| "epoch": 2.2632321143179466, | |
| "grad_norm": 3.231057643890381, | |
| "learning_rate": 8.679458239277652e-05, | |
| "loss": 0.155, | |
| "mean_token_accuracy": 0.9938994752509254, | |
| "num_tokens": 26609884.0, | |
| "step": 3010 | |
| }, | |
| { | |
| "epoch": 2.2684967566043057, | |
| "grad_norm": 2.787071466445923, | |
| "learning_rate": 8.653122648607976e-05, | |
| "loss": 0.1268, | |
| "mean_token_accuracy": 0.9960082045623234, | |
| "num_tokens": 26672894.0, | |
| "step": 3017 | |
| }, | |
| { | |
| "epoch": 2.273761398890665, | |
| "grad_norm": 3.5303075313568115, | |
| "learning_rate": 8.6267870579383e-05, | |
| "loss": 0.2094, | |
| "mean_token_accuracy": 0.9927326611110142, | |
| "num_tokens": 26732486.0, | |
| "step": 3024 | |
| }, | |
| { | |
| "epoch": 2.2790260411770236, | |
| "grad_norm": 1.8545262813568115, | |
| "learning_rate": 8.600451467268624e-05, | |
| "loss": 0.1052, | |
| "mean_token_accuracy": 0.9955382794141769, | |
| "num_tokens": 26796774.0, | |
| "step": 3031 | |
| }, | |
| { | |
| "epoch": 2.2842906834633827, | |
| "grad_norm": 11.009295463562012, | |
| "learning_rate": 8.574115876598947e-05, | |
| "loss": 0.0957, | |
| "mean_token_accuracy": 0.9960797908050674, | |
| "num_tokens": 26856305.0, | |
| "step": 3038 | |
| }, | |
| { | |
| "epoch": 2.2895553257497414, | |
| "grad_norm": 5.196146011352539, | |
| "learning_rate": 8.547780285929271e-05, | |
| "loss": 0.1109, | |
| "mean_token_accuracy": 0.9938612484506198, | |
| "num_tokens": 26919374.0, | |
| "step": 3045 | |
| }, | |
| { | |
| "epoch": 2.2948199680361006, | |
| "grad_norm": 0.10862071812152863, | |
| "learning_rate": 8.521444695259593e-05, | |
| "loss": 0.1145, | |
| "mean_token_accuracy": 0.9933103120752743, | |
| "num_tokens": 26979071.0, | |
| "step": 3052 | |
| }, | |
| { | |
| "epoch": 2.3000846103224593, | |
| "grad_norm": 1.7097374200820923, | |
| "learning_rate": 8.495109104589918e-05, | |
| "loss": 0.1492, | |
| "mean_token_accuracy": 0.9936210161873272, | |
| "num_tokens": 27038851.0, | |
| "step": 3059 | |
| }, | |
| { | |
| "epoch": 2.3053492526088184, | |
| "grad_norm": 6.656505584716797, | |
| "learning_rate": 8.468773513920241e-05, | |
| "loss": 0.1534, | |
| "mean_token_accuracy": 0.9932264674987111, | |
| "num_tokens": 27100103.0, | |
| "step": 3066 | |
| }, | |
| { | |
| "epoch": 2.310613894895177, | |
| "grad_norm": 4.5811309814453125, | |
| "learning_rate": 8.442437923250564e-05, | |
| "loss": 0.1881, | |
| "mean_token_accuracy": 0.9944715765970094, | |
| "num_tokens": 27158572.0, | |
| "step": 3073 | |
| }, | |
| { | |
| "epoch": 2.3158785371815362, | |
| "grad_norm": 0.6880609393119812, | |
| "learning_rate": 8.416102332580889e-05, | |
| "loss": 0.0596, | |
| "mean_token_accuracy": 0.997020538364138, | |
| "num_tokens": 27213274.0, | |
| "step": 3080 | |
| }, | |
| { | |
| "epoch": 2.321143179467895, | |
| "grad_norm": 2.9210703372955322, | |
| "learning_rate": 8.389766741911211e-05, | |
| "loss": 0.2002, | |
| "mean_token_accuracy": 0.9929726900798934, | |
| "num_tokens": 27274381.0, | |
| "step": 3087 | |
| }, | |
| { | |
| "epoch": 2.326407821754254, | |
| "grad_norm": 2.0303795337677, | |
| "learning_rate": 8.363431151241536e-05, | |
| "loss": 0.065, | |
| "mean_token_accuracy": 0.9971029268843787, | |
| "num_tokens": 27337577.0, | |
| "step": 3094 | |
| }, | |
| { | |
| "epoch": 2.331672464040613, | |
| "grad_norm": 7.503274440765381, | |
| "learning_rate": 8.337095560571859e-05, | |
| "loss": 0.192, | |
| "mean_token_accuracy": 0.9927912588630404, | |
| "num_tokens": 27399255.0, | |
| "step": 3101 | |
| }, | |
| { | |
| "epoch": 2.336937106326972, | |
| "grad_norm": 0.6785974502563477, | |
| "learning_rate": 8.310759969902182e-05, | |
| "loss": 0.0789, | |
| "mean_token_accuracy": 0.9970830136111805, | |
| "num_tokens": 27458839.0, | |
| "step": 3108 | |
| }, | |
| { | |
| "epoch": 2.3422017486133306, | |
| "grad_norm": 0.4219534695148468, | |
| "learning_rate": 8.284424379232505e-05, | |
| "loss": 0.0194, | |
| "mean_token_accuracy": 0.9990826717444828, | |
| "num_tokens": 27517257.0, | |
| "step": 3115 | |
| }, | |
| { | |
| "epoch": 2.34746639089969, | |
| "grad_norm": 2.2100822925567627, | |
| "learning_rate": 8.25808878856283e-05, | |
| "loss": 0.1194, | |
| "mean_token_accuracy": 0.9947898355977876, | |
| "num_tokens": 27581805.0, | |
| "step": 3122 | |
| }, | |
| { | |
| "epoch": 2.352731033186049, | |
| "grad_norm": 0.08652421087026596, | |
| "learning_rate": 8.231753197893153e-05, | |
| "loss": 0.1811, | |
| "mean_token_accuracy": 0.9949411132505962, | |
| "num_tokens": 27647409.0, | |
| "step": 3129 | |
| }, | |
| { | |
| "epoch": 2.3579956754724076, | |
| "grad_norm": 0.47360947728157043, | |
| "learning_rate": 8.205417607223477e-05, | |
| "loss": 0.2265, | |
| "mean_token_accuracy": 0.9947876089385578, | |
| "num_tokens": 27708176.0, | |
| "step": 3136 | |
| }, | |
| { | |
| "epoch": 2.3632603177587663, | |
| "grad_norm": 0.6750448346138, | |
| "learning_rate": 8.179082016553801e-05, | |
| "loss": 0.0542, | |
| "mean_token_accuracy": 0.997616748724665, | |
| "num_tokens": 27767388.0, | |
| "step": 3143 | |
| }, | |
| { | |
| "epoch": 2.3685249600451255, | |
| "grad_norm": 2.9133658409118652, | |
| "learning_rate": 8.152746425884123e-05, | |
| "loss": 0.1551, | |
| "mean_token_accuracy": 0.9941770743046489, | |
| "num_tokens": 27831534.0, | |
| "step": 3150 | |
| }, | |
| { | |
| "epoch": 2.3737896023314846, | |
| "grad_norm": 0.8766928315162659, | |
| "learning_rate": 8.126410835214448e-05, | |
| "loss": 0.1773, | |
| "mean_token_accuracy": 0.9957404945577893, | |
| "num_tokens": 27891017.0, | |
| "step": 3157 | |
| }, | |
| { | |
| "epoch": 2.3790542446178433, | |
| "grad_norm": 0.7604002952575684, | |
| "learning_rate": 8.100075244544771e-05, | |
| "loss": 0.103, | |
| "mean_token_accuracy": 0.9960931933351925, | |
| "num_tokens": 27954163.0, | |
| "step": 3164 | |
| }, | |
| { | |
| "epoch": 2.3843188869042025, | |
| "grad_norm": 9.492481231689453, | |
| "learning_rate": 8.073739653875094e-05, | |
| "loss": 0.0592, | |
| "mean_token_accuracy": 0.9977307968905994, | |
| "num_tokens": 28016806.0, | |
| "step": 3171 | |
| }, | |
| { | |
| "epoch": 2.389583529190561, | |
| "grad_norm": 3.001394033432007, | |
| "learning_rate": 8.047404063205419e-05, | |
| "loss": 0.088, | |
| "mean_token_accuracy": 0.9949189679963248, | |
| "num_tokens": 28081447.0, | |
| "step": 3178 | |
| }, | |
| { | |
| "epoch": 2.3948481714769203, | |
| "grad_norm": 11.754488945007324, | |
| "learning_rate": 8.021068472535741e-05, | |
| "loss": 0.136, | |
| "mean_token_accuracy": 0.9951529172914368, | |
| "num_tokens": 28147031.0, | |
| "step": 3185 | |
| }, | |
| { | |
| "epoch": 2.400112813763279, | |
| "grad_norm": 0.9783135652542114, | |
| "learning_rate": 7.994732881866065e-05, | |
| "loss": 0.1096, | |
| "mean_token_accuracy": 0.9952867382339069, | |
| "num_tokens": 28213547.0, | |
| "step": 3192 | |
| }, | |
| { | |
| "epoch": 2.405377456049638, | |
| "grad_norm": 2.644630193710327, | |
| "learning_rate": 7.968397291196389e-05, | |
| "loss": 0.2359, | |
| "mean_token_accuracy": 0.9948877319693565, | |
| "num_tokens": 28271806.0, | |
| "step": 3199 | |
| }, | |
| { | |
| "epoch": 2.410642098335997, | |
| "grad_norm": 4.04617166519165, | |
| "learning_rate": 7.942061700526712e-05, | |
| "loss": 0.0999, | |
| "mean_token_accuracy": 0.9952200927904674, | |
| "num_tokens": 28333501.0, | |
| "step": 3206 | |
| }, | |
| { | |
| "epoch": 2.415906740622356, | |
| "grad_norm": 0.1013299897313118, | |
| "learning_rate": 7.915726109857035e-05, | |
| "loss": 0.1039, | |
| "mean_token_accuracy": 0.9962451394115176, | |
| "num_tokens": 28393500.0, | |
| "step": 3213 | |
| }, | |
| { | |
| "epoch": 2.4211713829087147, | |
| "grad_norm": 0.9739857912063599, | |
| "learning_rate": 7.88939051918736e-05, | |
| "loss": 0.0987, | |
| "mean_token_accuracy": 0.9960404485464096, | |
| "num_tokens": 28454156.0, | |
| "step": 3220 | |
| }, | |
| { | |
| "epoch": 2.426436025195074, | |
| "grad_norm": 2.094162940979004, | |
| "learning_rate": 7.863054928517683e-05, | |
| "loss": 0.1173, | |
| "mean_token_accuracy": 0.9950579064232963, | |
| "num_tokens": 28514798.0, | |
| "step": 3227 | |
| }, | |
| { | |
| "epoch": 2.4317006674814325, | |
| "grad_norm": 0.6073592305183411, | |
| "learning_rate": 7.836719337848006e-05, | |
| "loss": 0.1377, | |
| "mean_token_accuracy": 0.9945566068802562, | |
| "num_tokens": 28580408.0, | |
| "step": 3234 | |
| }, | |
| { | |
| "epoch": 2.4369653097677917, | |
| "grad_norm": 1.9101344347000122, | |
| "learning_rate": 7.810383747178331e-05, | |
| "loss": 0.07, | |
| "mean_token_accuracy": 0.995871941958155, | |
| "num_tokens": 28642729.0, | |
| "step": 3241 | |
| }, | |
| { | |
| "epoch": 2.4422299520541504, | |
| "grad_norm": 0.7469014525413513, | |
| "learning_rate": 7.784048156508653e-05, | |
| "loss": 0.0916, | |
| "mean_token_accuracy": 0.9964812376669475, | |
| "num_tokens": 28700127.0, | |
| "step": 3248 | |
| }, | |
| { | |
| "epoch": 2.4474945943405095, | |
| "grad_norm": 2.5793464183807373, | |
| "learning_rate": 7.757712565838977e-05, | |
| "loss": 0.1088, | |
| "mean_token_accuracy": 0.9954137855342456, | |
| "num_tokens": 28758436.0, | |
| "step": 3255 | |
| }, | |
| { | |
| "epoch": 2.4527592366268687, | |
| "grad_norm": 2.6391632556915283, | |
| "learning_rate": 7.7313769751693e-05, | |
| "loss": 0.1774, | |
| "mean_token_accuracy": 0.993948587349483, | |
| "num_tokens": 28820402.0, | |
| "step": 3262 | |
| }, | |
| { | |
| "epoch": 2.4580238789132274, | |
| "grad_norm": 0.27291956543922424, | |
| "learning_rate": 7.705041384499624e-05, | |
| "loss": 0.0769, | |
| "mean_token_accuracy": 0.9961353284972054, | |
| "num_tokens": 28880755.0, | |
| "step": 3269 | |
| }, | |
| { | |
| "epoch": 2.4632885211995865, | |
| "grad_norm": 2.6168487071990967, | |
| "learning_rate": 7.678705793829948e-05, | |
| "loss": 0.1177, | |
| "mean_token_accuracy": 0.9948775491544178, | |
| "num_tokens": 28944490.0, | |
| "step": 3276 | |
| }, | |
| { | |
| "epoch": 2.4685531634859452, | |
| "grad_norm": 0.30814939737319946, | |
| "learning_rate": 7.652370203160272e-05, | |
| "loss": 0.0893, | |
| "mean_token_accuracy": 0.9961133790867669, | |
| "num_tokens": 29004641.0, | |
| "step": 3283 | |
| }, | |
| { | |
| "epoch": 2.4738178057723044, | |
| "grad_norm": 5.610361099243164, | |
| "learning_rate": 7.626034612490595e-05, | |
| "loss": 0.1238, | |
| "mean_token_accuracy": 0.9973568096756935, | |
| "num_tokens": 29067350.0, | |
| "step": 3290 | |
| }, | |
| { | |
| "epoch": 2.479082448058663, | |
| "grad_norm": 0.8641747832298279, | |
| "learning_rate": 7.599699021820918e-05, | |
| "loss": 0.0758, | |
| "mean_token_accuracy": 0.9984217445765223, | |
| "num_tokens": 29135078.0, | |
| "step": 3297 | |
| }, | |
| { | |
| "epoch": 2.484347090345022, | |
| "grad_norm": 1.6216466426849365, | |
| "learning_rate": 7.573363431151242e-05, | |
| "loss": 0.1882, | |
| "mean_token_accuracy": 0.995362747992788, | |
| "num_tokens": 29202741.0, | |
| "step": 3304 | |
| }, | |
| { | |
| "epoch": 2.489611732631381, | |
| "grad_norm": 1.475360631942749, | |
| "learning_rate": 7.547027840481565e-05, | |
| "loss": 0.1453, | |
| "mean_token_accuracy": 0.9946168033140046, | |
| "num_tokens": 29265443.0, | |
| "step": 3311 | |
| }, | |
| { | |
| "epoch": 2.49487637491774, | |
| "grad_norm": 3.3166816234588623, | |
| "learning_rate": 7.52069224981189e-05, | |
| "loss": 0.1369, | |
| "mean_token_accuracy": 0.9960187503269741, | |
| "num_tokens": 29328040.0, | |
| "step": 3318 | |
| }, | |
| { | |
| "epoch": 2.5001410172040988, | |
| "grad_norm": 0.7593300938606262, | |
| "learning_rate": 7.494356659142213e-05, | |
| "loss": 0.0738, | |
| "mean_token_accuracy": 0.997723214328289, | |
| "num_tokens": 29387131.0, | |
| "step": 3325 | |
| }, | |
| { | |
| "epoch": 2.505405659490458, | |
| "grad_norm": 0.220882847905159, | |
| "learning_rate": 7.468021068472536e-05, | |
| "loss": 0.0733, | |
| "mean_token_accuracy": 0.9957305842212268, | |
| "num_tokens": 29449037.0, | |
| "step": 3332 | |
| }, | |
| { | |
| "epoch": 2.510670301776817, | |
| "grad_norm": 2.9052233695983887, | |
| "learning_rate": 7.44168547780286e-05, | |
| "loss": 0.1608, | |
| "mean_token_accuracy": 0.9953676845346179, | |
| "num_tokens": 29512257.0, | |
| "step": 3339 | |
| }, | |
| { | |
| "epoch": 2.5159349440631757, | |
| "grad_norm": 1.5184600353240967, | |
| "learning_rate": 7.415349887133182e-05, | |
| "loss": 0.1091, | |
| "mean_token_accuracy": 0.9970807571496282, | |
| "num_tokens": 29571450.0, | |
| "step": 3346 | |
| }, | |
| { | |
| "epoch": 2.5211995863495344, | |
| "grad_norm": 1.856492042541504, | |
| "learning_rate": 7.389014296463507e-05, | |
| "loss": 0.1173, | |
| "mean_token_accuracy": 0.9951877625925201, | |
| "num_tokens": 29631144.0, | |
| "step": 3353 | |
| }, | |
| { | |
| "epoch": 2.5264642286358936, | |
| "grad_norm": 1.3158390522003174, | |
| "learning_rate": 7.36267870579383e-05, | |
| "loss": 0.1148, | |
| "mean_token_accuracy": 0.9937653030667987, | |
| "num_tokens": 29689035.0, | |
| "step": 3360 | |
| }, | |
| { | |
| "epoch": 2.5317288709222527, | |
| "grad_norm": 2.1421892642974854, | |
| "learning_rate": 7.336343115124154e-05, | |
| "loss": 0.1688, | |
| "mean_token_accuracy": 0.9960722933922496, | |
| "num_tokens": 29751756.0, | |
| "step": 3367 | |
| }, | |
| { | |
| "epoch": 2.5369935132086114, | |
| "grad_norm": 2.0183699131011963, | |
| "learning_rate": 7.310007524454477e-05, | |
| "loss": 0.0863, | |
| "mean_token_accuracy": 0.9955445549317768, | |
| "num_tokens": 29816194.0, | |
| "step": 3374 | |
| }, | |
| { | |
| "epoch": 2.54225815549497, | |
| "grad_norm": 1.0039271116256714, | |
| "learning_rate": 7.283671933784801e-05, | |
| "loss": 0.2236, | |
| "mean_token_accuracy": 0.9936398097446987, | |
| "num_tokens": 29876603.0, | |
| "step": 3381 | |
| }, | |
| { | |
| "epoch": 2.5475227977813293, | |
| "grad_norm": 4.0505452156066895, | |
| "learning_rate": 7.257336343115125e-05, | |
| "loss": 0.0836, | |
| "mean_token_accuracy": 0.9964388534426689, | |
| "num_tokens": 29936365.0, | |
| "step": 3388 | |
| }, | |
| { | |
| "epoch": 2.5527874400676884, | |
| "grad_norm": 1.4060059785842896, | |
| "learning_rate": 7.231000752445448e-05, | |
| "loss": 0.1119, | |
| "mean_token_accuracy": 0.9964170062116214, | |
| "num_tokens": 29997244.0, | |
| "step": 3395 | |
| }, | |
| { | |
| "epoch": 2.558052082354047, | |
| "grad_norm": 3.394388437271118, | |
| "learning_rate": 7.204665161775773e-05, | |
| "loss": 0.1724, | |
| "mean_token_accuracy": 0.9946589767932892, | |
| "num_tokens": 30057794.0, | |
| "step": 3402 | |
| }, | |
| { | |
| "epoch": 2.5633167246404063, | |
| "grad_norm": 0.78196781873703, | |
| "learning_rate": 7.178329571106094e-05, | |
| "loss": 0.1592, | |
| "mean_token_accuracy": 0.9961164561765534, | |
| "num_tokens": 30119507.0, | |
| "step": 3409 | |
| }, | |
| { | |
| "epoch": 2.568581366926765, | |
| "grad_norm": 0.3891119360923767, | |
| "learning_rate": 7.151993980436419e-05, | |
| "loss": 0.1355, | |
| "mean_token_accuracy": 0.9941473879984447, | |
| "num_tokens": 30181060.0, | |
| "step": 3416 | |
| }, | |
| { | |
| "epoch": 2.573846009213124, | |
| "grad_norm": 5.231655597686768, | |
| "learning_rate": 7.125658389766742e-05, | |
| "loss": 0.0626, | |
| "mean_token_accuracy": 0.9970739568982806, | |
| "num_tokens": 30238290.0, | |
| "step": 3423 | |
| }, | |
| { | |
| "epoch": 2.579110651499483, | |
| "grad_norm": 15.069188117980957, | |
| "learning_rate": 7.099322799097066e-05, | |
| "loss": 0.1599, | |
| "mean_token_accuracy": 0.9956659791725022, | |
| "num_tokens": 30300698.0, | |
| "step": 3430 | |
| }, | |
| { | |
| "epoch": 2.584375293785842, | |
| "grad_norm": 7.821376800537109, | |
| "learning_rate": 7.07298720842739e-05, | |
| "loss": 0.1033, | |
| "mean_token_accuracy": 0.9969842966113772, | |
| "num_tokens": 30367327.0, | |
| "step": 3437 | |
| }, | |
| { | |
| "epoch": 2.5896399360722007, | |
| "grad_norm": 0.07698877155780792, | |
| "learning_rate": 7.046651617757712e-05, | |
| "loss": 0.0867, | |
| "mean_token_accuracy": 0.9960279879825455, | |
| "num_tokens": 30432719.0, | |
| "step": 3444 | |
| }, | |
| { | |
| "epoch": 2.59490457835856, | |
| "grad_norm": 0.38316383957862854, | |
| "learning_rate": 7.020316027088037e-05, | |
| "loss": 0.1367, | |
| "mean_token_accuracy": 0.9958215983850616, | |
| "num_tokens": 30491113.0, | |
| "step": 3451 | |
| }, | |
| { | |
| "epoch": 2.6001692206449185, | |
| "grad_norm": 9.539909362792969, | |
| "learning_rate": 6.99398043641836e-05, | |
| "loss": 0.1478, | |
| "mean_token_accuracy": 0.9933771437832287, | |
| "num_tokens": 30549788.0, | |
| "step": 3458 | |
| }, | |
| { | |
| "epoch": 2.6054338629312777, | |
| "grad_norm": 3.2880895137786865, | |
| "learning_rate": 6.967644845748683e-05, | |
| "loss": 0.1082, | |
| "mean_token_accuracy": 0.9973175621458462, | |
| "num_tokens": 30612839.0, | |
| "step": 3465 | |
| }, | |
| { | |
| "epoch": 2.610698505217637, | |
| "grad_norm": 2.825924873352051, | |
| "learning_rate": 6.941309255079006e-05, | |
| "loss": 0.202, | |
| "mean_token_accuracy": 0.9937642472130912, | |
| "num_tokens": 30673599.0, | |
| "step": 3472 | |
| }, | |
| { | |
| "epoch": 2.6159631475039955, | |
| "grad_norm": 1.3274503946304321, | |
| "learning_rate": 6.914973664409331e-05, | |
| "loss": 0.0773, | |
| "mean_token_accuracy": 0.9975179433822632, | |
| "num_tokens": 30737614.0, | |
| "step": 3479 | |
| }, | |
| { | |
| "epoch": 2.621227789790354, | |
| "grad_norm": 0.2665782868862152, | |
| "learning_rate": 6.888638073739654e-05, | |
| "loss": 0.0694, | |
| "mean_token_accuracy": 0.9972060258899417, | |
| "num_tokens": 30801843.0, | |
| "step": 3486 | |
| }, | |
| { | |
| "epoch": 2.6264924320767133, | |
| "grad_norm": 6.859877586364746, | |
| "learning_rate": 6.862302483069978e-05, | |
| "loss": 0.1289, | |
| "mean_token_accuracy": 0.9961654150060245, | |
| "num_tokens": 30860977.0, | |
| "step": 3493 | |
| }, | |
| { | |
| "epoch": 2.6317570743630725, | |
| "grad_norm": 0.9966444373130798, | |
| "learning_rate": 6.835966892400302e-05, | |
| "loss": 0.0772, | |
| "mean_token_accuracy": 0.9965699538588524, | |
| "num_tokens": 30923052.0, | |
| "step": 3500 | |
| }, | |
| { | |
| "epoch": 2.637021716649431, | |
| "grad_norm": 1.1179418563842773, | |
| "learning_rate": 6.809631301730624e-05, | |
| "loss": 0.0719, | |
| "mean_token_accuracy": 0.9967302520360265, | |
| "num_tokens": 30986133.0, | |
| "step": 3507 | |
| }, | |
| { | |
| "epoch": 2.64228635893579, | |
| "grad_norm": 2.330273151397705, | |
| "learning_rate": 6.783295711060949e-05, | |
| "loss": 0.0627, | |
| "mean_token_accuracy": 0.9968089908361435, | |
| "num_tokens": 31045344.0, | |
| "step": 3514 | |
| }, | |
| { | |
| "epoch": 2.647551001222149, | |
| "grad_norm": 0.2781616449356079, | |
| "learning_rate": 6.756960120391272e-05, | |
| "loss": 0.1229, | |
| "mean_token_accuracy": 0.9937345342976707, | |
| "num_tokens": 31109576.0, | |
| "step": 3521 | |
| }, | |
| { | |
| "epoch": 2.652815643508508, | |
| "grad_norm": 0.6426751613616943, | |
| "learning_rate": 6.730624529721595e-05, | |
| "loss": 0.1242, | |
| "mean_token_accuracy": 0.994049035012722, | |
| "num_tokens": 31171191.0, | |
| "step": 3528 | |
| }, | |
| { | |
| "epoch": 2.658080285794867, | |
| "grad_norm": 1.9291712045669556, | |
| "learning_rate": 6.70428893905192e-05, | |
| "loss": 0.1159, | |
| "mean_token_accuracy": 0.9960754119924137, | |
| "num_tokens": 31230782.0, | |
| "step": 3535 | |
| }, | |
| { | |
| "epoch": 2.663344928081226, | |
| "grad_norm": 5.023970603942871, | |
| "learning_rate": 6.677953348382242e-05, | |
| "loss": 0.1802, | |
| "mean_token_accuracy": 0.993708176272256, | |
| "num_tokens": 31294772.0, | |
| "step": 3542 | |
| }, | |
| { | |
| "epoch": 2.6686095703675847, | |
| "grad_norm": 0.6795799136161804, | |
| "learning_rate": 6.651617757712566e-05, | |
| "loss": 0.1522, | |
| "mean_token_accuracy": 0.9949724322983197, | |
| "num_tokens": 31356199.0, | |
| "step": 3549 | |
| }, | |
| { | |
| "epoch": 2.673874212653944, | |
| "grad_norm": 0.4033326208591461, | |
| "learning_rate": 6.62528216704289e-05, | |
| "loss": 0.0822, | |
| "mean_token_accuracy": 0.997618304831641, | |
| "num_tokens": 31422669.0, | |
| "step": 3556 | |
| }, | |
| { | |
| "epoch": 2.6791388549403026, | |
| "grad_norm": 0.6970848441123962, | |
| "learning_rate": 6.598946576373213e-05, | |
| "loss": 0.109, | |
| "mean_token_accuracy": 0.9952883582030024, | |
| "num_tokens": 31487598.0, | |
| "step": 3563 | |
| }, | |
| { | |
| "epoch": 2.6844034972266617, | |
| "grad_norm": 2.0525119304656982, | |
| "learning_rate": 6.572610985703536e-05, | |
| "loss": 0.0431, | |
| "mean_token_accuracy": 0.9979359933308193, | |
| "num_tokens": 31554596.0, | |
| "step": 3570 | |
| }, | |
| { | |
| "epoch": 2.689668139513021, | |
| "grad_norm": 1.1314244270324707, | |
| "learning_rate": 6.546275395033861e-05, | |
| "loss": 0.0881, | |
| "mean_token_accuracy": 0.9948847900543895, | |
| "num_tokens": 31618172.0, | |
| "step": 3577 | |
| }, | |
| { | |
| "epoch": 2.6949327817993796, | |
| "grad_norm": 1.205787181854248, | |
| "learning_rate": 6.519939804364184e-05, | |
| "loss": 0.1088, | |
| "mean_token_accuracy": 0.9961233202900205, | |
| "num_tokens": 31677865.0, | |
| "step": 3584 | |
| }, | |
| { | |
| "epoch": 2.7001974240857383, | |
| "grad_norm": 3.819504737854004, | |
| "learning_rate": 6.493604213694507e-05, | |
| "loss": 0.0726, | |
| "mean_token_accuracy": 0.995971353990691, | |
| "num_tokens": 31743550.0, | |
| "step": 3591 | |
| }, | |
| { | |
| "epoch": 2.7054620663720974, | |
| "grad_norm": 1.703169345855713, | |
| "learning_rate": 6.467268623024832e-05, | |
| "loss": 0.1052, | |
| "mean_token_accuracy": 0.9947156554886273, | |
| "num_tokens": 31802765.0, | |
| "step": 3598 | |
| }, | |
| { | |
| "epoch": 2.7107267086584566, | |
| "grad_norm": 2.281057834625244, | |
| "learning_rate": 6.440933032355154e-05, | |
| "loss": 0.1096, | |
| "mean_token_accuracy": 0.9958588747041566, | |
| "num_tokens": 31867762.0, | |
| "step": 3605 | |
| }, | |
| { | |
| "epoch": 2.7159913509448153, | |
| "grad_norm": 0.6781365871429443, | |
| "learning_rate": 6.414597441685478e-05, | |
| "loss": 0.2936, | |
| "mean_token_accuracy": 0.9875736928411892, | |
| "num_tokens": 31929955.0, | |
| "step": 3612 | |
| }, | |
| { | |
| "epoch": 2.721255993231174, | |
| "grad_norm": 2.8714804649353027, | |
| "learning_rate": 6.388261851015802e-05, | |
| "loss": 0.087, | |
| "mean_token_accuracy": 0.996411423598017, | |
| "num_tokens": 31989377.0, | |
| "step": 3619 | |
| }, | |
| { | |
| "epoch": 2.726520635517533, | |
| "grad_norm": 0.3498064875602722, | |
| "learning_rate": 6.361926260346125e-05, | |
| "loss": 0.0625, | |
| "mean_token_accuracy": 0.9963695577212742, | |
| "num_tokens": 32048557.0, | |
| "step": 3626 | |
| }, | |
| { | |
| "epoch": 2.7317852778038922, | |
| "grad_norm": 4.420494079589844, | |
| "learning_rate": 6.33559066967645e-05, | |
| "loss": 0.101, | |
| "mean_token_accuracy": 0.9967913350888661, | |
| "num_tokens": 32114815.0, | |
| "step": 3633 | |
| }, | |
| { | |
| "epoch": 2.737049920090251, | |
| "grad_norm": 2.4144864082336426, | |
| "learning_rate": 6.309255079006773e-05, | |
| "loss": 0.134, | |
| "mean_token_accuracy": 0.994675701217992, | |
| "num_tokens": 32174880.0, | |
| "step": 3640 | |
| }, | |
| { | |
| "epoch": 2.74231456237661, | |
| "grad_norm": 0.7500758767127991, | |
| "learning_rate": 6.282919488337096e-05, | |
| "loss": 0.1471, | |
| "mean_token_accuracy": 0.9963582541261401, | |
| "num_tokens": 32234915.0, | |
| "step": 3647 | |
| }, | |
| { | |
| "epoch": 2.747579204662969, | |
| "grad_norm": 1.0065075159072876, | |
| "learning_rate": 6.256583897667419e-05, | |
| "loss": 0.104, | |
| "mean_token_accuracy": 0.9962236753531865, | |
| "num_tokens": 32296268.0, | |
| "step": 3654 | |
| }, | |
| { | |
| "epoch": 2.752843846949328, | |
| "grad_norm": 0.6879806518554688, | |
| "learning_rate": 6.230248306997743e-05, | |
| "loss": 0.2152, | |
| "mean_token_accuracy": 0.9945028296538762, | |
| "num_tokens": 32360448.0, | |
| "step": 3661 | |
| }, | |
| { | |
| "epoch": 2.7581084892356866, | |
| "grad_norm": 4.699110507965088, | |
| "learning_rate": 6.203912716328066e-05, | |
| "loss": 0.0538, | |
| "mean_token_accuracy": 0.9969492469515119, | |
| "num_tokens": 32422782.0, | |
| "step": 3668 | |
| }, | |
| { | |
| "epoch": 2.763373131522046, | |
| "grad_norm": 1.8280649185180664, | |
| "learning_rate": 6.17757712565839e-05, | |
| "loss": 0.1131, | |
| "mean_token_accuracy": 0.9955264308622905, | |
| "num_tokens": 32490477.0, | |
| "step": 3675 | |
| }, | |
| { | |
| "epoch": 2.7686377738084045, | |
| "grad_norm": 1.1380226612091064, | |
| "learning_rate": 6.151241534988714e-05, | |
| "loss": 0.0861, | |
| "mean_token_accuracy": 0.9977227577141353, | |
| "num_tokens": 32551893.0, | |
| "step": 3682 | |
| }, | |
| { | |
| "epoch": 2.7739024160947636, | |
| "grad_norm": 6.666886806488037, | |
| "learning_rate": 6.124905944319037e-05, | |
| "loss": 0.1502, | |
| "mean_token_accuracy": 0.9936152824333736, | |
| "num_tokens": 32615118.0, | |
| "step": 3689 | |
| }, | |
| { | |
| "epoch": 2.7791670583811223, | |
| "grad_norm": 3.979558229446411, | |
| "learning_rate": 6.098570353649361e-05, | |
| "loss": 0.1325, | |
| "mean_token_accuracy": 0.9923213102987835, | |
| "num_tokens": 32682333.0, | |
| "step": 3696 | |
| }, | |
| { | |
| "epoch": 2.7844317006674815, | |
| "grad_norm": 1.1557002067565918, | |
| "learning_rate": 6.072234762979684e-05, | |
| "loss": 0.1336, | |
| "mean_token_accuracy": 0.9942744810666356, | |
| "num_tokens": 32744014.0, | |
| "step": 3703 | |
| }, | |
| { | |
| "epoch": 2.7896963429538406, | |
| "grad_norm": 1.1009775400161743, | |
| "learning_rate": 6.045899172310008e-05, | |
| "loss": 0.0925, | |
| "mean_token_accuracy": 0.9959461561271122, | |
| "num_tokens": 32807751.0, | |
| "step": 3710 | |
| }, | |
| { | |
| "epoch": 2.7949609852401993, | |
| "grad_norm": 10.877095222473145, | |
| "learning_rate": 6.019563581640332e-05, | |
| "loss": 0.1689, | |
| "mean_token_accuracy": 0.9931018746324948, | |
| "num_tokens": 32866934.0, | |
| "step": 3717 | |
| }, | |
| { | |
| "epoch": 2.800225627526558, | |
| "grad_norm": 6.549318313598633, | |
| "learning_rate": 5.9932279909706546e-05, | |
| "loss": 0.0969, | |
| "mean_token_accuracy": 0.9969319147723061, | |
| "num_tokens": 32932072.0, | |
| "step": 3724 | |
| }, | |
| { | |
| "epoch": 2.805490269812917, | |
| "grad_norm": 0.5708068609237671, | |
| "learning_rate": 5.9668924003009785e-05, | |
| "loss": 0.0675, | |
| "mean_token_accuracy": 0.9954135449869292, | |
| "num_tokens": 32992828.0, | |
| "step": 3731 | |
| }, | |
| { | |
| "epoch": 2.8107549120992763, | |
| "grad_norm": 2.0712924003601074, | |
| "learning_rate": 5.9405568096313025e-05, | |
| "loss": 0.2076, | |
| "mean_token_accuracy": 0.9952983504959515, | |
| "num_tokens": 33053902.0, | |
| "step": 3738 | |
| }, | |
| { | |
| "epoch": 2.816019554385635, | |
| "grad_norm": 1.6891502141952515, | |
| "learning_rate": 5.914221218961625e-05, | |
| "loss": 0.1404, | |
| "mean_token_accuracy": 0.9939380894814219, | |
| "num_tokens": 33114709.0, | |
| "step": 3745 | |
| }, | |
| { | |
| "epoch": 2.821284196671994, | |
| "grad_norm": 3.204617977142334, | |
| "learning_rate": 5.887885628291949e-05, | |
| "loss": 0.0801, | |
| "mean_token_accuracy": 0.996879558478083, | |
| "num_tokens": 33182620.0, | |
| "step": 3752 | |
| }, | |
| { | |
| "epoch": 2.826548838958353, | |
| "grad_norm": 0.6853612065315247, | |
| "learning_rate": 5.861550037622273e-05, | |
| "loss": 0.0792, | |
| "mean_token_accuracy": 0.996274471282959, | |
| "num_tokens": 33246004.0, | |
| "step": 3759 | |
| }, | |
| { | |
| "epoch": 2.831813481244712, | |
| "grad_norm": 5.600191593170166, | |
| "learning_rate": 5.835214446952596e-05, | |
| "loss": 0.2204, | |
| "mean_token_accuracy": 0.9915649422577449, | |
| "num_tokens": 33310729.0, | |
| "step": 3766 | |
| }, | |
| { | |
| "epoch": 2.8370781235310707, | |
| "grad_norm": 0.11385060846805573, | |
| "learning_rate": 5.80887885628292e-05, | |
| "loss": 0.0693, | |
| "mean_token_accuracy": 0.9975795469113758, | |
| "num_tokens": 33373693.0, | |
| "step": 3773 | |
| }, | |
| { | |
| "epoch": 2.84234276581743, | |
| "grad_norm": 3.935316324234009, | |
| "learning_rate": 5.782543265613243e-05, | |
| "loss": 0.072, | |
| "mean_token_accuracy": 0.9963005163839885, | |
| "num_tokens": 33434883.0, | |
| "step": 3780 | |
| }, | |
| { | |
| "epoch": 2.8476074081037885, | |
| "grad_norm": 2.338846206665039, | |
| "learning_rate": 5.7562076749435666e-05, | |
| "loss": 0.0892, | |
| "mean_token_accuracy": 0.9956449451191085, | |
| "num_tokens": 33496857.0, | |
| "step": 3787 | |
| }, | |
| { | |
| "epoch": 2.8528720503901477, | |
| "grad_norm": 1.753577470779419, | |
| "learning_rate": 5.7298720842738906e-05, | |
| "loss": 0.1407, | |
| "mean_token_accuracy": 0.9935056045651436, | |
| "num_tokens": 33560834.0, | |
| "step": 3794 | |
| }, | |
| { | |
| "epoch": 2.8581366926765064, | |
| "grad_norm": 0.09232474118471146, | |
| "learning_rate": 5.703536493604213e-05, | |
| "loss": 0.0663, | |
| "mean_token_accuracy": 0.9963288988385882, | |
| "num_tokens": 33617994.0, | |
| "step": 3801 | |
| }, | |
| { | |
| "epoch": 2.8634013349628655, | |
| "grad_norm": 0.5382691025733948, | |
| "learning_rate": 5.677200902934538e-05, | |
| "loss": 0.1302, | |
| "mean_token_accuracy": 0.9940649994782039, | |
| "num_tokens": 33680204.0, | |
| "step": 3808 | |
| }, | |
| { | |
| "epoch": 2.8686659772492247, | |
| "grad_norm": 0.8829149007797241, | |
| "learning_rate": 5.650865312264862e-05, | |
| "loss": 0.0764, | |
| "mean_token_accuracy": 0.9972236560923713, | |
| "num_tokens": 33742740.0, | |
| "step": 3815 | |
| }, | |
| { | |
| "epoch": 2.8739306195355834, | |
| "grad_norm": 0.5694432854652405, | |
| "learning_rate": 5.624529721595184e-05, | |
| "loss": 0.1618, | |
| "mean_token_accuracy": 0.9941023790410587, | |
| "num_tokens": 33803113.0, | |
| "step": 3822 | |
| }, | |
| { | |
| "epoch": 2.879195261821942, | |
| "grad_norm": 1.8104138374328613, | |
| "learning_rate": 5.598194130925508e-05, | |
| "loss": 0.1307, | |
| "mean_token_accuracy": 0.9938526834760394, | |
| "num_tokens": 33862407.0, | |
| "step": 3829 | |
| }, | |
| { | |
| "epoch": 2.884459904108301, | |
| "grad_norm": 9.209033966064453, | |
| "learning_rate": 5.571858540255832e-05, | |
| "loss": 0.095, | |
| "mean_token_accuracy": 0.995516337454319, | |
| "num_tokens": 33921503.0, | |
| "step": 3836 | |
| }, | |
| { | |
| "epoch": 2.8897245463946604, | |
| "grad_norm": 2.056917190551758, | |
| "learning_rate": 5.545522949586155e-05, | |
| "loss": 0.1082, | |
| "mean_token_accuracy": 0.9934366803084101, | |
| "num_tokens": 33981340.0, | |
| "step": 3843 | |
| }, | |
| { | |
| "epoch": 2.894989188681019, | |
| "grad_norm": 8.990647315979004, | |
| "learning_rate": 5.519187358916479e-05, | |
| "loss": 0.114, | |
| "mean_token_accuracy": 0.9967345467635563, | |
| "num_tokens": 34046990.0, | |
| "step": 3850 | |
| }, | |
| { | |
| "epoch": 2.9002538309673778, | |
| "grad_norm": 1.003222942352295, | |
| "learning_rate": 5.4928517682468026e-05, | |
| "loss": 0.1405, | |
| "mean_token_accuracy": 0.9952324394668851, | |
| "num_tokens": 34106859.0, | |
| "step": 3857 | |
| }, | |
| { | |
| "epoch": 2.905518473253737, | |
| "grad_norm": 1.2694677114486694, | |
| "learning_rate": 5.466516177577126e-05, | |
| "loss": 0.0435, | |
| "mean_token_accuracy": 0.9984203289662089, | |
| "num_tokens": 34169207.0, | |
| "step": 3864 | |
| }, | |
| { | |
| "epoch": 2.910783115540096, | |
| "grad_norm": 1.1073271036148071, | |
| "learning_rate": 5.44018058690745e-05, | |
| "loss": 0.0678, | |
| "mean_token_accuracy": 0.9971352975283351, | |
| "num_tokens": 34232528.0, | |
| "step": 3871 | |
| }, | |
| { | |
| "epoch": 2.9160477578264548, | |
| "grad_norm": 0.165752112865448, | |
| "learning_rate": 5.413844996237774e-05, | |
| "loss": 0.0815, | |
| "mean_token_accuracy": 0.9959091256771769, | |
| "num_tokens": 34292644.0, | |
| "step": 3878 | |
| }, | |
| { | |
| "epoch": 2.921312400112814, | |
| "grad_norm": 1.1744028329849243, | |
| "learning_rate": 5.387509405568096e-05, | |
| "loss": 0.1459, | |
| "mean_token_accuracy": 0.9959145986608097, | |
| "num_tokens": 34354187.0, | |
| "step": 3885 | |
| }, | |
| { | |
| "epoch": 2.9265770423991726, | |
| "grad_norm": 1.0740725994110107, | |
| "learning_rate": 5.36117381489842e-05, | |
| "loss": 0.1995, | |
| "mean_token_accuracy": 0.9932695201465062, | |
| "num_tokens": 34415822.0, | |
| "step": 3892 | |
| }, | |
| { | |
| "epoch": 2.9318416846855317, | |
| "grad_norm": 1.9310617446899414, | |
| "learning_rate": 5.334838224228743e-05, | |
| "loss": 0.1579, | |
| "mean_token_accuracy": 0.9933696197611945, | |
| "num_tokens": 34476697.0, | |
| "step": 3899 | |
| }, | |
| { | |
| "epoch": 2.9371063269718904, | |
| "grad_norm": 0.574180543422699, | |
| "learning_rate": 5.308502633559067e-05, | |
| "loss": 0.0823, | |
| "mean_token_accuracy": 0.9970756439226014, | |
| "num_tokens": 34539612.0, | |
| "step": 3906 | |
| }, | |
| { | |
| "epoch": 2.9423709692582496, | |
| "grad_norm": 13.607620239257812, | |
| "learning_rate": 5.282167042889391e-05, | |
| "loss": 0.0932, | |
| "mean_token_accuracy": 0.9977340368287904, | |
| "num_tokens": 34598452.0, | |
| "step": 3913 | |
| }, | |
| { | |
| "epoch": 2.9476356115446083, | |
| "grad_norm": 1.2882604598999023, | |
| "learning_rate": 5.255831452219714e-05, | |
| "loss": 0.1458, | |
| "mean_token_accuracy": 0.9930107263582093, | |
| "num_tokens": 34661727.0, | |
| "step": 3920 | |
| }, | |
| { | |
| "epoch": 2.9529002538309674, | |
| "grad_norm": 0.943372368812561, | |
| "learning_rate": 5.229495861550038e-05, | |
| "loss": 0.1113, | |
| "mean_token_accuracy": 0.9966319703630039, | |
| "num_tokens": 34729738.0, | |
| "step": 3927 | |
| }, | |
| { | |
| "epoch": 2.958164896117326, | |
| "grad_norm": 0.9916655421257019, | |
| "learning_rate": 5.203160270880362e-05, | |
| "loss": 0.0953, | |
| "mean_token_accuracy": 0.99606361665896, | |
| "num_tokens": 34793897.0, | |
| "step": 3934 | |
| }, | |
| { | |
| "epoch": 2.9634295384036853, | |
| "grad_norm": 2.0784947872161865, | |
| "learning_rate": 5.1768246802106844e-05, | |
| "loss": 0.1981, | |
| "mean_token_accuracy": 0.9954314987574305, | |
| "num_tokens": 34849278.0, | |
| "step": 3941 | |
| }, | |
| { | |
| "epoch": 2.9686941806900444, | |
| "grad_norm": 0.8440742492675781, | |
| "learning_rate": 5.1504890895410084e-05, | |
| "loss": 0.1069, | |
| "mean_token_accuracy": 0.9943101214511054, | |
| "num_tokens": 34907876.0, | |
| "step": 3948 | |
| }, | |
| { | |
| "epoch": 2.973958822976403, | |
| "grad_norm": 0.37809041142463684, | |
| "learning_rate": 5.124153498871332e-05, | |
| "loss": 0.0762, | |
| "mean_token_accuracy": 0.997384498161929, | |
| "num_tokens": 34967018.0, | |
| "step": 3955 | |
| }, | |
| { | |
| "epoch": 2.979223465262762, | |
| "grad_norm": 1.6948941946029663, | |
| "learning_rate": 5.0978179082016555e-05, | |
| "loss": 0.0883, | |
| "mean_token_accuracy": 0.9937601664236614, | |
| "num_tokens": 35025189.0, | |
| "step": 3962 | |
| }, | |
| { | |
| "epoch": 2.984488107549121, | |
| "grad_norm": 0.9532034397125244, | |
| "learning_rate": 5.0714823175319795e-05, | |
| "loss": 0.064, | |
| "mean_token_accuracy": 0.9956995323300362, | |
| "num_tokens": 35085659.0, | |
| "step": 3969 | |
| }, | |
| { | |
| "epoch": 2.98975274983548, | |
| "grad_norm": 3.386885643005371, | |
| "learning_rate": 5.0451467268623034e-05, | |
| "loss": 0.0938, | |
| "mean_token_accuracy": 0.9941990928990501, | |
| "num_tokens": 35150487.0, | |
| "step": 3976 | |
| }, | |
| { | |
| "epoch": 2.995017392121839, | |
| "grad_norm": 0.6338557004928589, | |
| "learning_rate": 5.018811136192626e-05, | |
| "loss": 0.0582, | |
| "mean_token_accuracy": 0.9974785053304264, | |
| "num_tokens": 35216121.0, | |
| "step": 3983 | |
| }, | |
| { | |
| "epoch": 3.0, | |
| "grad_norm": 2.1497695446014404, | |
| "learning_rate": 4.99247554552295e-05, | |
| "loss": 0.0809, | |
| "mean_token_accuracy": 0.9940776622520303, | |
| "num_tokens": 35273604.0, | |
| "step": 3990 | |
| } | |
| ], | |
| "logging_steps": 7, | |
| "max_steps": 5316, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 4, | |
| "save_steps": 333, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": false | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 7.763439039501742e+17, | |
| "train_batch_size": 1, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |