| { |
| "best_global_step": null, |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 10.0, |
| "eval_steps": 500, |
| "global_step": 400, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "entropy": 1.2453354597091675, |
| "epoch": 0.025, |
| "grad_norm": 0.7225268483161926, |
| "learning_rate": 0.0002, |
| "loss": 2.687872886657715, |
| "mean_token_accuracy": 0.514649897813797, |
| "num_tokens": 74890.0, |
| "step": 1 |
| }, |
| { |
| "entropy": 1.276440054178238, |
| "epoch": 0.05, |
| "grad_norm": 0.4285995066165924, |
| "learning_rate": 0.0002, |
| "loss": 2.2105631828308105, |
| "mean_token_accuracy": 0.5571927130222321, |
| "num_tokens": 149478.0, |
| "step": 2 |
| }, |
| { |
| "entropy": 1.3720857799053192, |
| "epoch": 0.075, |
| "grad_norm": 0.4019286632537842, |
| "learning_rate": 0.0002, |
| "loss": 1.8746938705444336, |
| "mean_token_accuracy": 0.5803174823522568, |
| "num_tokens": 224127.0, |
| "step": 3 |
| }, |
| { |
| "entropy": 1.4278347790241241, |
| "epoch": 0.1, |
| "grad_norm": 0.3917113244533539, |
| "learning_rate": 0.0002, |
| "loss": 1.6238957643508911, |
| "mean_token_accuracy": 0.5989710986614227, |
| "num_tokens": 299184.0, |
| "step": 4 |
| }, |
| { |
| "entropy": 1.408233255147934, |
| "epoch": 0.125, |
| "grad_norm": 0.2560015320777893, |
| "learning_rate": 0.0002, |
| "loss": 1.4765487909317017, |
| "mean_token_accuracy": 0.620598778128624, |
| "num_tokens": 373384.0, |
| "step": 5 |
| }, |
| { |
| "entropy": 1.4224989414215088, |
| "epoch": 0.15, |
| "grad_norm": 0.17689421772956848, |
| "learning_rate": 0.0002, |
| "loss": 1.358055830001831, |
| "mean_token_accuracy": 0.6330070495605469, |
| "num_tokens": 447785.0, |
| "step": 6 |
| }, |
| { |
| "entropy": 1.3971222341060638, |
| "epoch": 0.175, |
| "grad_norm": 0.13785144686698914, |
| "learning_rate": 0.0002, |
| "loss": 1.2616052627563477, |
| "mean_token_accuracy": 0.6447271108627319, |
| "num_tokens": 522258.0, |
| "step": 7 |
| }, |
| { |
| "entropy": 1.340851902961731, |
| "epoch": 0.2, |
| "grad_norm": 0.13835509121418, |
| "learning_rate": 0.0002, |
| "loss": 1.1717238426208496, |
| "mean_token_accuracy": 0.656842052936554, |
| "num_tokens": 597302.0, |
| "step": 8 |
| }, |
| { |
| "entropy": 1.290471225976944, |
| "epoch": 0.225, |
| "grad_norm": 0.13574543595314026, |
| "learning_rate": 0.0002, |
| "loss": 1.1026949882507324, |
| "mean_token_accuracy": 0.6640458852052689, |
| "num_tokens": 671987.0, |
| "step": 9 |
| }, |
| { |
| "entropy": 1.2009779810905457, |
| "epoch": 0.25, |
| "grad_norm": 0.1603001058101654, |
| "learning_rate": 0.0002, |
| "loss": 1.021569848060608, |
| "mean_token_accuracy": 0.6854399591684341, |
| "num_tokens": 746652.0, |
| "step": 10 |
| }, |
| { |
| "entropy": 1.10042205452919, |
| "epoch": 0.275, |
| "grad_norm": 0.1099264919757843, |
| "learning_rate": 0.0002, |
| "loss": 0.9563246369361877, |
| "mean_token_accuracy": 0.692952573299408, |
| "num_tokens": 821844.0, |
| "step": 11 |
| }, |
| { |
| "entropy": 1.0122379958629608, |
| "epoch": 0.3, |
| "grad_norm": 0.10267413407564163, |
| "learning_rate": 0.0002, |
| "loss": 0.9180705547332764, |
| "mean_token_accuracy": 0.697370782494545, |
| "num_tokens": 896293.0, |
| "step": 12 |
| }, |
| { |
| "entropy": 0.9369151145219803, |
| "epoch": 0.325, |
| "grad_norm": 0.09305275231599808, |
| "learning_rate": 0.0002, |
| "loss": 0.8677215576171875, |
| "mean_token_accuracy": 0.7073450535535812, |
| "num_tokens": 970585.0, |
| "step": 13 |
| }, |
| { |
| "entropy": 0.8721449226140976, |
| "epoch": 0.35, |
| "grad_norm": 0.09551584720611572, |
| "learning_rate": 0.0002, |
| "loss": 0.8373533487319946, |
| "mean_token_accuracy": 0.7065530866384506, |
| "num_tokens": 1045873.0, |
| "step": 14 |
| }, |
| { |
| "entropy": 0.8138987272977829, |
| "epoch": 0.375, |
| "grad_norm": 0.09821188449859619, |
| "learning_rate": 0.0002, |
| "loss": 0.8080641031265259, |
| "mean_token_accuracy": 0.7135835438966751, |
| "num_tokens": 1121090.0, |
| "step": 15 |
| }, |
| { |
| "entropy": 0.782063752412796, |
| "epoch": 0.4, |
| "grad_norm": 0.09285010397434235, |
| "learning_rate": 0.0002, |
| "loss": 0.7853477001190186, |
| "mean_token_accuracy": 0.7157466858625412, |
| "num_tokens": 1196168.0, |
| "step": 16 |
| }, |
| { |
| "entropy": 0.7581740468740463, |
| "epoch": 0.425, |
| "grad_norm": 0.0761309340596199, |
| "learning_rate": 0.0002, |
| "loss": 0.7540827393531799, |
| "mean_token_accuracy": 0.7216732352972031, |
| "num_tokens": 1271469.0, |
| "step": 17 |
| }, |
| { |
| "entropy": 0.7492464035749435, |
| "epoch": 0.45, |
| "grad_norm": 0.08480065315961838, |
| "learning_rate": 0.0002, |
| "loss": 0.7439934015274048, |
| "mean_token_accuracy": 0.7229771614074707, |
| "num_tokens": 1345721.0, |
| "step": 18 |
| }, |
| { |
| "entropy": 0.7309027314186096, |
| "epoch": 0.475, |
| "grad_norm": 0.09197527915239334, |
| "learning_rate": 0.0002, |
| "loss": 0.7264150381088257, |
| "mean_token_accuracy": 0.7261519432067871, |
| "num_tokens": 1421058.0, |
| "step": 19 |
| }, |
| { |
| "entropy": 0.7090645134449005, |
| "epoch": 0.5, |
| "grad_norm": 0.08530627936124802, |
| "learning_rate": 0.0002, |
| "loss": 0.7123439311981201, |
| "mean_token_accuracy": 0.729744628071785, |
| "num_tokens": 1496487.0, |
| "step": 20 |
| }, |
| { |
| "entropy": 0.6987305730581284, |
| "epoch": 0.525, |
| "grad_norm": 0.0691930428147316, |
| "learning_rate": 0.0002, |
| "loss": 0.694115400314331, |
| "mean_token_accuracy": 0.7338976860046387, |
| "num_tokens": 1571896.0, |
| "step": 21 |
| }, |
| { |
| "entropy": 0.6976824253797531, |
| "epoch": 0.55, |
| "grad_norm": 0.06659026443958282, |
| "learning_rate": 0.0002, |
| "loss": 0.6916466951370239, |
| "mean_token_accuracy": 0.733501672744751, |
| "num_tokens": 1645975.0, |
| "step": 22 |
| }, |
| { |
| "entropy": 0.6778732389211655, |
| "epoch": 0.575, |
| "grad_norm": 0.06560930609703064, |
| "learning_rate": 0.0002, |
| "loss": 0.6672404408454895, |
| "mean_token_accuracy": 0.7411384731531143, |
| "num_tokens": 1720725.0, |
| "step": 23 |
| }, |
| { |
| "entropy": 0.6772548854351044, |
| "epoch": 0.6, |
| "grad_norm": 0.06431178748607635, |
| "learning_rate": 0.0002, |
| "loss": 0.6628670692443848, |
| "mean_token_accuracy": 0.7417115569114685, |
| "num_tokens": 1795630.0, |
| "step": 24 |
| }, |
| { |
| "entropy": 0.6669440716505051, |
| "epoch": 0.625, |
| "grad_norm": 0.06317117810249329, |
| "learning_rate": 0.0002, |
| "loss": 0.6542543172836304, |
| "mean_token_accuracy": 0.7428397238254547, |
| "num_tokens": 1870446.0, |
| "step": 25 |
| }, |
| { |
| "entropy": 0.6646556407213211, |
| "epoch": 0.65, |
| "grad_norm": 0.05317490175366402, |
| "learning_rate": 0.0002, |
| "loss": 0.6533851623535156, |
| "mean_token_accuracy": 0.7443498522043228, |
| "num_tokens": 1945732.0, |
| "step": 26 |
| }, |
| { |
| "entropy": 0.6496605724096298, |
| "epoch": 0.675, |
| "grad_norm": 0.059705302119255066, |
| "learning_rate": 0.0002, |
| "loss": 0.6433347463607788, |
| "mean_token_accuracy": 0.7493544220924377, |
| "num_tokens": 2020426.0, |
| "step": 27 |
| }, |
| { |
| "entropy": 0.6413406282663345, |
| "epoch": 0.7, |
| "grad_norm": 0.05553779378533363, |
| "learning_rate": 0.0002, |
| "loss": 0.6379414200782776, |
| "mean_token_accuracy": 0.7490545064210892, |
| "num_tokens": 2094994.0, |
| "step": 28 |
| }, |
| { |
| "entropy": 0.6248471438884735, |
| "epoch": 0.725, |
| "grad_norm": 0.04922964423894882, |
| "learning_rate": 0.0002, |
| "loss": 0.619972825050354, |
| "mean_token_accuracy": 0.757112592458725, |
| "num_tokens": 2169826.0, |
| "step": 29 |
| }, |
| { |
| "entropy": 0.619708925485611, |
| "epoch": 0.75, |
| "grad_norm": 0.05293005332350731, |
| "learning_rate": 0.0002, |
| "loss": 0.6181076765060425, |
| "mean_token_accuracy": 0.7557289451360703, |
| "num_tokens": 2244717.0, |
| "step": 30 |
| }, |
| { |
| "entropy": 0.6241249591112137, |
| "epoch": 0.775, |
| "grad_norm": 0.05528721585869789, |
| "learning_rate": 0.0002, |
| "loss": 0.6231218576431274, |
| "mean_token_accuracy": 0.7546616345643997, |
| "num_tokens": 2319262.0, |
| "step": 31 |
| }, |
| { |
| "entropy": 0.6172136664390564, |
| "epoch": 0.8, |
| "grad_norm": 0.05522594600915909, |
| "learning_rate": 0.0002, |
| "loss": 0.6161586046218872, |
| "mean_token_accuracy": 0.756885826587677, |
| "num_tokens": 2394260.0, |
| "step": 32 |
| }, |
| { |
| "entropy": 0.6185073405504227, |
| "epoch": 0.825, |
| "grad_norm": 0.05178181454539299, |
| "learning_rate": 0.0002, |
| "loss": 0.615160346031189, |
| "mean_token_accuracy": 0.757054477930069, |
| "num_tokens": 2469143.0, |
| "step": 33 |
| }, |
| { |
| "entropy": 0.6113822758197784, |
| "epoch": 0.85, |
| "grad_norm": 0.047203969210386276, |
| "learning_rate": 0.0002, |
| "loss": 0.6075619459152222, |
| "mean_token_accuracy": 0.7611373513936996, |
| "num_tokens": 2544884.0, |
| "step": 34 |
| }, |
| { |
| "entropy": 0.6107644140720367, |
| "epoch": 0.875, |
| "grad_norm": 0.04446641355752945, |
| "learning_rate": 0.0002, |
| "loss": 0.6088961958885193, |
| "mean_token_accuracy": 0.7600347548723221, |
| "num_tokens": 2619389.0, |
| "step": 35 |
| }, |
| { |
| "entropy": 0.6040280014276505, |
| "epoch": 0.9, |
| "grad_norm": 0.04550258815288544, |
| "learning_rate": 0.0002, |
| "loss": 0.6014739871025085, |
| "mean_token_accuracy": 0.7626253515481949, |
| "num_tokens": 2694103.0, |
| "step": 36 |
| }, |
| { |
| "entropy": 0.6048439294099808, |
| "epoch": 0.925, |
| "grad_norm": 0.047757431864738464, |
| "learning_rate": 0.0002, |
| "loss": 0.6057171821594238, |
| "mean_token_accuracy": 0.760799378156662, |
| "num_tokens": 2769246.0, |
| "step": 37 |
| }, |
| { |
| "entropy": 0.6022361516952515, |
| "epoch": 0.95, |
| "grad_norm": 0.046528495848178864, |
| "learning_rate": 0.0002, |
| "loss": 0.6049805283546448, |
| "mean_token_accuracy": 0.7588685750961304, |
| "num_tokens": 2843271.0, |
| "step": 38 |
| }, |
| { |
| "entropy": 0.5945823639631271, |
| "epoch": 0.975, |
| "grad_norm": 0.04531135782599449, |
| "learning_rate": 0.0002, |
| "loss": 0.5926676988601685, |
| "mean_token_accuracy": 0.7653662264347076, |
| "num_tokens": 2918891.0, |
| "step": 39 |
| }, |
| { |
| "entropy": 0.5929759591817856, |
| "epoch": 1.0, |
| "grad_norm": 0.04308256506919861, |
| "learning_rate": 0.0002, |
| "loss": 0.5950115323066711, |
| "mean_token_accuracy": 0.7647853493690491, |
| "num_tokens": 2993209.0, |
| "step": 40 |
| }, |
| { |
| "entropy": 0.5922251790761948, |
| "epoch": 1.025, |
| "grad_norm": 0.03466418385505676, |
| "learning_rate": 0.0002, |
| "loss": 0.5925185084342957, |
| "mean_token_accuracy": 0.7662032544612885, |
| "num_tokens": 3067858.0, |
| "step": 41 |
| }, |
| { |
| "entropy": 0.5973207205533981, |
| "epoch": 1.05, |
| "grad_norm": 0.04712899401783943, |
| "learning_rate": 0.0002, |
| "loss": 0.5978673696517944, |
| "mean_token_accuracy": 0.7634124308824539, |
| "num_tokens": 3142219.0, |
| "step": 42 |
| }, |
| { |
| "entropy": 0.6021459102630615, |
| "epoch": 1.075, |
| "grad_norm": 0.038375336676836014, |
| "learning_rate": 0.0002, |
| "loss": 0.598951518535614, |
| "mean_token_accuracy": 0.7614264935255051, |
| "num_tokens": 3217036.0, |
| "step": 43 |
| }, |
| { |
| "entropy": 0.6001265943050385, |
| "epoch": 1.1, |
| "grad_norm": 0.03852194547653198, |
| "learning_rate": 0.0002, |
| "loss": 0.596366286277771, |
| "mean_token_accuracy": 0.761825367808342, |
| "num_tokens": 3292192.0, |
| "step": 44 |
| }, |
| { |
| "entropy": 0.5948344469070435, |
| "epoch": 1.125, |
| "grad_norm": 0.03576741740107536, |
| "learning_rate": 0.0002, |
| "loss": 0.5903418660163879, |
| "mean_token_accuracy": 0.7651200741529465, |
| "num_tokens": 3367236.0, |
| "step": 45 |
| }, |
| { |
| "entropy": 0.5950423181056976, |
| "epoch": 1.15, |
| "grad_norm": 0.04135625809431076, |
| "learning_rate": 0.0002, |
| "loss": 0.5973416566848755, |
| "mean_token_accuracy": 0.7616013288497925, |
| "num_tokens": 3442037.0, |
| "step": 46 |
| }, |
| { |
| "entropy": 0.5873961746692657, |
| "epoch": 1.175, |
| "grad_norm": 0.035329703241586685, |
| "learning_rate": 0.0002, |
| "loss": 0.5871415138244629, |
| "mean_token_accuracy": 0.7657803446054459, |
| "num_tokens": 3516585.0, |
| "step": 47 |
| }, |
| { |
| "entropy": 0.5887223035097122, |
| "epoch": 1.2, |
| "grad_norm": 0.029021920636296272, |
| "learning_rate": 0.0002, |
| "loss": 0.5878927707672119, |
| "mean_token_accuracy": 0.764517068862915, |
| "num_tokens": 3590613.0, |
| "step": 48 |
| }, |
| { |
| "entropy": 0.5823997408151627, |
| "epoch": 1.225, |
| "grad_norm": 0.036641936749219894, |
| "learning_rate": 0.0002, |
| "loss": 0.5807881355285645, |
| "mean_token_accuracy": 0.7683323621749878, |
| "num_tokens": 3665437.0, |
| "step": 49 |
| }, |
| { |
| "entropy": 0.5899570137262344, |
| "epoch": 1.25, |
| "grad_norm": 0.030183902010321617, |
| "learning_rate": 0.0002, |
| "loss": 0.587297797203064, |
| "mean_token_accuracy": 0.7668341845273972, |
| "num_tokens": 3739929.0, |
| "step": 50 |
| }, |
| { |
| "entropy": 0.5848294198513031, |
| "epoch": 1.275, |
| "grad_norm": 0.031763091683387756, |
| "learning_rate": 0.0002, |
| "loss": 0.5853850245475769, |
| "mean_token_accuracy": 0.7676772177219391, |
| "num_tokens": 3814610.0, |
| "step": 51 |
| }, |
| { |
| "entropy": 0.5903092175722122, |
| "epoch": 1.3, |
| "grad_norm": 0.030446121469140053, |
| "learning_rate": 0.0002, |
| "loss": 0.5921140313148499, |
| "mean_token_accuracy": 0.7648924738168716, |
| "num_tokens": 3889259.0, |
| "step": 52 |
| }, |
| { |
| "entropy": 0.5961786210536957, |
| "epoch": 1.325, |
| "grad_norm": 0.02807600237429142, |
| "learning_rate": 0.0002, |
| "loss": 0.5954646468162537, |
| "mean_token_accuracy": 0.7630998939275742, |
| "num_tokens": 3963853.0, |
| "step": 53 |
| }, |
| { |
| "entropy": 0.584626317024231, |
| "epoch": 1.35, |
| "grad_norm": 0.023483913391828537, |
| "learning_rate": 0.0002, |
| "loss": 0.5831642150878906, |
| "mean_token_accuracy": 0.7670921683311462, |
| "num_tokens": 4038842.0, |
| "step": 54 |
| }, |
| { |
| "entropy": 0.5851259380578995, |
| "epoch": 1.375, |
| "grad_norm": 0.029443850740790367, |
| "learning_rate": 0.0002, |
| "loss": 0.5853797197341919, |
| "mean_token_accuracy": 0.7661719769239426, |
| "num_tokens": 4113390.0, |
| "step": 55 |
| }, |
| { |
| "entropy": 0.586825504899025, |
| "epoch": 1.4, |
| "grad_norm": 0.027121173217892647, |
| "learning_rate": 0.0002, |
| "loss": 0.587054967880249, |
| "mean_token_accuracy": 0.7664920389652252, |
| "num_tokens": 4188776.0, |
| "step": 56 |
| }, |
| { |
| "entropy": 0.5881156474351883, |
| "epoch": 1.425, |
| "grad_norm": 0.02241705358028412, |
| "learning_rate": 0.0002, |
| "loss": 0.5837085247039795, |
| "mean_token_accuracy": 0.7671953588724136, |
| "num_tokens": 4263443.0, |
| "step": 57 |
| }, |
| { |
| "entropy": 0.5906041860580444, |
| "epoch": 1.45, |
| "grad_norm": 0.024774691089987755, |
| "learning_rate": 0.0002, |
| "loss": 0.58786940574646, |
| "mean_token_accuracy": 0.7648984342813492, |
| "num_tokens": 4337825.0, |
| "step": 58 |
| }, |
| { |
| "entropy": 0.5956196784973145, |
| "epoch": 1.475, |
| "grad_norm": 0.02898634597659111, |
| "learning_rate": 0.0002, |
| "loss": 0.5959815979003906, |
| "mean_token_accuracy": 0.7617602795362473, |
| "num_tokens": 4412117.0, |
| "step": 59 |
| }, |
| { |
| "entropy": 0.5868811905384064, |
| "epoch": 1.5, |
| "grad_norm": 0.024418242275714874, |
| "learning_rate": 0.0002, |
| "loss": 0.5837817192077637, |
| "mean_token_accuracy": 0.7679703235626221, |
| "num_tokens": 4487325.0, |
| "step": 60 |
| }, |
| { |
| "entropy": 0.5820131897926331, |
| "epoch": 1.525, |
| "grad_norm": 0.018590735271573067, |
| "learning_rate": 0.0002, |
| "loss": 0.5806015729904175, |
| "mean_token_accuracy": 0.7693226784467697, |
| "num_tokens": 4562700.0, |
| "step": 61 |
| }, |
| { |
| "entropy": 0.585755005478859, |
| "epoch": 1.55, |
| "grad_norm": 0.022883402183651924, |
| "learning_rate": 0.0002, |
| "loss": 0.5838991403579712, |
| "mean_token_accuracy": 0.7651054114103317, |
| "num_tokens": 4637209.0, |
| "step": 62 |
| }, |
| { |
| "entropy": 0.584441751241684, |
| "epoch": 1.575, |
| "grad_norm": 0.027678513899445534, |
| "learning_rate": 0.0002, |
| "loss": 0.585010826587677, |
| "mean_token_accuracy": 0.7656246721744537, |
| "num_tokens": 4712394.0, |
| "step": 63 |
| }, |
| { |
| "entropy": 0.600657045841217, |
| "epoch": 1.6, |
| "grad_norm": 0.021883023902773857, |
| "learning_rate": 0.0002, |
| "loss": 0.6001325845718384, |
| "mean_token_accuracy": 0.7614835500717163, |
| "num_tokens": 4787928.0, |
| "step": 64 |
| }, |
| { |
| "entropy": 0.584406390786171, |
| "epoch": 1.625, |
| "grad_norm": 0.02202012576162815, |
| "learning_rate": 0.0002, |
| "loss": 0.5836704969406128, |
| "mean_token_accuracy": 0.7675553858280182, |
| "num_tokens": 4862997.0, |
| "step": 65 |
| }, |
| { |
| "entropy": 0.5805485397577286, |
| "epoch": 1.65, |
| "grad_norm": 0.02676200121641159, |
| "learning_rate": 0.0002, |
| "loss": 0.5817915201187134, |
| "mean_token_accuracy": 0.7683773785829544, |
| "num_tokens": 4937295.0, |
| "step": 66 |
| }, |
| { |
| "entropy": 0.5826524794101715, |
| "epoch": 1.675, |
| "grad_norm": 0.0226582121104002, |
| "learning_rate": 0.0002, |
| "loss": 0.5793530941009521, |
| "mean_token_accuracy": 0.7675565928220749, |
| "num_tokens": 5012169.0, |
| "step": 67 |
| }, |
| { |
| "entropy": 0.5945204049348831, |
| "epoch": 1.7, |
| "grad_norm": 0.023526955395936966, |
| "learning_rate": 0.0002, |
| "loss": 0.591584324836731, |
| "mean_token_accuracy": 0.7634985893964767, |
| "num_tokens": 5087480.0, |
| "step": 68 |
| }, |
| { |
| "entropy": 0.5845358669757843, |
| "epoch": 1.725, |
| "grad_norm": 0.026141872629523277, |
| "learning_rate": 0.0002, |
| "loss": 0.5840914249420166, |
| "mean_token_accuracy": 0.7668609470129013, |
| "num_tokens": 5161892.0, |
| "step": 69 |
| }, |
| { |
| "entropy": 0.5799373537302017, |
| "epoch": 1.75, |
| "grad_norm": 0.023512404412031174, |
| "learning_rate": 0.0002, |
| "loss": 0.5786521434783936, |
| "mean_token_accuracy": 0.7684901505708694, |
| "num_tokens": 5237346.0, |
| "step": 70 |
| }, |
| { |
| "entropy": 0.5913780480623245, |
| "epoch": 1.775, |
| "grad_norm": 0.021628571674227715, |
| "learning_rate": 0.0002, |
| "loss": 0.5903242826461792, |
| "mean_token_accuracy": 0.7634606808423996, |
| "num_tokens": 5312258.0, |
| "step": 71 |
| }, |
| { |
| "entropy": 0.5781446248292923, |
| "epoch": 1.8, |
| "grad_norm": 0.025359593331813812, |
| "learning_rate": 0.0002, |
| "loss": 0.5779482126235962, |
| "mean_token_accuracy": 0.7685064077377319, |
| "num_tokens": 5386770.0, |
| "step": 72 |
| }, |
| { |
| "entropy": 0.5885084420442581, |
| "epoch": 1.825, |
| "grad_norm": 0.02480519749224186, |
| "learning_rate": 0.0002, |
| "loss": 0.5897427797317505, |
| "mean_token_accuracy": 0.7633197009563446, |
| "num_tokens": 5461637.0, |
| "step": 73 |
| }, |
| { |
| "entropy": 0.5789479911327362, |
| "epoch": 1.85, |
| "grad_norm": 0.021689681336283684, |
| "learning_rate": 0.0002, |
| "loss": 0.5763558149337769, |
| "mean_token_accuracy": 0.7686384618282318, |
| "num_tokens": 5537141.0, |
| "step": 74 |
| }, |
| { |
| "entropy": 0.5846573114395142, |
| "epoch": 1.875, |
| "grad_norm": 0.023601949214935303, |
| "learning_rate": 0.0002, |
| "loss": 0.585774302482605, |
| "mean_token_accuracy": 0.7660125941038132, |
| "num_tokens": 5611711.0, |
| "step": 75 |
| }, |
| { |
| "entropy": 0.5856555849313736, |
| "epoch": 1.9, |
| "grad_norm": 0.02879919670522213, |
| "learning_rate": 0.0002, |
| "loss": 0.5847885608673096, |
| "mean_token_accuracy": 0.7661805897951126, |
| "num_tokens": 5687175.0, |
| "step": 76 |
| }, |
| { |
| "entropy": 0.591449961066246, |
| "epoch": 1.925, |
| "grad_norm": 0.023963551968336105, |
| "learning_rate": 0.0002, |
| "loss": 0.5909919738769531, |
| "mean_token_accuracy": 0.7625879198312759, |
| "num_tokens": 5762051.0, |
| "step": 77 |
| }, |
| { |
| "entropy": 0.5732830464839935, |
| "epoch": 1.95, |
| "grad_norm": 0.02373599074780941, |
| "learning_rate": 0.0002, |
| "loss": 0.5746445059776306, |
| "mean_token_accuracy": 0.7692500203847885, |
| "num_tokens": 5836504.0, |
| "step": 78 |
| }, |
| { |
| "entropy": 0.5739967525005341, |
| "epoch": 1.975, |
| "grad_norm": 0.024121304973959923, |
| "learning_rate": 0.0002, |
| "loss": 0.5751599073410034, |
| "mean_token_accuracy": 0.7690701484680176, |
| "num_tokens": 5911400.0, |
| "step": 79 |
| }, |
| { |
| "entropy": 0.5794458240270615, |
| "epoch": 2.0, |
| "grad_norm": 0.023465219885110855, |
| "learning_rate": 0.0002, |
| "loss": 0.5783512592315674, |
| "mean_token_accuracy": 0.7684450447559357, |
| "num_tokens": 5986482.0, |
| "step": 80 |
| }, |
| { |
| "entropy": 0.5798445045948029, |
| "epoch": 2.025, |
| "grad_norm": 0.0209247674793005, |
| "learning_rate": 0.0002, |
| "loss": 0.5802971124649048, |
| "mean_token_accuracy": 0.7673298269510269, |
| "num_tokens": 6061119.0, |
| "step": 81 |
| }, |
| { |
| "entropy": 0.5746374428272247, |
| "epoch": 2.05, |
| "grad_norm": 0.022763773798942566, |
| "learning_rate": 0.0002, |
| "loss": 0.572521448135376, |
| "mean_token_accuracy": 0.7688614279031754, |
| "num_tokens": 6135886.0, |
| "step": 82 |
| }, |
| { |
| "entropy": 0.5834762305021286, |
| "epoch": 2.075, |
| "grad_norm": 0.024529799818992615, |
| "learning_rate": 0.0002, |
| "loss": 0.5825608968734741, |
| "mean_token_accuracy": 0.7661754339933395, |
| "num_tokens": 6211037.0, |
| "step": 83 |
| }, |
| { |
| "entropy": 0.5820914059877396, |
| "epoch": 2.1, |
| "grad_norm": 0.02345711924135685, |
| "learning_rate": 0.0002, |
| "loss": 0.5759867429733276, |
| "mean_token_accuracy": 0.7681425362825394, |
| "num_tokens": 6285250.0, |
| "step": 84 |
| }, |
| { |
| "entropy": 0.5817355811595917, |
| "epoch": 2.125, |
| "grad_norm": 0.025857318192720413, |
| "learning_rate": 0.0002, |
| "loss": 0.5796504020690918, |
| "mean_token_accuracy": 0.7672218382358551, |
| "num_tokens": 6360531.0, |
| "step": 85 |
| }, |
| { |
| "entropy": 0.5870300382375717, |
| "epoch": 2.15, |
| "grad_norm": 0.01944359764456749, |
| "learning_rate": 0.0002, |
| "loss": 0.5889190435409546, |
| "mean_token_accuracy": 0.7646535336971283, |
| "num_tokens": 6434545.0, |
| "step": 86 |
| }, |
| { |
| "entropy": 0.5707338750362396, |
| "epoch": 2.175, |
| "grad_norm": 0.022768637165427208, |
| "learning_rate": 0.0002, |
| "loss": 0.5740299224853516, |
| "mean_token_accuracy": 0.7691160142421722, |
| "num_tokens": 6509711.0, |
| "step": 87 |
| }, |
| { |
| "entropy": 0.5874478965997696, |
| "epoch": 2.2, |
| "grad_norm": 0.02508588135242462, |
| "learning_rate": 0.0002, |
| "loss": 0.5900440216064453, |
| "mean_token_accuracy": 0.7628317475318909, |
| "num_tokens": 6584583.0, |
| "step": 88 |
| }, |
| { |
| "entropy": 0.5802857577800751, |
| "epoch": 2.225, |
| "grad_norm": 0.02080141380429268, |
| "learning_rate": 0.0002, |
| "loss": 0.5806664824485779, |
| "mean_token_accuracy": 0.7670494765043259, |
| "num_tokens": 6658937.0, |
| "step": 89 |
| }, |
| { |
| "entropy": 0.5850326269865036, |
| "epoch": 2.25, |
| "grad_norm": 0.020431680604815483, |
| "learning_rate": 0.0002, |
| "loss": 0.5813099145889282, |
| "mean_token_accuracy": 0.7660593539476395, |
| "num_tokens": 6733805.0, |
| "step": 90 |
| }, |
| { |
| "entropy": 0.5846189707517624, |
| "epoch": 2.275, |
| "grad_norm": 0.02515556663274765, |
| "learning_rate": 0.0002, |
| "loss": 0.5851079225540161, |
| "mean_token_accuracy": 0.7659571021795273, |
| "num_tokens": 6808609.0, |
| "step": 91 |
| }, |
| { |
| "entropy": 0.6003973633050919, |
| "epoch": 2.3, |
| "grad_norm": 0.02406417950987816, |
| "learning_rate": 0.0002, |
| "loss": 0.5949417352676392, |
| "mean_token_accuracy": 0.7613963186740875, |
| "num_tokens": 6883966.0, |
| "step": 92 |
| }, |
| { |
| "entropy": 0.574845939874649, |
| "epoch": 2.325, |
| "grad_norm": 0.025337981060147285, |
| "learning_rate": 0.0002, |
| "loss": 0.5741162896156311, |
| "mean_token_accuracy": 0.7704001069068909, |
| "num_tokens": 6957938.0, |
| "step": 93 |
| }, |
| { |
| "entropy": 0.5916647762060165, |
| "epoch": 2.35, |
| "grad_norm": 0.021806908771395683, |
| "learning_rate": 0.0002, |
| "loss": 0.5912328958511353, |
| "mean_token_accuracy": 0.7635269463062286, |
| "num_tokens": 7033240.0, |
| "step": 94 |
| }, |
| { |
| "entropy": 0.5794739425182343, |
| "epoch": 2.375, |
| "grad_norm": 0.021972037851810455, |
| "learning_rate": 0.0002, |
| "loss": 0.5759366750717163, |
| "mean_token_accuracy": 0.7700382471084595, |
| "num_tokens": 7108387.0, |
| "step": 95 |
| }, |
| { |
| "entropy": 0.5825973749160767, |
| "epoch": 2.4, |
| "grad_norm": 0.02072254940867424, |
| "learning_rate": 0.0002, |
| "loss": 0.5852139592170715, |
| "mean_token_accuracy": 0.7643989473581314, |
| "num_tokens": 7182686.0, |
| "step": 96 |
| }, |
| { |
| "entropy": 0.5752668976783752, |
| "epoch": 2.425, |
| "grad_norm": 0.02361258678138256, |
| "learning_rate": 0.0002, |
| "loss": 0.5763595104217529, |
| "mean_token_accuracy": 0.768645167350769, |
| "num_tokens": 7258669.0, |
| "step": 97 |
| }, |
| { |
| "entropy": 0.5715779960155487, |
| "epoch": 2.45, |
| "grad_norm": 0.02046627178788185, |
| "learning_rate": 0.0002, |
| "loss": 0.5750976800918579, |
| "mean_token_accuracy": 0.7683692574501038, |
| "num_tokens": 7333818.0, |
| "step": 98 |
| }, |
| { |
| "entropy": 0.5790873467922211, |
| "epoch": 2.475, |
| "grad_norm": 0.02545187622308731, |
| "learning_rate": 0.0002, |
| "loss": 0.5775801539421082, |
| "mean_token_accuracy": 0.7690194249153137, |
| "num_tokens": 7408902.0, |
| "step": 99 |
| }, |
| { |
| "entropy": 0.5897374451160431, |
| "epoch": 2.5, |
| "grad_norm": 0.021124642342329025, |
| "learning_rate": 0.0002, |
| "loss": 0.5902318358421326, |
| "mean_token_accuracy": 0.7630759179592133, |
| "num_tokens": 7483783.0, |
| "step": 100 |
| }, |
| { |
| "entropy": 0.580902174115181, |
| "epoch": 2.525, |
| "grad_norm": 0.019817229360342026, |
| "learning_rate": 0.0002, |
| "loss": 0.5784831643104553, |
| "mean_token_accuracy": 0.7668700814247131, |
| "num_tokens": 7557724.0, |
| "step": 101 |
| }, |
| { |
| "entropy": 0.5834500938653946, |
| "epoch": 2.55, |
| "grad_norm": 0.024572577327489853, |
| "learning_rate": 0.0002, |
| "loss": 0.5787115097045898, |
| "mean_token_accuracy": 0.7675525993108749, |
| "num_tokens": 7633373.0, |
| "step": 102 |
| }, |
| { |
| "entropy": 0.5778897404670715, |
| "epoch": 2.575, |
| "grad_norm": 0.022201891988515854, |
| "learning_rate": 0.0002, |
| "loss": 0.5758777260780334, |
| "mean_token_accuracy": 0.767838791012764, |
| "num_tokens": 7708329.0, |
| "step": 103 |
| }, |
| { |
| "entropy": 0.578838050365448, |
| "epoch": 2.6, |
| "grad_norm": 0.02364918775856495, |
| "learning_rate": 0.0002, |
| "loss": 0.581870436668396, |
| "mean_token_accuracy": 0.7661506086587906, |
| "num_tokens": 7782911.0, |
| "step": 104 |
| }, |
| { |
| "entropy": 0.5752829909324646, |
| "epoch": 2.625, |
| "grad_norm": 0.022952446714043617, |
| "learning_rate": 0.0002, |
| "loss": 0.5794333815574646, |
| "mean_token_accuracy": 0.7660450041294098, |
| "num_tokens": 7857948.0, |
| "step": 105 |
| }, |
| { |
| "entropy": 0.5836146026849747, |
| "epoch": 2.65, |
| "grad_norm": 0.02250981330871582, |
| "learning_rate": 0.0002, |
| "loss": 0.5844818949699402, |
| "mean_token_accuracy": 0.7643773108720779, |
| "num_tokens": 7932466.0, |
| "step": 106 |
| }, |
| { |
| "entropy": 0.5810949504375458, |
| "epoch": 2.675, |
| "grad_norm": 0.021099543198943138, |
| "learning_rate": 0.0002, |
| "loss": 0.5812161564826965, |
| "mean_token_accuracy": 0.7672342509031296, |
| "num_tokens": 8007372.0, |
| "step": 107 |
| }, |
| { |
| "entropy": 0.5868075489997864, |
| "epoch": 2.7, |
| "grad_norm": 0.024328874424099922, |
| "learning_rate": 0.0002, |
| "loss": 0.583724319934845, |
| "mean_token_accuracy": 0.765077531337738, |
| "num_tokens": 8081242.0, |
| "step": 108 |
| }, |
| { |
| "entropy": 0.5784394592046738, |
| "epoch": 2.725, |
| "grad_norm": 0.023478057235479355, |
| "learning_rate": 0.0002, |
| "loss": 0.5791985988616943, |
| "mean_token_accuracy": 0.7663314342498779, |
| "num_tokens": 8155716.0, |
| "step": 109 |
| }, |
| { |
| "entropy": 0.5747242122888565, |
| "epoch": 2.75, |
| "grad_norm": 0.02284744381904602, |
| "learning_rate": 0.0002, |
| "loss": 0.5755459070205688, |
| "mean_token_accuracy": 0.7680166959762573, |
| "num_tokens": 8229840.0, |
| "step": 110 |
| }, |
| { |
| "entropy": 0.574517697095871, |
| "epoch": 2.775, |
| "grad_norm": 0.022360296919941902, |
| "learning_rate": 0.0002, |
| "loss": 0.5729012489318848, |
| "mean_token_accuracy": 0.7694876343011856, |
| "num_tokens": 8305151.0, |
| "step": 111 |
| }, |
| { |
| "entropy": 0.5775211006402969, |
| "epoch": 2.8, |
| "grad_norm": 0.025003811344504356, |
| "learning_rate": 0.0002, |
| "loss": 0.5751063227653503, |
| "mean_token_accuracy": 0.7694868594408035, |
| "num_tokens": 8379669.0, |
| "step": 112 |
| }, |
| { |
| "entropy": 0.5808530151844025, |
| "epoch": 2.825, |
| "grad_norm": 0.01840745098888874, |
| "learning_rate": 0.0002, |
| "loss": 0.58048415184021, |
| "mean_token_accuracy": 0.7667081654071808, |
| "num_tokens": 8454144.0, |
| "step": 113 |
| }, |
| { |
| "entropy": 0.5671190619468689, |
| "epoch": 2.85, |
| "grad_norm": 0.024347495287656784, |
| "learning_rate": 0.0002, |
| "loss": 0.568537712097168, |
| "mean_token_accuracy": 0.7717441022396088, |
| "num_tokens": 8529127.0, |
| "step": 114 |
| }, |
| { |
| "entropy": 0.5740341693162918, |
| "epoch": 2.875, |
| "grad_norm": 0.024653296917676926, |
| "learning_rate": 0.0002, |
| "loss": 0.5723626613616943, |
| "mean_token_accuracy": 0.7696013450622559, |
| "num_tokens": 8604291.0, |
| "step": 115 |
| }, |
| { |
| "entropy": 0.5796328634023666, |
| "epoch": 2.9, |
| "grad_norm": 0.020844636484980583, |
| "learning_rate": 0.0002, |
| "loss": 0.5782836079597473, |
| "mean_token_accuracy": 0.7682057768106461, |
| "num_tokens": 8679195.0, |
| "step": 116 |
| }, |
| { |
| "entropy": 0.576483279466629, |
| "epoch": 2.925, |
| "grad_norm": 0.021920515224337578, |
| "learning_rate": 0.0002, |
| "loss": 0.578456699848175, |
| "mean_token_accuracy": 0.7670477628707886, |
| "num_tokens": 8754009.0, |
| "step": 117 |
| }, |
| { |
| "entropy": 0.5752202421426773, |
| "epoch": 2.95, |
| "grad_norm": 0.020918108522892, |
| "learning_rate": 0.0002, |
| "loss": 0.5750131011009216, |
| "mean_token_accuracy": 0.7684811949729919, |
| "num_tokens": 8828191.0, |
| "step": 118 |
| }, |
| { |
| "entropy": 0.5851098299026489, |
| "epoch": 2.975, |
| "grad_norm": 0.02478696219623089, |
| "learning_rate": 0.0002, |
| "loss": 0.5817323327064514, |
| "mean_token_accuracy": 0.7672260999679565, |
| "num_tokens": 8903936.0, |
| "step": 119 |
| }, |
| { |
| "entropy": 0.5765727013349533, |
| "epoch": 3.0, |
| "grad_norm": 0.021200377494096756, |
| "learning_rate": 0.0002, |
| "loss": 0.575070858001709, |
| "mean_token_accuracy": 0.7691281586885452, |
| "num_tokens": 8979681.0, |
| "step": 120 |
| }, |
| { |
| "entropy": 0.5856994092464447, |
| "epoch": 3.025, |
| "grad_norm": 0.01982778124511242, |
| "learning_rate": 0.0002, |
| "loss": 0.5849777460098267, |
| "mean_token_accuracy": 0.7634477466344833, |
| "num_tokens": 9053813.0, |
| "step": 121 |
| }, |
| { |
| "entropy": 0.5709525793790817, |
| "epoch": 3.05, |
| "grad_norm": 0.020404471084475517, |
| "learning_rate": 0.0002, |
| "loss": 0.5730876326560974, |
| "mean_token_accuracy": 0.7704098522663116, |
| "num_tokens": 9128173.0, |
| "step": 122 |
| }, |
| { |
| "entropy": 0.5704852193593979, |
| "epoch": 3.075, |
| "grad_norm": 0.016850776970386505, |
| "learning_rate": 0.0002, |
| "loss": 0.5663577318191528, |
| "mean_token_accuracy": 0.7722858935594559, |
| "num_tokens": 9203061.0, |
| "step": 123 |
| }, |
| { |
| "entropy": 0.567479208111763, |
| "epoch": 3.1, |
| "grad_norm": 0.025294054299592972, |
| "learning_rate": 0.0002, |
| "loss": 0.5650860071182251, |
| "mean_token_accuracy": 0.7725925892591476, |
| "num_tokens": 9278418.0, |
| "step": 124 |
| }, |
| { |
| "entropy": 0.5768236368894577, |
| "epoch": 3.125, |
| "grad_norm": 0.021733148023486137, |
| "learning_rate": 0.0002, |
| "loss": 0.5766515731811523, |
| "mean_token_accuracy": 0.7679264396429062, |
| "num_tokens": 9353227.0, |
| "step": 125 |
| }, |
| { |
| "entropy": 0.5702922940254211, |
| "epoch": 3.15, |
| "grad_norm": 0.023117227479815483, |
| "learning_rate": 0.0002, |
| "loss": 0.5716854929924011, |
| "mean_token_accuracy": 0.7702013403177261, |
| "num_tokens": 9428399.0, |
| "step": 126 |
| }, |
| { |
| "entropy": 0.5861406326293945, |
| "epoch": 3.175, |
| "grad_norm": 0.02236233651638031, |
| "learning_rate": 0.0002, |
| "loss": 0.5864638090133667, |
| "mean_token_accuracy": 0.7633958756923676, |
| "num_tokens": 9503325.0, |
| "step": 127 |
| }, |
| { |
| "entropy": 0.5789273381233215, |
| "epoch": 3.2, |
| "grad_norm": 0.02411346696317196, |
| "learning_rate": 0.0002, |
| "loss": 0.5779775977134705, |
| "mean_token_accuracy": 0.7668363004922867, |
| "num_tokens": 9578044.0, |
| "step": 128 |
| }, |
| { |
| "entropy": 0.5797711908817291, |
| "epoch": 3.225, |
| "grad_norm": 0.023102540522813797, |
| "learning_rate": 0.0002, |
| "loss": 0.5821047425270081, |
| "mean_token_accuracy": 0.7657962143421173, |
| "num_tokens": 9652622.0, |
| "step": 129 |
| }, |
| { |
| "entropy": 0.5753140151500702, |
| "epoch": 3.25, |
| "grad_norm": 0.02087407372891903, |
| "learning_rate": 0.0002, |
| "loss": 0.5716829895973206, |
| "mean_token_accuracy": 0.7688267230987549, |
| "num_tokens": 9727771.0, |
| "step": 130 |
| }, |
| { |
| "entropy": 0.5765914916992188, |
| "epoch": 3.275, |
| "grad_norm": 0.022741632536053658, |
| "learning_rate": 0.0002, |
| "loss": 0.5777339935302734, |
| "mean_token_accuracy": 0.7680183500051498, |
| "num_tokens": 9802572.0, |
| "step": 131 |
| }, |
| { |
| "entropy": 0.5704336613416672, |
| "epoch": 3.3, |
| "grad_norm": 0.02135850489139557, |
| "learning_rate": 0.0002, |
| "loss": 0.5711397528648376, |
| "mean_token_accuracy": 0.7697059661149979, |
| "num_tokens": 9877509.0, |
| "step": 132 |
| }, |
| { |
| "entropy": 0.5819953978061676, |
| "epoch": 3.325, |
| "grad_norm": 0.028905468061566353, |
| "learning_rate": 0.0002, |
| "loss": 0.579884946346283, |
| "mean_token_accuracy": 0.7667191326618195, |
| "num_tokens": 9951617.0, |
| "step": 133 |
| }, |
| { |
| "entropy": 0.5826835632324219, |
| "epoch": 3.35, |
| "grad_norm": 0.021706923842430115, |
| "learning_rate": 0.0002, |
| "loss": 0.5823646783828735, |
| "mean_token_accuracy": 0.764801412820816, |
| "num_tokens": 10026079.0, |
| "step": 134 |
| }, |
| { |
| "entropy": 0.5760972201824188, |
| "epoch": 3.375, |
| "grad_norm": 0.02655896358191967, |
| "learning_rate": 0.0002, |
| "loss": 0.5790044665336609, |
| "mean_token_accuracy": 0.7664503753185272, |
| "num_tokens": 10101148.0, |
| "step": 135 |
| }, |
| { |
| "entropy": 0.5685720443725586, |
| "epoch": 3.4, |
| "grad_norm": 0.02456754446029663, |
| "learning_rate": 0.0002, |
| "loss": 0.5716453790664673, |
| "mean_token_accuracy": 0.7696442306041718, |
| "num_tokens": 10175811.0, |
| "step": 136 |
| }, |
| { |
| "entropy": 0.5759570449590683, |
| "epoch": 3.425, |
| "grad_norm": 0.02254396118223667, |
| "learning_rate": 0.0002, |
| "loss": 0.5739217400550842, |
| "mean_token_accuracy": 0.7697554975748062, |
| "num_tokens": 10250838.0, |
| "step": 137 |
| }, |
| { |
| "entropy": 0.5719419866800308, |
| "epoch": 3.45, |
| "grad_norm": 0.024404190480709076, |
| "learning_rate": 0.0002, |
| "loss": 0.5725557804107666, |
| "mean_token_accuracy": 0.769346296787262, |
| "num_tokens": 10326342.0, |
| "step": 138 |
| }, |
| { |
| "entropy": 0.5715157091617584, |
| "epoch": 3.475, |
| "grad_norm": 0.022105256095528603, |
| "learning_rate": 0.0002, |
| "loss": 0.5716947317123413, |
| "mean_token_accuracy": 0.7693071365356445, |
| "num_tokens": 10402338.0, |
| "step": 139 |
| }, |
| { |
| "entropy": 0.5735979527235031, |
| "epoch": 3.5, |
| "grad_norm": 0.023778000846505165, |
| "learning_rate": 0.0002, |
| "loss": 0.5760594606399536, |
| "mean_token_accuracy": 0.7667177468538284, |
| "num_tokens": 10476752.0, |
| "step": 140 |
| }, |
| { |
| "entropy": 0.5770183205604553, |
| "epoch": 3.525, |
| "grad_norm": 0.021110933274030685, |
| "learning_rate": 0.0002, |
| "loss": 0.5751050710678101, |
| "mean_token_accuracy": 0.7673051208257675, |
| "num_tokens": 10551920.0, |
| "step": 141 |
| }, |
| { |
| "entropy": 0.5748606622219086, |
| "epoch": 3.55, |
| "grad_norm": 0.020023738965392113, |
| "learning_rate": 0.0002, |
| "loss": 0.5679011940956116, |
| "mean_token_accuracy": 0.7721457779407501, |
| "num_tokens": 10626809.0, |
| "step": 142 |
| }, |
| { |
| "entropy": 0.582221657037735, |
| "epoch": 3.575, |
| "grad_norm": 0.02178809978067875, |
| "learning_rate": 0.0002, |
| "loss": 0.579848051071167, |
| "mean_token_accuracy": 0.7657678723335266, |
| "num_tokens": 10701733.0, |
| "step": 143 |
| }, |
| { |
| "entropy": 0.571207270026207, |
| "epoch": 3.6, |
| "grad_norm": 0.021556353196501732, |
| "learning_rate": 0.0002, |
| "loss": 0.5747779607772827, |
| "mean_token_accuracy": 0.7685752362012863, |
| "num_tokens": 10776164.0, |
| "step": 144 |
| }, |
| { |
| "entropy": 0.5803283900022507, |
| "epoch": 3.625, |
| "grad_norm": 0.024940941482782364, |
| "learning_rate": 0.0002, |
| "loss": 0.5839154124259949, |
| "mean_token_accuracy": 0.7644830942153931, |
| "num_tokens": 10851502.0, |
| "step": 145 |
| }, |
| { |
| "entropy": 0.5679881721735001, |
| "epoch": 3.65, |
| "grad_norm": 0.02257210575044155, |
| "learning_rate": 0.0002, |
| "loss": 0.5671518445014954, |
| "mean_token_accuracy": 0.7707392424345016, |
| "num_tokens": 10926491.0, |
| "step": 146 |
| }, |
| { |
| "entropy": 0.5873086154460907, |
| "epoch": 3.675, |
| "grad_norm": 0.024546999484300613, |
| "learning_rate": 0.0002, |
| "loss": 0.5834171772003174, |
| "mean_token_accuracy": 0.764349952340126, |
| "num_tokens": 11000892.0, |
| "step": 147 |
| }, |
| { |
| "entropy": 0.573735237121582, |
| "epoch": 3.7, |
| "grad_norm": 0.02570403181016445, |
| "learning_rate": 0.0002, |
| "loss": 0.5722212195396423, |
| "mean_token_accuracy": 0.7698929309844971, |
| "num_tokens": 11075461.0, |
| "step": 148 |
| }, |
| { |
| "entropy": 0.5838541835546494, |
| "epoch": 3.725, |
| "grad_norm": 0.021784571930766106, |
| "learning_rate": 0.0002, |
| "loss": 0.5852305889129639, |
| "mean_token_accuracy": 0.7646767646074295, |
| "num_tokens": 11149906.0, |
| "step": 149 |
| }, |
| { |
| "entropy": 0.5819535553455353, |
| "epoch": 3.75, |
| "grad_norm": 0.023919865489006042, |
| "learning_rate": 0.0002, |
| "loss": 0.5841096639633179, |
| "mean_token_accuracy": 0.7660493403673172, |
| "num_tokens": 11224427.0, |
| "step": 150 |
| }, |
| { |
| "entropy": 0.5729077309370041, |
| "epoch": 3.775, |
| "grad_norm": 0.019240032881498337, |
| "learning_rate": 0.0002, |
| "loss": 0.5747278332710266, |
| "mean_token_accuracy": 0.7695352137088776, |
| "num_tokens": 11299373.0, |
| "step": 151 |
| }, |
| { |
| "entropy": 0.5640138536691666, |
| "epoch": 3.8, |
| "grad_norm": 0.022750195115804672, |
| "learning_rate": 0.0002, |
| "loss": 0.5636758804321289, |
| "mean_token_accuracy": 0.7727017253637314, |
| "num_tokens": 11373929.0, |
| "step": 152 |
| }, |
| { |
| "entropy": 0.5785274505615234, |
| "epoch": 3.825, |
| "grad_norm": 0.024555128067731857, |
| "learning_rate": 0.0002, |
| "loss": 0.577983021736145, |
| "mean_token_accuracy": 0.7675963938236237, |
| "num_tokens": 11448658.0, |
| "step": 153 |
| }, |
| { |
| "entropy": 0.5797367095947266, |
| "epoch": 3.85, |
| "grad_norm": 0.02360512688755989, |
| "learning_rate": 0.0002, |
| "loss": 0.5788124799728394, |
| "mean_token_accuracy": 0.7675672024488449, |
| "num_tokens": 11522296.0, |
| "step": 154 |
| }, |
| { |
| "entropy": 0.5766919553279877, |
| "epoch": 3.875, |
| "grad_norm": 0.020860835909843445, |
| "learning_rate": 0.0002, |
| "loss": 0.5760090351104736, |
| "mean_token_accuracy": 0.7677243202924728, |
| "num_tokens": 11596689.0, |
| "step": 155 |
| }, |
| { |
| "entropy": 0.5780852437019348, |
| "epoch": 3.9, |
| "grad_norm": 0.021970726549625397, |
| "learning_rate": 0.0002, |
| "loss": 0.5821795463562012, |
| "mean_token_accuracy": 0.765151247382164, |
| "num_tokens": 11670420.0, |
| "step": 156 |
| }, |
| { |
| "entropy": 0.5896212756633759, |
| "epoch": 3.925, |
| "grad_norm": 0.025580603629350662, |
| "learning_rate": 0.0002, |
| "loss": 0.5879545211791992, |
| "mean_token_accuracy": 0.7629344761371613, |
| "num_tokens": 11746549.0, |
| "step": 157 |
| }, |
| { |
| "entropy": 0.5752788335084915, |
| "epoch": 3.95, |
| "grad_norm": 0.02031378634274006, |
| "learning_rate": 0.0002, |
| "loss": 0.5733282566070557, |
| "mean_token_accuracy": 0.768017366528511, |
| "num_tokens": 11822165.0, |
| "step": 158 |
| }, |
| { |
| "entropy": 0.5673830062150955, |
| "epoch": 3.975, |
| "grad_norm": 0.023106930777430534, |
| "learning_rate": 0.0002, |
| "loss": 0.5672657489776611, |
| "mean_token_accuracy": 0.7714889943599701, |
| "num_tokens": 11897985.0, |
| "step": 159 |
| }, |
| { |
| "entropy": 0.5763387382030487, |
| "epoch": 4.0, |
| "grad_norm": 0.02034103125333786, |
| "learning_rate": 0.0002, |
| "loss": 0.5772510766983032, |
| "mean_token_accuracy": 0.7671704441308975, |
| "num_tokens": 11972903.0, |
| "step": 160 |
| }, |
| { |
| "entropy": 0.5695903152227402, |
| "epoch": 4.025, |
| "grad_norm": 0.021100850775837898, |
| "learning_rate": 0.0002, |
| "loss": 0.5693217515945435, |
| "mean_token_accuracy": 0.7699502855539322, |
| "num_tokens": 12047693.0, |
| "step": 161 |
| }, |
| { |
| "entropy": 0.573061928153038, |
| "epoch": 4.05, |
| "grad_norm": 0.021061765030026436, |
| "learning_rate": 0.0002, |
| "loss": 0.57159024477005, |
| "mean_token_accuracy": 0.7694694995880127, |
| "num_tokens": 12122067.0, |
| "step": 162 |
| }, |
| { |
| "entropy": 0.5698549449443817, |
| "epoch": 4.075, |
| "grad_norm": 0.025176256895065308, |
| "learning_rate": 0.0002, |
| "loss": 0.5731710195541382, |
| "mean_token_accuracy": 0.7683273702859879, |
| "num_tokens": 12196269.0, |
| "step": 163 |
| }, |
| { |
| "entropy": 0.5690735876560211, |
| "epoch": 4.1, |
| "grad_norm": 0.02089373581111431, |
| "learning_rate": 0.0002, |
| "loss": 0.5714013576507568, |
| "mean_token_accuracy": 0.7697489559650421, |
| "num_tokens": 12270853.0, |
| "step": 164 |
| }, |
| { |
| "entropy": 0.5816214233636856, |
| "epoch": 4.125, |
| "grad_norm": 0.02240598015487194, |
| "learning_rate": 0.0002, |
| "loss": 0.5786125063896179, |
| "mean_token_accuracy": 0.7670575529336929, |
| "num_tokens": 12345733.0, |
| "step": 165 |
| }, |
| { |
| "entropy": 0.5852687507867813, |
| "epoch": 4.15, |
| "grad_norm": 0.023174043744802475, |
| "learning_rate": 0.0002, |
| "loss": 0.58061683177948, |
| "mean_token_accuracy": 0.7636701017618179, |
| "num_tokens": 12419829.0, |
| "step": 166 |
| }, |
| { |
| "entropy": 0.571011945605278, |
| "epoch": 4.175, |
| "grad_norm": 0.022563502192497253, |
| "learning_rate": 0.0002, |
| "loss": 0.5689563751220703, |
| "mean_token_accuracy": 0.7716156244277954, |
| "num_tokens": 12494264.0, |
| "step": 167 |
| }, |
| { |
| "entropy": 0.5736175775527954, |
| "epoch": 4.2, |
| "grad_norm": 0.02212107926607132, |
| "learning_rate": 0.0002, |
| "loss": 0.5761622190475464, |
| "mean_token_accuracy": 0.7677205204963684, |
| "num_tokens": 12569157.0, |
| "step": 168 |
| }, |
| { |
| "entropy": 0.5656485557556152, |
| "epoch": 4.225, |
| "grad_norm": 0.02473953552544117, |
| "learning_rate": 0.0002, |
| "loss": 0.5689172744750977, |
| "mean_token_accuracy": 0.771388441324234, |
| "num_tokens": 12644204.0, |
| "step": 169 |
| }, |
| { |
| "entropy": 0.5819400250911713, |
| "epoch": 4.25, |
| "grad_norm": 0.024174660444259644, |
| "learning_rate": 0.0002, |
| "loss": 0.5831934213638306, |
| "mean_token_accuracy": 0.7642627954483032, |
| "num_tokens": 12719199.0, |
| "step": 170 |
| }, |
| { |
| "entropy": 0.5756211876869202, |
| "epoch": 4.275, |
| "grad_norm": 0.019957805052399635, |
| "learning_rate": 0.0002, |
| "loss": 0.5719864964485168, |
| "mean_token_accuracy": 0.7682519555091858, |
| "num_tokens": 12793696.0, |
| "step": 171 |
| }, |
| { |
| "entropy": 0.5758868604898453, |
| "epoch": 4.3, |
| "grad_norm": 0.02505411207675934, |
| "learning_rate": 0.0002, |
| "loss": 0.5705811977386475, |
| "mean_token_accuracy": 0.7688630670309067, |
| "num_tokens": 12868020.0, |
| "step": 172 |
| }, |
| { |
| "entropy": 0.576561376452446, |
| "epoch": 4.325, |
| "grad_norm": 0.02111932635307312, |
| "learning_rate": 0.0002, |
| "loss": 0.5742412805557251, |
| "mean_token_accuracy": 0.7686543017625809, |
| "num_tokens": 12943124.0, |
| "step": 173 |
| }, |
| { |
| "entropy": 0.5648486465215683, |
| "epoch": 4.35, |
| "grad_norm": 0.024696264415979385, |
| "learning_rate": 0.0002, |
| "loss": 0.5707447528839111, |
| "mean_token_accuracy": 0.7711144238710403, |
| "num_tokens": 13018094.0, |
| "step": 174 |
| }, |
| { |
| "entropy": 0.550954133272171, |
| "epoch": 4.375, |
| "grad_norm": 0.021990923210978508, |
| "learning_rate": 0.0002, |
| "loss": 0.5539791584014893, |
| "mean_token_accuracy": 0.7755307257175446, |
| "num_tokens": 13093256.0, |
| "step": 175 |
| }, |
| { |
| "entropy": 0.571951225399971, |
| "epoch": 4.4, |
| "grad_norm": 0.021349789574742317, |
| "learning_rate": 0.0002, |
| "loss": 0.5731101036071777, |
| "mean_token_accuracy": 0.7676120102405548, |
| "num_tokens": 13167474.0, |
| "step": 176 |
| }, |
| { |
| "entropy": 0.5779466480016708, |
| "epoch": 4.425, |
| "grad_norm": 0.02244136668741703, |
| "learning_rate": 0.0002, |
| "loss": 0.5748851895332336, |
| "mean_token_accuracy": 0.7674881815910339, |
| "num_tokens": 13243109.0, |
| "step": 177 |
| }, |
| { |
| "entropy": 0.570502832531929, |
| "epoch": 4.45, |
| "grad_norm": 0.021098149940371513, |
| "learning_rate": 0.0002, |
| "loss": 0.5683890581130981, |
| "mean_token_accuracy": 0.7724465280771255, |
| "num_tokens": 13317893.0, |
| "step": 178 |
| }, |
| { |
| "entropy": 0.5695969015359879, |
| "epoch": 4.475, |
| "grad_norm": 0.02162528783082962, |
| "learning_rate": 0.0002, |
| "loss": 0.5705981254577637, |
| "mean_token_accuracy": 0.768608570098877, |
| "num_tokens": 13392379.0, |
| "step": 179 |
| }, |
| { |
| "entropy": 0.5717011392116547, |
| "epoch": 4.5, |
| "grad_norm": 0.0223979689180851, |
| "learning_rate": 0.0002, |
| "loss": 0.5761755108833313, |
| "mean_token_accuracy": 0.7661348432302475, |
| "num_tokens": 13467500.0, |
| "step": 180 |
| }, |
| { |
| "entropy": 0.5750777423381805, |
| "epoch": 4.525, |
| "grad_norm": 0.0206700898706913, |
| "learning_rate": 0.0002, |
| "loss": 0.5748052597045898, |
| "mean_token_accuracy": 0.7673143148422241, |
| "num_tokens": 13542205.0, |
| "step": 181 |
| }, |
| { |
| "entropy": 0.5675535202026367, |
| "epoch": 4.55, |
| "grad_norm": 0.021973995491862297, |
| "learning_rate": 0.0002, |
| "loss": 0.563953697681427, |
| "mean_token_accuracy": 0.7722303122282028, |
| "num_tokens": 13616628.0, |
| "step": 182 |
| }, |
| { |
| "entropy": 0.5718803107738495, |
| "epoch": 4.575, |
| "grad_norm": 0.021145911887288094, |
| "learning_rate": 0.0002, |
| "loss": 0.5688813924789429, |
| "mean_token_accuracy": 0.7704852521419525, |
| "num_tokens": 13691940.0, |
| "step": 183 |
| }, |
| { |
| "entropy": 0.5774464905261993, |
| "epoch": 4.6, |
| "grad_norm": 0.021537618711590767, |
| "learning_rate": 0.0002, |
| "loss": 0.5771785974502563, |
| "mean_token_accuracy": 0.7655858248472214, |
| "num_tokens": 13767043.0, |
| "step": 184 |
| }, |
| { |
| "entropy": 0.570703387260437, |
| "epoch": 4.625, |
| "grad_norm": 0.02538282983005047, |
| "learning_rate": 0.0002, |
| "loss": 0.5734485387802124, |
| "mean_token_accuracy": 0.7683311551809311, |
| "num_tokens": 13842107.0, |
| "step": 185 |
| }, |
| { |
| "entropy": 0.5709679424762726, |
| "epoch": 4.65, |
| "grad_norm": 0.024409880861639977, |
| "learning_rate": 0.0002, |
| "loss": 0.5723483562469482, |
| "mean_token_accuracy": 0.7690693438053131, |
| "num_tokens": 13916757.0, |
| "step": 186 |
| }, |
| { |
| "entropy": 0.5667127072811127, |
| "epoch": 4.675, |
| "grad_norm": 0.02431379444897175, |
| "learning_rate": 0.0002, |
| "loss": 0.5675520896911621, |
| "mean_token_accuracy": 0.7711669653654099, |
| "num_tokens": 13992850.0, |
| "step": 187 |
| }, |
| { |
| "entropy": 0.5858957320451736, |
| "epoch": 4.7, |
| "grad_norm": 0.02329982817173004, |
| "learning_rate": 0.0002, |
| "loss": 0.5802958011627197, |
| "mean_token_accuracy": 0.7663194984197617, |
| "num_tokens": 14068227.0, |
| "step": 188 |
| }, |
| { |
| "entropy": 0.5811503231525421, |
| "epoch": 4.725, |
| "grad_norm": 0.025335390120744705, |
| "learning_rate": 0.0002, |
| "loss": 0.5782935619354248, |
| "mean_token_accuracy": 0.7669466435909271, |
| "num_tokens": 14142904.0, |
| "step": 189 |
| }, |
| { |
| "entropy": 0.5707950592041016, |
| "epoch": 4.75, |
| "grad_norm": 0.02279096655547619, |
| "learning_rate": 0.0002, |
| "loss": 0.5738247632980347, |
| "mean_token_accuracy": 0.7678002119064331, |
| "num_tokens": 14217940.0, |
| "step": 190 |
| }, |
| { |
| "entropy": 0.5689068585634232, |
| "epoch": 4.775, |
| "grad_norm": 0.028375349938869476, |
| "learning_rate": 0.0002, |
| "loss": 0.5795305371284485, |
| "mean_token_accuracy": 0.7658824324607849, |
| "num_tokens": 14292463.0, |
| "step": 191 |
| }, |
| { |
| "entropy": 0.5779102444648743, |
| "epoch": 4.8, |
| "grad_norm": 0.019591832533478737, |
| "learning_rate": 0.0002, |
| "loss": 0.5775682330131531, |
| "mean_token_accuracy": 0.7678920924663544, |
| "num_tokens": 14367428.0, |
| "step": 192 |
| }, |
| { |
| "entropy": 0.5796025097370148, |
| "epoch": 4.825, |
| "grad_norm": 0.024824826046824455, |
| "learning_rate": 0.0002, |
| "loss": 0.5707208514213562, |
| "mean_token_accuracy": 0.7707486748695374, |
| "num_tokens": 14442526.0, |
| "step": 193 |
| }, |
| { |
| "entropy": 0.574284628033638, |
| "epoch": 4.85, |
| "grad_norm": 0.021157678216695786, |
| "learning_rate": 0.0002, |
| "loss": 0.5702036023139954, |
| "mean_token_accuracy": 0.7699166387319565, |
| "num_tokens": 14517364.0, |
| "step": 194 |
| }, |
| { |
| "entropy": 0.5729261040687561, |
| "epoch": 4.875, |
| "grad_norm": 0.025306105613708496, |
| "learning_rate": 0.0002, |
| "loss": 0.5750659704208374, |
| "mean_token_accuracy": 0.7673707902431488, |
| "num_tokens": 14592405.0, |
| "step": 195 |
| }, |
| { |
| "entropy": 0.5620183199644089, |
| "epoch": 4.9, |
| "grad_norm": 0.025408228859305382, |
| "learning_rate": 0.0002, |
| "loss": 0.5673909783363342, |
| "mean_token_accuracy": 0.7702877819538116, |
| "num_tokens": 14667053.0, |
| "step": 196 |
| }, |
| { |
| "entropy": 0.5667766779661179, |
| "epoch": 4.925, |
| "grad_norm": 0.024316171184182167, |
| "learning_rate": 0.0002, |
| "loss": 0.5681411027908325, |
| "mean_token_accuracy": 0.7717499136924744, |
| "num_tokens": 14742222.0, |
| "step": 197 |
| }, |
| { |
| "entropy": 0.5805934369564056, |
| "epoch": 4.95, |
| "grad_norm": 0.02220967784523964, |
| "learning_rate": 0.0002, |
| "loss": 0.5777135491371155, |
| "mean_token_accuracy": 0.7674207240343094, |
| "num_tokens": 14817012.0, |
| "step": 198 |
| }, |
| { |
| "entropy": 0.573724776506424, |
| "epoch": 4.975, |
| "grad_norm": 0.02526751719415188, |
| "learning_rate": 0.0002, |
| "loss": 0.5700376033782959, |
| "mean_token_accuracy": 0.7695773392915726, |
| "num_tokens": 14891805.0, |
| "step": 199 |
| }, |
| { |
| "entropy": 0.5778943598270416, |
| "epoch": 5.0, |
| "grad_norm": 0.021638575941324234, |
| "learning_rate": 0.0002, |
| "loss": 0.5796632766723633, |
| "mean_token_accuracy": 0.7665348052978516, |
| "num_tokens": 14966136.0, |
| "step": 200 |
| }, |
| { |
| "entropy": 0.5613097548484802, |
| "epoch": 5.025, |
| "grad_norm": 0.021904166787862778, |
| "learning_rate": 0.0002, |
| "loss": 0.5612062215805054, |
| "mean_token_accuracy": 0.7737284600734711, |
| "num_tokens": 15040385.0, |
| "step": 201 |
| }, |
| { |
| "entropy": 0.5623792111873627, |
| "epoch": 5.05, |
| "grad_norm": 0.02356012538075447, |
| "learning_rate": 0.0002, |
| "loss": 0.5624409317970276, |
| "mean_token_accuracy": 0.7724986523389816, |
| "num_tokens": 15115257.0, |
| "step": 202 |
| }, |
| { |
| "entropy": 0.5548672676086426, |
| "epoch": 5.075, |
| "grad_norm": 0.02421456202864647, |
| "learning_rate": 0.0002, |
| "loss": 0.5566623210906982, |
| "mean_token_accuracy": 0.7740187644958496, |
| "num_tokens": 15190279.0, |
| "step": 203 |
| }, |
| { |
| "entropy": 0.5734377503395081, |
| "epoch": 5.1, |
| "grad_norm": 0.027081554755568504, |
| "learning_rate": 0.0002, |
| "loss": 0.5723409652709961, |
| "mean_token_accuracy": 0.7683208882808685, |
| "num_tokens": 15265534.0, |
| "step": 204 |
| }, |
| { |
| "entropy": 0.5770121663808823, |
| "epoch": 5.125, |
| "grad_norm": 0.025843461975455284, |
| "learning_rate": 0.0002, |
| "loss": 0.5737386345863342, |
| "mean_token_accuracy": 0.768302395939827, |
| "num_tokens": 15340204.0, |
| "step": 205 |
| }, |
| { |
| "entropy": 0.5695698410272598, |
| "epoch": 5.15, |
| "grad_norm": 0.024712897837162018, |
| "learning_rate": 0.0002, |
| "loss": 0.566956639289856, |
| "mean_token_accuracy": 0.7712061703205109, |
| "num_tokens": 15415225.0, |
| "step": 206 |
| }, |
| { |
| "entropy": 0.5699747204780579, |
| "epoch": 5.175, |
| "grad_norm": 0.02740584686398506, |
| "learning_rate": 0.0002, |
| "loss": 0.5699794888496399, |
| "mean_token_accuracy": 0.7703876197338104, |
| "num_tokens": 15490117.0, |
| "step": 207 |
| }, |
| { |
| "entropy": 0.5615235567092896, |
| "epoch": 5.2, |
| "grad_norm": 0.02705363929271698, |
| "learning_rate": 0.0002, |
| "loss": 0.5620254278182983, |
| "mean_token_accuracy": 0.7728376239538193, |
| "num_tokens": 15564960.0, |
| "step": 208 |
| }, |
| { |
| "entropy": 0.571983590722084, |
| "epoch": 5.225, |
| "grad_norm": 0.02741997316479683, |
| "learning_rate": 0.0002, |
| "loss": 0.5753256678581238, |
| "mean_token_accuracy": 0.7678238153457642, |
| "num_tokens": 15640346.0, |
| "step": 209 |
| }, |
| { |
| "entropy": 0.5734784454107285, |
| "epoch": 5.25, |
| "grad_norm": 0.026802683249115944, |
| "learning_rate": 0.0002, |
| "loss": 0.573603630065918, |
| "mean_token_accuracy": 0.767236202955246, |
| "num_tokens": 15715092.0, |
| "step": 210 |
| }, |
| { |
| "entropy": 0.581208124756813, |
| "epoch": 5.275, |
| "grad_norm": 0.024372655898332596, |
| "learning_rate": 0.0002, |
| "loss": 0.5770745277404785, |
| "mean_token_accuracy": 0.766946941614151, |
| "num_tokens": 15790126.0, |
| "step": 211 |
| }, |
| { |
| "entropy": 0.5763002783060074, |
| "epoch": 5.3, |
| "grad_norm": 0.030634434893727303, |
| "learning_rate": 0.0002, |
| "loss": 0.5704483985900879, |
| "mean_token_accuracy": 0.7682492583990097, |
| "num_tokens": 15865049.0, |
| "step": 212 |
| }, |
| { |
| "entropy": 0.5761642754077911, |
| "epoch": 5.325, |
| "grad_norm": 0.02550283446907997, |
| "learning_rate": 0.0002, |
| "loss": 0.5767782926559448, |
| "mean_token_accuracy": 0.7672466337680817, |
| "num_tokens": 15939883.0, |
| "step": 213 |
| }, |
| { |
| "entropy": 0.5759230703115463, |
| "epoch": 5.35, |
| "grad_norm": 0.03148680552840233, |
| "learning_rate": 0.0002, |
| "loss": 0.5800034999847412, |
| "mean_token_accuracy": 0.7658649682998657, |
| "num_tokens": 16014297.0, |
| "step": 214 |
| }, |
| { |
| "entropy": 0.5732310563325882, |
| "epoch": 5.375, |
| "grad_norm": 0.03305201232433319, |
| "learning_rate": 0.0002, |
| "loss": 0.5733552575111389, |
| "mean_token_accuracy": 0.7683651447296143, |
| "num_tokens": 16089131.0, |
| "step": 215 |
| }, |
| { |
| "entropy": 0.5785533636808395, |
| "epoch": 5.4, |
| "grad_norm": 0.024717051535844803, |
| "learning_rate": 0.0002, |
| "loss": 0.5784015655517578, |
| "mean_token_accuracy": 0.7660104632377625, |
| "num_tokens": 16164324.0, |
| "step": 216 |
| }, |
| { |
| "entropy": 0.5687698572874069, |
| "epoch": 5.425, |
| "grad_norm": 0.029457444325089455, |
| "learning_rate": 0.0002, |
| "loss": 0.5750937461853027, |
| "mean_token_accuracy": 0.7675990760326385, |
| "num_tokens": 16239760.0, |
| "step": 217 |
| }, |
| { |
| "entropy": 0.5556502044200897, |
| "epoch": 5.45, |
| "grad_norm": 0.02100587822496891, |
| "learning_rate": 0.0002, |
| "loss": 0.555939793586731, |
| "mean_token_accuracy": 0.7752721607685089, |
| "num_tokens": 16313638.0, |
| "step": 218 |
| }, |
| { |
| "entropy": 0.5768693834543228, |
| "epoch": 5.475, |
| "grad_norm": 0.02610902115702629, |
| "learning_rate": 0.0002, |
| "loss": 0.5738801956176758, |
| "mean_token_accuracy": 0.7672755122184753, |
| "num_tokens": 16388910.0, |
| "step": 219 |
| }, |
| { |
| "entropy": 0.5702448487281799, |
| "epoch": 5.5, |
| "grad_norm": 0.023769576102495193, |
| "learning_rate": 0.0002, |
| "loss": 0.5661875009536743, |
| "mean_token_accuracy": 0.771107405424118, |
| "num_tokens": 16464093.0, |
| "step": 220 |
| }, |
| { |
| "entropy": 0.564674437046051, |
| "epoch": 5.525, |
| "grad_norm": 0.031206723302602768, |
| "learning_rate": 0.0002, |
| "loss": 0.5673574805259705, |
| "mean_token_accuracy": 0.7696384638547897, |
| "num_tokens": 16538659.0, |
| "step": 221 |
| }, |
| { |
| "entropy": 0.5666674822568893, |
| "epoch": 5.55, |
| "grad_norm": 0.028113245964050293, |
| "learning_rate": 0.0002, |
| "loss": 0.5711595416069031, |
| "mean_token_accuracy": 0.770385280251503, |
| "num_tokens": 16612944.0, |
| "step": 222 |
| }, |
| { |
| "entropy": 0.5764763206243515, |
| "epoch": 5.575, |
| "grad_norm": 0.02818591520190239, |
| "learning_rate": 0.0002, |
| "loss": 0.5757144689559937, |
| "mean_token_accuracy": 0.7672824114561081, |
| "num_tokens": 16688054.0, |
| "step": 223 |
| }, |
| { |
| "entropy": 0.5728043168783188, |
| "epoch": 5.6, |
| "grad_norm": 0.026192322373390198, |
| "learning_rate": 0.0002, |
| "loss": 0.5717101693153381, |
| "mean_token_accuracy": 0.768505647778511, |
| "num_tokens": 16763085.0, |
| "step": 224 |
| }, |
| { |
| "entropy": 0.5653840452432632, |
| "epoch": 5.625, |
| "grad_norm": 0.02572912909090519, |
| "learning_rate": 0.0002, |
| "loss": 0.5642470717430115, |
| "mean_token_accuracy": 0.7718316316604614, |
| "num_tokens": 16837841.0, |
| "step": 225 |
| }, |
| { |
| "entropy": 0.5668259114027023, |
| "epoch": 5.65, |
| "grad_norm": 0.025471486151218414, |
| "learning_rate": 0.0002, |
| "loss": 0.564730167388916, |
| "mean_token_accuracy": 0.771491751074791, |
| "num_tokens": 16912424.0, |
| "step": 226 |
| }, |
| { |
| "entropy": 0.5578023791313171, |
| "epoch": 5.675, |
| "grad_norm": 0.029479067772626877, |
| "learning_rate": 0.0002, |
| "loss": 0.5636255741119385, |
| "mean_token_accuracy": 0.7711348384618759, |
| "num_tokens": 16986843.0, |
| "step": 227 |
| }, |
| { |
| "entropy": 0.5757892429828644, |
| "epoch": 5.7, |
| "grad_norm": 0.026731031015515327, |
| "learning_rate": 0.0002, |
| "loss": 0.5748844742774963, |
| "mean_token_accuracy": 0.7673381417989731, |
| "num_tokens": 17061592.0, |
| "step": 228 |
| }, |
| { |
| "entropy": 0.5692086964845657, |
| "epoch": 5.725, |
| "grad_norm": 0.02727457694709301, |
| "learning_rate": 0.0002, |
| "loss": 0.5636672973632812, |
| "mean_token_accuracy": 0.7722706943750381, |
| "num_tokens": 17136842.0, |
| "step": 229 |
| }, |
| { |
| "entropy": 0.570942759513855, |
| "epoch": 5.75, |
| "grad_norm": 0.02676619589328766, |
| "learning_rate": 0.0002, |
| "loss": 0.5672682523727417, |
| "mean_token_accuracy": 0.7702435255050659, |
| "num_tokens": 17211443.0, |
| "step": 230 |
| }, |
| { |
| "entropy": 0.5606240034103394, |
| "epoch": 5.775, |
| "grad_norm": 0.028618959710001945, |
| "learning_rate": 0.0002, |
| "loss": 0.5684143304824829, |
| "mean_token_accuracy": 0.7702829986810684, |
| "num_tokens": 17286970.0, |
| "step": 231 |
| }, |
| { |
| "entropy": 0.569316640496254, |
| "epoch": 5.8, |
| "grad_norm": 0.027750151231884956, |
| "learning_rate": 0.0002, |
| "loss": 0.574034571647644, |
| "mean_token_accuracy": 0.7676869779825211, |
| "num_tokens": 17361772.0, |
| "step": 232 |
| }, |
| { |
| "entropy": 0.576723724603653, |
| "epoch": 5.825, |
| "grad_norm": 0.02459871955215931, |
| "learning_rate": 0.0002, |
| "loss": 0.5745028853416443, |
| "mean_token_accuracy": 0.7675033956766129, |
| "num_tokens": 17436257.0, |
| "step": 233 |
| }, |
| { |
| "entropy": 0.5697972923517227, |
| "epoch": 5.85, |
| "grad_norm": 0.02738168090581894, |
| "learning_rate": 0.0002, |
| "loss": 0.5621964931488037, |
| "mean_token_accuracy": 0.7732144594192505, |
| "num_tokens": 17510677.0, |
| "step": 234 |
| }, |
| { |
| "entropy": 0.5714251548051834, |
| "epoch": 5.875, |
| "grad_norm": 0.022376077249646187, |
| "learning_rate": 0.0002, |
| "loss": 0.5706051588058472, |
| "mean_token_accuracy": 0.7693936377763748, |
| "num_tokens": 17585616.0, |
| "step": 235 |
| }, |
| { |
| "entropy": 0.5623523741960526, |
| "epoch": 5.9, |
| "grad_norm": 0.029145779088139534, |
| "learning_rate": 0.0002, |
| "loss": 0.5659171342849731, |
| "mean_token_accuracy": 0.7712376862764359, |
| "num_tokens": 17660019.0, |
| "step": 236 |
| }, |
| { |
| "entropy": 0.5657843053340912, |
| "epoch": 5.925, |
| "grad_norm": 0.024399209767580032, |
| "learning_rate": 0.0002, |
| "loss": 0.5663224458694458, |
| "mean_token_accuracy": 0.7708786725997925, |
| "num_tokens": 17735131.0, |
| "step": 237 |
| }, |
| { |
| "entropy": 0.5680066645145416, |
| "epoch": 5.95, |
| "grad_norm": 0.027334652841091156, |
| "learning_rate": 0.0002, |
| "loss": 0.5661309957504272, |
| "mean_token_accuracy": 0.7711464017629623, |
| "num_tokens": 17809928.0, |
| "step": 238 |
| }, |
| { |
| "entropy": 0.5661123096942902, |
| "epoch": 5.975, |
| "grad_norm": 0.02591884881258011, |
| "learning_rate": 0.0002, |
| "loss": 0.5688766837120056, |
| "mean_token_accuracy": 0.7700952738523483, |
| "num_tokens": 17884292.0, |
| "step": 239 |
| }, |
| { |
| "entropy": 0.5738363265991211, |
| "epoch": 6.0, |
| "grad_norm": 0.023802831768989563, |
| "learning_rate": 0.0002, |
| "loss": 0.5767297744750977, |
| "mean_token_accuracy": 0.7682788968086243, |
| "num_tokens": 17959306.0, |
| "step": 240 |
| }, |
| { |
| "entropy": 0.5705175548791885, |
| "epoch": 6.025, |
| "grad_norm": 0.026808038353919983, |
| "learning_rate": 0.0002, |
| "loss": 0.5646485090255737, |
| "mean_token_accuracy": 0.771740049123764, |
| "num_tokens": 18033987.0, |
| "step": 241 |
| }, |
| { |
| "entropy": 0.568175345659256, |
| "epoch": 6.05, |
| "grad_norm": 0.026018792763352394, |
| "learning_rate": 0.0002, |
| "loss": 0.5654234290122986, |
| "mean_token_accuracy": 0.7715927213430405, |
| "num_tokens": 18108533.0, |
| "step": 242 |
| }, |
| { |
| "entropy": 0.5620162785053253, |
| "epoch": 6.075, |
| "grad_norm": 0.03238891437649727, |
| "learning_rate": 0.0002, |
| "loss": 0.5632866024971008, |
| "mean_token_accuracy": 0.7722341269254684, |
| "num_tokens": 18183188.0, |
| "step": 243 |
| }, |
| { |
| "entropy": 0.5663654953241348, |
| "epoch": 6.1, |
| "grad_norm": 0.04267890378832817, |
| "learning_rate": 0.0002, |
| "loss": 0.5707510709762573, |
| "mean_token_accuracy": 0.7683595418930054, |
| "num_tokens": 18257193.0, |
| "step": 244 |
| }, |
| { |
| "entropy": 0.5661468356847763, |
| "epoch": 6.125, |
| "grad_norm": 0.023024071007966995, |
| "learning_rate": 0.0002, |
| "loss": 0.5620009899139404, |
| "mean_token_accuracy": 0.7728223353624344, |
| "num_tokens": 18331475.0, |
| "step": 245 |
| }, |
| { |
| "entropy": 0.5726824253797531, |
| "epoch": 6.15, |
| "grad_norm": 0.03274550661444664, |
| "learning_rate": 0.0002, |
| "loss": 0.5699936151504517, |
| "mean_token_accuracy": 0.7683205753564835, |
| "num_tokens": 18406469.0, |
| "step": 246 |
| }, |
| { |
| "entropy": 0.553859069943428, |
| "epoch": 6.175, |
| "grad_norm": 0.025100160390138626, |
| "learning_rate": 0.0002, |
| "loss": 0.5552276968955994, |
| "mean_token_accuracy": 0.774128720164299, |
| "num_tokens": 18481392.0, |
| "step": 247 |
| }, |
| { |
| "entropy": 0.5592118203639984, |
| "epoch": 6.2, |
| "grad_norm": 0.030672013759613037, |
| "learning_rate": 0.0002, |
| "loss": 0.5638296604156494, |
| "mean_token_accuracy": 0.7705448269844055, |
| "num_tokens": 18557082.0, |
| "step": 248 |
| }, |
| { |
| "entropy": 0.5587449073791504, |
| "epoch": 6.225, |
| "grad_norm": 0.02617192640900612, |
| "learning_rate": 0.0002, |
| "loss": 0.5589370727539062, |
| "mean_token_accuracy": 0.7735968083143234, |
| "num_tokens": 18632396.0, |
| "step": 249 |
| }, |
| { |
| "entropy": 0.570429340004921, |
| "epoch": 6.25, |
| "grad_norm": 0.026497265323996544, |
| "learning_rate": 0.0002, |
| "loss": 0.5678025484085083, |
| "mean_token_accuracy": 0.7696103155612946, |
| "num_tokens": 18706761.0, |
| "step": 250 |
| }, |
| { |
| "entropy": 0.5758352428674698, |
| "epoch": 6.275, |
| "grad_norm": 0.03510003909468651, |
| "learning_rate": 0.0002, |
| "loss": 0.5702039003372192, |
| "mean_token_accuracy": 0.7697926163673401, |
| "num_tokens": 18781835.0, |
| "step": 251 |
| }, |
| { |
| "entropy": 0.5599015653133392, |
| "epoch": 6.3, |
| "grad_norm": 0.026413707062602043, |
| "learning_rate": 0.0002, |
| "loss": 0.5648400783538818, |
| "mean_token_accuracy": 0.7707231491804123, |
| "num_tokens": 18856780.0, |
| "step": 252 |
| }, |
| { |
| "entropy": 0.5588082820177078, |
| "epoch": 6.325, |
| "grad_norm": 0.03752964362502098, |
| "learning_rate": 0.0002, |
| "loss": 0.5637919902801514, |
| "mean_token_accuracy": 0.7720127999782562, |
| "num_tokens": 18931882.0, |
| "step": 253 |
| }, |
| { |
| "entropy": 0.5763219594955444, |
| "epoch": 6.35, |
| "grad_norm": 0.027257010340690613, |
| "learning_rate": 0.0002, |
| "loss": 0.5744665861129761, |
| "mean_token_accuracy": 0.7678072452545166, |
| "num_tokens": 19006654.0, |
| "step": 254 |
| }, |
| { |
| "entropy": 0.5638702213764191, |
| "epoch": 6.375, |
| "grad_norm": 0.03087831847369671, |
| "learning_rate": 0.0002, |
| "loss": 0.5590026378631592, |
| "mean_token_accuracy": 0.7745161801576614, |
| "num_tokens": 19081712.0, |
| "step": 255 |
| }, |
| { |
| "entropy": 0.5763344466686249, |
| "epoch": 6.4, |
| "grad_norm": 0.026007242500782013, |
| "learning_rate": 0.0002, |
| "loss": 0.5743853449821472, |
| "mean_token_accuracy": 0.767315685749054, |
| "num_tokens": 19156683.0, |
| "step": 256 |
| }, |
| { |
| "entropy": 0.5625879168510437, |
| "epoch": 6.425, |
| "grad_norm": 0.02871275693178177, |
| "learning_rate": 0.0002, |
| "loss": 0.5631023645401001, |
| "mean_token_accuracy": 0.7718234807252884, |
| "num_tokens": 19231669.0, |
| "step": 257 |
| }, |
| { |
| "entropy": 0.5742108523845673, |
| "epoch": 6.45, |
| "grad_norm": 0.029883647337555885, |
| "learning_rate": 0.0002, |
| "loss": 0.576926052570343, |
| "mean_token_accuracy": 0.7668623924255371, |
| "num_tokens": 19306142.0, |
| "step": 258 |
| }, |
| { |
| "entropy": 0.557953953742981, |
| "epoch": 6.475, |
| "grad_norm": 0.03357018902897835, |
| "learning_rate": 0.0002, |
| "loss": 0.5605831146240234, |
| "mean_token_accuracy": 0.7726317644119263, |
| "num_tokens": 19380351.0, |
| "step": 259 |
| }, |
| { |
| "entropy": 0.5633054375648499, |
| "epoch": 6.5, |
| "grad_norm": 0.028555380180478096, |
| "learning_rate": 0.0002, |
| "loss": 0.5600845813751221, |
| "mean_token_accuracy": 0.7734545171260834, |
| "num_tokens": 19454607.0, |
| "step": 260 |
| }, |
| { |
| "entropy": 0.5629763156175613, |
| "epoch": 6.525, |
| "grad_norm": 0.027474038302898407, |
| "learning_rate": 0.0002, |
| "loss": 0.5615619421005249, |
| "mean_token_accuracy": 0.7738883346319199, |
| "num_tokens": 19529836.0, |
| "step": 261 |
| }, |
| { |
| "entropy": 0.5723123848438263, |
| "epoch": 6.55, |
| "grad_norm": 0.030043484643101692, |
| "learning_rate": 0.0002, |
| "loss": 0.5767689943313599, |
| "mean_token_accuracy": 0.7650009542703629, |
| "num_tokens": 19604672.0, |
| "step": 262 |
| }, |
| { |
| "entropy": 0.5594469308853149, |
| "epoch": 6.575, |
| "grad_norm": 0.027517110109329224, |
| "learning_rate": 0.0002, |
| "loss": 0.5633723735809326, |
| "mean_token_accuracy": 0.7720958739519119, |
| "num_tokens": 19678879.0, |
| "step": 263 |
| }, |
| { |
| "entropy": 0.5676506012678146, |
| "epoch": 6.6, |
| "grad_norm": 0.03375779092311859, |
| "learning_rate": 0.0002, |
| "loss": 0.5649895668029785, |
| "mean_token_accuracy": 0.7705852091312408, |
| "num_tokens": 19753964.0, |
| "step": 264 |
| }, |
| { |
| "entropy": 0.5738198310136795, |
| "epoch": 6.625, |
| "grad_norm": 0.026767941191792488, |
| "learning_rate": 0.0002, |
| "loss": 0.5693171620368958, |
| "mean_token_accuracy": 0.7693505436182022, |
| "num_tokens": 19829009.0, |
| "step": 265 |
| }, |
| { |
| "entropy": 0.5671893358230591, |
| "epoch": 6.65, |
| "grad_norm": 0.033948201686143875, |
| "learning_rate": 0.0002, |
| "loss": 0.5679397583007812, |
| "mean_token_accuracy": 0.7691246271133423, |
| "num_tokens": 19904354.0, |
| "step": 266 |
| }, |
| { |
| "entropy": 0.5724634379148483, |
| "epoch": 6.675, |
| "grad_norm": 0.027929022908210754, |
| "learning_rate": 0.0002, |
| "loss": 0.5724775791168213, |
| "mean_token_accuracy": 0.7685766965150833, |
| "num_tokens": 19979485.0, |
| "step": 267 |
| }, |
| { |
| "entropy": 0.5530816316604614, |
| "epoch": 6.7, |
| "grad_norm": 0.02936733327805996, |
| "learning_rate": 0.0002, |
| "loss": 0.5522775053977966, |
| "mean_token_accuracy": 0.7771619409322739, |
| "num_tokens": 20055306.0, |
| "step": 268 |
| }, |
| { |
| "entropy": 0.5592961460351944, |
| "epoch": 6.725, |
| "grad_norm": 0.033846575766801834, |
| "learning_rate": 0.0002, |
| "loss": 0.5621505975723267, |
| "mean_token_accuracy": 0.7730831801891327, |
| "num_tokens": 20129856.0, |
| "step": 269 |
| }, |
| { |
| "entropy": 0.5651666820049286, |
| "epoch": 6.75, |
| "grad_norm": 0.025500988587737083, |
| "learning_rate": 0.0002, |
| "loss": 0.5668225288391113, |
| "mean_token_accuracy": 0.7706255167722702, |
| "num_tokens": 20204683.0, |
| "step": 270 |
| }, |
| { |
| "entropy": 0.5640353113412857, |
| "epoch": 6.775, |
| "grad_norm": 0.033350858837366104, |
| "learning_rate": 0.0002, |
| "loss": 0.5606875419616699, |
| "mean_token_accuracy": 0.7722483277320862, |
| "num_tokens": 20280039.0, |
| "step": 271 |
| }, |
| { |
| "entropy": 0.5725362002849579, |
| "epoch": 6.8, |
| "grad_norm": 0.03152982145547867, |
| "learning_rate": 0.0002, |
| "loss": 0.5739132165908813, |
| "mean_token_accuracy": 0.767605796456337, |
| "num_tokens": 20354788.0, |
| "step": 272 |
| }, |
| { |
| "entropy": 0.5656009018421173, |
| "epoch": 6.825, |
| "grad_norm": 0.03156192600727081, |
| "learning_rate": 0.0002, |
| "loss": 0.5675251483917236, |
| "mean_token_accuracy": 0.7705205380916595, |
| "num_tokens": 20429122.0, |
| "step": 273 |
| }, |
| { |
| "entropy": 0.5605581253767014, |
| "epoch": 6.85, |
| "grad_norm": 0.03891259804368019, |
| "learning_rate": 0.0002, |
| "loss": 0.5642046928405762, |
| "mean_token_accuracy": 0.7717309892177582, |
| "num_tokens": 20503457.0, |
| "step": 274 |
| }, |
| { |
| "entropy": 0.5743284076452255, |
| "epoch": 6.875, |
| "grad_norm": 0.026666074991226196, |
| "learning_rate": 0.0002, |
| "loss": 0.5709526538848877, |
| "mean_token_accuracy": 0.7691267430782318, |
| "num_tokens": 20578654.0, |
| "step": 275 |
| }, |
| { |
| "entropy": 0.5633113235235214, |
| "epoch": 6.9, |
| "grad_norm": 0.03862672671675682, |
| "learning_rate": 0.0002, |
| "loss": 0.5625099539756775, |
| "mean_token_accuracy": 0.7733415812253952, |
| "num_tokens": 20653724.0, |
| "step": 276 |
| }, |
| { |
| "entropy": 0.5552873611450195, |
| "epoch": 6.925, |
| "grad_norm": 0.02755405753850937, |
| "learning_rate": 0.0002, |
| "loss": 0.555654764175415, |
| "mean_token_accuracy": 0.7761149406433105, |
| "num_tokens": 20728186.0, |
| "step": 277 |
| }, |
| { |
| "entropy": 0.5622376352548599, |
| "epoch": 6.95, |
| "grad_norm": 0.038842860609292984, |
| "learning_rate": 0.0002, |
| "loss": 0.5644208192825317, |
| "mean_token_accuracy": 0.7724904865026474, |
| "num_tokens": 20803838.0, |
| "step": 278 |
| }, |
| { |
| "entropy": 0.5590371191501617, |
| "epoch": 6.975, |
| "grad_norm": 0.03130970522761345, |
| "learning_rate": 0.0002, |
| "loss": 0.5607836246490479, |
| "mean_token_accuracy": 0.7744860798120499, |
| "num_tokens": 20878758.0, |
| "step": 279 |
| }, |
| { |
| "entropy": 0.5657824128866196, |
| "epoch": 7.0, |
| "grad_norm": 0.03451741114258766, |
| "learning_rate": 0.0002, |
| "loss": 0.5687781572341919, |
| "mean_token_accuracy": 0.7700928151607513, |
| "num_tokens": 20952402.0, |
| "step": 280 |
| }, |
| { |
| "entropy": 0.5669686943292618, |
| "epoch": 7.025, |
| "grad_norm": 0.033809054642915726, |
| "learning_rate": 0.0002, |
| "loss": 0.555560827255249, |
| "mean_token_accuracy": 0.7754446268081665, |
| "num_tokens": 21027343.0, |
| "step": 281 |
| }, |
| { |
| "entropy": 0.5571761578321457, |
| "epoch": 7.05, |
| "grad_norm": 0.02909841760993004, |
| "learning_rate": 0.0002, |
| "loss": 0.5523079037666321, |
| "mean_token_accuracy": 0.7767569869756699, |
| "num_tokens": 21101355.0, |
| "step": 282 |
| }, |
| { |
| "entropy": 0.5467852652072906, |
| "epoch": 7.075, |
| "grad_norm": 0.03742019459605217, |
| "learning_rate": 0.0002, |
| "loss": 0.5528299808502197, |
| "mean_token_accuracy": 0.7753051668405533, |
| "num_tokens": 21175670.0, |
| "step": 283 |
| }, |
| { |
| "entropy": 0.5646944791078568, |
| "epoch": 7.1, |
| "grad_norm": 0.029561564326286316, |
| "learning_rate": 0.0002, |
| "loss": 0.5654648542404175, |
| "mean_token_accuracy": 0.7715721130371094, |
| "num_tokens": 21250590.0, |
| "step": 284 |
| }, |
| { |
| "entropy": 0.5694975554943085, |
| "epoch": 7.125, |
| "grad_norm": 0.043832119554281235, |
| "learning_rate": 0.0002, |
| "loss": 0.563732385635376, |
| "mean_token_accuracy": 0.771723747253418, |
| "num_tokens": 21325721.0, |
| "step": 285 |
| }, |
| { |
| "entropy": 0.5660099983215332, |
| "epoch": 7.15, |
| "grad_norm": 0.03258618339896202, |
| "learning_rate": 0.0002, |
| "loss": 0.5601391792297363, |
| "mean_token_accuracy": 0.7732871919870377, |
| "num_tokens": 21400578.0, |
| "step": 286 |
| }, |
| { |
| "entropy": 0.5517364293336868, |
| "epoch": 7.175, |
| "grad_norm": 0.04530012607574463, |
| "learning_rate": 0.0002, |
| "loss": 0.558700680732727, |
| "mean_token_accuracy": 0.7721482962369919, |
| "num_tokens": 21475582.0, |
| "step": 287 |
| }, |
| { |
| "entropy": 0.5505019277334213, |
| "epoch": 7.2, |
| "grad_norm": 0.035087864845991135, |
| "learning_rate": 0.0002, |
| "loss": 0.5531398057937622, |
| "mean_token_accuracy": 0.7760081589221954, |
| "num_tokens": 21550821.0, |
| "step": 288 |
| }, |
| { |
| "entropy": 0.559594452381134, |
| "epoch": 7.225, |
| "grad_norm": 0.034394703805446625, |
| "learning_rate": 0.0002, |
| "loss": 0.5592218637466431, |
| "mean_token_accuracy": 0.773568719625473, |
| "num_tokens": 21625873.0, |
| "step": 289 |
| }, |
| { |
| "entropy": 0.5724920481443405, |
| "epoch": 7.25, |
| "grad_norm": 0.033760059624910355, |
| "learning_rate": 0.0002, |
| "loss": 0.5668227672576904, |
| "mean_token_accuracy": 0.7702212035655975, |
| "num_tokens": 21700602.0, |
| "step": 290 |
| }, |
| { |
| "entropy": 0.56439508497715, |
| "epoch": 7.275, |
| "grad_norm": 0.03572908788919449, |
| "learning_rate": 0.0002, |
| "loss": 0.5633417367935181, |
| "mean_token_accuracy": 0.7723740786314011, |
| "num_tokens": 21774875.0, |
| "step": 291 |
| }, |
| { |
| "entropy": 0.548421323299408, |
| "epoch": 7.3, |
| "grad_norm": 0.04545460268855095, |
| "learning_rate": 0.0002, |
| "loss": 0.5489292144775391, |
| "mean_token_accuracy": 0.7780417054891586, |
| "num_tokens": 21849501.0, |
| "step": 292 |
| }, |
| { |
| "entropy": 0.5536051839590073, |
| "epoch": 7.325, |
| "grad_norm": 0.03099142387509346, |
| "learning_rate": 0.0002, |
| "loss": 0.5557237863540649, |
| "mean_token_accuracy": 0.7745026648044586, |
| "num_tokens": 21925199.0, |
| "step": 293 |
| }, |
| { |
| "entropy": 0.5548270344734192, |
| "epoch": 7.35, |
| "grad_norm": 0.04060740023851395, |
| "learning_rate": 0.0002, |
| "loss": 0.5544718503952026, |
| "mean_token_accuracy": 0.7760574668645859, |
| "num_tokens": 22000154.0, |
| "step": 294 |
| }, |
| { |
| "entropy": 0.5629658997058868, |
| "epoch": 7.375, |
| "grad_norm": 0.03493206575512886, |
| "learning_rate": 0.0002, |
| "loss": 0.557624101638794, |
| "mean_token_accuracy": 0.7730877846479416, |
| "num_tokens": 22074584.0, |
| "step": 295 |
| }, |
| { |
| "entropy": 0.5578918755054474, |
| "epoch": 7.4, |
| "grad_norm": 0.037077102810144424, |
| "learning_rate": 0.0002, |
| "loss": 0.5596894025802612, |
| "mean_token_accuracy": 0.7733047008514404, |
| "num_tokens": 22149636.0, |
| "step": 296 |
| }, |
| { |
| "entropy": 0.5517973154783249, |
| "epoch": 7.425, |
| "grad_norm": 0.03832925483584404, |
| "learning_rate": 0.0002, |
| "loss": 0.5568417906761169, |
| "mean_token_accuracy": 0.7738584578037262, |
| "num_tokens": 22224466.0, |
| "step": 297 |
| }, |
| { |
| "entropy": 0.5560635775327682, |
| "epoch": 7.45, |
| "grad_norm": 0.02942826971411705, |
| "learning_rate": 0.0002, |
| "loss": 0.5580307841300964, |
| "mean_token_accuracy": 0.7735044956207275, |
| "num_tokens": 22299483.0, |
| "step": 298 |
| }, |
| { |
| "entropy": 0.5802666395902634, |
| "epoch": 7.475, |
| "grad_norm": 0.038540106266736984, |
| "learning_rate": 0.0002, |
| "loss": 0.5760456323623657, |
| "mean_token_accuracy": 0.7662649601697922, |
| "num_tokens": 22374870.0, |
| "step": 299 |
| }, |
| { |
| "entropy": 0.5679387599229813, |
| "epoch": 7.5, |
| "grad_norm": 0.029141677543520927, |
| "learning_rate": 0.0002, |
| "loss": 0.564399778842926, |
| "mean_token_accuracy": 0.7710927873849869, |
| "num_tokens": 22449693.0, |
| "step": 300 |
| }, |
| { |
| "entropy": 0.5547898411750793, |
| "epoch": 7.525, |
| "grad_norm": 0.02980385534465313, |
| "learning_rate": 0.0002, |
| "loss": 0.5558938980102539, |
| "mean_token_accuracy": 0.7764756679534912, |
| "num_tokens": 22524828.0, |
| "step": 301 |
| }, |
| { |
| "entropy": 0.5566362589597702, |
| "epoch": 7.55, |
| "grad_norm": 0.036666952073574066, |
| "learning_rate": 0.0002, |
| "loss": 0.5618187785148621, |
| "mean_token_accuracy": 0.7723874747753143, |
| "num_tokens": 22599762.0, |
| "step": 302 |
| }, |
| { |
| "entropy": 0.5653972774744034, |
| "epoch": 7.575, |
| "grad_norm": 0.035354770720005035, |
| "learning_rate": 0.0002, |
| "loss": 0.5642956495285034, |
| "mean_token_accuracy": 0.7708256095647812, |
| "num_tokens": 22674686.0, |
| "step": 303 |
| }, |
| { |
| "entropy": 0.5595411062240601, |
| "epoch": 7.6, |
| "grad_norm": 0.03489721938967705, |
| "learning_rate": 0.0002, |
| "loss": 0.5577263832092285, |
| "mean_token_accuracy": 0.7742740511894226, |
| "num_tokens": 22749009.0, |
| "step": 304 |
| }, |
| { |
| "entropy": 0.5515602380037308, |
| "epoch": 7.625, |
| "grad_norm": 0.032466236501932144, |
| "learning_rate": 0.0002, |
| "loss": 0.5543727874755859, |
| "mean_token_accuracy": 0.7750429511070251, |
| "num_tokens": 22822984.0, |
| "step": 305 |
| }, |
| { |
| "entropy": 0.567086473107338, |
| "epoch": 7.65, |
| "grad_norm": 0.037166330963373184, |
| "learning_rate": 0.0002, |
| "loss": 0.5712989568710327, |
| "mean_token_accuracy": 0.768398255109787, |
| "num_tokens": 22897628.0, |
| "step": 306 |
| }, |
| { |
| "entropy": 0.570941150188446, |
| "epoch": 7.675, |
| "grad_norm": 0.03177010267972946, |
| "learning_rate": 0.0002, |
| "loss": 0.5668294429779053, |
| "mean_token_accuracy": 0.7703831493854523, |
| "num_tokens": 22972657.0, |
| "step": 307 |
| }, |
| { |
| "entropy": 0.5639981329441071, |
| "epoch": 7.7, |
| "grad_norm": 0.034557901322841644, |
| "learning_rate": 0.0002, |
| "loss": 0.5623838901519775, |
| "mean_token_accuracy": 0.7735652476549149, |
| "num_tokens": 23047874.0, |
| "step": 308 |
| }, |
| { |
| "entropy": 0.5679261088371277, |
| "epoch": 7.725, |
| "grad_norm": 0.028234630823135376, |
| "learning_rate": 0.0002, |
| "loss": 0.5686887502670288, |
| "mean_token_accuracy": 0.7698494493961334, |
| "num_tokens": 23122211.0, |
| "step": 309 |
| }, |
| { |
| "entropy": 0.5655084848403931, |
| "epoch": 7.75, |
| "grad_norm": 0.03173128515481949, |
| "learning_rate": 0.0002, |
| "loss": 0.5650713443756104, |
| "mean_token_accuracy": 0.7710306495428085, |
| "num_tokens": 23197631.0, |
| "step": 310 |
| }, |
| { |
| "entropy": 0.5513352751731873, |
| "epoch": 7.775, |
| "grad_norm": 0.030083199962973595, |
| "learning_rate": 0.0002, |
| "loss": 0.5536549687385559, |
| "mean_token_accuracy": 0.7755338102579117, |
| "num_tokens": 23272548.0, |
| "step": 311 |
| }, |
| { |
| "entropy": 0.5555614978075027, |
| "epoch": 7.8, |
| "grad_norm": 0.034044049680233, |
| "learning_rate": 0.0002, |
| "loss": 0.5587220788002014, |
| "mean_token_accuracy": 0.7734567075967789, |
| "num_tokens": 23347302.0, |
| "step": 312 |
| }, |
| { |
| "entropy": 0.5663987696170807, |
| "epoch": 7.825, |
| "grad_norm": 0.03760316222906113, |
| "learning_rate": 0.0002, |
| "loss": 0.5659680366516113, |
| "mean_token_accuracy": 0.7704599052667618, |
| "num_tokens": 23422273.0, |
| "step": 313 |
| }, |
| { |
| "entropy": 0.5548186749219894, |
| "epoch": 7.85, |
| "grad_norm": 0.03154882416129112, |
| "learning_rate": 0.0002, |
| "loss": 0.5523256659507751, |
| "mean_token_accuracy": 0.7759846299886703, |
| "num_tokens": 23497091.0, |
| "step": 314 |
| }, |
| { |
| "entropy": 0.5607025325298309, |
| "epoch": 7.875, |
| "grad_norm": 0.041530657559633255, |
| "learning_rate": 0.0002, |
| "loss": 0.5609087347984314, |
| "mean_token_accuracy": 0.772527739405632, |
| "num_tokens": 23571404.0, |
| "step": 315 |
| }, |
| { |
| "entropy": 0.549463763833046, |
| "epoch": 7.9, |
| "grad_norm": 0.030178798362612724, |
| "learning_rate": 0.0002, |
| "loss": 0.5497753620147705, |
| "mean_token_accuracy": 0.7761662304401398, |
| "num_tokens": 23646640.0, |
| "step": 316 |
| }, |
| { |
| "entropy": 0.5618870705366135, |
| "epoch": 7.925, |
| "grad_norm": 0.04211151972413063, |
| "learning_rate": 0.0002, |
| "loss": 0.5642685294151306, |
| "mean_token_accuracy": 0.7714849263429642, |
| "num_tokens": 23720939.0, |
| "step": 317 |
| }, |
| { |
| "entropy": 0.570905327796936, |
| "epoch": 7.95, |
| "grad_norm": 0.026979681104421616, |
| "learning_rate": 0.0002, |
| "loss": 0.569770336151123, |
| "mean_token_accuracy": 0.7690195441246033, |
| "num_tokens": 23795221.0, |
| "step": 318 |
| }, |
| { |
| "entropy": 0.5613491535186768, |
| "epoch": 7.975, |
| "grad_norm": 0.04255770891904831, |
| "learning_rate": 0.0002, |
| "loss": 0.559675931930542, |
| "mean_token_accuracy": 0.7733322232961655, |
| "num_tokens": 23871279.0, |
| "step": 319 |
| }, |
| { |
| "entropy": 0.5522212386131287, |
| "epoch": 8.0, |
| "grad_norm": 0.032483723014593124, |
| "learning_rate": 0.0002, |
| "loss": 0.5494213104248047, |
| "mean_token_accuracy": 0.777331531047821, |
| "num_tokens": 23945592.0, |
| "step": 320 |
| }, |
| { |
| "entropy": 0.5492678731679916, |
| "epoch": 8.025, |
| "grad_norm": 0.04212978109717369, |
| "learning_rate": 0.0002, |
| "loss": 0.5434910655021667, |
| "mean_token_accuracy": 0.7794834822416306, |
| "num_tokens": 24020279.0, |
| "step": 321 |
| }, |
| { |
| "entropy": 0.5525006651878357, |
| "epoch": 8.05, |
| "grad_norm": 0.04567183181643486, |
| "learning_rate": 0.0002, |
| "loss": 0.5581562519073486, |
| "mean_token_accuracy": 0.7742817401885986, |
| "num_tokens": 24095378.0, |
| "step": 322 |
| }, |
| { |
| "entropy": 0.5501823425292969, |
| "epoch": 8.075, |
| "grad_norm": 0.04195858910679817, |
| "learning_rate": 0.0002, |
| "loss": 0.5543116331100464, |
| "mean_token_accuracy": 0.7745549827814102, |
| "num_tokens": 24169761.0, |
| "step": 323 |
| }, |
| { |
| "entropy": 0.5598976612091064, |
| "epoch": 8.1, |
| "grad_norm": 0.041518036276102066, |
| "learning_rate": 0.0002, |
| "loss": 0.5529639720916748, |
| "mean_token_accuracy": 0.7760216742753983, |
| "num_tokens": 24244551.0, |
| "step": 324 |
| }, |
| { |
| "entropy": 0.5517577975988388, |
| "epoch": 8.125, |
| "grad_norm": 0.04003611207008362, |
| "learning_rate": 0.0002, |
| "loss": 0.547474205493927, |
| "mean_token_accuracy": 0.7773387134075165, |
| "num_tokens": 24319348.0, |
| "step": 325 |
| }, |
| { |
| "entropy": 0.5453294217586517, |
| "epoch": 8.15, |
| "grad_norm": 0.04722796007990837, |
| "learning_rate": 0.0002, |
| "loss": 0.5527043342590332, |
| "mean_token_accuracy": 0.7764124572277069, |
| "num_tokens": 24394923.0, |
| "step": 326 |
| }, |
| { |
| "entropy": 0.5439276248216629, |
| "epoch": 8.175, |
| "grad_norm": 0.050235629081726074, |
| "learning_rate": 0.0002, |
| "loss": 0.5492441654205322, |
| "mean_token_accuracy": 0.7766183167695999, |
| "num_tokens": 24469884.0, |
| "step": 327 |
| }, |
| { |
| "entropy": 0.5524363815784454, |
| "epoch": 8.2, |
| "grad_norm": 0.05216272920370102, |
| "learning_rate": 0.0002, |
| "loss": 0.5485007762908936, |
| "mean_token_accuracy": 0.7773555964231491, |
| "num_tokens": 24544406.0, |
| "step": 328 |
| }, |
| { |
| "entropy": 0.5342830568552017, |
| "epoch": 8.225, |
| "grad_norm": 0.03883667290210724, |
| "learning_rate": 0.0002, |
| "loss": 0.5314654111862183, |
| "mean_token_accuracy": 0.7834321856498718, |
| "num_tokens": 24618651.0, |
| "step": 329 |
| }, |
| { |
| "entropy": 0.5648667514324188, |
| "epoch": 8.25, |
| "grad_norm": 0.05192190781235695, |
| "learning_rate": 0.0002, |
| "loss": 0.5654504299163818, |
| "mean_token_accuracy": 0.7713068872690201, |
| "num_tokens": 24692797.0, |
| "step": 330 |
| }, |
| { |
| "entropy": 0.5633829087018967, |
| "epoch": 8.275, |
| "grad_norm": 0.061627499759197235, |
| "learning_rate": 0.0002, |
| "loss": 0.5625565052032471, |
| "mean_token_accuracy": 0.7720554023981094, |
| "num_tokens": 24767683.0, |
| "step": 331 |
| }, |
| { |
| "entropy": 0.5499114990234375, |
| "epoch": 8.3, |
| "grad_norm": 0.04107741639018059, |
| "learning_rate": 0.0002, |
| "loss": 0.548437237739563, |
| "mean_token_accuracy": 0.7773527503013611, |
| "num_tokens": 24843880.0, |
| "step": 332 |
| }, |
| { |
| "entropy": 0.5482420921325684, |
| "epoch": 8.325, |
| "grad_norm": 0.05437928065657616, |
| "learning_rate": 0.0002, |
| "loss": 0.5508846044540405, |
| "mean_token_accuracy": 0.776692345738411, |
| "num_tokens": 24918492.0, |
| "step": 333 |
| }, |
| { |
| "entropy": 0.5507365763187408, |
| "epoch": 8.35, |
| "grad_norm": 0.038063954561948776, |
| "learning_rate": 0.0002, |
| "loss": 0.5517404675483704, |
| "mean_token_accuracy": 0.7754537016153336, |
| "num_tokens": 24993390.0, |
| "step": 334 |
| }, |
| { |
| "entropy": 0.5636460483074188, |
| "epoch": 8.375, |
| "grad_norm": 0.044943079352378845, |
| "learning_rate": 0.0002, |
| "loss": 0.5621282458305359, |
| "mean_token_accuracy": 0.7718300223350525, |
| "num_tokens": 25067643.0, |
| "step": 335 |
| }, |
| { |
| "entropy": 0.5502006560564041, |
| "epoch": 8.4, |
| "grad_norm": 0.038005705922842026, |
| "learning_rate": 0.0002, |
| "loss": 0.5524753332138062, |
| "mean_token_accuracy": 0.7749505192041397, |
| "num_tokens": 25142656.0, |
| "step": 336 |
| }, |
| { |
| "entropy": 0.5396933555603027, |
| "epoch": 8.425, |
| "grad_norm": 0.04691820219159126, |
| "learning_rate": 0.0002, |
| "loss": 0.5444018840789795, |
| "mean_token_accuracy": 0.779260128736496, |
| "num_tokens": 25217599.0, |
| "step": 337 |
| }, |
| { |
| "entropy": 0.5635709166526794, |
| "epoch": 8.45, |
| "grad_norm": 0.046322260051965714, |
| "learning_rate": 0.0002, |
| "loss": 0.5574597120285034, |
| "mean_token_accuracy": 0.774439737200737, |
| "num_tokens": 25293321.0, |
| "step": 338 |
| }, |
| { |
| "entropy": 0.546722874045372, |
| "epoch": 8.475, |
| "grad_norm": 0.043173372745513916, |
| "learning_rate": 0.0002, |
| "loss": 0.5474981665611267, |
| "mean_token_accuracy": 0.7775770723819733, |
| "num_tokens": 25367884.0, |
| "step": 339 |
| }, |
| { |
| "entropy": 0.5400292277336121, |
| "epoch": 8.5, |
| "grad_norm": 0.05466064065694809, |
| "learning_rate": 0.0002, |
| "loss": 0.546923041343689, |
| "mean_token_accuracy": 0.7782380133867264, |
| "num_tokens": 25443693.0, |
| "step": 340 |
| }, |
| { |
| "entropy": 0.5547144860029221, |
| "epoch": 8.525, |
| "grad_norm": 0.03708970546722412, |
| "learning_rate": 0.0002, |
| "loss": 0.5485285520553589, |
| "mean_token_accuracy": 0.777634859085083, |
| "num_tokens": 25519449.0, |
| "step": 341 |
| }, |
| { |
| "entropy": 0.5555647015571594, |
| "epoch": 8.55, |
| "grad_norm": 0.044979583472013474, |
| "learning_rate": 0.0002, |
| "loss": 0.5532581806182861, |
| "mean_token_accuracy": 0.7749562114477158, |
| "num_tokens": 25594565.0, |
| "step": 342 |
| }, |
| { |
| "entropy": 0.5622061938047409, |
| "epoch": 8.575, |
| "grad_norm": 0.037068452686071396, |
| "learning_rate": 0.0002, |
| "loss": 0.5619022846221924, |
| "mean_token_accuracy": 0.7722371071577072, |
| "num_tokens": 25669210.0, |
| "step": 343 |
| }, |
| { |
| "entropy": 0.5621164441108704, |
| "epoch": 8.6, |
| "grad_norm": 0.04099290445446968, |
| "learning_rate": 0.0002, |
| "loss": 0.5597378015518188, |
| "mean_token_accuracy": 0.7729932218790054, |
| "num_tokens": 25743928.0, |
| "step": 344 |
| }, |
| { |
| "entropy": 0.5559934675693512, |
| "epoch": 8.625, |
| "grad_norm": 0.034955333918333054, |
| "learning_rate": 0.0002, |
| "loss": 0.5548402070999146, |
| "mean_token_accuracy": 0.774565801024437, |
| "num_tokens": 25819235.0, |
| "step": 345 |
| }, |
| { |
| "entropy": 0.5608330816030502, |
| "epoch": 8.65, |
| "grad_norm": 0.032942138612270355, |
| "learning_rate": 0.0002, |
| "loss": 0.5593307614326477, |
| "mean_token_accuracy": 0.7738174945116043, |
| "num_tokens": 25894146.0, |
| "step": 346 |
| }, |
| { |
| "entropy": 0.5570491552352905, |
| "epoch": 8.675, |
| "grad_norm": 0.037585385143756866, |
| "learning_rate": 0.0002, |
| "loss": 0.5622788071632385, |
| "mean_token_accuracy": 0.7726104408502579, |
| "num_tokens": 25968950.0, |
| "step": 347 |
| }, |
| { |
| "entropy": 0.5602799952030182, |
| "epoch": 8.7, |
| "grad_norm": 0.036275461316108704, |
| "learning_rate": 0.0002, |
| "loss": 0.5584805011749268, |
| "mean_token_accuracy": 0.7735689133405685, |
| "num_tokens": 26044155.0, |
| "step": 348 |
| }, |
| { |
| "entropy": 0.5549326539039612, |
| "epoch": 8.725, |
| "grad_norm": 0.03921646997332573, |
| "learning_rate": 0.0002, |
| "loss": 0.5547788739204407, |
| "mean_token_accuracy": 0.774893268942833, |
| "num_tokens": 26118350.0, |
| "step": 349 |
| }, |
| { |
| "entropy": 0.562700167298317, |
| "epoch": 8.75, |
| "grad_norm": 0.037997711449861526, |
| "learning_rate": 0.0002, |
| "loss": 0.5609459280967712, |
| "mean_token_accuracy": 0.7732319533824921, |
| "num_tokens": 26193288.0, |
| "step": 350 |
| }, |
| { |
| "entropy": 0.5581104457378387, |
| "epoch": 8.775, |
| "grad_norm": 0.03644339367747307, |
| "learning_rate": 0.0002, |
| "loss": 0.5592728853225708, |
| "mean_token_accuracy": 0.772967129945755, |
| "num_tokens": 26267183.0, |
| "step": 351 |
| }, |
| { |
| "entropy": 0.5485663414001465, |
| "epoch": 8.8, |
| "grad_norm": 0.03490961715579033, |
| "learning_rate": 0.0002, |
| "loss": 0.5506186485290527, |
| "mean_token_accuracy": 0.7772383987903595, |
| "num_tokens": 26342404.0, |
| "step": 352 |
| }, |
| { |
| "entropy": 0.5463830828666687, |
| "epoch": 8.825, |
| "grad_norm": 0.03406834974884987, |
| "learning_rate": 0.0002, |
| "loss": 0.543938398361206, |
| "mean_token_accuracy": 0.7797689437866211, |
| "num_tokens": 26416380.0, |
| "step": 353 |
| }, |
| { |
| "entropy": 0.5513378083705902, |
| "epoch": 8.85, |
| "grad_norm": 0.03450295329093933, |
| "learning_rate": 0.0002, |
| "loss": 0.5490238070487976, |
| "mean_token_accuracy": 0.7770363390445709, |
| "num_tokens": 26490636.0, |
| "step": 354 |
| }, |
| { |
| "entropy": 0.5578331649303436, |
| "epoch": 8.875, |
| "grad_norm": 0.03415544703602791, |
| "learning_rate": 0.0002, |
| "loss": 0.5596639513969421, |
| "mean_token_accuracy": 0.7740722298622131, |
| "num_tokens": 26564954.0, |
| "step": 355 |
| }, |
| { |
| "entropy": 0.5540540665388107, |
| "epoch": 8.9, |
| "grad_norm": 0.03938233479857445, |
| "learning_rate": 0.0002, |
| "loss": 0.5562814474105835, |
| "mean_token_accuracy": 0.7746336907148361, |
| "num_tokens": 26639707.0, |
| "step": 356 |
| }, |
| { |
| "entropy": 0.5420689284801483, |
| "epoch": 8.925, |
| "grad_norm": 0.04445737600326538, |
| "learning_rate": 0.0002, |
| "loss": 0.5478014945983887, |
| "mean_token_accuracy": 0.7780271470546722, |
| "num_tokens": 26713693.0, |
| "step": 357 |
| }, |
| { |
| "entropy": 0.55097496509552, |
| "epoch": 8.95, |
| "grad_norm": 0.03611644357442856, |
| "learning_rate": 0.0002, |
| "loss": 0.5501728057861328, |
| "mean_token_accuracy": 0.7780314683914185, |
| "num_tokens": 26788869.0, |
| "step": 358 |
| }, |
| { |
| "entropy": 0.5535710901021957, |
| "epoch": 8.975, |
| "grad_norm": 0.03289943188428879, |
| "learning_rate": 0.0002, |
| "loss": 0.5510231852531433, |
| "mean_token_accuracy": 0.7763938903808594, |
| "num_tokens": 26863133.0, |
| "step": 359 |
| }, |
| { |
| "entropy": 0.5588638633489609, |
| "epoch": 9.0, |
| "grad_norm": 0.03923680633306503, |
| "learning_rate": 0.0002, |
| "loss": 0.5572277307510376, |
| "mean_token_accuracy": 0.7751377373933792, |
| "num_tokens": 26938828.0, |
| "step": 360 |
| }, |
| { |
| "entropy": 0.5573904514312744, |
| "epoch": 9.025, |
| "grad_norm": 0.042408641427755356, |
| "learning_rate": 0.0002, |
| "loss": 0.546970009803772, |
| "mean_token_accuracy": 0.7769688963890076, |
| "num_tokens": 27013695.0, |
| "step": 361 |
| }, |
| { |
| "entropy": 0.5463464558124542, |
| "epoch": 9.05, |
| "grad_norm": 0.04984664544463158, |
| "learning_rate": 0.0002, |
| "loss": 0.5488175749778748, |
| "mean_token_accuracy": 0.7771357446908951, |
| "num_tokens": 27088413.0, |
| "step": 362 |
| }, |
| { |
| "entropy": 0.5289105176925659, |
| "epoch": 9.075, |
| "grad_norm": 0.04879127815365791, |
| "learning_rate": 0.0002, |
| "loss": 0.5313310623168945, |
| "mean_token_accuracy": 0.7846623361110687, |
| "num_tokens": 27162877.0, |
| "step": 363 |
| }, |
| { |
| "entropy": 0.5486065149307251, |
| "epoch": 9.1, |
| "grad_norm": 0.05812316760420799, |
| "learning_rate": 0.0002, |
| "loss": 0.5521052479743958, |
| "mean_token_accuracy": 0.7759816944599152, |
| "num_tokens": 27236796.0, |
| "step": 364 |
| }, |
| { |
| "entropy": 0.5488688349723816, |
| "epoch": 9.125, |
| "grad_norm": 0.048603836447000504, |
| "learning_rate": 0.0002, |
| "loss": 0.5465872287750244, |
| "mean_token_accuracy": 0.7791540026664734, |
| "num_tokens": 27311937.0, |
| "step": 365 |
| }, |
| { |
| "entropy": 0.5352853387594223, |
| "epoch": 9.15, |
| "grad_norm": 0.0544096976518631, |
| "learning_rate": 0.0002, |
| "loss": 0.5347145795822144, |
| "mean_token_accuracy": 0.7831054776906967, |
| "num_tokens": 27387311.0, |
| "step": 366 |
| }, |
| { |
| "entropy": 0.5446972846984863, |
| "epoch": 9.175, |
| "grad_norm": 0.07536739856004715, |
| "learning_rate": 0.0002, |
| "loss": 0.5502406358718872, |
| "mean_token_accuracy": 0.7761110365390778, |
| "num_tokens": 27462732.0, |
| "step": 367 |
| }, |
| { |
| "entropy": 0.5384257137775421, |
| "epoch": 9.2, |
| "grad_norm": 0.07809668034315109, |
| "learning_rate": 0.0002, |
| "loss": 0.5337420701980591, |
| "mean_token_accuracy": 0.7827627509832382, |
| "num_tokens": 27537612.0, |
| "step": 368 |
| }, |
| { |
| "entropy": 0.5450446158647537, |
| "epoch": 9.225, |
| "grad_norm": 0.05390315130352974, |
| "learning_rate": 0.0002, |
| "loss": 0.541022777557373, |
| "mean_token_accuracy": 0.7800815850496292, |
| "num_tokens": 27612425.0, |
| "step": 369 |
| }, |
| { |
| "entropy": 0.5376718789339066, |
| "epoch": 9.25, |
| "grad_norm": 0.050644826143980026, |
| "learning_rate": 0.0002, |
| "loss": 0.5411415100097656, |
| "mean_token_accuracy": 0.7793814241886139, |
| "num_tokens": 27686664.0, |
| "step": 370 |
| }, |
| { |
| "entropy": 0.5415088385343552, |
| "epoch": 9.275, |
| "grad_norm": 0.07354080677032471, |
| "learning_rate": 0.0002, |
| "loss": 0.5452766418457031, |
| "mean_token_accuracy": 0.7790811359882355, |
| "num_tokens": 27762025.0, |
| "step": 371 |
| }, |
| { |
| "entropy": 0.5448242127895355, |
| "epoch": 9.3, |
| "grad_norm": 0.0593232586979866, |
| "learning_rate": 0.0002, |
| "loss": 0.5438794493675232, |
| "mean_token_accuracy": 0.7780045717954636, |
| "num_tokens": 27835553.0, |
| "step": 372 |
| }, |
| { |
| "entropy": 0.5405709743499756, |
| "epoch": 9.325, |
| "grad_norm": 0.05473851040005684, |
| "learning_rate": 0.0002, |
| "loss": 0.5379279851913452, |
| "mean_token_accuracy": 0.782084509730339, |
| "num_tokens": 27910916.0, |
| "step": 373 |
| }, |
| { |
| "entropy": 0.5353046655654907, |
| "epoch": 9.35, |
| "grad_norm": 0.07823872566223145, |
| "learning_rate": 0.0002, |
| "loss": 0.5367236733436584, |
| "mean_token_accuracy": 0.7828928083181381, |
| "num_tokens": 27986761.0, |
| "step": 374 |
| }, |
| { |
| "entropy": 0.5484424829483032, |
| "epoch": 9.375, |
| "grad_norm": 0.09651726484298706, |
| "learning_rate": 0.0002, |
| "loss": 0.5527921915054321, |
| "mean_token_accuracy": 0.7761628329753876, |
| "num_tokens": 28061459.0, |
| "step": 375 |
| }, |
| { |
| "entropy": 0.551795557141304, |
| "epoch": 9.4, |
| "grad_norm": 0.04663221165537834, |
| "learning_rate": 0.0002, |
| "loss": 0.5416221022605896, |
| "mean_token_accuracy": 0.7804577797651291, |
| "num_tokens": 28136180.0, |
| "step": 376 |
| }, |
| { |
| "entropy": 0.5437692701816559, |
| "epoch": 9.425, |
| "grad_norm": 0.060796961188316345, |
| "learning_rate": 0.0002, |
| "loss": 0.5428634881973267, |
| "mean_token_accuracy": 0.7803646326065063, |
| "num_tokens": 28210743.0, |
| "step": 377 |
| }, |
| { |
| "entropy": 0.5389476418495178, |
| "epoch": 9.45, |
| "grad_norm": 0.06818708777427673, |
| "learning_rate": 0.0002, |
| "loss": 0.542535662651062, |
| "mean_token_accuracy": 0.779965728521347, |
| "num_tokens": 28285511.0, |
| "step": 378 |
| }, |
| { |
| "entropy": 0.5542739927768707, |
| "epoch": 9.475, |
| "grad_norm": 0.040479619055986404, |
| "learning_rate": 0.0002, |
| "loss": 0.5535589456558228, |
| "mean_token_accuracy": 0.776050016283989, |
| "num_tokens": 28361419.0, |
| "step": 379 |
| }, |
| { |
| "entropy": 0.544225737452507, |
| "epoch": 9.5, |
| "grad_norm": 0.061609551310539246, |
| "learning_rate": 0.0002, |
| "loss": 0.5452514290809631, |
| "mean_token_accuracy": 0.7790030539035797, |
| "num_tokens": 28436492.0, |
| "step": 380 |
| }, |
| { |
| "entropy": 0.546901598572731, |
| "epoch": 9.525, |
| "grad_norm": 0.0580863393843174, |
| "learning_rate": 0.0002, |
| "loss": 0.545783519744873, |
| "mean_token_accuracy": 0.7786774635314941, |
| "num_tokens": 28511395.0, |
| "step": 381 |
| }, |
| { |
| "entropy": 0.5370142608880997, |
| "epoch": 9.55, |
| "grad_norm": 0.052466463297605515, |
| "learning_rate": 0.0002, |
| "loss": 0.5396707057952881, |
| "mean_token_accuracy": 0.7794803082942963, |
| "num_tokens": 28585898.0, |
| "step": 382 |
| }, |
| { |
| "entropy": 0.5520068109035492, |
| "epoch": 9.575, |
| "grad_norm": 0.06656571477651596, |
| "learning_rate": 0.0002, |
| "loss": 0.555323600769043, |
| "mean_token_accuracy": 0.7742787003517151, |
| "num_tokens": 28660846.0, |
| "step": 383 |
| }, |
| { |
| "entropy": 0.5486234575510025, |
| "epoch": 9.6, |
| "grad_norm": 0.0534614734351635, |
| "learning_rate": 0.0002, |
| "loss": 0.5439633131027222, |
| "mean_token_accuracy": 0.7797495126724243, |
| "num_tokens": 28735594.0, |
| "step": 384 |
| }, |
| { |
| "entropy": 0.5434623211622238, |
| "epoch": 9.625, |
| "grad_norm": 0.045583903789520264, |
| "learning_rate": 0.0002, |
| "loss": 0.5419756174087524, |
| "mean_token_accuracy": 0.7793311029672623, |
| "num_tokens": 28809189.0, |
| "step": 385 |
| }, |
| { |
| "entropy": 0.5472258776426315, |
| "epoch": 9.65, |
| "grad_norm": 0.05377979576587677, |
| "learning_rate": 0.0002, |
| "loss": 0.5480138063430786, |
| "mean_token_accuracy": 0.7763115465641022, |
| "num_tokens": 28883637.0, |
| "step": 386 |
| }, |
| { |
| "entropy": 0.544014573097229, |
| "epoch": 9.675, |
| "grad_norm": 0.04192574322223663, |
| "learning_rate": 0.0002, |
| "loss": 0.544299304485321, |
| "mean_token_accuracy": 0.7778673022985458, |
| "num_tokens": 28957576.0, |
| "step": 387 |
| }, |
| { |
| "entropy": 0.5428405702114105, |
| "epoch": 9.7, |
| "grad_norm": 0.06100517511367798, |
| "learning_rate": 0.0002, |
| "loss": 0.5435603857040405, |
| "mean_token_accuracy": 0.7799983322620392, |
| "num_tokens": 29032062.0, |
| "step": 388 |
| }, |
| { |
| "entropy": 0.5476491451263428, |
| "epoch": 9.725, |
| "grad_norm": 0.048970699310302734, |
| "learning_rate": 0.0002, |
| "loss": 0.5444561839103699, |
| "mean_token_accuracy": 0.7781545221805573, |
| "num_tokens": 29106620.0, |
| "step": 389 |
| }, |
| { |
| "entropy": 0.5525311529636383, |
| "epoch": 9.75, |
| "grad_norm": 0.04579257220029831, |
| "learning_rate": 0.0002, |
| "loss": 0.5549685955047607, |
| "mean_token_accuracy": 0.7740297764539719, |
| "num_tokens": 29181675.0, |
| "step": 390 |
| }, |
| { |
| "entropy": 0.5356399416923523, |
| "epoch": 9.775, |
| "grad_norm": 0.05338006839156151, |
| "learning_rate": 0.0002, |
| "loss": 0.5426127314567566, |
| "mean_token_accuracy": 0.7810541093349457, |
| "num_tokens": 29256940.0, |
| "step": 391 |
| }, |
| { |
| "entropy": 0.5530804395675659, |
| "epoch": 9.8, |
| "grad_norm": 0.04246848449110985, |
| "learning_rate": 0.0002, |
| "loss": 0.5501898527145386, |
| "mean_token_accuracy": 0.7768332362174988, |
| "num_tokens": 29331834.0, |
| "step": 392 |
| }, |
| { |
| "entropy": 0.5455987602472305, |
| "epoch": 9.825, |
| "grad_norm": 0.05141966789960861, |
| "learning_rate": 0.0002, |
| "loss": 0.5416363477706909, |
| "mean_token_accuracy": 0.779405802488327, |
| "num_tokens": 29406597.0, |
| "step": 393 |
| }, |
| { |
| "entropy": 0.5400111973285675, |
| "epoch": 9.85, |
| "grad_norm": 0.04637204110622406, |
| "learning_rate": 0.0002, |
| "loss": 0.5422732830047607, |
| "mean_token_accuracy": 0.7795074433088303, |
| "num_tokens": 29481010.0, |
| "step": 394 |
| }, |
| { |
| "entropy": 0.5512913167476654, |
| "epoch": 9.875, |
| "grad_norm": 0.047507502138614655, |
| "learning_rate": 0.0002, |
| "loss": 0.5522125363349915, |
| "mean_token_accuracy": 0.7759748101234436, |
| "num_tokens": 29557413.0, |
| "step": 395 |
| }, |
| { |
| "entropy": 0.5526851117610931, |
| "epoch": 9.9, |
| "grad_norm": 0.05560845509171486, |
| "learning_rate": 0.0002, |
| "loss": 0.5494586229324341, |
| "mean_token_accuracy": 0.7773067951202393, |
| "num_tokens": 29632250.0, |
| "step": 396 |
| }, |
| { |
| "entropy": 0.5464700162410736, |
| "epoch": 9.925, |
| "grad_norm": 0.03767940029501915, |
| "learning_rate": 0.0002, |
| "loss": 0.5446071624755859, |
| "mean_token_accuracy": 0.779336228966713, |
| "num_tokens": 29707194.0, |
| "step": 397 |
| }, |
| { |
| "entropy": 0.5447115898132324, |
| "epoch": 9.95, |
| "grad_norm": 0.06558585911989212, |
| "learning_rate": 0.0002, |
| "loss": 0.5500915050506592, |
| "mean_token_accuracy": 0.7773743122816086, |
| "num_tokens": 29782815.0, |
| "step": 398 |
| }, |
| { |
| "entropy": 0.5433839708566666, |
| "epoch": 9.975, |
| "grad_norm": 0.04332485795021057, |
| "learning_rate": 0.0002, |
| "loss": 0.5444520711898804, |
| "mean_token_accuracy": 0.7788331806659698, |
| "num_tokens": 29857625.0, |
| "step": 399 |
| }, |
| { |
| "entropy": 0.5447465628385544, |
| "epoch": 10.0, |
| "grad_norm": 0.049522414803504944, |
| "learning_rate": 0.0002, |
| "loss": 0.5472792387008667, |
| "mean_token_accuracy": 0.7776744365692139, |
| "num_tokens": 29931942.0, |
| "step": 400 |
| } |
| ], |
| "logging_steps": 1, |
| "max_steps": 400, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 10, |
| "save_steps": 500, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": true |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 1.6443458485716255e+18, |
| "train_batch_size": 16, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|