eac123's picture
Upload final checkpoint (checkpoint-400)
b985d4f verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 10.0,
"eval_steps": 500,
"global_step": 400,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"entropy": 1.2453354597091675,
"epoch": 0.025,
"grad_norm": 0.7225268483161926,
"learning_rate": 0.0002,
"loss": 2.687872886657715,
"mean_token_accuracy": 0.514649897813797,
"num_tokens": 74890.0,
"step": 1
},
{
"entropy": 1.276440054178238,
"epoch": 0.05,
"grad_norm": 0.4285995066165924,
"learning_rate": 0.0002,
"loss": 2.2105631828308105,
"mean_token_accuracy": 0.5571927130222321,
"num_tokens": 149478.0,
"step": 2
},
{
"entropy": 1.3720857799053192,
"epoch": 0.075,
"grad_norm": 0.4019286632537842,
"learning_rate": 0.0002,
"loss": 1.8746938705444336,
"mean_token_accuracy": 0.5803174823522568,
"num_tokens": 224127.0,
"step": 3
},
{
"entropy": 1.4278347790241241,
"epoch": 0.1,
"grad_norm": 0.3917113244533539,
"learning_rate": 0.0002,
"loss": 1.6238957643508911,
"mean_token_accuracy": 0.5989710986614227,
"num_tokens": 299184.0,
"step": 4
},
{
"entropy": 1.408233255147934,
"epoch": 0.125,
"grad_norm": 0.2560015320777893,
"learning_rate": 0.0002,
"loss": 1.4765487909317017,
"mean_token_accuracy": 0.620598778128624,
"num_tokens": 373384.0,
"step": 5
},
{
"entropy": 1.4224989414215088,
"epoch": 0.15,
"grad_norm": 0.17689421772956848,
"learning_rate": 0.0002,
"loss": 1.358055830001831,
"mean_token_accuracy": 0.6330070495605469,
"num_tokens": 447785.0,
"step": 6
},
{
"entropy": 1.3971222341060638,
"epoch": 0.175,
"grad_norm": 0.13785144686698914,
"learning_rate": 0.0002,
"loss": 1.2616052627563477,
"mean_token_accuracy": 0.6447271108627319,
"num_tokens": 522258.0,
"step": 7
},
{
"entropy": 1.340851902961731,
"epoch": 0.2,
"grad_norm": 0.13835509121418,
"learning_rate": 0.0002,
"loss": 1.1717238426208496,
"mean_token_accuracy": 0.656842052936554,
"num_tokens": 597302.0,
"step": 8
},
{
"entropy": 1.290471225976944,
"epoch": 0.225,
"grad_norm": 0.13574543595314026,
"learning_rate": 0.0002,
"loss": 1.1026949882507324,
"mean_token_accuracy": 0.6640458852052689,
"num_tokens": 671987.0,
"step": 9
},
{
"entropy": 1.2009779810905457,
"epoch": 0.25,
"grad_norm": 0.1603001058101654,
"learning_rate": 0.0002,
"loss": 1.021569848060608,
"mean_token_accuracy": 0.6854399591684341,
"num_tokens": 746652.0,
"step": 10
},
{
"entropy": 1.10042205452919,
"epoch": 0.275,
"grad_norm": 0.1099264919757843,
"learning_rate": 0.0002,
"loss": 0.9563246369361877,
"mean_token_accuracy": 0.692952573299408,
"num_tokens": 821844.0,
"step": 11
},
{
"entropy": 1.0122379958629608,
"epoch": 0.3,
"grad_norm": 0.10267413407564163,
"learning_rate": 0.0002,
"loss": 0.9180705547332764,
"mean_token_accuracy": 0.697370782494545,
"num_tokens": 896293.0,
"step": 12
},
{
"entropy": 0.9369151145219803,
"epoch": 0.325,
"grad_norm": 0.09305275231599808,
"learning_rate": 0.0002,
"loss": 0.8677215576171875,
"mean_token_accuracy": 0.7073450535535812,
"num_tokens": 970585.0,
"step": 13
},
{
"entropy": 0.8721449226140976,
"epoch": 0.35,
"grad_norm": 0.09551584720611572,
"learning_rate": 0.0002,
"loss": 0.8373533487319946,
"mean_token_accuracy": 0.7065530866384506,
"num_tokens": 1045873.0,
"step": 14
},
{
"entropy": 0.8138987272977829,
"epoch": 0.375,
"grad_norm": 0.09821188449859619,
"learning_rate": 0.0002,
"loss": 0.8080641031265259,
"mean_token_accuracy": 0.7135835438966751,
"num_tokens": 1121090.0,
"step": 15
},
{
"entropy": 0.782063752412796,
"epoch": 0.4,
"grad_norm": 0.09285010397434235,
"learning_rate": 0.0002,
"loss": 0.7853477001190186,
"mean_token_accuracy": 0.7157466858625412,
"num_tokens": 1196168.0,
"step": 16
},
{
"entropy": 0.7581740468740463,
"epoch": 0.425,
"grad_norm": 0.0761309340596199,
"learning_rate": 0.0002,
"loss": 0.7540827393531799,
"mean_token_accuracy": 0.7216732352972031,
"num_tokens": 1271469.0,
"step": 17
},
{
"entropy": 0.7492464035749435,
"epoch": 0.45,
"grad_norm": 0.08480065315961838,
"learning_rate": 0.0002,
"loss": 0.7439934015274048,
"mean_token_accuracy": 0.7229771614074707,
"num_tokens": 1345721.0,
"step": 18
},
{
"entropy": 0.7309027314186096,
"epoch": 0.475,
"grad_norm": 0.09197527915239334,
"learning_rate": 0.0002,
"loss": 0.7264150381088257,
"mean_token_accuracy": 0.7261519432067871,
"num_tokens": 1421058.0,
"step": 19
},
{
"entropy": 0.7090645134449005,
"epoch": 0.5,
"grad_norm": 0.08530627936124802,
"learning_rate": 0.0002,
"loss": 0.7123439311981201,
"mean_token_accuracy": 0.729744628071785,
"num_tokens": 1496487.0,
"step": 20
},
{
"entropy": 0.6987305730581284,
"epoch": 0.525,
"grad_norm": 0.0691930428147316,
"learning_rate": 0.0002,
"loss": 0.694115400314331,
"mean_token_accuracy": 0.7338976860046387,
"num_tokens": 1571896.0,
"step": 21
},
{
"entropy": 0.6976824253797531,
"epoch": 0.55,
"grad_norm": 0.06659026443958282,
"learning_rate": 0.0002,
"loss": 0.6916466951370239,
"mean_token_accuracy": 0.733501672744751,
"num_tokens": 1645975.0,
"step": 22
},
{
"entropy": 0.6778732389211655,
"epoch": 0.575,
"grad_norm": 0.06560930609703064,
"learning_rate": 0.0002,
"loss": 0.6672404408454895,
"mean_token_accuracy": 0.7411384731531143,
"num_tokens": 1720725.0,
"step": 23
},
{
"entropy": 0.6772548854351044,
"epoch": 0.6,
"grad_norm": 0.06431178748607635,
"learning_rate": 0.0002,
"loss": 0.6628670692443848,
"mean_token_accuracy": 0.7417115569114685,
"num_tokens": 1795630.0,
"step": 24
},
{
"entropy": 0.6669440716505051,
"epoch": 0.625,
"grad_norm": 0.06317117810249329,
"learning_rate": 0.0002,
"loss": 0.6542543172836304,
"mean_token_accuracy": 0.7428397238254547,
"num_tokens": 1870446.0,
"step": 25
},
{
"entropy": 0.6646556407213211,
"epoch": 0.65,
"grad_norm": 0.05317490175366402,
"learning_rate": 0.0002,
"loss": 0.6533851623535156,
"mean_token_accuracy": 0.7443498522043228,
"num_tokens": 1945732.0,
"step": 26
},
{
"entropy": 0.6496605724096298,
"epoch": 0.675,
"grad_norm": 0.059705302119255066,
"learning_rate": 0.0002,
"loss": 0.6433347463607788,
"mean_token_accuracy": 0.7493544220924377,
"num_tokens": 2020426.0,
"step": 27
},
{
"entropy": 0.6413406282663345,
"epoch": 0.7,
"grad_norm": 0.05553779378533363,
"learning_rate": 0.0002,
"loss": 0.6379414200782776,
"mean_token_accuracy": 0.7490545064210892,
"num_tokens": 2094994.0,
"step": 28
},
{
"entropy": 0.6248471438884735,
"epoch": 0.725,
"grad_norm": 0.04922964423894882,
"learning_rate": 0.0002,
"loss": 0.619972825050354,
"mean_token_accuracy": 0.757112592458725,
"num_tokens": 2169826.0,
"step": 29
},
{
"entropy": 0.619708925485611,
"epoch": 0.75,
"grad_norm": 0.05293005332350731,
"learning_rate": 0.0002,
"loss": 0.6181076765060425,
"mean_token_accuracy": 0.7557289451360703,
"num_tokens": 2244717.0,
"step": 30
},
{
"entropy": 0.6241249591112137,
"epoch": 0.775,
"grad_norm": 0.05528721585869789,
"learning_rate": 0.0002,
"loss": 0.6231218576431274,
"mean_token_accuracy": 0.7546616345643997,
"num_tokens": 2319262.0,
"step": 31
},
{
"entropy": 0.6172136664390564,
"epoch": 0.8,
"grad_norm": 0.05522594600915909,
"learning_rate": 0.0002,
"loss": 0.6161586046218872,
"mean_token_accuracy": 0.756885826587677,
"num_tokens": 2394260.0,
"step": 32
},
{
"entropy": 0.6185073405504227,
"epoch": 0.825,
"grad_norm": 0.05178181454539299,
"learning_rate": 0.0002,
"loss": 0.615160346031189,
"mean_token_accuracy": 0.757054477930069,
"num_tokens": 2469143.0,
"step": 33
},
{
"entropy": 0.6113822758197784,
"epoch": 0.85,
"grad_norm": 0.047203969210386276,
"learning_rate": 0.0002,
"loss": 0.6075619459152222,
"mean_token_accuracy": 0.7611373513936996,
"num_tokens": 2544884.0,
"step": 34
},
{
"entropy": 0.6107644140720367,
"epoch": 0.875,
"grad_norm": 0.04446641355752945,
"learning_rate": 0.0002,
"loss": 0.6088961958885193,
"mean_token_accuracy": 0.7600347548723221,
"num_tokens": 2619389.0,
"step": 35
},
{
"entropy": 0.6040280014276505,
"epoch": 0.9,
"grad_norm": 0.04550258815288544,
"learning_rate": 0.0002,
"loss": 0.6014739871025085,
"mean_token_accuracy": 0.7626253515481949,
"num_tokens": 2694103.0,
"step": 36
},
{
"entropy": 0.6048439294099808,
"epoch": 0.925,
"grad_norm": 0.047757431864738464,
"learning_rate": 0.0002,
"loss": 0.6057171821594238,
"mean_token_accuracy": 0.760799378156662,
"num_tokens": 2769246.0,
"step": 37
},
{
"entropy": 0.6022361516952515,
"epoch": 0.95,
"grad_norm": 0.046528495848178864,
"learning_rate": 0.0002,
"loss": 0.6049805283546448,
"mean_token_accuracy": 0.7588685750961304,
"num_tokens": 2843271.0,
"step": 38
},
{
"entropy": 0.5945823639631271,
"epoch": 0.975,
"grad_norm": 0.04531135782599449,
"learning_rate": 0.0002,
"loss": 0.5926676988601685,
"mean_token_accuracy": 0.7653662264347076,
"num_tokens": 2918891.0,
"step": 39
},
{
"entropy": 0.5929759591817856,
"epoch": 1.0,
"grad_norm": 0.04308256506919861,
"learning_rate": 0.0002,
"loss": 0.5950115323066711,
"mean_token_accuracy": 0.7647853493690491,
"num_tokens": 2993209.0,
"step": 40
},
{
"entropy": 0.5922251790761948,
"epoch": 1.025,
"grad_norm": 0.03466418385505676,
"learning_rate": 0.0002,
"loss": 0.5925185084342957,
"mean_token_accuracy": 0.7662032544612885,
"num_tokens": 3067858.0,
"step": 41
},
{
"entropy": 0.5973207205533981,
"epoch": 1.05,
"grad_norm": 0.04712899401783943,
"learning_rate": 0.0002,
"loss": 0.5978673696517944,
"mean_token_accuracy": 0.7634124308824539,
"num_tokens": 3142219.0,
"step": 42
},
{
"entropy": 0.6021459102630615,
"epoch": 1.075,
"grad_norm": 0.038375336676836014,
"learning_rate": 0.0002,
"loss": 0.598951518535614,
"mean_token_accuracy": 0.7614264935255051,
"num_tokens": 3217036.0,
"step": 43
},
{
"entropy": 0.6001265943050385,
"epoch": 1.1,
"grad_norm": 0.03852194547653198,
"learning_rate": 0.0002,
"loss": 0.596366286277771,
"mean_token_accuracy": 0.761825367808342,
"num_tokens": 3292192.0,
"step": 44
},
{
"entropy": 0.5948344469070435,
"epoch": 1.125,
"grad_norm": 0.03576741740107536,
"learning_rate": 0.0002,
"loss": 0.5903418660163879,
"mean_token_accuracy": 0.7651200741529465,
"num_tokens": 3367236.0,
"step": 45
},
{
"entropy": 0.5950423181056976,
"epoch": 1.15,
"grad_norm": 0.04135625809431076,
"learning_rate": 0.0002,
"loss": 0.5973416566848755,
"mean_token_accuracy": 0.7616013288497925,
"num_tokens": 3442037.0,
"step": 46
},
{
"entropy": 0.5873961746692657,
"epoch": 1.175,
"grad_norm": 0.035329703241586685,
"learning_rate": 0.0002,
"loss": 0.5871415138244629,
"mean_token_accuracy": 0.7657803446054459,
"num_tokens": 3516585.0,
"step": 47
},
{
"entropy": 0.5887223035097122,
"epoch": 1.2,
"grad_norm": 0.029021920636296272,
"learning_rate": 0.0002,
"loss": 0.5878927707672119,
"mean_token_accuracy": 0.764517068862915,
"num_tokens": 3590613.0,
"step": 48
},
{
"entropy": 0.5823997408151627,
"epoch": 1.225,
"grad_norm": 0.036641936749219894,
"learning_rate": 0.0002,
"loss": 0.5807881355285645,
"mean_token_accuracy": 0.7683323621749878,
"num_tokens": 3665437.0,
"step": 49
},
{
"entropy": 0.5899570137262344,
"epoch": 1.25,
"grad_norm": 0.030183902010321617,
"learning_rate": 0.0002,
"loss": 0.587297797203064,
"mean_token_accuracy": 0.7668341845273972,
"num_tokens": 3739929.0,
"step": 50
},
{
"entropy": 0.5848294198513031,
"epoch": 1.275,
"grad_norm": 0.031763091683387756,
"learning_rate": 0.0002,
"loss": 0.5853850245475769,
"mean_token_accuracy": 0.7676772177219391,
"num_tokens": 3814610.0,
"step": 51
},
{
"entropy": 0.5903092175722122,
"epoch": 1.3,
"grad_norm": 0.030446121469140053,
"learning_rate": 0.0002,
"loss": 0.5921140313148499,
"mean_token_accuracy": 0.7648924738168716,
"num_tokens": 3889259.0,
"step": 52
},
{
"entropy": 0.5961786210536957,
"epoch": 1.325,
"grad_norm": 0.02807600237429142,
"learning_rate": 0.0002,
"loss": 0.5954646468162537,
"mean_token_accuracy": 0.7630998939275742,
"num_tokens": 3963853.0,
"step": 53
},
{
"entropy": 0.584626317024231,
"epoch": 1.35,
"grad_norm": 0.023483913391828537,
"learning_rate": 0.0002,
"loss": 0.5831642150878906,
"mean_token_accuracy": 0.7670921683311462,
"num_tokens": 4038842.0,
"step": 54
},
{
"entropy": 0.5851259380578995,
"epoch": 1.375,
"grad_norm": 0.029443850740790367,
"learning_rate": 0.0002,
"loss": 0.5853797197341919,
"mean_token_accuracy": 0.7661719769239426,
"num_tokens": 4113390.0,
"step": 55
},
{
"entropy": 0.586825504899025,
"epoch": 1.4,
"grad_norm": 0.027121173217892647,
"learning_rate": 0.0002,
"loss": 0.587054967880249,
"mean_token_accuracy": 0.7664920389652252,
"num_tokens": 4188776.0,
"step": 56
},
{
"entropy": 0.5881156474351883,
"epoch": 1.425,
"grad_norm": 0.02241705358028412,
"learning_rate": 0.0002,
"loss": 0.5837085247039795,
"mean_token_accuracy": 0.7671953588724136,
"num_tokens": 4263443.0,
"step": 57
},
{
"entropy": 0.5906041860580444,
"epoch": 1.45,
"grad_norm": 0.024774691089987755,
"learning_rate": 0.0002,
"loss": 0.58786940574646,
"mean_token_accuracy": 0.7648984342813492,
"num_tokens": 4337825.0,
"step": 58
},
{
"entropy": 0.5956196784973145,
"epoch": 1.475,
"grad_norm": 0.02898634597659111,
"learning_rate": 0.0002,
"loss": 0.5959815979003906,
"mean_token_accuracy": 0.7617602795362473,
"num_tokens": 4412117.0,
"step": 59
},
{
"entropy": 0.5868811905384064,
"epoch": 1.5,
"grad_norm": 0.024418242275714874,
"learning_rate": 0.0002,
"loss": 0.5837817192077637,
"mean_token_accuracy": 0.7679703235626221,
"num_tokens": 4487325.0,
"step": 60
},
{
"entropy": 0.5820131897926331,
"epoch": 1.525,
"grad_norm": 0.018590735271573067,
"learning_rate": 0.0002,
"loss": 0.5806015729904175,
"mean_token_accuracy": 0.7693226784467697,
"num_tokens": 4562700.0,
"step": 61
},
{
"entropy": 0.585755005478859,
"epoch": 1.55,
"grad_norm": 0.022883402183651924,
"learning_rate": 0.0002,
"loss": 0.5838991403579712,
"mean_token_accuracy": 0.7651054114103317,
"num_tokens": 4637209.0,
"step": 62
},
{
"entropy": 0.584441751241684,
"epoch": 1.575,
"grad_norm": 0.027678513899445534,
"learning_rate": 0.0002,
"loss": 0.585010826587677,
"mean_token_accuracy": 0.7656246721744537,
"num_tokens": 4712394.0,
"step": 63
},
{
"entropy": 0.600657045841217,
"epoch": 1.6,
"grad_norm": 0.021883023902773857,
"learning_rate": 0.0002,
"loss": 0.6001325845718384,
"mean_token_accuracy": 0.7614835500717163,
"num_tokens": 4787928.0,
"step": 64
},
{
"entropy": 0.584406390786171,
"epoch": 1.625,
"grad_norm": 0.02202012576162815,
"learning_rate": 0.0002,
"loss": 0.5836704969406128,
"mean_token_accuracy": 0.7675553858280182,
"num_tokens": 4862997.0,
"step": 65
},
{
"entropy": 0.5805485397577286,
"epoch": 1.65,
"grad_norm": 0.02676200121641159,
"learning_rate": 0.0002,
"loss": 0.5817915201187134,
"mean_token_accuracy": 0.7683773785829544,
"num_tokens": 4937295.0,
"step": 66
},
{
"entropy": 0.5826524794101715,
"epoch": 1.675,
"grad_norm": 0.0226582121104002,
"learning_rate": 0.0002,
"loss": 0.5793530941009521,
"mean_token_accuracy": 0.7675565928220749,
"num_tokens": 5012169.0,
"step": 67
},
{
"entropy": 0.5945204049348831,
"epoch": 1.7,
"grad_norm": 0.023526955395936966,
"learning_rate": 0.0002,
"loss": 0.591584324836731,
"mean_token_accuracy": 0.7634985893964767,
"num_tokens": 5087480.0,
"step": 68
},
{
"entropy": 0.5845358669757843,
"epoch": 1.725,
"grad_norm": 0.026141872629523277,
"learning_rate": 0.0002,
"loss": 0.5840914249420166,
"mean_token_accuracy": 0.7668609470129013,
"num_tokens": 5161892.0,
"step": 69
},
{
"entropy": 0.5799373537302017,
"epoch": 1.75,
"grad_norm": 0.023512404412031174,
"learning_rate": 0.0002,
"loss": 0.5786521434783936,
"mean_token_accuracy": 0.7684901505708694,
"num_tokens": 5237346.0,
"step": 70
},
{
"entropy": 0.5913780480623245,
"epoch": 1.775,
"grad_norm": 0.021628571674227715,
"learning_rate": 0.0002,
"loss": 0.5903242826461792,
"mean_token_accuracy": 0.7634606808423996,
"num_tokens": 5312258.0,
"step": 71
},
{
"entropy": 0.5781446248292923,
"epoch": 1.8,
"grad_norm": 0.025359593331813812,
"learning_rate": 0.0002,
"loss": 0.5779482126235962,
"mean_token_accuracy": 0.7685064077377319,
"num_tokens": 5386770.0,
"step": 72
},
{
"entropy": 0.5885084420442581,
"epoch": 1.825,
"grad_norm": 0.02480519749224186,
"learning_rate": 0.0002,
"loss": 0.5897427797317505,
"mean_token_accuracy": 0.7633197009563446,
"num_tokens": 5461637.0,
"step": 73
},
{
"entropy": 0.5789479911327362,
"epoch": 1.85,
"grad_norm": 0.021689681336283684,
"learning_rate": 0.0002,
"loss": 0.5763558149337769,
"mean_token_accuracy": 0.7686384618282318,
"num_tokens": 5537141.0,
"step": 74
},
{
"entropy": 0.5846573114395142,
"epoch": 1.875,
"grad_norm": 0.023601949214935303,
"learning_rate": 0.0002,
"loss": 0.585774302482605,
"mean_token_accuracy": 0.7660125941038132,
"num_tokens": 5611711.0,
"step": 75
},
{
"entropy": 0.5856555849313736,
"epoch": 1.9,
"grad_norm": 0.02879919670522213,
"learning_rate": 0.0002,
"loss": 0.5847885608673096,
"mean_token_accuracy": 0.7661805897951126,
"num_tokens": 5687175.0,
"step": 76
},
{
"entropy": 0.591449961066246,
"epoch": 1.925,
"grad_norm": 0.023963551968336105,
"learning_rate": 0.0002,
"loss": 0.5909919738769531,
"mean_token_accuracy": 0.7625879198312759,
"num_tokens": 5762051.0,
"step": 77
},
{
"entropy": 0.5732830464839935,
"epoch": 1.95,
"grad_norm": 0.02373599074780941,
"learning_rate": 0.0002,
"loss": 0.5746445059776306,
"mean_token_accuracy": 0.7692500203847885,
"num_tokens": 5836504.0,
"step": 78
},
{
"entropy": 0.5739967525005341,
"epoch": 1.975,
"grad_norm": 0.024121304973959923,
"learning_rate": 0.0002,
"loss": 0.5751599073410034,
"mean_token_accuracy": 0.7690701484680176,
"num_tokens": 5911400.0,
"step": 79
},
{
"entropy": 0.5794458240270615,
"epoch": 2.0,
"grad_norm": 0.023465219885110855,
"learning_rate": 0.0002,
"loss": 0.5783512592315674,
"mean_token_accuracy": 0.7684450447559357,
"num_tokens": 5986482.0,
"step": 80
},
{
"entropy": 0.5798445045948029,
"epoch": 2.025,
"grad_norm": 0.0209247674793005,
"learning_rate": 0.0002,
"loss": 0.5802971124649048,
"mean_token_accuracy": 0.7673298269510269,
"num_tokens": 6061119.0,
"step": 81
},
{
"entropy": 0.5746374428272247,
"epoch": 2.05,
"grad_norm": 0.022763773798942566,
"learning_rate": 0.0002,
"loss": 0.572521448135376,
"mean_token_accuracy": 0.7688614279031754,
"num_tokens": 6135886.0,
"step": 82
},
{
"entropy": 0.5834762305021286,
"epoch": 2.075,
"grad_norm": 0.024529799818992615,
"learning_rate": 0.0002,
"loss": 0.5825608968734741,
"mean_token_accuracy": 0.7661754339933395,
"num_tokens": 6211037.0,
"step": 83
},
{
"entropy": 0.5820914059877396,
"epoch": 2.1,
"grad_norm": 0.02345711924135685,
"learning_rate": 0.0002,
"loss": 0.5759867429733276,
"mean_token_accuracy": 0.7681425362825394,
"num_tokens": 6285250.0,
"step": 84
},
{
"entropy": 0.5817355811595917,
"epoch": 2.125,
"grad_norm": 0.025857318192720413,
"learning_rate": 0.0002,
"loss": 0.5796504020690918,
"mean_token_accuracy": 0.7672218382358551,
"num_tokens": 6360531.0,
"step": 85
},
{
"entropy": 0.5870300382375717,
"epoch": 2.15,
"grad_norm": 0.01944359764456749,
"learning_rate": 0.0002,
"loss": 0.5889190435409546,
"mean_token_accuracy": 0.7646535336971283,
"num_tokens": 6434545.0,
"step": 86
},
{
"entropy": 0.5707338750362396,
"epoch": 2.175,
"grad_norm": 0.022768637165427208,
"learning_rate": 0.0002,
"loss": 0.5740299224853516,
"mean_token_accuracy": 0.7691160142421722,
"num_tokens": 6509711.0,
"step": 87
},
{
"entropy": 0.5874478965997696,
"epoch": 2.2,
"grad_norm": 0.02508588135242462,
"learning_rate": 0.0002,
"loss": 0.5900440216064453,
"mean_token_accuracy": 0.7628317475318909,
"num_tokens": 6584583.0,
"step": 88
},
{
"entropy": 0.5802857577800751,
"epoch": 2.225,
"grad_norm": 0.02080141380429268,
"learning_rate": 0.0002,
"loss": 0.5806664824485779,
"mean_token_accuracy": 0.7670494765043259,
"num_tokens": 6658937.0,
"step": 89
},
{
"entropy": 0.5850326269865036,
"epoch": 2.25,
"grad_norm": 0.020431680604815483,
"learning_rate": 0.0002,
"loss": 0.5813099145889282,
"mean_token_accuracy": 0.7660593539476395,
"num_tokens": 6733805.0,
"step": 90
},
{
"entropy": 0.5846189707517624,
"epoch": 2.275,
"grad_norm": 0.02515556663274765,
"learning_rate": 0.0002,
"loss": 0.5851079225540161,
"mean_token_accuracy": 0.7659571021795273,
"num_tokens": 6808609.0,
"step": 91
},
{
"entropy": 0.6003973633050919,
"epoch": 2.3,
"grad_norm": 0.02406417950987816,
"learning_rate": 0.0002,
"loss": 0.5949417352676392,
"mean_token_accuracy": 0.7613963186740875,
"num_tokens": 6883966.0,
"step": 92
},
{
"entropy": 0.574845939874649,
"epoch": 2.325,
"grad_norm": 0.025337981060147285,
"learning_rate": 0.0002,
"loss": 0.5741162896156311,
"mean_token_accuracy": 0.7704001069068909,
"num_tokens": 6957938.0,
"step": 93
},
{
"entropy": 0.5916647762060165,
"epoch": 2.35,
"grad_norm": 0.021806908771395683,
"learning_rate": 0.0002,
"loss": 0.5912328958511353,
"mean_token_accuracy": 0.7635269463062286,
"num_tokens": 7033240.0,
"step": 94
},
{
"entropy": 0.5794739425182343,
"epoch": 2.375,
"grad_norm": 0.021972037851810455,
"learning_rate": 0.0002,
"loss": 0.5759366750717163,
"mean_token_accuracy": 0.7700382471084595,
"num_tokens": 7108387.0,
"step": 95
},
{
"entropy": 0.5825973749160767,
"epoch": 2.4,
"grad_norm": 0.02072254940867424,
"learning_rate": 0.0002,
"loss": 0.5852139592170715,
"mean_token_accuracy": 0.7643989473581314,
"num_tokens": 7182686.0,
"step": 96
},
{
"entropy": 0.5752668976783752,
"epoch": 2.425,
"grad_norm": 0.02361258678138256,
"learning_rate": 0.0002,
"loss": 0.5763595104217529,
"mean_token_accuracy": 0.768645167350769,
"num_tokens": 7258669.0,
"step": 97
},
{
"entropy": 0.5715779960155487,
"epoch": 2.45,
"grad_norm": 0.02046627178788185,
"learning_rate": 0.0002,
"loss": 0.5750976800918579,
"mean_token_accuracy": 0.7683692574501038,
"num_tokens": 7333818.0,
"step": 98
},
{
"entropy": 0.5790873467922211,
"epoch": 2.475,
"grad_norm": 0.02545187622308731,
"learning_rate": 0.0002,
"loss": 0.5775801539421082,
"mean_token_accuracy": 0.7690194249153137,
"num_tokens": 7408902.0,
"step": 99
},
{
"entropy": 0.5897374451160431,
"epoch": 2.5,
"grad_norm": 0.021124642342329025,
"learning_rate": 0.0002,
"loss": 0.5902318358421326,
"mean_token_accuracy": 0.7630759179592133,
"num_tokens": 7483783.0,
"step": 100
},
{
"entropy": 0.580902174115181,
"epoch": 2.525,
"grad_norm": 0.019817229360342026,
"learning_rate": 0.0002,
"loss": 0.5784831643104553,
"mean_token_accuracy": 0.7668700814247131,
"num_tokens": 7557724.0,
"step": 101
},
{
"entropy": 0.5834500938653946,
"epoch": 2.55,
"grad_norm": 0.024572577327489853,
"learning_rate": 0.0002,
"loss": 0.5787115097045898,
"mean_token_accuracy": 0.7675525993108749,
"num_tokens": 7633373.0,
"step": 102
},
{
"entropy": 0.5778897404670715,
"epoch": 2.575,
"grad_norm": 0.022201891988515854,
"learning_rate": 0.0002,
"loss": 0.5758777260780334,
"mean_token_accuracy": 0.767838791012764,
"num_tokens": 7708329.0,
"step": 103
},
{
"entropy": 0.578838050365448,
"epoch": 2.6,
"grad_norm": 0.02364918775856495,
"learning_rate": 0.0002,
"loss": 0.581870436668396,
"mean_token_accuracy": 0.7661506086587906,
"num_tokens": 7782911.0,
"step": 104
},
{
"entropy": 0.5752829909324646,
"epoch": 2.625,
"grad_norm": 0.022952446714043617,
"learning_rate": 0.0002,
"loss": 0.5794333815574646,
"mean_token_accuracy": 0.7660450041294098,
"num_tokens": 7857948.0,
"step": 105
},
{
"entropy": 0.5836146026849747,
"epoch": 2.65,
"grad_norm": 0.02250981330871582,
"learning_rate": 0.0002,
"loss": 0.5844818949699402,
"mean_token_accuracy": 0.7643773108720779,
"num_tokens": 7932466.0,
"step": 106
},
{
"entropy": 0.5810949504375458,
"epoch": 2.675,
"grad_norm": 0.021099543198943138,
"learning_rate": 0.0002,
"loss": 0.5812161564826965,
"mean_token_accuracy": 0.7672342509031296,
"num_tokens": 8007372.0,
"step": 107
},
{
"entropy": 0.5868075489997864,
"epoch": 2.7,
"grad_norm": 0.024328874424099922,
"learning_rate": 0.0002,
"loss": 0.583724319934845,
"mean_token_accuracy": 0.765077531337738,
"num_tokens": 8081242.0,
"step": 108
},
{
"entropy": 0.5784394592046738,
"epoch": 2.725,
"grad_norm": 0.023478057235479355,
"learning_rate": 0.0002,
"loss": 0.5791985988616943,
"mean_token_accuracy": 0.7663314342498779,
"num_tokens": 8155716.0,
"step": 109
},
{
"entropy": 0.5747242122888565,
"epoch": 2.75,
"grad_norm": 0.02284744381904602,
"learning_rate": 0.0002,
"loss": 0.5755459070205688,
"mean_token_accuracy": 0.7680166959762573,
"num_tokens": 8229840.0,
"step": 110
},
{
"entropy": 0.574517697095871,
"epoch": 2.775,
"grad_norm": 0.022360296919941902,
"learning_rate": 0.0002,
"loss": 0.5729012489318848,
"mean_token_accuracy": 0.7694876343011856,
"num_tokens": 8305151.0,
"step": 111
},
{
"entropy": 0.5775211006402969,
"epoch": 2.8,
"grad_norm": 0.025003811344504356,
"learning_rate": 0.0002,
"loss": 0.5751063227653503,
"mean_token_accuracy": 0.7694868594408035,
"num_tokens": 8379669.0,
"step": 112
},
{
"entropy": 0.5808530151844025,
"epoch": 2.825,
"grad_norm": 0.01840745098888874,
"learning_rate": 0.0002,
"loss": 0.58048415184021,
"mean_token_accuracy": 0.7667081654071808,
"num_tokens": 8454144.0,
"step": 113
},
{
"entropy": 0.5671190619468689,
"epoch": 2.85,
"grad_norm": 0.024347495287656784,
"learning_rate": 0.0002,
"loss": 0.568537712097168,
"mean_token_accuracy": 0.7717441022396088,
"num_tokens": 8529127.0,
"step": 114
},
{
"entropy": 0.5740341693162918,
"epoch": 2.875,
"grad_norm": 0.024653296917676926,
"learning_rate": 0.0002,
"loss": 0.5723626613616943,
"mean_token_accuracy": 0.7696013450622559,
"num_tokens": 8604291.0,
"step": 115
},
{
"entropy": 0.5796328634023666,
"epoch": 2.9,
"grad_norm": 0.020844636484980583,
"learning_rate": 0.0002,
"loss": 0.5782836079597473,
"mean_token_accuracy": 0.7682057768106461,
"num_tokens": 8679195.0,
"step": 116
},
{
"entropy": 0.576483279466629,
"epoch": 2.925,
"grad_norm": 0.021920515224337578,
"learning_rate": 0.0002,
"loss": 0.578456699848175,
"mean_token_accuracy": 0.7670477628707886,
"num_tokens": 8754009.0,
"step": 117
},
{
"entropy": 0.5752202421426773,
"epoch": 2.95,
"grad_norm": 0.020918108522892,
"learning_rate": 0.0002,
"loss": 0.5750131011009216,
"mean_token_accuracy": 0.7684811949729919,
"num_tokens": 8828191.0,
"step": 118
},
{
"entropy": 0.5851098299026489,
"epoch": 2.975,
"grad_norm": 0.02478696219623089,
"learning_rate": 0.0002,
"loss": 0.5817323327064514,
"mean_token_accuracy": 0.7672260999679565,
"num_tokens": 8903936.0,
"step": 119
},
{
"entropy": 0.5765727013349533,
"epoch": 3.0,
"grad_norm": 0.021200377494096756,
"learning_rate": 0.0002,
"loss": 0.575070858001709,
"mean_token_accuracy": 0.7691281586885452,
"num_tokens": 8979681.0,
"step": 120
},
{
"entropy": 0.5856994092464447,
"epoch": 3.025,
"grad_norm": 0.01982778124511242,
"learning_rate": 0.0002,
"loss": 0.5849777460098267,
"mean_token_accuracy": 0.7634477466344833,
"num_tokens": 9053813.0,
"step": 121
},
{
"entropy": 0.5709525793790817,
"epoch": 3.05,
"grad_norm": 0.020404471084475517,
"learning_rate": 0.0002,
"loss": 0.5730876326560974,
"mean_token_accuracy": 0.7704098522663116,
"num_tokens": 9128173.0,
"step": 122
},
{
"entropy": 0.5704852193593979,
"epoch": 3.075,
"grad_norm": 0.016850776970386505,
"learning_rate": 0.0002,
"loss": 0.5663577318191528,
"mean_token_accuracy": 0.7722858935594559,
"num_tokens": 9203061.0,
"step": 123
},
{
"entropy": 0.567479208111763,
"epoch": 3.1,
"grad_norm": 0.025294054299592972,
"learning_rate": 0.0002,
"loss": 0.5650860071182251,
"mean_token_accuracy": 0.7725925892591476,
"num_tokens": 9278418.0,
"step": 124
},
{
"entropy": 0.5768236368894577,
"epoch": 3.125,
"grad_norm": 0.021733148023486137,
"learning_rate": 0.0002,
"loss": 0.5766515731811523,
"mean_token_accuracy": 0.7679264396429062,
"num_tokens": 9353227.0,
"step": 125
},
{
"entropy": 0.5702922940254211,
"epoch": 3.15,
"grad_norm": 0.023117227479815483,
"learning_rate": 0.0002,
"loss": 0.5716854929924011,
"mean_token_accuracy": 0.7702013403177261,
"num_tokens": 9428399.0,
"step": 126
},
{
"entropy": 0.5861406326293945,
"epoch": 3.175,
"grad_norm": 0.02236233651638031,
"learning_rate": 0.0002,
"loss": 0.5864638090133667,
"mean_token_accuracy": 0.7633958756923676,
"num_tokens": 9503325.0,
"step": 127
},
{
"entropy": 0.5789273381233215,
"epoch": 3.2,
"grad_norm": 0.02411346696317196,
"learning_rate": 0.0002,
"loss": 0.5779775977134705,
"mean_token_accuracy": 0.7668363004922867,
"num_tokens": 9578044.0,
"step": 128
},
{
"entropy": 0.5797711908817291,
"epoch": 3.225,
"grad_norm": 0.023102540522813797,
"learning_rate": 0.0002,
"loss": 0.5821047425270081,
"mean_token_accuracy": 0.7657962143421173,
"num_tokens": 9652622.0,
"step": 129
},
{
"entropy": 0.5753140151500702,
"epoch": 3.25,
"grad_norm": 0.02087407372891903,
"learning_rate": 0.0002,
"loss": 0.5716829895973206,
"mean_token_accuracy": 0.7688267230987549,
"num_tokens": 9727771.0,
"step": 130
},
{
"entropy": 0.5765914916992188,
"epoch": 3.275,
"grad_norm": 0.022741632536053658,
"learning_rate": 0.0002,
"loss": 0.5777339935302734,
"mean_token_accuracy": 0.7680183500051498,
"num_tokens": 9802572.0,
"step": 131
},
{
"entropy": 0.5704336613416672,
"epoch": 3.3,
"grad_norm": 0.02135850489139557,
"learning_rate": 0.0002,
"loss": 0.5711397528648376,
"mean_token_accuracy": 0.7697059661149979,
"num_tokens": 9877509.0,
"step": 132
},
{
"entropy": 0.5819953978061676,
"epoch": 3.325,
"grad_norm": 0.028905468061566353,
"learning_rate": 0.0002,
"loss": 0.579884946346283,
"mean_token_accuracy": 0.7667191326618195,
"num_tokens": 9951617.0,
"step": 133
},
{
"entropy": 0.5826835632324219,
"epoch": 3.35,
"grad_norm": 0.021706923842430115,
"learning_rate": 0.0002,
"loss": 0.5823646783828735,
"mean_token_accuracy": 0.764801412820816,
"num_tokens": 10026079.0,
"step": 134
},
{
"entropy": 0.5760972201824188,
"epoch": 3.375,
"grad_norm": 0.02655896358191967,
"learning_rate": 0.0002,
"loss": 0.5790044665336609,
"mean_token_accuracy": 0.7664503753185272,
"num_tokens": 10101148.0,
"step": 135
},
{
"entropy": 0.5685720443725586,
"epoch": 3.4,
"grad_norm": 0.02456754446029663,
"learning_rate": 0.0002,
"loss": 0.5716453790664673,
"mean_token_accuracy": 0.7696442306041718,
"num_tokens": 10175811.0,
"step": 136
},
{
"entropy": 0.5759570449590683,
"epoch": 3.425,
"grad_norm": 0.02254396118223667,
"learning_rate": 0.0002,
"loss": 0.5739217400550842,
"mean_token_accuracy": 0.7697554975748062,
"num_tokens": 10250838.0,
"step": 137
},
{
"entropy": 0.5719419866800308,
"epoch": 3.45,
"grad_norm": 0.024404190480709076,
"learning_rate": 0.0002,
"loss": 0.5725557804107666,
"mean_token_accuracy": 0.769346296787262,
"num_tokens": 10326342.0,
"step": 138
},
{
"entropy": 0.5715157091617584,
"epoch": 3.475,
"grad_norm": 0.022105256095528603,
"learning_rate": 0.0002,
"loss": 0.5716947317123413,
"mean_token_accuracy": 0.7693071365356445,
"num_tokens": 10402338.0,
"step": 139
},
{
"entropy": 0.5735979527235031,
"epoch": 3.5,
"grad_norm": 0.023778000846505165,
"learning_rate": 0.0002,
"loss": 0.5760594606399536,
"mean_token_accuracy": 0.7667177468538284,
"num_tokens": 10476752.0,
"step": 140
},
{
"entropy": 0.5770183205604553,
"epoch": 3.525,
"grad_norm": 0.021110933274030685,
"learning_rate": 0.0002,
"loss": 0.5751050710678101,
"mean_token_accuracy": 0.7673051208257675,
"num_tokens": 10551920.0,
"step": 141
},
{
"entropy": 0.5748606622219086,
"epoch": 3.55,
"grad_norm": 0.020023738965392113,
"learning_rate": 0.0002,
"loss": 0.5679011940956116,
"mean_token_accuracy": 0.7721457779407501,
"num_tokens": 10626809.0,
"step": 142
},
{
"entropy": 0.582221657037735,
"epoch": 3.575,
"grad_norm": 0.02178809978067875,
"learning_rate": 0.0002,
"loss": 0.579848051071167,
"mean_token_accuracy": 0.7657678723335266,
"num_tokens": 10701733.0,
"step": 143
},
{
"entropy": 0.571207270026207,
"epoch": 3.6,
"grad_norm": 0.021556353196501732,
"learning_rate": 0.0002,
"loss": 0.5747779607772827,
"mean_token_accuracy": 0.7685752362012863,
"num_tokens": 10776164.0,
"step": 144
},
{
"entropy": 0.5803283900022507,
"epoch": 3.625,
"grad_norm": 0.024940941482782364,
"learning_rate": 0.0002,
"loss": 0.5839154124259949,
"mean_token_accuracy": 0.7644830942153931,
"num_tokens": 10851502.0,
"step": 145
},
{
"entropy": 0.5679881721735001,
"epoch": 3.65,
"grad_norm": 0.02257210575044155,
"learning_rate": 0.0002,
"loss": 0.5671518445014954,
"mean_token_accuracy": 0.7707392424345016,
"num_tokens": 10926491.0,
"step": 146
},
{
"entropy": 0.5873086154460907,
"epoch": 3.675,
"grad_norm": 0.024546999484300613,
"learning_rate": 0.0002,
"loss": 0.5834171772003174,
"mean_token_accuracy": 0.764349952340126,
"num_tokens": 11000892.0,
"step": 147
},
{
"entropy": 0.573735237121582,
"epoch": 3.7,
"grad_norm": 0.02570403181016445,
"learning_rate": 0.0002,
"loss": 0.5722212195396423,
"mean_token_accuracy": 0.7698929309844971,
"num_tokens": 11075461.0,
"step": 148
},
{
"entropy": 0.5838541835546494,
"epoch": 3.725,
"grad_norm": 0.021784571930766106,
"learning_rate": 0.0002,
"loss": 0.5852305889129639,
"mean_token_accuracy": 0.7646767646074295,
"num_tokens": 11149906.0,
"step": 149
},
{
"entropy": 0.5819535553455353,
"epoch": 3.75,
"grad_norm": 0.023919865489006042,
"learning_rate": 0.0002,
"loss": 0.5841096639633179,
"mean_token_accuracy": 0.7660493403673172,
"num_tokens": 11224427.0,
"step": 150
},
{
"entropy": 0.5729077309370041,
"epoch": 3.775,
"grad_norm": 0.019240032881498337,
"learning_rate": 0.0002,
"loss": 0.5747278332710266,
"mean_token_accuracy": 0.7695352137088776,
"num_tokens": 11299373.0,
"step": 151
},
{
"entropy": 0.5640138536691666,
"epoch": 3.8,
"grad_norm": 0.022750195115804672,
"learning_rate": 0.0002,
"loss": 0.5636758804321289,
"mean_token_accuracy": 0.7727017253637314,
"num_tokens": 11373929.0,
"step": 152
},
{
"entropy": 0.5785274505615234,
"epoch": 3.825,
"grad_norm": 0.024555128067731857,
"learning_rate": 0.0002,
"loss": 0.577983021736145,
"mean_token_accuracy": 0.7675963938236237,
"num_tokens": 11448658.0,
"step": 153
},
{
"entropy": 0.5797367095947266,
"epoch": 3.85,
"grad_norm": 0.02360512688755989,
"learning_rate": 0.0002,
"loss": 0.5788124799728394,
"mean_token_accuracy": 0.7675672024488449,
"num_tokens": 11522296.0,
"step": 154
},
{
"entropy": 0.5766919553279877,
"epoch": 3.875,
"grad_norm": 0.020860835909843445,
"learning_rate": 0.0002,
"loss": 0.5760090351104736,
"mean_token_accuracy": 0.7677243202924728,
"num_tokens": 11596689.0,
"step": 155
},
{
"entropy": 0.5780852437019348,
"epoch": 3.9,
"grad_norm": 0.021970726549625397,
"learning_rate": 0.0002,
"loss": 0.5821795463562012,
"mean_token_accuracy": 0.765151247382164,
"num_tokens": 11670420.0,
"step": 156
},
{
"entropy": 0.5896212756633759,
"epoch": 3.925,
"grad_norm": 0.025580603629350662,
"learning_rate": 0.0002,
"loss": 0.5879545211791992,
"mean_token_accuracy": 0.7629344761371613,
"num_tokens": 11746549.0,
"step": 157
},
{
"entropy": 0.5752788335084915,
"epoch": 3.95,
"grad_norm": 0.02031378634274006,
"learning_rate": 0.0002,
"loss": 0.5733282566070557,
"mean_token_accuracy": 0.768017366528511,
"num_tokens": 11822165.0,
"step": 158
},
{
"entropy": 0.5673830062150955,
"epoch": 3.975,
"grad_norm": 0.023106930777430534,
"learning_rate": 0.0002,
"loss": 0.5672657489776611,
"mean_token_accuracy": 0.7714889943599701,
"num_tokens": 11897985.0,
"step": 159
},
{
"entropy": 0.5763387382030487,
"epoch": 4.0,
"grad_norm": 0.02034103125333786,
"learning_rate": 0.0002,
"loss": 0.5772510766983032,
"mean_token_accuracy": 0.7671704441308975,
"num_tokens": 11972903.0,
"step": 160
},
{
"entropy": 0.5695903152227402,
"epoch": 4.025,
"grad_norm": 0.021100850775837898,
"learning_rate": 0.0002,
"loss": 0.5693217515945435,
"mean_token_accuracy": 0.7699502855539322,
"num_tokens": 12047693.0,
"step": 161
},
{
"entropy": 0.573061928153038,
"epoch": 4.05,
"grad_norm": 0.021061765030026436,
"learning_rate": 0.0002,
"loss": 0.57159024477005,
"mean_token_accuracy": 0.7694694995880127,
"num_tokens": 12122067.0,
"step": 162
},
{
"entropy": 0.5698549449443817,
"epoch": 4.075,
"grad_norm": 0.025176256895065308,
"learning_rate": 0.0002,
"loss": 0.5731710195541382,
"mean_token_accuracy": 0.7683273702859879,
"num_tokens": 12196269.0,
"step": 163
},
{
"entropy": 0.5690735876560211,
"epoch": 4.1,
"grad_norm": 0.02089373581111431,
"learning_rate": 0.0002,
"loss": 0.5714013576507568,
"mean_token_accuracy": 0.7697489559650421,
"num_tokens": 12270853.0,
"step": 164
},
{
"entropy": 0.5816214233636856,
"epoch": 4.125,
"grad_norm": 0.02240598015487194,
"learning_rate": 0.0002,
"loss": 0.5786125063896179,
"mean_token_accuracy": 0.7670575529336929,
"num_tokens": 12345733.0,
"step": 165
},
{
"entropy": 0.5852687507867813,
"epoch": 4.15,
"grad_norm": 0.023174043744802475,
"learning_rate": 0.0002,
"loss": 0.58061683177948,
"mean_token_accuracy": 0.7636701017618179,
"num_tokens": 12419829.0,
"step": 166
},
{
"entropy": 0.571011945605278,
"epoch": 4.175,
"grad_norm": 0.022563502192497253,
"learning_rate": 0.0002,
"loss": 0.5689563751220703,
"mean_token_accuracy": 0.7716156244277954,
"num_tokens": 12494264.0,
"step": 167
},
{
"entropy": 0.5736175775527954,
"epoch": 4.2,
"grad_norm": 0.02212107926607132,
"learning_rate": 0.0002,
"loss": 0.5761622190475464,
"mean_token_accuracy": 0.7677205204963684,
"num_tokens": 12569157.0,
"step": 168
},
{
"entropy": 0.5656485557556152,
"epoch": 4.225,
"grad_norm": 0.02473953552544117,
"learning_rate": 0.0002,
"loss": 0.5689172744750977,
"mean_token_accuracy": 0.771388441324234,
"num_tokens": 12644204.0,
"step": 169
},
{
"entropy": 0.5819400250911713,
"epoch": 4.25,
"grad_norm": 0.024174660444259644,
"learning_rate": 0.0002,
"loss": 0.5831934213638306,
"mean_token_accuracy": 0.7642627954483032,
"num_tokens": 12719199.0,
"step": 170
},
{
"entropy": 0.5756211876869202,
"epoch": 4.275,
"grad_norm": 0.019957805052399635,
"learning_rate": 0.0002,
"loss": 0.5719864964485168,
"mean_token_accuracy": 0.7682519555091858,
"num_tokens": 12793696.0,
"step": 171
},
{
"entropy": 0.5758868604898453,
"epoch": 4.3,
"grad_norm": 0.02505411207675934,
"learning_rate": 0.0002,
"loss": 0.5705811977386475,
"mean_token_accuracy": 0.7688630670309067,
"num_tokens": 12868020.0,
"step": 172
},
{
"entropy": 0.576561376452446,
"epoch": 4.325,
"grad_norm": 0.02111932635307312,
"learning_rate": 0.0002,
"loss": 0.5742412805557251,
"mean_token_accuracy": 0.7686543017625809,
"num_tokens": 12943124.0,
"step": 173
},
{
"entropy": 0.5648486465215683,
"epoch": 4.35,
"grad_norm": 0.024696264415979385,
"learning_rate": 0.0002,
"loss": 0.5707447528839111,
"mean_token_accuracy": 0.7711144238710403,
"num_tokens": 13018094.0,
"step": 174
},
{
"entropy": 0.550954133272171,
"epoch": 4.375,
"grad_norm": 0.021990923210978508,
"learning_rate": 0.0002,
"loss": 0.5539791584014893,
"mean_token_accuracy": 0.7755307257175446,
"num_tokens": 13093256.0,
"step": 175
},
{
"entropy": 0.571951225399971,
"epoch": 4.4,
"grad_norm": 0.021349789574742317,
"learning_rate": 0.0002,
"loss": 0.5731101036071777,
"mean_token_accuracy": 0.7676120102405548,
"num_tokens": 13167474.0,
"step": 176
},
{
"entropy": 0.5779466480016708,
"epoch": 4.425,
"grad_norm": 0.02244136668741703,
"learning_rate": 0.0002,
"loss": 0.5748851895332336,
"mean_token_accuracy": 0.7674881815910339,
"num_tokens": 13243109.0,
"step": 177
},
{
"entropy": 0.570502832531929,
"epoch": 4.45,
"grad_norm": 0.021098149940371513,
"learning_rate": 0.0002,
"loss": 0.5683890581130981,
"mean_token_accuracy": 0.7724465280771255,
"num_tokens": 13317893.0,
"step": 178
},
{
"entropy": 0.5695969015359879,
"epoch": 4.475,
"grad_norm": 0.02162528783082962,
"learning_rate": 0.0002,
"loss": 0.5705981254577637,
"mean_token_accuracy": 0.768608570098877,
"num_tokens": 13392379.0,
"step": 179
},
{
"entropy": 0.5717011392116547,
"epoch": 4.5,
"grad_norm": 0.0223979689180851,
"learning_rate": 0.0002,
"loss": 0.5761755108833313,
"mean_token_accuracy": 0.7661348432302475,
"num_tokens": 13467500.0,
"step": 180
},
{
"entropy": 0.5750777423381805,
"epoch": 4.525,
"grad_norm": 0.0206700898706913,
"learning_rate": 0.0002,
"loss": 0.5748052597045898,
"mean_token_accuracy": 0.7673143148422241,
"num_tokens": 13542205.0,
"step": 181
},
{
"entropy": 0.5675535202026367,
"epoch": 4.55,
"grad_norm": 0.021973995491862297,
"learning_rate": 0.0002,
"loss": 0.563953697681427,
"mean_token_accuracy": 0.7722303122282028,
"num_tokens": 13616628.0,
"step": 182
},
{
"entropy": 0.5718803107738495,
"epoch": 4.575,
"grad_norm": 0.021145911887288094,
"learning_rate": 0.0002,
"loss": 0.5688813924789429,
"mean_token_accuracy": 0.7704852521419525,
"num_tokens": 13691940.0,
"step": 183
},
{
"entropy": 0.5774464905261993,
"epoch": 4.6,
"grad_norm": 0.021537618711590767,
"learning_rate": 0.0002,
"loss": 0.5771785974502563,
"mean_token_accuracy": 0.7655858248472214,
"num_tokens": 13767043.0,
"step": 184
},
{
"entropy": 0.570703387260437,
"epoch": 4.625,
"grad_norm": 0.02538282983005047,
"learning_rate": 0.0002,
"loss": 0.5734485387802124,
"mean_token_accuracy": 0.7683311551809311,
"num_tokens": 13842107.0,
"step": 185
},
{
"entropy": 0.5709679424762726,
"epoch": 4.65,
"grad_norm": 0.024409880861639977,
"learning_rate": 0.0002,
"loss": 0.5723483562469482,
"mean_token_accuracy": 0.7690693438053131,
"num_tokens": 13916757.0,
"step": 186
},
{
"entropy": 0.5667127072811127,
"epoch": 4.675,
"grad_norm": 0.02431379444897175,
"learning_rate": 0.0002,
"loss": 0.5675520896911621,
"mean_token_accuracy": 0.7711669653654099,
"num_tokens": 13992850.0,
"step": 187
},
{
"entropy": 0.5858957320451736,
"epoch": 4.7,
"grad_norm": 0.02329982817173004,
"learning_rate": 0.0002,
"loss": 0.5802958011627197,
"mean_token_accuracy": 0.7663194984197617,
"num_tokens": 14068227.0,
"step": 188
},
{
"entropy": 0.5811503231525421,
"epoch": 4.725,
"grad_norm": 0.025335390120744705,
"learning_rate": 0.0002,
"loss": 0.5782935619354248,
"mean_token_accuracy": 0.7669466435909271,
"num_tokens": 14142904.0,
"step": 189
},
{
"entropy": 0.5707950592041016,
"epoch": 4.75,
"grad_norm": 0.02279096655547619,
"learning_rate": 0.0002,
"loss": 0.5738247632980347,
"mean_token_accuracy": 0.7678002119064331,
"num_tokens": 14217940.0,
"step": 190
},
{
"entropy": 0.5689068585634232,
"epoch": 4.775,
"grad_norm": 0.028375349938869476,
"learning_rate": 0.0002,
"loss": 0.5795305371284485,
"mean_token_accuracy": 0.7658824324607849,
"num_tokens": 14292463.0,
"step": 191
},
{
"entropy": 0.5779102444648743,
"epoch": 4.8,
"grad_norm": 0.019591832533478737,
"learning_rate": 0.0002,
"loss": 0.5775682330131531,
"mean_token_accuracy": 0.7678920924663544,
"num_tokens": 14367428.0,
"step": 192
},
{
"entropy": 0.5796025097370148,
"epoch": 4.825,
"grad_norm": 0.024824826046824455,
"learning_rate": 0.0002,
"loss": 0.5707208514213562,
"mean_token_accuracy": 0.7707486748695374,
"num_tokens": 14442526.0,
"step": 193
},
{
"entropy": 0.574284628033638,
"epoch": 4.85,
"grad_norm": 0.021157678216695786,
"learning_rate": 0.0002,
"loss": 0.5702036023139954,
"mean_token_accuracy": 0.7699166387319565,
"num_tokens": 14517364.0,
"step": 194
},
{
"entropy": 0.5729261040687561,
"epoch": 4.875,
"grad_norm": 0.025306105613708496,
"learning_rate": 0.0002,
"loss": 0.5750659704208374,
"mean_token_accuracy": 0.7673707902431488,
"num_tokens": 14592405.0,
"step": 195
},
{
"entropy": 0.5620183199644089,
"epoch": 4.9,
"grad_norm": 0.025408228859305382,
"learning_rate": 0.0002,
"loss": 0.5673909783363342,
"mean_token_accuracy": 0.7702877819538116,
"num_tokens": 14667053.0,
"step": 196
},
{
"entropy": 0.5667766779661179,
"epoch": 4.925,
"grad_norm": 0.024316171184182167,
"learning_rate": 0.0002,
"loss": 0.5681411027908325,
"mean_token_accuracy": 0.7717499136924744,
"num_tokens": 14742222.0,
"step": 197
},
{
"entropy": 0.5805934369564056,
"epoch": 4.95,
"grad_norm": 0.02220967784523964,
"learning_rate": 0.0002,
"loss": 0.5777135491371155,
"mean_token_accuracy": 0.7674207240343094,
"num_tokens": 14817012.0,
"step": 198
},
{
"entropy": 0.573724776506424,
"epoch": 4.975,
"grad_norm": 0.02526751719415188,
"learning_rate": 0.0002,
"loss": 0.5700376033782959,
"mean_token_accuracy": 0.7695773392915726,
"num_tokens": 14891805.0,
"step": 199
},
{
"entropy": 0.5778943598270416,
"epoch": 5.0,
"grad_norm": 0.021638575941324234,
"learning_rate": 0.0002,
"loss": 0.5796632766723633,
"mean_token_accuracy": 0.7665348052978516,
"num_tokens": 14966136.0,
"step": 200
},
{
"entropy": 0.5613097548484802,
"epoch": 5.025,
"grad_norm": 0.021904166787862778,
"learning_rate": 0.0002,
"loss": 0.5612062215805054,
"mean_token_accuracy": 0.7737284600734711,
"num_tokens": 15040385.0,
"step": 201
},
{
"entropy": 0.5623792111873627,
"epoch": 5.05,
"grad_norm": 0.02356012538075447,
"learning_rate": 0.0002,
"loss": 0.5624409317970276,
"mean_token_accuracy": 0.7724986523389816,
"num_tokens": 15115257.0,
"step": 202
},
{
"entropy": 0.5548672676086426,
"epoch": 5.075,
"grad_norm": 0.02421456202864647,
"learning_rate": 0.0002,
"loss": 0.5566623210906982,
"mean_token_accuracy": 0.7740187644958496,
"num_tokens": 15190279.0,
"step": 203
},
{
"entropy": 0.5734377503395081,
"epoch": 5.1,
"grad_norm": 0.027081554755568504,
"learning_rate": 0.0002,
"loss": 0.5723409652709961,
"mean_token_accuracy": 0.7683208882808685,
"num_tokens": 15265534.0,
"step": 204
},
{
"entropy": 0.5770121663808823,
"epoch": 5.125,
"grad_norm": 0.025843461975455284,
"learning_rate": 0.0002,
"loss": 0.5737386345863342,
"mean_token_accuracy": 0.768302395939827,
"num_tokens": 15340204.0,
"step": 205
},
{
"entropy": 0.5695698410272598,
"epoch": 5.15,
"grad_norm": 0.024712897837162018,
"learning_rate": 0.0002,
"loss": 0.566956639289856,
"mean_token_accuracy": 0.7712061703205109,
"num_tokens": 15415225.0,
"step": 206
},
{
"entropy": 0.5699747204780579,
"epoch": 5.175,
"grad_norm": 0.02740584686398506,
"learning_rate": 0.0002,
"loss": 0.5699794888496399,
"mean_token_accuracy": 0.7703876197338104,
"num_tokens": 15490117.0,
"step": 207
},
{
"entropy": 0.5615235567092896,
"epoch": 5.2,
"grad_norm": 0.02705363929271698,
"learning_rate": 0.0002,
"loss": 0.5620254278182983,
"mean_token_accuracy": 0.7728376239538193,
"num_tokens": 15564960.0,
"step": 208
},
{
"entropy": 0.571983590722084,
"epoch": 5.225,
"grad_norm": 0.02741997316479683,
"learning_rate": 0.0002,
"loss": 0.5753256678581238,
"mean_token_accuracy": 0.7678238153457642,
"num_tokens": 15640346.0,
"step": 209
},
{
"entropy": 0.5734784454107285,
"epoch": 5.25,
"grad_norm": 0.026802683249115944,
"learning_rate": 0.0002,
"loss": 0.573603630065918,
"mean_token_accuracy": 0.767236202955246,
"num_tokens": 15715092.0,
"step": 210
},
{
"entropy": 0.581208124756813,
"epoch": 5.275,
"grad_norm": 0.024372655898332596,
"learning_rate": 0.0002,
"loss": 0.5770745277404785,
"mean_token_accuracy": 0.766946941614151,
"num_tokens": 15790126.0,
"step": 211
},
{
"entropy": 0.5763002783060074,
"epoch": 5.3,
"grad_norm": 0.030634434893727303,
"learning_rate": 0.0002,
"loss": 0.5704483985900879,
"mean_token_accuracy": 0.7682492583990097,
"num_tokens": 15865049.0,
"step": 212
},
{
"entropy": 0.5761642754077911,
"epoch": 5.325,
"grad_norm": 0.02550283446907997,
"learning_rate": 0.0002,
"loss": 0.5767782926559448,
"mean_token_accuracy": 0.7672466337680817,
"num_tokens": 15939883.0,
"step": 213
},
{
"entropy": 0.5759230703115463,
"epoch": 5.35,
"grad_norm": 0.03148680552840233,
"learning_rate": 0.0002,
"loss": 0.5800034999847412,
"mean_token_accuracy": 0.7658649682998657,
"num_tokens": 16014297.0,
"step": 214
},
{
"entropy": 0.5732310563325882,
"epoch": 5.375,
"grad_norm": 0.03305201232433319,
"learning_rate": 0.0002,
"loss": 0.5733552575111389,
"mean_token_accuracy": 0.7683651447296143,
"num_tokens": 16089131.0,
"step": 215
},
{
"entropy": 0.5785533636808395,
"epoch": 5.4,
"grad_norm": 0.024717051535844803,
"learning_rate": 0.0002,
"loss": 0.5784015655517578,
"mean_token_accuracy": 0.7660104632377625,
"num_tokens": 16164324.0,
"step": 216
},
{
"entropy": 0.5687698572874069,
"epoch": 5.425,
"grad_norm": 0.029457444325089455,
"learning_rate": 0.0002,
"loss": 0.5750937461853027,
"mean_token_accuracy": 0.7675990760326385,
"num_tokens": 16239760.0,
"step": 217
},
{
"entropy": 0.5556502044200897,
"epoch": 5.45,
"grad_norm": 0.02100587822496891,
"learning_rate": 0.0002,
"loss": 0.555939793586731,
"mean_token_accuracy": 0.7752721607685089,
"num_tokens": 16313638.0,
"step": 218
},
{
"entropy": 0.5768693834543228,
"epoch": 5.475,
"grad_norm": 0.02610902115702629,
"learning_rate": 0.0002,
"loss": 0.5738801956176758,
"mean_token_accuracy": 0.7672755122184753,
"num_tokens": 16388910.0,
"step": 219
},
{
"entropy": 0.5702448487281799,
"epoch": 5.5,
"grad_norm": 0.023769576102495193,
"learning_rate": 0.0002,
"loss": 0.5661875009536743,
"mean_token_accuracy": 0.771107405424118,
"num_tokens": 16464093.0,
"step": 220
},
{
"entropy": 0.564674437046051,
"epoch": 5.525,
"grad_norm": 0.031206723302602768,
"learning_rate": 0.0002,
"loss": 0.5673574805259705,
"mean_token_accuracy": 0.7696384638547897,
"num_tokens": 16538659.0,
"step": 221
},
{
"entropy": 0.5666674822568893,
"epoch": 5.55,
"grad_norm": 0.028113245964050293,
"learning_rate": 0.0002,
"loss": 0.5711595416069031,
"mean_token_accuracy": 0.770385280251503,
"num_tokens": 16612944.0,
"step": 222
},
{
"entropy": 0.5764763206243515,
"epoch": 5.575,
"grad_norm": 0.02818591520190239,
"learning_rate": 0.0002,
"loss": 0.5757144689559937,
"mean_token_accuracy": 0.7672824114561081,
"num_tokens": 16688054.0,
"step": 223
},
{
"entropy": 0.5728043168783188,
"epoch": 5.6,
"grad_norm": 0.026192322373390198,
"learning_rate": 0.0002,
"loss": 0.5717101693153381,
"mean_token_accuracy": 0.768505647778511,
"num_tokens": 16763085.0,
"step": 224
},
{
"entropy": 0.5653840452432632,
"epoch": 5.625,
"grad_norm": 0.02572912909090519,
"learning_rate": 0.0002,
"loss": 0.5642470717430115,
"mean_token_accuracy": 0.7718316316604614,
"num_tokens": 16837841.0,
"step": 225
},
{
"entropy": 0.5668259114027023,
"epoch": 5.65,
"grad_norm": 0.025471486151218414,
"learning_rate": 0.0002,
"loss": 0.564730167388916,
"mean_token_accuracy": 0.771491751074791,
"num_tokens": 16912424.0,
"step": 226
},
{
"entropy": 0.5578023791313171,
"epoch": 5.675,
"grad_norm": 0.029479067772626877,
"learning_rate": 0.0002,
"loss": 0.5636255741119385,
"mean_token_accuracy": 0.7711348384618759,
"num_tokens": 16986843.0,
"step": 227
},
{
"entropy": 0.5757892429828644,
"epoch": 5.7,
"grad_norm": 0.026731031015515327,
"learning_rate": 0.0002,
"loss": 0.5748844742774963,
"mean_token_accuracy": 0.7673381417989731,
"num_tokens": 17061592.0,
"step": 228
},
{
"entropy": 0.5692086964845657,
"epoch": 5.725,
"grad_norm": 0.02727457694709301,
"learning_rate": 0.0002,
"loss": 0.5636672973632812,
"mean_token_accuracy": 0.7722706943750381,
"num_tokens": 17136842.0,
"step": 229
},
{
"entropy": 0.570942759513855,
"epoch": 5.75,
"grad_norm": 0.02676619589328766,
"learning_rate": 0.0002,
"loss": 0.5672682523727417,
"mean_token_accuracy": 0.7702435255050659,
"num_tokens": 17211443.0,
"step": 230
},
{
"entropy": 0.5606240034103394,
"epoch": 5.775,
"grad_norm": 0.028618959710001945,
"learning_rate": 0.0002,
"loss": 0.5684143304824829,
"mean_token_accuracy": 0.7702829986810684,
"num_tokens": 17286970.0,
"step": 231
},
{
"entropy": 0.569316640496254,
"epoch": 5.8,
"grad_norm": 0.027750151231884956,
"learning_rate": 0.0002,
"loss": 0.574034571647644,
"mean_token_accuracy": 0.7676869779825211,
"num_tokens": 17361772.0,
"step": 232
},
{
"entropy": 0.576723724603653,
"epoch": 5.825,
"grad_norm": 0.02459871955215931,
"learning_rate": 0.0002,
"loss": 0.5745028853416443,
"mean_token_accuracy": 0.7675033956766129,
"num_tokens": 17436257.0,
"step": 233
},
{
"entropy": 0.5697972923517227,
"epoch": 5.85,
"grad_norm": 0.02738168090581894,
"learning_rate": 0.0002,
"loss": 0.5621964931488037,
"mean_token_accuracy": 0.7732144594192505,
"num_tokens": 17510677.0,
"step": 234
},
{
"entropy": 0.5714251548051834,
"epoch": 5.875,
"grad_norm": 0.022376077249646187,
"learning_rate": 0.0002,
"loss": 0.5706051588058472,
"mean_token_accuracy": 0.7693936377763748,
"num_tokens": 17585616.0,
"step": 235
},
{
"entropy": 0.5623523741960526,
"epoch": 5.9,
"grad_norm": 0.029145779088139534,
"learning_rate": 0.0002,
"loss": 0.5659171342849731,
"mean_token_accuracy": 0.7712376862764359,
"num_tokens": 17660019.0,
"step": 236
},
{
"entropy": 0.5657843053340912,
"epoch": 5.925,
"grad_norm": 0.024399209767580032,
"learning_rate": 0.0002,
"loss": 0.5663224458694458,
"mean_token_accuracy": 0.7708786725997925,
"num_tokens": 17735131.0,
"step": 237
},
{
"entropy": 0.5680066645145416,
"epoch": 5.95,
"grad_norm": 0.027334652841091156,
"learning_rate": 0.0002,
"loss": 0.5661309957504272,
"mean_token_accuracy": 0.7711464017629623,
"num_tokens": 17809928.0,
"step": 238
},
{
"entropy": 0.5661123096942902,
"epoch": 5.975,
"grad_norm": 0.02591884881258011,
"learning_rate": 0.0002,
"loss": 0.5688766837120056,
"mean_token_accuracy": 0.7700952738523483,
"num_tokens": 17884292.0,
"step": 239
},
{
"entropy": 0.5738363265991211,
"epoch": 6.0,
"grad_norm": 0.023802831768989563,
"learning_rate": 0.0002,
"loss": 0.5767297744750977,
"mean_token_accuracy": 0.7682788968086243,
"num_tokens": 17959306.0,
"step": 240
},
{
"entropy": 0.5705175548791885,
"epoch": 6.025,
"grad_norm": 0.026808038353919983,
"learning_rate": 0.0002,
"loss": 0.5646485090255737,
"mean_token_accuracy": 0.771740049123764,
"num_tokens": 18033987.0,
"step": 241
},
{
"entropy": 0.568175345659256,
"epoch": 6.05,
"grad_norm": 0.026018792763352394,
"learning_rate": 0.0002,
"loss": 0.5654234290122986,
"mean_token_accuracy": 0.7715927213430405,
"num_tokens": 18108533.0,
"step": 242
},
{
"entropy": 0.5620162785053253,
"epoch": 6.075,
"grad_norm": 0.03238891437649727,
"learning_rate": 0.0002,
"loss": 0.5632866024971008,
"mean_token_accuracy": 0.7722341269254684,
"num_tokens": 18183188.0,
"step": 243
},
{
"entropy": 0.5663654953241348,
"epoch": 6.1,
"grad_norm": 0.04267890378832817,
"learning_rate": 0.0002,
"loss": 0.5707510709762573,
"mean_token_accuracy": 0.7683595418930054,
"num_tokens": 18257193.0,
"step": 244
},
{
"entropy": 0.5661468356847763,
"epoch": 6.125,
"grad_norm": 0.023024071007966995,
"learning_rate": 0.0002,
"loss": 0.5620009899139404,
"mean_token_accuracy": 0.7728223353624344,
"num_tokens": 18331475.0,
"step": 245
},
{
"entropy": 0.5726824253797531,
"epoch": 6.15,
"grad_norm": 0.03274550661444664,
"learning_rate": 0.0002,
"loss": 0.5699936151504517,
"mean_token_accuracy": 0.7683205753564835,
"num_tokens": 18406469.0,
"step": 246
},
{
"entropy": 0.553859069943428,
"epoch": 6.175,
"grad_norm": 0.025100160390138626,
"learning_rate": 0.0002,
"loss": 0.5552276968955994,
"mean_token_accuracy": 0.774128720164299,
"num_tokens": 18481392.0,
"step": 247
},
{
"entropy": 0.5592118203639984,
"epoch": 6.2,
"grad_norm": 0.030672013759613037,
"learning_rate": 0.0002,
"loss": 0.5638296604156494,
"mean_token_accuracy": 0.7705448269844055,
"num_tokens": 18557082.0,
"step": 248
},
{
"entropy": 0.5587449073791504,
"epoch": 6.225,
"grad_norm": 0.02617192640900612,
"learning_rate": 0.0002,
"loss": 0.5589370727539062,
"mean_token_accuracy": 0.7735968083143234,
"num_tokens": 18632396.0,
"step": 249
},
{
"entropy": 0.570429340004921,
"epoch": 6.25,
"grad_norm": 0.026497265323996544,
"learning_rate": 0.0002,
"loss": 0.5678025484085083,
"mean_token_accuracy": 0.7696103155612946,
"num_tokens": 18706761.0,
"step": 250
},
{
"entropy": 0.5758352428674698,
"epoch": 6.275,
"grad_norm": 0.03510003909468651,
"learning_rate": 0.0002,
"loss": 0.5702039003372192,
"mean_token_accuracy": 0.7697926163673401,
"num_tokens": 18781835.0,
"step": 251
},
{
"entropy": 0.5599015653133392,
"epoch": 6.3,
"grad_norm": 0.026413707062602043,
"learning_rate": 0.0002,
"loss": 0.5648400783538818,
"mean_token_accuracy": 0.7707231491804123,
"num_tokens": 18856780.0,
"step": 252
},
{
"entropy": 0.5588082820177078,
"epoch": 6.325,
"grad_norm": 0.03752964362502098,
"learning_rate": 0.0002,
"loss": 0.5637919902801514,
"mean_token_accuracy": 0.7720127999782562,
"num_tokens": 18931882.0,
"step": 253
},
{
"entropy": 0.5763219594955444,
"epoch": 6.35,
"grad_norm": 0.027257010340690613,
"learning_rate": 0.0002,
"loss": 0.5744665861129761,
"mean_token_accuracy": 0.7678072452545166,
"num_tokens": 19006654.0,
"step": 254
},
{
"entropy": 0.5638702213764191,
"epoch": 6.375,
"grad_norm": 0.03087831847369671,
"learning_rate": 0.0002,
"loss": 0.5590026378631592,
"mean_token_accuracy": 0.7745161801576614,
"num_tokens": 19081712.0,
"step": 255
},
{
"entropy": 0.5763344466686249,
"epoch": 6.4,
"grad_norm": 0.026007242500782013,
"learning_rate": 0.0002,
"loss": 0.5743853449821472,
"mean_token_accuracy": 0.767315685749054,
"num_tokens": 19156683.0,
"step": 256
},
{
"entropy": 0.5625879168510437,
"epoch": 6.425,
"grad_norm": 0.02871275693178177,
"learning_rate": 0.0002,
"loss": 0.5631023645401001,
"mean_token_accuracy": 0.7718234807252884,
"num_tokens": 19231669.0,
"step": 257
},
{
"entropy": 0.5742108523845673,
"epoch": 6.45,
"grad_norm": 0.029883647337555885,
"learning_rate": 0.0002,
"loss": 0.576926052570343,
"mean_token_accuracy": 0.7668623924255371,
"num_tokens": 19306142.0,
"step": 258
},
{
"entropy": 0.557953953742981,
"epoch": 6.475,
"grad_norm": 0.03357018902897835,
"learning_rate": 0.0002,
"loss": 0.5605831146240234,
"mean_token_accuracy": 0.7726317644119263,
"num_tokens": 19380351.0,
"step": 259
},
{
"entropy": 0.5633054375648499,
"epoch": 6.5,
"grad_norm": 0.028555380180478096,
"learning_rate": 0.0002,
"loss": 0.5600845813751221,
"mean_token_accuracy": 0.7734545171260834,
"num_tokens": 19454607.0,
"step": 260
},
{
"entropy": 0.5629763156175613,
"epoch": 6.525,
"grad_norm": 0.027474038302898407,
"learning_rate": 0.0002,
"loss": 0.5615619421005249,
"mean_token_accuracy": 0.7738883346319199,
"num_tokens": 19529836.0,
"step": 261
},
{
"entropy": 0.5723123848438263,
"epoch": 6.55,
"grad_norm": 0.030043484643101692,
"learning_rate": 0.0002,
"loss": 0.5767689943313599,
"mean_token_accuracy": 0.7650009542703629,
"num_tokens": 19604672.0,
"step": 262
},
{
"entropy": 0.5594469308853149,
"epoch": 6.575,
"grad_norm": 0.027517110109329224,
"learning_rate": 0.0002,
"loss": 0.5633723735809326,
"mean_token_accuracy": 0.7720958739519119,
"num_tokens": 19678879.0,
"step": 263
},
{
"entropy": 0.5676506012678146,
"epoch": 6.6,
"grad_norm": 0.03375779092311859,
"learning_rate": 0.0002,
"loss": 0.5649895668029785,
"mean_token_accuracy": 0.7705852091312408,
"num_tokens": 19753964.0,
"step": 264
},
{
"entropy": 0.5738198310136795,
"epoch": 6.625,
"grad_norm": 0.026767941191792488,
"learning_rate": 0.0002,
"loss": 0.5693171620368958,
"mean_token_accuracy": 0.7693505436182022,
"num_tokens": 19829009.0,
"step": 265
},
{
"entropy": 0.5671893358230591,
"epoch": 6.65,
"grad_norm": 0.033948201686143875,
"learning_rate": 0.0002,
"loss": 0.5679397583007812,
"mean_token_accuracy": 0.7691246271133423,
"num_tokens": 19904354.0,
"step": 266
},
{
"entropy": 0.5724634379148483,
"epoch": 6.675,
"grad_norm": 0.027929022908210754,
"learning_rate": 0.0002,
"loss": 0.5724775791168213,
"mean_token_accuracy": 0.7685766965150833,
"num_tokens": 19979485.0,
"step": 267
},
{
"entropy": 0.5530816316604614,
"epoch": 6.7,
"grad_norm": 0.02936733327805996,
"learning_rate": 0.0002,
"loss": 0.5522775053977966,
"mean_token_accuracy": 0.7771619409322739,
"num_tokens": 20055306.0,
"step": 268
},
{
"entropy": 0.5592961460351944,
"epoch": 6.725,
"grad_norm": 0.033846575766801834,
"learning_rate": 0.0002,
"loss": 0.5621505975723267,
"mean_token_accuracy": 0.7730831801891327,
"num_tokens": 20129856.0,
"step": 269
},
{
"entropy": 0.5651666820049286,
"epoch": 6.75,
"grad_norm": 0.025500988587737083,
"learning_rate": 0.0002,
"loss": 0.5668225288391113,
"mean_token_accuracy": 0.7706255167722702,
"num_tokens": 20204683.0,
"step": 270
},
{
"entropy": 0.5640353113412857,
"epoch": 6.775,
"grad_norm": 0.033350858837366104,
"learning_rate": 0.0002,
"loss": 0.5606875419616699,
"mean_token_accuracy": 0.7722483277320862,
"num_tokens": 20280039.0,
"step": 271
},
{
"entropy": 0.5725362002849579,
"epoch": 6.8,
"grad_norm": 0.03152982145547867,
"learning_rate": 0.0002,
"loss": 0.5739132165908813,
"mean_token_accuracy": 0.767605796456337,
"num_tokens": 20354788.0,
"step": 272
},
{
"entropy": 0.5656009018421173,
"epoch": 6.825,
"grad_norm": 0.03156192600727081,
"learning_rate": 0.0002,
"loss": 0.5675251483917236,
"mean_token_accuracy": 0.7705205380916595,
"num_tokens": 20429122.0,
"step": 273
},
{
"entropy": 0.5605581253767014,
"epoch": 6.85,
"grad_norm": 0.03891259804368019,
"learning_rate": 0.0002,
"loss": 0.5642046928405762,
"mean_token_accuracy": 0.7717309892177582,
"num_tokens": 20503457.0,
"step": 274
},
{
"entropy": 0.5743284076452255,
"epoch": 6.875,
"grad_norm": 0.026666074991226196,
"learning_rate": 0.0002,
"loss": 0.5709526538848877,
"mean_token_accuracy": 0.7691267430782318,
"num_tokens": 20578654.0,
"step": 275
},
{
"entropy": 0.5633113235235214,
"epoch": 6.9,
"grad_norm": 0.03862672671675682,
"learning_rate": 0.0002,
"loss": 0.5625099539756775,
"mean_token_accuracy": 0.7733415812253952,
"num_tokens": 20653724.0,
"step": 276
},
{
"entropy": 0.5552873611450195,
"epoch": 6.925,
"grad_norm": 0.02755405753850937,
"learning_rate": 0.0002,
"loss": 0.555654764175415,
"mean_token_accuracy": 0.7761149406433105,
"num_tokens": 20728186.0,
"step": 277
},
{
"entropy": 0.5622376352548599,
"epoch": 6.95,
"grad_norm": 0.038842860609292984,
"learning_rate": 0.0002,
"loss": 0.5644208192825317,
"mean_token_accuracy": 0.7724904865026474,
"num_tokens": 20803838.0,
"step": 278
},
{
"entropy": 0.5590371191501617,
"epoch": 6.975,
"grad_norm": 0.03130970522761345,
"learning_rate": 0.0002,
"loss": 0.5607836246490479,
"mean_token_accuracy": 0.7744860798120499,
"num_tokens": 20878758.0,
"step": 279
},
{
"entropy": 0.5657824128866196,
"epoch": 7.0,
"grad_norm": 0.03451741114258766,
"learning_rate": 0.0002,
"loss": 0.5687781572341919,
"mean_token_accuracy": 0.7700928151607513,
"num_tokens": 20952402.0,
"step": 280
},
{
"entropy": 0.5669686943292618,
"epoch": 7.025,
"grad_norm": 0.033809054642915726,
"learning_rate": 0.0002,
"loss": 0.555560827255249,
"mean_token_accuracy": 0.7754446268081665,
"num_tokens": 21027343.0,
"step": 281
},
{
"entropy": 0.5571761578321457,
"epoch": 7.05,
"grad_norm": 0.02909841760993004,
"learning_rate": 0.0002,
"loss": 0.5523079037666321,
"mean_token_accuracy": 0.7767569869756699,
"num_tokens": 21101355.0,
"step": 282
},
{
"entropy": 0.5467852652072906,
"epoch": 7.075,
"grad_norm": 0.03742019459605217,
"learning_rate": 0.0002,
"loss": 0.5528299808502197,
"mean_token_accuracy": 0.7753051668405533,
"num_tokens": 21175670.0,
"step": 283
},
{
"entropy": 0.5646944791078568,
"epoch": 7.1,
"grad_norm": 0.029561564326286316,
"learning_rate": 0.0002,
"loss": 0.5654648542404175,
"mean_token_accuracy": 0.7715721130371094,
"num_tokens": 21250590.0,
"step": 284
},
{
"entropy": 0.5694975554943085,
"epoch": 7.125,
"grad_norm": 0.043832119554281235,
"learning_rate": 0.0002,
"loss": 0.563732385635376,
"mean_token_accuracy": 0.771723747253418,
"num_tokens": 21325721.0,
"step": 285
},
{
"entropy": 0.5660099983215332,
"epoch": 7.15,
"grad_norm": 0.03258618339896202,
"learning_rate": 0.0002,
"loss": 0.5601391792297363,
"mean_token_accuracy": 0.7732871919870377,
"num_tokens": 21400578.0,
"step": 286
},
{
"entropy": 0.5517364293336868,
"epoch": 7.175,
"grad_norm": 0.04530012607574463,
"learning_rate": 0.0002,
"loss": 0.558700680732727,
"mean_token_accuracy": 0.7721482962369919,
"num_tokens": 21475582.0,
"step": 287
},
{
"entropy": 0.5505019277334213,
"epoch": 7.2,
"grad_norm": 0.035087864845991135,
"learning_rate": 0.0002,
"loss": 0.5531398057937622,
"mean_token_accuracy": 0.7760081589221954,
"num_tokens": 21550821.0,
"step": 288
},
{
"entropy": 0.559594452381134,
"epoch": 7.225,
"grad_norm": 0.034394703805446625,
"learning_rate": 0.0002,
"loss": 0.5592218637466431,
"mean_token_accuracy": 0.773568719625473,
"num_tokens": 21625873.0,
"step": 289
},
{
"entropy": 0.5724920481443405,
"epoch": 7.25,
"grad_norm": 0.033760059624910355,
"learning_rate": 0.0002,
"loss": 0.5668227672576904,
"mean_token_accuracy": 0.7702212035655975,
"num_tokens": 21700602.0,
"step": 290
},
{
"entropy": 0.56439508497715,
"epoch": 7.275,
"grad_norm": 0.03572908788919449,
"learning_rate": 0.0002,
"loss": 0.5633417367935181,
"mean_token_accuracy": 0.7723740786314011,
"num_tokens": 21774875.0,
"step": 291
},
{
"entropy": 0.548421323299408,
"epoch": 7.3,
"grad_norm": 0.04545460268855095,
"learning_rate": 0.0002,
"loss": 0.5489292144775391,
"mean_token_accuracy": 0.7780417054891586,
"num_tokens": 21849501.0,
"step": 292
},
{
"entropy": 0.5536051839590073,
"epoch": 7.325,
"grad_norm": 0.03099142387509346,
"learning_rate": 0.0002,
"loss": 0.5557237863540649,
"mean_token_accuracy": 0.7745026648044586,
"num_tokens": 21925199.0,
"step": 293
},
{
"entropy": 0.5548270344734192,
"epoch": 7.35,
"grad_norm": 0.04060740023851395,
"learning_rate": 0.0002,
"loss": 0.5544718503952026,
"mean_token_accuracy": 0.7760574668645859,
"num_tokens": 22000154.0,
"step": 294
},
{
"entropy": 0.5629658997058868,
"epoch": 7.375,
"grad_norm": 0.03493206575512886,
"learning_rate": 0.0002,
"loss": 0.557624101638794,
"mean_token_accuracy": 0.7730877846479416,
"num_tokens": 22074584.0,
"step": 295
},
{
"entropy": 0.5578918755054474,
"epoch": 7.4,
"grad_norm": 0.037077102810144424,
"learning_rate": 0.0002,
"loss": 0.5596894025802612,
"mean_token_accuracy": 0.7733047008514404,
"num_tokens": 22149636.0,
"step": 296
},
{
"entropy": 0.5517973154783249,
"epoch": 7.425,
"grad_norm": 0.03832925483584404,
"learning_rate": 0.0002,
"loss": 0.5568417906761169,
"mean_token_accuracy": 0.7738584578037262,
"num_tokens": 22224466.0,
"step": 297
},
{
"entropy": 0.5560635775327682,
"epoch": 7.45,
"grad_norm": 0.02942826971411705,
"learning_rate": 0.0002,
"loss": 0.5580307841300964,
"mean_token_accuracy": 0.7735044956207275,
"num_tokens": 22299483.0,
"step": 298
},
{
"entropy": 0.5802666395902634,
"epoch": 7.475,
"grad_norm": 0.038540106266736984,
"learning_rate": 0.0002,
"loss": 0.5760456323623657,
"mean_token_accuracy": 0.7662649601697922,
"num_tokens": 22374870.0,
"step": 299
},
{
"entropy": 0.5679387599229813,
"epoch": 7.5,
"grad_norm": 0.029141677543520927,
"learning_rate": 0.0002,
"loss": 0.564399778842926,
"mean_token_accuracy": 0.7710927873849869,
"num_tokens": 22449693.0,
"step": 300
},
{
"entropy": 0.5547898411750793,
"epoch": 7.525,
"grad_norm": 0.02980385534465313,
"learning_rate": 0.0002,
"loss": 0.5558938980102539,
"mean_token_accuracy": 0.7764756679534912,
"num_tokens": 22524828.0,
"step": 301
},
{
"entropy": 0.5566362589597702,
"epoch": 7.55,
"grad_norm": 0.036666952073574066,
"learning_rate": 0.0002,
"loss": 0.5618187785148621,
"mean_token_accuracy": 0.7723874747753143,
"num_tokens": 22599762.0,
"step": 302
},
{
"entropy": 0.5653972774744034,
"epoch": 7.575,
"grad_norm": 0.035354770720005035,
"learning_rate": 0.0002,
"loss": 0.5642956495285034,
"mean_token_accuracy": 0.7708256095647812,
"num_tokens": 22674686.0,
"step": 303
},
{
"entropy": 0.5595411062240601,
"epoch": 7.6,
"grad_norm": 0.03489721938967705,
"learning_rate": 0.0002,
"loss": 0.5577263832092285,
"mean_token_accuracy": 0.7742740511894226,
"num_tokens": 22749009.0,
"step": 304
},
{
"entropy": 0.5515602380037308,
"epoch": 7.625,
"grad_norm": 0.032466236501932144,
"learning_rate": 0.0002,
"loss": 0.5543727874755859,
"mean_token_accuracy": 0.7750429511070251,
"num_tokens": 22822984.0,
"step": 305
},
{
"entropy": 0.567086473107338,
"epoch": 7.65,
"grad_norm": 0.037166330963373184,
"learning_rate": 0.0002,
"loss": 0.5712989568710327,
"mean_token_accuracy": 0.768398255109787,
"num_tokens": 22897628.0,
"step": 306
},
{
"entropy": 0.570941150188446,
"epoch": 7.675,
"grad_norm": 0.03177010267972946,
"learning_rate": 0.0002,
"loss": 0.5668294429779053,
"mean_token_accuracy": 0.7703831493854523,
"num_tokens": 22972657.0,
"step": 307
},
{
"entropy": 0.5639981329441071,
"epoch": 7.7,
"grad_norm": 0.034557901322841644,
"learning_rate": 0.0002,
"loss": 0.5623838901519775,
"mean_token_accuracy": 0.7735652476549149,
"num_tokens": 23047874.0,
"step": 308
},
{
"entropy": 0.5679261088371277,
"epoch": 7.725,
"grad_norm": 0.028234630823135376,
"learning_rate": 0.0002,
"loss": 0.5686887502670288,
"mean_token_accuracy": 0.7698494493961334,
"num_tokens": 23122211.0,
"step": 309
},
{
"entropy": 0.5655084848403931,
"epoch": 7.75,
"grad_norm": 0.03173128515481949,
"learning_rate": 0.0002,
"loss": 0.5650713443756104,
"mean_token_accuracy": 0.7710306495428085,
"num_tokens": 23197631.0,
"step": 310
},
{
"entropy": 0.5513352751731873,
"epoch": 7.775,
"grad_norm": 0.030083199962973595,
"learning_rate": 0.0002,
"loss": 0.5536549687385559,
"mean_token_accuracy": 0.7755338102579117,
"num_tokens": 23272548.0,
"step": 311
},
{
"entropy": 0.5555614978075027,
"epoch": 7.8,
"grad_norm": 0.034044049680233,
"learning_rate": 0.0002,
"loss": 0.5587220788002014,
"mean_token_accuracy": 0.7734567075967789,
"num_tokens": 23347302.0,
"step": 312
},
{
"entropy": 0.5663987696170807,
"epoch": 7.825,
"grad_norm": 0.03760316222906113,
"learning_rate": 0.0002,
"loss": 0.5659680366516113,
"mean_token_accuracy": 0.7704599052667618,
"num_tokens": 23422273.0,
"step": 313
},
{
"entropy": 0.5548186749219894,
"epoch": 7.85,
"grad_norm": 0.03154882416129112,
"learning_rate": 0.0002,
"loss": 0.5523256659507751,
"mean_token_accuracy": 0.7759846299886703,
"num_tokens": 23497091.0,
"step": 314
},
{
"entropy": 0.5607025325298309,
"epoch": 7.875,
"grad_norm": 0.041530657559633255,
"learning_rate": 0.0002,
"loss": 0.5609087347984314,
"mean_token_accuracy": 0.772527739405632,
"num_tokens": 23571404.0,
"step": 315
},
{
"entropy": 0.549463763833046,
"epoch": 7.9,
"grad_norm": 0.030178798362612724,
"learning_rate": 0.0002,
"loss": 0.5497753620147705,
"mean_token_accuracy": 0.7761662304401398,
"num_tokens": 23646640.0,
"step": 316
},
{
"entropy": 0.5618870705366135,
"epoch": 7.925,
"grad_norm": 0.04211151972413063,
"learning_rate": 0.0002,
"loss": 0.5642685294151306,
"mean_token_accuracy": 0.7714849263429642,
"num_tokens": 23720939.0,
"step": 317
},
{
"entropy": 0.570905327796936,
"epoch": 7.95,
"grad_norm": 0.026979681104421616,
"learning_rate": 0.0002,
"loss": 0.569770336151123,
"mean_token_accuracy": 0.7690195441246033,
"num_tokens": 23795221.0,
"step": 318
},
{
"entropy": 0.5613491535186768,
"epoch": 7.975,
"grad_norm": 0.04255770891904831,
"learning_rate": 0.0002,
"loss": 0.559675931930542,
"mean_token_accuracy": 0.7733322232961655,
"num_tokens": 23871279.0,
"step": 319
},
{
"entropy": 0.5522212386131287,
"epoch": 8.0,
"grad_norm": 0.032483723014593124,
"learning_rate": 0.0002,
"loss": 0.5494213104248047,
"mean_token_accuracy": 0.777331531047821,
"num_tokens": 23945592.0,
"step": 320
},
{
"entropy": 0.5492678731679916,
"epoch": 8.025,
"grad_norm": 0.04212978109717369,
"learning_rate": 0.0002,
"loss": 0.5434910655021667,
"mean_token_accuracy": 0.7794834822416306,
"num_tokens": 24020279.0,
"step": 321
},
{
"entropy": 0.5525006651878357,
"epoch": 8.05,
"grad_norm": 0.04567183181643486,
"learning_rate": 0.0002,
"loss": 0.5581562519073486,
"mean_token_accuracy": 0.7742817401885986,
"num_tokens": 24095378.0,
"step": 322
},
{
"entropy": 0.5501823425292969,
"epoch": 8.075,
"grad_norm": 0.04195858910679817,
"learning_rate": 0.0002,
"loss": 0.5543116331100464,
"mean_token_accuracy": 0.7745549827814102,
"num_tokens": 24169761.0,
"step": 323
},
{
"entropy": 0.5598976612091064,
"epoch": 8.1,
"grad_norm": 0.041518036276102066,
"learning_rate": 0.0002,
"loss": 0.5529639720916748,
"mean_token_accuracy": 0.7760216742753983,
"num_tokens": 24244551.0,
"step": 324
},
{
"entropy": 0.5517577975988388,
"epoch": 8.125,
"grad_norm": 0.04003611207008362,
"learning_rate": 0.0002,
"loss": 0.547474205493927,
"mean_token_accuracy": 0.7773387134075165,
"num_tokens": 24319348.0,
"step": 325
},
{
"entropy": 0.5453294217586517,
"epoch": 8.15,
"grad_norm": 0.04722796007990837,
"learning_rate": 0.0002,
"loss": 0.5527043342590332,
"mean_token_accuracy": 0.7764124572277069,
"num_tokens": 24394923.0,
"step": 326
},
{
"entropy": 0.5439276248216629,
"epoch": 8.175,
"grad_norm": 0.050235629081726074,
"learning_rate": 0.0002,
"loss": 0.5492441654205322,
"mean_token_accuracy": 0.7766183167695999,
"num_tokens": 24469884.0,
"step": 327
},
{
"entropy": 0.5524363815784454,
"epoch": 8.2,
"grad_norm": 0.05216272920370102,
"learning_rate": 0.0002,
"loss": 0.5485007762908936,
"mean_token_accuracy": 0.7773555964231491,
"num_tokens": 24544406.0,
"step": 328
},
{
"entropy": 0.5342830568552017,
"epoch": 8.225,
"grad_norm": 0.03883667290210724,
"learning_rate": 0.0002,
"loss": 0.5314654111862183,
"mean_token_accuracy": 0.7834321856498718,
"num_tokens": 24618651.0,
"step": 329
},
{
"entropy": 0.5648667514324188,
"epoch": 8.25,
"grad_norm": 0.05192190781235695,
"learning_rate": 0.0002,
"loss": 0.5654504299163818,
"mean_token_accuracy": 0.7713068872690201,
"num_tokens": 24692797.0,
"step": 330
},
{
"entropy": 0.5633829087018967,
"epoch": 8.275,
"grad_norm": 0.061627499759197235,
"learning_rate": 0.0002,
"loss": 0.5625565052032471,
"mean_token_accuracy": 0.7720554023981094,
"num_tokens": 24767683.0,
"step": 331
},
{
"entropy": 0.5499114990234375,
"epoch": 8.3,
"grad_norm": 0.04107741639018059,
"learning_rate": 0.0002,
"loss": 0.548437237739563,
"mean_token_accuracy": 0.7773527503013611,
"num_tokens": 24843880.0,
"step": 332
},
{
"entropy": 0.5482420921325684,
"epoch": 8.325,
"grad_norm": 0.05437928065657616,
"learning_rate": 0.0002,
"loss": 0.5508846044540405,
"mean_token_accuracy": 0.776692345738411,
"num_tokens": 24918492.0,
"step": 333
},
{
"entropy": 0.5507365763187408,
"epoch": 8.35,
"grad_norm": 0.038063954561948776,
"learning_rate": 0.0002,
"loss": 0.5517404675483704,
"mean_token_accuracy": 0.7754537016153336,
"num_tokens": 24993390.0,
"step": 334
},
{
"entropy": 0.5636460483074188,
"epoch": 8.375,
"grad_norm": 0.044943079352378845,
"learning_rate": 0.0002,
"loss": 0.5621282458305359,
"mean_token_accuracy": 0.7718300223350525,
"num_tokens": 25067643.0,
"step": 335
},
{
"entropy": 0.5502006560564041,
"epoch": 8.4,
"grad_norm": 0.038005705922842026,
"learning_rate": 0.0002,
"loss": 0.5524753332138062,
"mean_token_accuracy": 0.7749505192041397,
"num_tokens": 25142656.0,
"step": 336
},
{
"entropy": 0.5396933555603027,
"epoch": 8.425,
"grad_norm": 0.04691820219159126,
"learning_rate": 0.0002,
"loss": 0.5444018840789795,
"mean_token_accuracy": 0.779260128736496,
"num_tokens": 25217599.0,
"step": 337
},
{
"entropy": 0.5635709166526794,
"epoch": 8.45,
"grad_norm": 0.046322260051965714,
"learning_rate": 0.0002,
"loss": 0.5574597120285034,
"mean_token_accuracy": 0.774439737200737,
"num_tokens": 25293321.0,
"step": 338
},
{
"entropy": 0.546722874045372,
"epoch": 8.475,
"grad_norm": 0.043173372745513916,
"learning_rate": 0.0002,
"loss": 0.5474981665611267,
"mean_token_accuracy": 0.7775770723819733,
"num_tokens": 25367884.0,
"step": 339
},
{
"entropy": 0.5400292277336121,
"epoch": 8.5,
"grad_norm": 0.05466064065694809,
"learning_rate": 0.0002,
"loss": 0.546923041343689,
"mean_token_accuracy": 0.7782380133867264,
"num_tokens": 25443693.0,
"step": 340
},
{
"entropy": 0.5547144860029221,
"epoch": 8.525,
"grad_norm": 0.03708970546722412,
"learning_rate": 0.0002,
"loss": 0.5485285520553589,
"mean_token_accuracy": 0.777634859085083,
"num_tokens": 25519449.0,
"step": 341
},
{
"entropy": 0.5555647015571594,
"epoch": 8.55,
"grad_norm": 0.044979583472013474,
"learning_rate": 0.0002,
"loss": 0.5532581806182861,
"mean_token_accuracy": 0.7749562114477158,
"num_tokens": 25594565.0,
"step": 342
},
{
"entropy": 0.5622061938047409,
"epoch": 8.575,
"grad_norm": 0.037068452686071396,
"learning_rate": 0.0002,
"loss": 0.5619022846221924,
"mean_token_accuracy": 0.7722371071577072,
"num_tokens": 25669210.0,
"step": 343
},
{
"entropy": 0.5621164441108704,
"epoch": 8.6,
"grad_norm": 0.04099290445446968,
"learning_rate": 0.0002,
"loss": 0.5597378015518188,
"mean_token_accuracy": 0.7729932218790054,
"num_tokens": 25743928.0,
"step": 344
},
{
"entropy": 0.5559934675693512,
"epoch": 8.625,
"grad_norm": 0.034955333918333054,
"learning_rate": 0.0002,
"loss": 0.5548402070999146,
"mean_token_accuracy": 0.774565801024437,
"num_tokens": 25819235.0,
"step": 345
},
{
"entropy": 0.5608330816030502,
"epoch": 8.65,
"grad_norm": 0.032942138612270355,
"learning_rate": 0.0002,
"loss": 0.5593307614326477,
"mean_token_accuracy": 0.7738174945116043,
"num_tokens": 25894146.0,
"step": 346
},
{
"entropy": 0.5570491552352905,
"epoch": 8.675,
"grad_norm": 0.037585385143756866,
"learning_rate": 0.0002,
"loss": 0.5622788071632385,
"mean_token_accuracy": 0.7726104408502579,
"num_tokens": 25968950.0,
"step": 347
},
{
"entropy": 0.5602799952030182,
"epoch": 8.7,
"grad_norm": 0.036275461316108704,
"learning_rate": 0.0002,
"loss": 0.5584805011749268,
"mean_token_accuracy": 0.7735689133405685,
"num_tokens": 26044155.0,
"step": 348
},
{
"entropy": 0.5549326539039612,
"epoch": 8.725,
"grad_norm": 0.03921646997332573,
"learning_rate": 0.0002,
"loss": 0.5547788739204407,
"mean_token_accuracy": 0.774893268942833,
"num_tokens": 26118350.0,
"step": 349
},
{
"entropy": 0.562700167298317,
"epoch": 8.75,
"grad_norm": 0.037997711449861526,
"learning_rate": 0.0002,
"loss": 0.5609459280967712,
"mean_token_accuracy": 0.7732319533824921,
"num_tokens": 26193288.0,
"step": 350
},
{
"entropy": 0.5581104457378387,
"epoch": 8.775,
"grad_norm": 0.03644339367747307,
"learning_rate": 0.0002,
"loss": 0.5592728853225708,
"mean_token_accuracy": 0.772967129945755,
"num_tokens": 26267183.0,
"step": 351
},
{
"entropy": 0.5485663414001465,
"epoch": 8.8,
"grad_norm": 0.03490961715579033,
"learning_rate": 0.0002,
"loss": 0.5506186485290527,
"mean_token_accuracy": 0.7772383987903595,
"num_tokens": 26342404.0,
"step": 352
},
{
"entropy": 0.5463830828666687,
"epoch": 8.825,
"grad_norm": 0.03406834974884987,
"learning_rate": 0.0002,
"loss": 0.543938398361206,
"mean_token_accuracy": 0.7797689437866211,
"num_tokens": 26416380.0,
"step": 353
},
{
"entropy": 0.5513378083705902,
"epoch": 8.85,
"grad_norm": 0.03450295329093933,
"learning_rate": 0.0002,
"loss": 0.5490238070487976,
"mean_token_accuracy": 0.7770363390445709,
"num_tokens": 26490636.0,
"step": 354
},
{
"entropy": 0.5578331649303436,
"epoch": 8.875,
"grad_norm": 0.03415544703602791,
"learning_rate": 0.0002,
"loss": 0.5596639513969421,
"mean_token_accuracy": 0.7740722298622131,
"num_tokens": 26564954.0,
"step": 355
},
{
"entropy": 0.5540540665388107,
"epoch": 8.9,
"grad_norm": 0.03938233479857445,
"learning_rate": 0.0002,
"loss": 0.5562814474105835,
"mean_token_accuracy": 0.7746336907148361,
"num_tokens": 26639707.0,
"step": 356
},
{
"entropy": 0.5420689284801483,
"epoch": 8.925,
"grad_norm": 0.04445737600326538,
"learning_rate": 0.0002,
"loss": 0.5478014945983887,
"mean_token_accuracy": 0.7780271470546722,
"num_tokens": 26713693.0,
"step": 357
},
{
"entropy": 0.55097496509552,
"epoch": 8.95,
"grad_norm": 0.03611644357442856,
"learning_rate": 0.0002,
"loss": 0.5501728057861328,
"mean_token_accuracy": 0.7780314683914185,
"num_tokens": 26788869.0,
"step": 358
},
{
"entropy": 0.5535710901021957,
"epoch": 8.975,
"grad_norm": 0.03289943188428879,
"learning_rate": 0.0002,
"loss": 0.5510231852531433,
"mean_token_accuracy": 0.7763938903808594,
"num_tokens": 26863133.0,
"step": 359
},
{
"entropy": 0.5588638633489609,
"epoch": 9.0,
"grad_norm": 0.03923680633306503,
"learning_rate": 0.0002,
"loss": 0.5572277307510376,
"mean_token_accuracy": 0.7751377373933792,
"num_tokens": 26938828.0,
"step": 360
},
{
"entropy": 0.5573904514312744,
"epoch": 9.025,
"grad_norm": 0.042408641427755356,
"learning_rate": 0.0002,
"loss": 0.546970009803772,
"mean_token_accuracy": 0.7769688963890076,
"num_tokens": 27013695.0,
"step": 361
},
{
"entropy": 0.5463464558124542,
"epoch": 9.05,
"grad_norm": 0.04984664544463158,
"learning_rate": 0.0002,
"loss": 0.5488175749778748,
"mean_token_accuracy": 0.7771357446908951,
"num_tokens": 27088413.0,
"step": 362
},
{
"entropy": 0.5289105176925659,
"epoch": 9.075,
"grad_norm": 0.04879127815365791,
"learning_rate": 0.0002,
"loss": 0.5313310623168945,
"mean_token_accuracy": 0.7846623361110687,
"num_tokens": 27162877.0,
"step": 363
},
{
"entropy": 0.5486065149307251,
"epoch": 9.1,
"grad_norm": 0.05812316760420799,
"learning_rate": 0.0002,
"loss": 0.5521052479743958,
"mean_token_accuracy": 0.7759816944599152,
"num_tokens": 27236796.0,
"step": 364
},
{
"entropy": 0.5488688349723816,
"epoch": 9.125,
"grad_norm": 0.048603836447000504,
"learning_rate": 0.0002,
"loss": 0.5465872287750244,
"mean_token_accuracy": 0.7791540026664734,
"num_tokens": 27311937.0,
"step": 365
},
{
"entropy": 0.5352853387594223,
"epoch": 9.15,
"grad_norm": 0.0544096976518631,
"learning_rate": 0.0002,
"loss": 0.5347145795822144,
"mean_token_accuracy": 0.7831054776906967,
"num_tokens": 27387311.0,
"step": 366
},
{
"entropy": 0.5446972846984863,
"epoch": 9.175,
"grad_norm": 0.07536739856004715,
"learning_rate": 0.0002,
"loss": 0.5502406358718872,
"mean_token_accuracy": 0.7761110365390778,
"num_tokens": 27462732.0,
"step": 367
},
{
"entropy": 0.5384257137775421,
"epoch": 9.2,
"grad_norm": 0.07809668034315109,
"learning_rate": 0.0002,
"loss": 0.5337420701980591,
"mean_token_accuracy": 0.7827627509832382,
"num_tokens": 27537612.0,
"step": 368
},
{
"entropy": 0.5450446158647537,
"epoch": 9.225,
"grad_norm": 0.05390315130352974,
"learning_rate": 0.0002,
"loss": 0.541022777557373,
"mean_token_accuracy": 0.7800815850496292,
"num_tokens": 27612425.0,
"step": 369
},
{
"entropy": 0.5376718789339066,
"epoch": 9.25,
"grad_norm": 0.050644826143980026,
"learning_rate": 0.0002,
"loss": 0.5411415100097656,
"mean_token_accuracy": 0.7793814241886139,
"num_tokens": 27686664.0,
"step": 370
},
{
"entropy": 0.5415088385343552,
"epoch": 9.275,
"grad_norm": 0.07354080677032471,
"learning_rate": 0.0002,
"loss": 0.5452766418457031,
"mean_token_accuracy": 0.7790811359882355,
"num_tokens": 27762025.0,
"step": 371
},
{
"entropy": 0.5448242127895355,
"epoch": 9.3,
"grad_norm": 0.0593232586979866,
"learning_rate": 0.0002,
"loss": 0.5438794493675232,
"mean_token_accuracy": 0.7780045717954636,
"num_tokens": 27835553.0,
"step": 372
},
{
"entropy": 0.5405709743499756,
"epoch": 9.325,
"grad_norm": 0.05473851040005684,
"learning_rate": 0.0002,
"loss": 0.5379279851913452,
"mean_token_accuracy": 0.782084509730339,
"num_tokens": 27910916.0,
"step": 373
},
{
"entropy": 0.5353046655654907,
"epoch": 9.35,
"grad_norm": 0.07823872566223145,
"learning_rate": 0.0002,
"loss": 0.5367236733436584,
"mean_token_accuracy": 0.7828928083181381,
"num_tokens": 27986761.0,
"step": 374
},
{
"entropy": 0.5484424829483032,
"epoch": 9.375,
"grad_norm": 0.09651726484298706,
"learning_rate": 0.0002,
"loss": 0.5527921915054321,
"mean_token_accuracy": 0.7761628329753876,
"num_tokens": 28061459.0,
"step": 375
},
{
"entropy": 0.551795557141304,
"epoch": 9.4,
"grad_norm": 0.04663221165537834,
"learning_rate": 0.0002,
"loss": 0.5416221022605896,
"mean_token_accuracy": 0.7804577797651291,
"num_tokens": 28136180.0,
"step": 376
},
{
"entropy": 0.5437692701816559,
"epoch": 9.425,
"grad_norm": 0.060796961188316345,
"learning_rate": 0.0002,
"loss": 0.5428634881973267,
"mean_token_accuracy": 0.7803646326065063,
"num_tokens": 28210743.0,
"step": 377
},
{
"entropy": 0.5389476418495178,
"epoch": 9.45,
"grad_norm": 0.06818708777427673,
"learning_rate": 0.0002,
"loss": 0.542535662651062,
"mean_token_accuracy": 0.779965728521347,
"num_tokens": 28285511.0,
"step": 378
},
{
"entropy": 0.5542739927768707,
"epoch": 9.475,
"grad_norm": 0.040479619055986404,
"learning_rate": 0.0002,
"loss": 0.5535589456558228,
"mean_token_accuracy": 0.776050016283989,
"num_tokens": 28361419.0,
"step": 379
},
{
"entropy": 0.544225737452507,
"epoch": 9.5,
"grad_norm": 0.061609551310539246,
"learning_rate": 0.0002,
"loss": 0.5452514290809631,
"mean_token_accuracy": 0.7790030539035797,
"num_tokens": 28436492.0,
"step": 380
},
{
"entropy": 0.546901598572731,
"epoch": 9.525,
"grad_norm": 0.0580863393843174,
"learning_rate": 0.0002,
"loss": 0.545783519744873,
"mean_token_accuracy": 0.7786774635314941,
"num_tokens": 28511395.0,
"step": 381
},
{
"entropy": 0.5370142608880997,
"epoch": 9.55,
"grad_norm": 0.052466463297605515,
"learning_rate": 0.0002,
"loss": 0.5396707057952881,
"mean_token_accuracy": 0.7794803082942963,
"num_tokens": 28585898.0,
"step": 382
},
{
"entropy": 0.5520068109035492,
"epoch": 9.575,
"grad_norm": 0.06656571477651596,
"learning_rate": 0.0002,
"loss": 0.555323600769043,
"mean_token_accuracy": 0.7742787003517151,
"num_tokens": 28660846.0,
"step": 383
},
{
"entropy": 0.5486234575510025,
"epoch": 9.6,
"grad_norm": 0.0534614734351635,
"learning_rate": 0.0002,
"loss": 0.5439633131027222,
"mean_token_accuracy": 0.7797495126724243,
"num_tokens": 28735594.0,
"step": 384
},
{
"entropy": 0.5434623211622238,
"epoch": 9.625,
"grad_norm": 0.045583903789520264,
"learning_rate": 0.0002,
"loss": 0.5419756174087524,
"mean_token_accuracy": 0.7793311029672623,
"num_tokens": 28809189.0,
"step": 385
},
{
"entropy": 0.5472258776426315,
"epoch": 9.65,
"grad_norm": 0.05377979576587677,
"learning_rate": 0.0002,
"loss": 0.5480138063430786,
"mean_token_accuracy": 0.7763115465641022,
"num_tokens": 28883637.0,
"step": 386
},
{
"entropy": 0.544014573097229,
"epoch": 9.675,
"grad_norm": 0.04192574322223663,
"learning_rate": 0.0002,
"loss": 0.544299304485321,
"mean_token_accuracy": 0.7778673022985458,
"num_tokens": 28957576.0,
"step": 387
},
{
"entropy": 0.5428405702114105,
"epoch": 9.7,
"grad_norm": 0.06100517511367798,
"learning_rate": 0.0002,
"loss": 0.5435603857040405,
"mean_token_accuracy": 0.7799983322620392,
"num_tokens": 29032062.0,
"step": 388
},
{
"entropy": 0.5476491451263428,
"epoch": 9.725,
"grad_norm": 0.048970699310302734,
"learning_rate": 0.0002,
"loss": 0.5444561839103699,
"mean_token_accuracy": 0.7781545221805573,
"num_tokens": 29106620.0,
"step": 389
},
{
"entropy": 0.5525311529636383,
"epoch": 9.75,
"grad_norm": 0.04579257220029831,
"learning_rate": 0.0002,
"loss": 0.5549685955047607,
"mean_token_accuracy": 0.7740297764539719,
"num_tokens": 29181675.0,
"step": 390
},
{
"entropy": 0.5356399416923523,
"epoch": 9.775,
"grad_norm": 0.05338006839156151,
"learning_rate": 0.0002,
"loss": 0.5426127314567566,
"mean_token_accuracy": 0.7810541093349457,
"num_tokens": 29256940.0,
"step": 391
},
{
"entropy": 0.5530804395675659,
"epoch": 9.8,
"grad_norm": 0.04246848449110985,
"learning_rate": 0.0002,
"loss": 0.5501898527145386,
"mean_token_accuracy": 0.7768332362174988,
"num_tokens": 29331834.0,
"step": 392
},
{
"entropy": 0.5455987602472305,
"epoch": 9.825,
"grad_norm": 0.05141966789960861,
"learning_rate": 0.0002,
"loss": 0.5416363477706909,
"mean_token_accuracy": 0.779405802488327,
"num_tokens": 29406597.0,
"step": 393
},
{
"entropy": 0.5400111973285675,
"epoch": 9.85,
"grad_norm": 0.04637204110622406,
"learning_rate": 0.0002,
"loss": 0.5422732830047607,
"mean_token_accuracy": 0.7795074433088303,
"num_tokens": 29481010.0,
"step": 394
},
{
"entropy": 0.5512913167476654,
"epoch": 9.875,
"grad_norm": 0.047507502138614655,
"learning_rate": 0.0002,
"loss": 0.5522125363349915,
"mean_token_accuracy": 0.7759748101234436,
"num_tokens": 29557413.0,
"step": 395
},
{
"entropy": 0.5526851117610931,
"epoch": 9.9,
"grad_norm": 0.05560845509171486,
"learning_rate": 0.0002,
"loss": 0.5494586229324341,
"mean_token_accuracy": 0.7773067951202393,
"num_tokens": 29632250.0,
"step": 396
},
{
"entropy": 0.5464700162410736,
"epoch": 9.925,
"grad_norm": 0.03767940029501915,
"learning_rate": 0.0002,
"loss": 0.5446071624755859,
"mean_token_accuracy": 0.779336228966713,
"num_tokens": 29707194.0,
"step": 397
},
{
"entropy": 0.5447115898132324,
"epoch": 9.95,
"grad_norm": 0.06558585911989212,
"learning_rate": 0.0002,
"loss": 0.5500915050506592,
"mean_token_accuracy": 0.7773743122816086,
"num_tokens": 29782815.0,
"step": 398
},
{
"entropy": 0.5433839708566666,
"epoch": 9.975,
"grad_norm": 0.04332485795021057,
"learning_rate": 0.0002,
"loss": 0.5444520711898804,
"mean_token_accuracy": 0.7788331806659698,
"num_tokens": 29857625.0,
"step": 399
},
{
"entropy": 0.5447465628385544,
"epoch": 10.0,
"grad_norm": 0.049522414803504944,
"learning_rate": 0.0002,
"loss": 0.5472792387008667,
"mean_token_accuracy": 0.7776744365692139,
"num_tokens": 29931942.0,
"step": 400
}
],
"logging_steps": 1,
"max_steps": 400,
"num_input_tokens_seen": 0,
"num_train_epochs": 10,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 1.6443458485716255e+18,
"train_batch_size": 16,
"trial_name": null,
"trial_params": null
}