Muqeeth's picture
Training in progress, step 52, checkpoint
d67980b verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"episode": 26624,
"epoch": 0.22810143934201507,
"eval_steps": 500,
"global_step": 52,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"episode": 512,
"epoch": 0.004386566141192597,
"eps": 5,
"loss/policy_avg": -0.0010775011032819748,
"loss/value_avg": 2.152796983718872,
"lr": 3e-06,
"objective/entropy": -46.88261795043945,
"objective/kl": -2.2351741790771484e-07,
"objective/non_score_reward": 0.0,
"objective/rlhf_reward": 4.597599983215332,
"objective/scores": 4.597599983215332,
"policy/approxkl_avg": 0.0034045230131596327,
"policy/clipfrac_avg": 0.05483449250459671,
"policy/entropy_avg": 0.840014636516571,
"step": 1,
"val/clipfrac_avg": 0.0006964995991438627,
"val/num_eos_tokens": 511,
"val/ratio": 0.9999260902404785,
"val/ratio_var": 6.746347025909927e-06
},
{
"episode": 1024,
"epoch": 0.008773132282385195,
"eps": 6,
"loss/policy_avg": 0.002331731840968132,
"loss/value_avg": 1.406424641609192,
"lr": 2.9882812500000002e-06,
"objective/entropy": 5.527715682983398,
"objective/kl": 0.9096249341964722,
"objective/non_score_reward": 0.0,
"objective/rlhf_reward": 5.0396623611450195,
"objective/scores": 5.0396623611450195,
"policy/approxkl_avg": 0.0057543180882930756,
"policy/clipfrac_avg": 0.04763935133814812,
"policy/entropy_avg": 0.7740105390548706,
"step": 2,
"val/clipfrac_avg": 0.0013810684904456139,
"val/num_eos_tokens": 512,
"val/ratio": 1.0008430480957031,
"val/ratio_var": 2.5710842237458564e-05
},
{
"episode": 1536,
"epoch": 0.013159698423577794,
"eps": 6,
"loss/policy_avg": 0.004994675051420927,
"loss/value_avg": 1.4159741401672363,
"lr": 2.9765625e-06,
"objective/entropy": -27.37961196899414,
"objective/kl": 1.9729511737823486,
"objective/non_score_reward": 0.0,
"objective/rlhf_reward": 4.9908952713012695,
"objective/scores": 4.9908952713012695,
"policy/approxkl_avg": 0.0038739778101444244,
"policy/clipfrac_avg": 0.0459367111325264,
"policy/entropy_avg": 0.6048773527145386,
"step": 3,
"val/clipfrac_avg": 0.0031044897623360157,
"val/num_eos_tokens": 512,
"val/ratio": 0.9995440244674683,
"val/ratio_var": 1.1859823644044809e-05
},
{
"episode": 2048,
"epoch": 0.01754626456477039,
"eps": 6,
"loss/policy_avg": 0.008640453219413757,
"loss/value_avg": 1.9971219301223755,
"lr": 2.96484375e-06,
"objective/entropy": -14.0813627243042,
"objective/kl": 2.724120616912842,
"objective/non_score_reward": 0.0,
"objective/rlhf_reward": 5.457592487335205,
"objective/scores": 5.457592487335205,
"policy/approxkl_avg": 0.00431688129901886,
"policy/clipfrac_avg": 0.039867501705884933,
"policy/entropy_avg": 0.5995867252349854,
"step": 4,
"val/clipfrac_avg": 0.003971057012677193,
"val/num_eos_tokens": 512,
"val/ratio": 1.00046968460083,
"val/ratio_var": 1.5058150893310085e-05
},
{
"episode": 2560,
"epoch": 0.02193283070596299,
"eps": 6,
"loss/policy_avg": 0.008194871246814728,
"loss/value_avg": 2.207172393798828,
"lr": 2.953125e-06,
"objective/entropy": 0.22097766399383545,
"objective/kl": 3.6732287406921387,
"objective/non_score_reward": 0.0,
"objective/rlhf_reward": 5.538914680480957,
"objective/scores": 5.538914680480957,
"policy/approxkl_avg": 0.005579844582825899,
"policy/clipfrac_avg": 0.04047102481126785,
"policy/entropy_avg": 0.554892361164093,
"step": 5,
"val/clipfrac_avg": 0.003971456084400415,
"val/num_eos_tokens": 512,
"val/ratio": 0.9994141459465027,
"val/ratio_var": 2.683511411305517e-05
},
{
"episode": 3072,
"epoch": 0.026319396847155587,
"eps": 6,
"loss/policy_avg": 0.008127043955028057,
"loss/value_avg": 1.9443745613098145,
"lr": 2.94140625e-06,
"objective/entropy": -4.540916442871094,
"objective/kl": 4.489357948303223,
"objective/non_score_reward": 0.0,
"objective/rlhf_reward": 5.669499397277832,
"objective/scores": 5.669499397277832,
"policy/approxkl_avg": 0.00525766983628273,
"policy/clipfrac_avg": 0.040956489741802216,
"policy/entropy_avg": 0.5248321890830994,
"step": 6,
"val/clipfrac_avg": 0.0037213233299553394,
"val/num_eos_tokens": 512,
"val/ratio": 0.9998031854629517,
"val/ratio_var": 1.9533537852112204e-05
},
{
"episode": 3584,
"epoch": 0.030705962988348184,
"eps": 6,
"loss/policy_avg": 0.0053918734192848206,
"loss/value_avg": 1.4462230205535889,
"lr": 2.9296875e-06,
"objective/entropy": -18.436466217041016,
"objective/kl": 6.404374122619629,
"objective/non_score_reward": 0.0,
"objective/rlhf_reward": 6.3423004150390625,
"objective/scores": 6.3423004150390625,
"policy/approxkl_avg": 0.004276195541024208,
"policy/clipfrac_avg": 0.041515737771987915,
"policy/entropy_avg": 0.5162253379821777,
"step": 7,
"val/clipfrac_avg": 0.0022959401831030846,
"val/num_eos_tokens": 512,
"val/ratio": 0.9992671608924866,
"val/ratio_var": 1.4565547644451726e-05
},
{
"episode": 4096,
"epoch": 0.03509252912954078,
"eps": 6,
"loss/policy_avg": 0.009467152878642082,
"loss/value_avg": 1.0647578239440918,
"lr": 2.91796875e-06,
"objective/entropy": -52.239105224609375,
"objective/kl": 9.6763334274292,
"objective/non_score_reward": 0.0,
"objective/rlhf_reward": 6.394425392150879,
"objective/scores": 6.394425392150879,
"policy/approxkl_avg": 0.003496184479445219,
"policy/clipfrac_avg": 0.042081158608198166,
"policy/entropy_avg": 0.6195869445800781,
"step": 8,
"val/clipfrac_avg": 0.0018653525039553642,
"val/num_eos_tokens": 506,
"val/ratio": 0.9998865127563477,
"val/ratio_var": 9.700558621261735e-06
},
{
"episode": 4608,
"epoch": 0.03947909527073338,
"eps": 6,
"loss/policy_avg": -0.003379741683602333,
"loss/value_avg": 0.8534075617790222,
"lr": 2.90625e-06,
"objective/entropy": -37.56623077392578,
"objective/kl": 17.91680145263672,
"objective/non_score_reward": 0.0,
"objective/rlhf_reward": 6.905078887939453,
"objective/scores": 6.905078887939453,
"policy/approxkl_avg": 0.00449257530272007,
"policy/clipfrac_avg": 0.04568759351968765,
"policy/entropy_avg": 0.8169364929199219,
"step": 9,
"val/clipfrac_avg": 0.0014340076595544815,
"val/num_eos_tokens": 499,
"val/ratio": 0.9995401501655579,
"val/ratio_var": 5.685657924914267e-06
},
{
"episode": 5120,
"epoch": 0.04386566141192598,
"eps": 5,
"loss/policy_avg": -0.005874279886484146,
"loss/value_avg": 0.8848444819450378,
"lr": 2.89453125e-06,
"objective/entropy": -13.25861930847168,
"objective/kl": 33.02040100097656,
"objective/non_score_reward": 0.0,
"objective/rlhf_reward": 6.507898807525635,
"objective/scores": 6.507898807525635,
"policy/approxkl_avg": 0.005827038548886776,
"policy/clipfrac_avg": 0.04495222494006157,
"policy/entropy_avg": 0.957590639591217,
"step": 10,
"val/clipfrac_avg": 0.0012696427293121815,
"val/num_eos_tokens": 450,
"val/ratio": 0.9994459748268127,
"val/ratio_var": 1.387734209856717e-05
},
{
"episode": 5632,
"epoch": 0.048252227553118573,
"eps": 5,
"loss/policy_avg": -0.014378623105585575,
"loss/value_avg": 0.9888654947280884,
"lr": 2.8828125e-06,
"objective/entropy": 1.6588702201843262,
"objective/kl": 43.53641891479492,
"objective/non_score_reward": 0.0,
"objective/rlhf_reward": 6.313203811645508,
"objective/scores": 6.313203811645508,
"policy/approxkl_avg": 0.007957661524415016,
"policy/clipfrac_avg": 0.04707237333059311,
"policy/entropy_avg": 1.0334728956222534,
"step": 11,
"val/clipfrac_avg": 0.0014614008832722902,
"val/num_eos_tokens": 388,
"val/ratio": 1.0005269050598145,
"val/ratio_var": 1.659244480833877e-05
},
{
"episode": 6144,
"epoch": 0.052638793694311174,
"eps": 5,
"loss/policy_avg": -0.022208962589502335,
"loss/value_avg": 0.9836709499359131,
"lr": 2.87109375e-06,
"objective/entropy": -5.948256492614746,
"objective/kl": 43.305381774902344,
"objective/non_score_reward": 0.0,
"objective/rlhf_reward": 6.90786600112915,
"objective/scores": 6.90786600112915,
"policy/approxkl_avg": 0.006931103765964508,
"policy/clipfrac_avg": 0.04469376057386398,
"policy/entropy_avg": 1.1865200996398926,
"step": 12,
"val/clipfrac_avg": 0.000864459783770144,
"val/num_eos_tokens": 400,
"val/ratio": 0.9998042583465576,
"val/ratio_var": 1.6377791325794533e-05
},
{
"episode": 6656,
"epoch": 0.05702535983550377,
"eps": 5,
"loss/policy_avg": -0.02128465473651886,
"loss/value_avg": 0.9731428623199463,
"lr": 2.859375e-06,
"objective/entropy": -8.351584434509277,
"objective/kl": 41.11138916015625,
"objective/non_score_reward": 0.0,
"objective/rlhf_reward": 6.677196502685547,
"objective/scores": 6.677196502685547,
"policy/approxkl_avg": 0.006365190725773573,
"policy/clipfrac_avg": 0.04388276860117912,
"policy/entropy_avg": 1.2417210340499878,
"step": 13,
"val/clipfrac_avg": 0.0005712928250432014,
"val/num_eos_tokens": 383,
"val/ratio": 1.0003130435943604,
"val/ratio_var": 1.1928386811632663e-05
},
{
"episode": 7168,
"epoch": 0.06141192597669637,
"eps": 5,
"loss/policy_avg": -0.02828969433903694,
"loss/value_avg": 0.9882746934890747,
"lr": 2.84765625e-06,
"objective/entropy": -17.76400375366211,
"objective/kl": 37.17643356323242,
"objective/non_score_reward": 0.0,
"objective/rlhf_reward": 6.9648332595825195,
"objective/scores": 6.9648332595825195,
"policy/approxkl_avg": 0.005724512506276369,
"policy/clipfrac_avg": 0.04208240285515785,
"policy/entropy_avg": 1.2095826864242554,
"step": 14,
"val/clipfrac_avg": 0.0012288970174267888,
"val/num_eos_tokens": 400,
"val/ratio": 0.9997619390487671,
"val/ratio_var": 1.3792546269542072e-05
},
{
"episode": 7680,
"epoch": 0.06579849211788896,
"eps": 5,
"loss/policy_avg": -0.038550965487957,
"loss/value_avg": 0.9785877466201782,
"lr": 2.8359375e-06,
"objective/entropy": -28.61048126220703,
"objective/kl": 32.39018630981445,
"objective/non_score_reward": 0.0,
"objective/rlhf_reward": 7.317845344543457,
"objective/scores": 7.317845344543457,
"policy/approxkl_avg": 0.005465254653245211,
"policy/clipfrac_avg": 0.03845732659101486,
"policy/entropy_avg": 1.1608960628509521,
"step": 15,
"val/clipfrac_avg": 0.00025615841150283813,
"val/num_eos_tokens": 419,
"val/ratio": 1.0000056028366089,
"val/ratio_var": 1.2135635188315064e-05
},
{
"episode": 8192,
"epoch": 0.07018505825908156,
"eps": 5,
"loss/policy_avg": -0.042696814984083176,
"loss/value_avg": 0.8722034692764282,
"lr": 2.82421875e-06,
"objective/entropy": -39.38507843017578,
"objective/kl": 29.655807495117188,
"objective/non_score_reward": 0.0,
"objective/rlhf_reward": 7.833863735198975,
"objective/scores": 7.833863735198975,
"policy/approxkl_avg": 0.004913633689284325,
"policy/clipfrac_avg": 0.04184533655643463,
"policy/entropy_avg": 1.0446637868881226,
"step": 16,
"val/clipfrac_avg": 0.00022731353237759322,
"val/num_eos_tokens": 456,
"val/ratio": 0.9998592138290405,
"val/ratio_var": 8.872881153365597e-06
},
{
"episode": 8704,
"epoch": 0.07457162440027416,
"eps": 5,
"loss/policy_avg": -0.030250171199440956,
"loss/value_avg": 0.73753422498703,
"lr": 2.8125e-06,
"objective/entropy": -48.727821350097656,
"objective/kl": 25.98404312133789,
"objective/non_score_reward": 0.0,
"objective/rlhf_reward": 7.985202312469482,
"objective/scores": 7.985202312469482,
"policy/approxkl_avg": 0.004400444217026234,
"policy/clipfrac_avg": 0.04398694634437561,
"policy/entropy_avg": 1.066787600517273,
"step": 17,
"val/clipfrac_avg": 0.000578696490265429,
"val/num_eos_tokens": 479,
"val/ratio": 0.9993942975997925,
"val/ratio_var": 9.115313332586084e-06
},
{
"episode": 9216,
"epoch": 0.07895819054146676,
"eps": 5,
"loss/policy_avg": -0.013007078319787979,
"loss/value_avg": 0.6708067059516907,
"lr": 2.80078125e-06,
"objective/entropy": -53.65514373779297,
"objective/kl": 26.27431869506836,
"objective/non_score_reward": 0.0,
"objective/rlhf_reward": 8.508003234863281,
"objective/scores": 8.508003234863281,
"policy/approxkl_avg": 0.004381729289889336,
"policy/clipfrac_avg": 0.042573459446430206,
"policy/entropy_avg": 0.9618682265281677,
"step": 18,
"val/clipfrac_avg": 0.0002770860446617007,
"val/num_eos_tokens": 496,
"val/ratio": 0.9999381899833679,
"val/ratio_var": 5.949283149675466e-06
},
{
"episode": 9728,
"epoch": 0.08334475668265935,
"eps": 5,
"loss/policy_avg": -0.0033736806362867355,
"loss/value_avg": 0.536386251449585,
"lr": 2.7890625e-06,
"objective/entropy": -56.456016540527344,
"objective/kl": 25.067127227783203,
"objective/non_score_reward": 0.0,
"objective/rlhf_reward": 8.319454193115234,
"objective/scores": 8.319454193115234,
"policy/approxkl_avg": 0.004075043834745884,
"policy/clipfrac_avg": 0.04519989341497421,
"policy/entropy_avg": 0.9269654154777527,
"step": 19,
"val/clipfrac_avg": 0.0003078244626522064,
"val/num_eos_tokens": 501,
"val/ratio": 0.9994814395904541,
"val/ratio_var": 9.405189302924555e-06
},
{
"episode": 10240,
"epoch": 0.08773132282385196,
"eps": 5,
"loss/policy_avg": 0.004913418088108301,
"loss/value_avg": 0.5358260869979858,
"lr": 2.77734375e-06,
"objective/entropy": -59.48634338378906,
"objective/kl": 24.29659652709961,
"objective/non_score_reward": 0.0,
"objective/rlhf_reward": 8.499679565429688,
"objective/scores": 8.499679565429688,
"policy/approxkl_avg": 0.0036819700617343187,
"policy/clipfrac_avg": 0.043816424906253815,
"policy/entropy_avg": 0.9329521059989929,
"step": 20,
"val/clipfrac_avg": 0.00022827752400189638,
"val/num_eos_tokens": 508,
"val/ratio": 1.0002903938293457,
"val/ratio_var": 7.933602319099009e-06
},
{
"episode": 10752,
"epoch": 0.09211788896504455,
"eps": 5,
"loss/policy_avg": 0.0012558624148368835,
"loss/value_avg": 0.5767730474472046,
"lr": 2.765625e-06,
"objective/entropy": -61.19065856933594,
"objective/kl": 25.785268783569336,
"objective/non_score_reward": 0.0,
"objective/rlhf_reward": 8.370229721069336,
"objective/scores": 8.370229721069336,
"policy/approxkl_avg": 0.0040285466238856316,
"policy/clipfrac_avg": 0.04525526985526085,
"policy/entropy_avg": 0.949712336063385,
"step": 21,
"val/clipfrac_avg": 0.00048373237950727344,
"val/num_eos_tokens": 507,
"val/ratio": 0.9995108246803284,
"val/ratio_var": 7.613601610501064e-06
},
{
"episode": 11264,
"epoch": 0.09650445510623715,
"eps": 5,
"loss/policy_avg": 0.007160295266658068,
"loss/value_avg": 0.6456326842308044,
"lr": 2.75390625e-06,
"objective/entropy": -61.760406494140625,
"objective/kl": 25.645444869995117,
"objective/non_score_reward": 0.0,
"objective/rlhf_reward": 8.45798110961914,
"objective/scores": 8.45798110961914,
"policy/approxkl_avg": 0.003935192245990038,
"policy/clipfrac_avg": 0.04455619305372238,
"policy/entropy_avg": 0.9833183884620667,
"step": 22,
"val/clipfrac_avg": 0.0008266369113698602,
"val/num_eos_tokens": 508,
"val/ratio": 0.9999653697013855,
"val/ratio_var": 6.215088887984166e-06
},
{
"episode": 11776,
"epoch": 0.10089102124742974,
"eps": 5,
"loss/policy_avg": 0.014776970259845257,
"loss/value_avg": 0.7030456066131592,
"lr": 2.7421875e-06,
"objective/entropy": -60.44242858886719,
"objective/kl": 26.446081161499023,
"objective/non_score_reward": 0.0,
"objective/rlhf_reward": 8.580012321472168,
"objective/scores": 8.580012321472168,
"policy/approxkl_avg": 0.003829989116638899,
"policy/clipfrac_avg": 0.04393254220485687,
"policy/entropy_avg": 1.067291259765625,
"step": 23,
"val/clipfrac_avg": 0.0012957865837961435,
"val/num_eos_tokens": 510,
"val/ratio": 1.0001041889190674,
"val/ratio_var": 7.751174962322693e-06
},
{
"episode": 12288,
"epoch": 0.10527758738862235,
"eps": 5,
"loss/policy_avg": 0.006113366223871708,
"loss/value_avg": 0.6351609230041504,
"lr": 2.73046875e-06,
"objective/entropy": -56.61647415161133,
"objective/kl": 30.12872886657715,
"objective/non_score_reward": 0.0,
"objective/rlhf_reward": 8.864429473876953,
"objective/scores": 8.864429473876953,
"policy/approxkl_avg": 0.004433917347341776,
"policy/clipfrac_avg": 0.04813031852245331,
"policy/entropy_avg": 1.052201747894287,
"step": 24,
"val/clipfrac_avg": 0.0007775876438245177,
"val/num_eos_tokens": 510,
"val/ratio": 0.9999730587005615,
"val/ratio_var": 2.6995268854079768e-05
},
{
"episode": 12800,
"epoch": 0.10966415352981494,
"eps": 5,
"loss/policy_avg": 0.010391147807240486,
"loss/value_avg": 0.6229045391082764,
"lr": 2.71875e-06,
"objective/entropy": -55.23644256591797,
"objective/kl": 32.96910095214844,
"objective/non_score_reward": 0.0,
"objective/rlhf_reward": 8.8991117477417,
"objective/scores": 8.8991117477417,
"policy/approxkl_avg": 0.004299290012568235,
"policy/clipfrac_avg": 0.04626629129052162,
"policy/entropy_avg": 1.0567516088485718,
"step": 25,
"val/clipfrac_avg": 0.0009714451734907925,
"val/num_eos_tokens": 510,
"val/ratio": 1.0003576278686523,
"val/ratio_var": 2.934046096925158e-05
},
{
"episode": 13312,
"epoch": 0.11405071967100754,
"eps": 5,
"loss/policy_avg": 0.005770000629127026,
"loss/value_avg": 0.6085966229438782,
"lr": 2.70703125e-06,
"objective/entropy": -47.69157409667969,
"objective/kl": 39.49090576171875,
"objective/non_score_reward": 0.0,
"objective/rlhf_reward": 8.737468719482422,
"objective/scores": 8.737468719482422,
"policy/approxkl_avg": 0.0053903451189398766,
"policy/clipfrac_avg": 0.05019207298755646,
"policy/entropy_avg": 1.0364032983779907,
"step": 26,
"val/clipfrac_avg": 0.001478280988521874,
"val/num_eos_tokens": 504,
"val/ratio": 0.9998453259468079,
"val/ratio_var": 9.86140457825968e-06
},
{
"episode": 13824,
"epoch": 0.11843728581220014,
"eps": 5,
"loss/policy_avg": -0.007628859020769596,
"loss/value_avg": 0.6527573466300964,
"lr": 2.6953125e-06,
"objective/entropy": -39.07084655761719,
"objective/kl": 47.1235237121582,
"objective/non_score_reward": 0.0,
"objective/rlhf_reward": 8.737848281860352,
"objective/scores": 8.737848281860352,
"policy/approxkl_avg": 0.005542443133890629,
"policy/clipfrac_avg": 0.046551190316677094,
"policy/entropy_avg": 0.9338756203651428,
"step": 27,
"val/clipfrac_avg": 0.0006895526312291622,
"val/num_eos_tokens": 481,
"val/ratio": 0.9999536275863647,
"val/ratio_var": 1.1857218851218931e-05
},
{
"episode": 14336,
"epoch": 0.12282385195339274,
"eps": 5,
"loss/policy_avg": -0.013978306204080582,
"loss/value_avg": 0.7715175151824951,
"lr": 2.68359375e-06,
"objective/entropy": -29.249040603637695,
"objective/kl": 58.577266693115234,
"objective/non_score_reward": 0.0,
"objective/rlhf_reward": 8.21330738067627,
"objective/scores": 8.21330738067627,
"policy/approxkl_avg": 0.006117562763392925,
"policy/clipfrac_avg": 0.0474059171974659,
"policy/entropy_avg": 0.8380322456359863,
"step": 28,
"val/clipfrac_avg": 0.0009613880538381636,
"val/num_eos_tokens": 456,
"val/ratio": 1.0004549026489258,
"val/ratio_var": 1.6783484170446172e-05
},
{
"episode": 14848,
"epoch": 0.12721041809458533,
"eps": 5,
"loss/policy_avg": -0.00955754891037941,
"loss/value_avg": 0.7996537089347839,
"lr": 2.671875e-06,
"objective/entropy": -16.704195022583008,
"objective/kl": 67.1770248413086,
"objective/non_score_reward": 0.0,
"objective/rlhf_reward": 7.992954254150391,
"objective/scores": 7.992954254150391,
"policy/approxkl_avg": 0.006946917623281479,
"policy/clipfrac_avg": 0.04584059491753578,
"policy/entropy_avg": 0.7783851623535156,
"step": 29,
"val/clipfrac_avg": 0.0007248380570672452,
"val/num_eos_tokens": 428,
"val/ratio": 1.0003103017807007,
"val/ratio_var": 1.1794147212640382e-05
},
{
"episode": 15360,
"epoch": 0.13159698423577793,
"eps": 5,
"loss/policy_avg": -0.01961817592382431,
"loss/value_avg": 0.7010769844055176,
"lr": 2.66015625e-06,
"objective/entropy": -17.91378402709961,
"objective/kl": 65.80282592773438,
"objective/non_score_reward": 0.0,
"objective/rlhf_reward": 8.037736892700195,
"objective/scores": 8.037736892700195,
"policy/approxkl_avg": 0.006708525121212006,
"policy/clipfrac_avg": 0.04874643683433533,
"policy/entropy_avg": 0.7650750279426575,
"step": 30,
"val/clipfrac_avg": 0.0005629804218187928,
"val/num_eos_tokens": 437,
"val/ratio": 0.9999438524246216,
"val/ratio_var": 1.5312107279896736e-05
},
{
"episode": 15872,
"epoch": 0.13598355037697052,
"eps": 5,
"loss/policy_avg": -0.008987879380583763,
"loss/value_avg": 0.5842263698577881,
"lr": 2.6484375e-06,
"objective/entropy": -24.265296936035156,
"objective/kl": 60.957061767578125,
"objective/non_score_reward": 0.0,
"objective/rlhf_reward": 8.903877258300781,
"objective/scores": 8.903877258300781,
"policy/approxkl_avg": 0.00701293908059597,
"policy/clipfrac_avg": 0.05128619819879532,
"policy/entropy_avg": 0.8259672522544861,
"step": 31,
"val/clipfrac_avg": 0.0004929137649014592,
"val/num_eos_tokens": 477,
"val/ratio": 0.9996868371963501,
"val/ratio_var": 1.6662374036968686e-05
},
{
"episode": 16384,
"epoch": 0.1403701165181631,
"eps": 5,
"loss/policy_avg": -0.004771184176206589,
"loss/value_avg": 0.558784008026123,
"lr": 2.63671875e-06,
"objective/entropy": -26.573974609375,
"objective/kl": 58.13153839111328,
"objective/non_score_reward": 0.0,
"objective/rlhf_reward": 8.980989456176758,
"objective/scores": 8.980989456176758,
"policy/approxkl_avg": 0.006675435695797205,
"policy/clipfrac_avg": 0.05361879989504814,
"policy/entropy_avg": 0.8607890009880066,
"step": 32,
"val/clipfrac_avg": 0.00043037798604927957,
"val/num_eos_tokens": 491,
"val/ratio": 0.9996814727783203,
"val/ratio_var": 1.2863994015788194e-05
},
{
"episode": 16896,
"epoch": 0.14475668265935573,
"eps": 5,
"loss/policy_avg": 0.000527138588950038,
"loss/value_avg": 0.5865890979766846,
"lr": 2.6250000000000003e-06,
"objective/entropy": -30.55011749267578,
"objective/kl": 54.622886657714844,
"objective/non_score_reward": 0.0,
"objective/rlhf_reward": 9.211483001708984,
"objective/scores": 9.211483001708984,
"policy/approxkl_avg": 0.006334663834422827,
"policy/clipfrac_avg": 0.054518453776836395,
"policy/entropy_avg": 0.9017385244369507,
"step": 33,
"val/clipfrac_avg": 0.0004077432386111468,
"val/num_eos_tokens": 504,
"val/ratio": 1.000108003616333,
"val/ratio_var": 1.4822944649495184e-05
},
{
"episode": 17408,
"epoch": 0.14914324880054833,
"eps": 5,
"loss/policy_avg": -0.0033687106333673,
"loss/value_avg": 0.6050545573234558,
"lr": 2.61328125e-06,
"objective/entropy": -33.91044616699219,
"objective/kl": 52.96797180175781,
"objective/non_score_reward": 0.0,
"objective/rlhf_reward": 9.475627899169922,
"objective/scores": 9.475627899169922,
"policy/approxkl_avg": 0.00646405853331089,
"policy/clipfrac_avg": 0.05471920967102051,
"policy/entropy_avg": 0.9273003339767456,
"step": 34,
"val/clipfrac_avg": 0.0001560871460242197,
"val/num_eos_tokens": 504,
"val/ratio": 1.000005841255188,
"val/ratio_var": 1.5097434697963763e-05
},
{
"episode": 17920,
"epoch": 0.15352981494174092,
"eps": 5,
"loss/policy_avg": 0.006220666225999594,
"loss/value_avg": 0.5275569558143616,
"lr": 2.6015625e-06,
"objective/entropy": -36.52558898925781,
"objective/kl": 49.886573791503906,
"objective/non_score_reward": 0.0,
"objective/rlhf_reward": 9.341217041015625,
"objective/scores": 9.341217041015625,
"policy/approxkl_avg": 0.0063529182225465775,
"policy/clipfrac_avg": 0.0546303354203701,
"policy/entropy_avg": 0.977063775062561,
"step": 35,
"val/clipfrac_avg": 0.00036523916060104966,
"val/num_eos_tokens": 511,
"val/ratio": 1.000399112701416,
"val/ratio_var": 1.2374664038361516e-05
},
{
"episode": 18432,
"epoch": 0.15791638108293352,
"eps": 5,
"loss/policy_avg": 0.006454125978052616,
"loss/value_avg": 0.500439465045929,
"lr": 2.5898437500000003e-06,
"objective/entropy": -40.24166488647461,
"objective/kl": 49.15092468261719,
"objective/non_score_reward": 0.0,
"objective/rlhf_reward": 9.569242477416992,
"objective/scores": 9.569242477416992,
"policy/approxkl_avg": 0.005947995465248823,
"policy/clipfrac_avg": 0.05309908464550972,
"policy/entropy_avg": 1.0032403469085693,
"step": 36,
"val/clipfrac_avg": 0.00021327620197553188,
"val/num_eos_tokens": 509,
"val/ratio": 1.0006883144378662,
"val/ratio_var": 1.43506422318751e-05
},
{
"episode": 18944,
"epoch": 0.1623029472241261,
"eps": 5,
"loss/policy_avg": 0.007721163332462311,
"loss/value_avg": 0.4169340431690216,
"lr": 2.578125e-06,
"objective/entropy": -43.73438262939453,
"objective/kl": 47.38279724121094,
"objective/non_score_reward": 0.0,
"objective/rlhf_reward": 9.466113090515137,
"objective/scores": 9.466113090515137,
"policy/approxkl_avg": 0.0057074325159192085,
"policy/clipfrac_avg": 0.05311349779367447,
"policy/entropy_avg": 1.0183720588684082,
"step": 37,
"val/clipfrac_avg": 0.0005003074184060097,
"val/num_eos_tokens": 510,
"val/ratio": 0.9993772506713867,
"val/ratio_var": 1.3632562513521407e-05
},
{
"episode": 19456,
"epoch": 0.1666895133653187,
"eps": 5,
"loss/policy_avg": 0.005469637922942638,
"loss/value_avg": 0.35471123456954956,
"lr": 2.56640625e-06,
"objective/entropy": -46.80841827392578,
"objective/kl": 47.71710205078125,
"objective/non_score_reward": 0.0,
"objective/rlhf_reward": 9.39622688293457,
"objective/scores": 9.39622688293457,
"policy/approxkl_avg": 0.005291135516017675,
"policy/clipfrac_avg": 0.053695324808359146,
"policy/entropy_avg": 1.0006449222564697,
"step": 38,
"val/clipfrac_avg": 0.00037036644062027335,
"val/num_eos_tokens": 510,
"val/ratio": 1.0000135898590088,
"val/ratio_var": 9.616092029318679e-06
},
{
"episode": 19968,
"epoch": 0.1710760795065113,
"eps": 5,
"loss/policy_avg": 0.007845397107303143,
"loss/value_avg": 0.3138006925582886,
"lr": 2.5546875000000003e-06,
"objective/entropy": -49.33262634277344,
"objective/kl": 45.213348388671875,
"objective/non_score_reward": 0.0,
"objective/rlhf_reward": 9.463075637817383,
"objective/scores": 9.463075637817383,
"policy/approxkl_avg": 0.0048331525176763535,
"policy/clipfrac_avg": 0.05174440145492554,
"policy/entropy_avg": 0.9827993512153625,
"step": 39,
"val/clipfrac_avg": 0.0006140347104519606,
"val/num_eos_tokens": 511,
"val/ratio": 0.9993776082992554,
"val/ratio_var": 1.478405738453148e-05
},
{
"episode": 20480,
"epoch": 0.17546264564770392,
"eps": 5,
"loss/policy_avg": 0.009613022208213806,
"loss/value_avg": 0.2865651249885559,
"lr": 2.54296875e-06,
"objective/entropy": -21.233320236206055,
"objective/kl": 45.59048843383789,
"objective/non_score_reward": 0.0,
"objective/rlhf_reward": 9.34876537322998,
"objective/scores": 9.34876537322998,
"policy/approxkl_avg": 0.00592905143275857,
"policy/clipfrac_avg": 0.048290204256772995,
"policy/entropy_avg": 0.8851035833358765,
"step": 40,
"val/clipfrac_avg": 0.0003385603195056319,
"val/num_eos_tokens": 512,
"val/ratio": 1.0003713369369507,
"val/ratio_var": 2.0009263607789762e-05
},
{
"episode": 20992,
"epoch": 0.1798492117888965,
"eps": 5,
"loss/policy_avg": 0.013228874653577805,
"loss/value_avg": 0.3579314947128296,
"lr": 2.53125e-06,
"objective/entropy": -12.196764945983887,
"objective/kl": 43.65990447998047,
"objective/non_score_reward": 0.0,
"objective/rlhf_reward": 9.419660568237305,
"objective/scores": 9.419660568237305,
"policy/approxkl_avg": 0.006829770281910896,
"policy/clipfrac_avg": 0.049680888652801514,
"policy/entropy_avg": 0.8308127522468567,
"step": 41,
"val/clipfrac_avg": 0.00030408138991333544,
"val/num_eos_tokens": 512,
"val/ratio": 0.9990015029907227,
"val/ratio_var": 1.62386004376458e-05
},
{
"episode": 21504,
"epoch": 0.1842357779300891,
"eps": 5,
"loss/policy_avg": 0.011590891517698765,
"loss/value_avg": 0.41505149006843567,
"lr": 2.5195312500000003e-06,
"objective/entropy": -50.10555648803711,
"objective/kl": 43.93341064453125,
"objective/non_score_reward": 0.0,
"objective/rlhf_reward": 9.235706329345703,
"objective/scores": 9.235706329345703,
"policy/approxkl_avg": 0.005015837028622627,
"policy/clipfrac_avg": 0.05047174170613289,
"policy/entropy_avg": 0.9143767356872559,
"step": 42,
"val/clipfrac_avg": 0.0005529467016458511,
"val/num_eos_tokens": 510,
"val/ratio": 1.0000828504562378,
"val/ratio_var": 1.0489389751455747e-05
},
{
"episode": 22016,
"epoch": 0.1886223440712817,
"eps": 5,
"loss/policy_avg": 0.012541696429252625,
"loss/value_avg": 0.42272669076919556,
"lr": 2.5078125e-06,
"objective/entropy": -28.646146774291992,
"objective/kl": 41.60773468017578,
"objective/non_score_reward": 0.0,
"objective/rlhf_reward": 9.567056655883789,
"objective/scores": 9.567056655883789,
"policy/approxkl_avg": 0.005777922924607992,
"policy/clipfrac_avg": 0.04737301170825958,
"policy/entropy_avg": 0.8459863662719727,
"step": 43,
"val/clipfrac_avg": 0.0008770625572651625,
"val/num_eos_tokens": 512,
"val/ratio": 1.0003621578216553,
"val/ratio_var": 1.7744689102983102e-05
},
{
"episode": 22528,
"epoch": 0.1930089102124743,
"eps": 5,
"loss/policy_avg": 0.010437501594424248,
"loss/value_avg": 0.44466620683670044,
"lr": 2.49609375e-06,
"objective/entropy": -47.16653823852539,
"objective/kl": 43.96792984008789,
"objective/non_score_reward": 0.0,
"objective/rlhf_reward": 9.598703384399414,
"objective/scores": 9.598703384399414,
"policy/approxkl_avg": 0.004931645467877388,
"policy/clipfrac_avg": 0.05061531811952591,
"policy/entropy_avg": 0.8815011978149414,
"step": 44,
"val/clipfrac_avg": 0.0006886773044243455,
"val/num_eos_tokens": 510,
"val/ratio": 1.0000885725021362,
"val/ratio_var": 1.353884090349311e-05
},
{
"episode": 23040,
"epoch": 0.1973954763536669,
"eps": 5,
"loss/policy_avg": 0.008954339660704136,
"loss/value_avg": 0.3882572650909424,
"lr": 2.4843750000000002e-06,
"objective/entropy": -45.80089569091797,
"objective/kl": 43.658973693847656,
"objective/non_score_reward": 0.0,
"objective/rlhf_reward": 9.753758430480957,
"objective/scores": 9.753758430480957,
"policy/approxkl_avg": 0.0052681779488921165,
"policy/clipfrac_avg": 0.04858952760696411,
"policy/entropy_avg": 0.8804545998573303,
"step": 45,
"val/clipfrac_avg": 0.00039701920468360186,
"val/num_eos_tokens": 510,
"val/ratio": 1.0002995729446411,
"val/ratio_var": 1.4173986528476235e-05
},
{
"episode": 23552,
"epoch": 0.20178204249485948,
"eps": 5,
"loss/policy_avg": 0.008618440479040146,
"loss/value_avg": 0.38125473260879517,
"lr": 2.47265625e-06,
"objective/entropy": -43.78852844238281,
"objective/kl": 45.42605209350586,
"objective/non_score_reward": 0.0,
"objective/rlhf_reward": 9.775838851928711,
"objective/scores": 9.775838851928711,
"policy/approxkl_avg": 0.004843084141612053,
"policy/clipfrac_avg": 0.04662889242172241,
"policy/entropy_avg": 0.8556495904922485,
"step": 46,
"val/clipfrac_avg": 0.0007612881599925458,
"val/num_eos_tokens": 511,
"val/ratio": 0.9999899864196777,
"val/ratio_var": 1.0652936907717958e-05
},
{
"episode": 24064,
"epoch": 0.2061686086360521,
"eps": 5,
"loss/policy_avg": 0.00817467924207449,
"loss/value_avg": 0.31694096326828003,
"lr": 2.4609375e-06,
"objective/entropy": -40.105125427246094,
"objective/kl": 47.7975959777832,
"objective/non_score_reward": 0.0,
"objective/rlhf_reward": 9.824735641479492,
"objective/scores": 9.824735641479492,
"policy/approxkl_avg": 0.0051739090122282505,
"policy/clipfrac_avg": 0.0448913611471653,
"policy/entropy_avg": 0.8633439540863037,
"step": 47,
"val/clipfrac_avg": 0.0006875419057905674,
"val/num_eos_tokens": 511,
"val/ratio": 0.9996302723884583,
"val/ratio_var": 9.456825864617713e-06
},
{
"episode": 24576,
"epoch": 0.2105551747772447,
"eps": 5,
"loss/policy_avg": 0.005447334609925747,
"loss/value_avg": 0.29413723945617676,
"lr": 2.4492187500000002e-06,
"objective/entropy": -35.29613494873047,
"objective/kl": 51.42682647705078,
"objective/non_score_reward": 0.0,
"objective/rlhf_reward": 10.033798217773438,
"objective/scores": 10.033798217773438,
"policy/approxkl_avg": 0.005674813874065876,
"policy/clipfrac_avg": 0.0471038892865181,
"policy/entropy_avg": 0.8564262390136719,
"step": 48,
"val/clipfrac_avg": 0.0006171433487907052,
"val/num_eos_tokens": 511,
"val/ratio": 1.000201940536499,
"val/ratio_var": 1.2321422218519729e-05
},
{
"episode": 25088,
"epoch": 0.2149417409184373,
"eps": 5,
"loss/policy_avg": 0.004657389596104622,
"loss/value_avg": 0.2838543653488159,
"lr": 2.4375e-06,
"objective/entropy": -30.200895309448242,
"objective/kl": 56.05735778808594,
"objective/non_score_reward": 0.0,
"objective/rlhf_reward": 9.83310317993164,
"objective/scores": 9.83310317993164,
"policy/approxkl_avg": 0.005895932205021381,
"policy/clipfrac_avg": 0.045708563178777695,
"policy/entropy_avg": 0.8755910396575928,
"step": 49,
"val/clipfrac_avg": 0.00047186214942485094,
"val/num_eos_tokens": 509,
"val/ratio": 1.0001733303070068,
"val/ratio_var": 1.263204376300564e-05
},
{
"episode": 25600,
"epoch": 0.21932830705962988,
"eps": 5,
"loss/policy_avg": 0.006238073576241732,
"loss/value_avg": 0.282005250453949,
"lr": 2.42578125e-06,
"objective/entropy": -25.376914978027344,
"objective/kl": 60.16917419433594,
"objective/non_score_reward": 0.0,
"objective/rlhf_reward": 9.97581672668457,
"objective/scores": 9.97581672668457,
"policy/approxkl_avg": 0.006535097025334835,
"policy/clipfrac_avg": 0.04995926469564438,
"policy/entropy_avg": 0.8645215034484863,
"step": 50,
"val/clipfrac_avg": 0.000910063972696662,
"val/num_eos_tokens": 508,
"val/ratio": 1.0001347064971924,
"val/ratio_var": 1.2472723028622568e-05
},
{
"episode": 26112,
"epoch": 0.22371487320082248,
"eps": 5,
"loss/policy_avg": -0.00042364001274108887,
"loss/value_avg": 0.2829969525337219,
"lr": 2.4140625000000002e-06,
"objective/entropy": -19.165199279785156,
"objective/kl": 66.85098266601562,
"objective/non_score_reward": 0.0,
"objective/rlhf_reward": 10.05324935913086,
"objective/scores": 10.05324935913086,
"policy/approxkl_avg": 0.006855464540421963,
"policy/clipfrac_avg": 0.04632896929979324,
"policy/entropy_avg": 0.8457518815994263,
"step": 51,
"val/clipfrac_avg": 0.000306067755445838,
"val/num_eos_tokens": 504,
"val/ratio": 0.9993095397949219,
"val/ratio_var": 1.3966228834760841e-05
},
{
"episode": 26624,
"epoch": 0.22810143934201507,
"eps": 5,
"loss/policy_avg": 0.000534743070602417,
"loss/value_avg": 0.30065596103668213,
"lr": 2.40234375e-06,
"objective/entropy": -15.858919143676758,
"objective/kl": 69.84648132324219,
"objective/non_score_reward": 0.0,
"objective/rlhf_reward": 9.9315185546875,
"objective/scores": 9.9315185546875,
"policy/approxkl_avg": 0.007337586954236031,
"policy/clipfrac_avg": 0.04750536382198334,
"policy/entropy_avg": 0.823326587677002,
"step": 52,
"val/clipfrac_avg": 0.00029861342045478523,
"val/num_eos_tokens": 495,
"val/ratio": 0.999434232711792,
"val/ratio_var": 1.7595832105143927e-05
}
],
"logging_steps": 100,
"max_steps": 256,
"num_input_tokens_seen": 0,
"num_train_epochs": 1.122960932145305,
"save_steps": 52,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 0,
"train_batch_size": null,
"trial_name": null,
"trial_params": null
}