Safetensors
qwen2
quest-corruption-7b-s375-v3-GRPO / trainer_state.json
kalomaze's picture
Add checkpoint-375: GRPO-trained corruption repair model
734f5ed verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.0,
"eval_steps": 500,
"global_step": 375,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"completion_length": 376.9687805175781,
"epoch": 0.0026666666666666666,
"grad_norm": 0.8290089342545105,
"kl": 0.00012969970703125,
"learning_rate": 2e-06,
"loss": 0.1243,
"reward": 0.26896461844444275,
"reward_std": 0.2736624479293823,
"rewards/length_reward": 0.026339290663599968,
"rewards/similarity_reward": 0.24262532591819763,
"step": 1
},
{
"completion_length": 363.27679443359375,
"epoch": 0.005333333333333333,
"grad_norm": 0.8474061525539559,
"kl": 0.00015354156494140625,
"learning_rate": 2e-06,
"loss": 0.1524,
"reward": 0.1628378927707672,
"reward_std": 0.21853798627853394,
"rewards/length_reward": 0.0178571455180645,
"rewards/similarity_reward": 0.14498072862625122,
"step": 2
},
{
"completion_length": 378.4732360839844,
"epoch": 0.008,
"grad_norm": 0.8192733588218007,
"kl": 0.0001430511474609375,
"learning_rate": 2e-06,
"loss": 0.1852,
"reward": 0.27797675132751465,
"reward_std": 0.23766961693763733,
"rewards/length_reward": 0.02633928880095482,
"rewards/similarity_reward": 0.25163745880126953,
"step": 3
},
{
"completion_length": 359.21429443359375,
"epoch": 0.010666666666666666,
"grad_norm": 0.845054880446133,
"kl": 0.00015544891357421875,
"learning_rate": 2e-06,
"loss": 0.0616,
"reward": 0.1538199633359909,
"reward_std": 0.17373259365558624,
"rewards/length_reward": 0.013839286752045155,
"rewards/similarity_reward": 0.13998067378997803,
"step": 4
},
{
"completion_length": 336.15625,
"epoch": 0.013333333333333334,
"grad_norm": 0.8717553512231005,
"kl": 0.00014972686767578125,
"learning_rate": 2e-06,
"loss": 0.1209,
"reward": 0.21149994432926178,
"reward_std": 0.2120143473148346,
"rewards/length_reward": 0.01830357313156128,
"rewards/similarity_reward": 0.1931963711977005,
"step": 5
},
{
"completion_length": 373.6026916503906,
"epoch": 0.016,
"grad_norm": 0.7610025457969676,
"kl": 0.00013446807861328125,
"learning_rate": 2e-06,
"loss": 0.1513,
"reward": 0.1827303022146225,
"reward_std": 0.24866001307964325,
"rewards/length_reward": 0.02187500335276127,
"rewards/similarity_reward": 0.16085529327392578,
"step": 6
},
{
"completion_length": 341.7589416503906,
"epoch": 0.018666666666666668,
"grad_norm": 0.8626611704865026,
"kl": 0.0001583099365234375,
"learning_rate": 2e-06,
"loss": 0.1271,
"reward": 0.19529196619987488,
"reward_std": 0.2814559042453766,
"rewards/length_reward": 0.021428575739264488,
"rewards/similarity_reward": 0.17386338114738464,
"step": 7
},
{
"completion_length": 424.98663330078125,
"epoch": 0.021333333333333333,
"grad_norm": 0.7146043340468313,
"kl": 0.00017452239990234375,
"learning_rate": 2e-06,
"loss": 0.1881,
"reward": 0.21610300242900848,
"reward_std": 0.2689198851585388,
"rewards/length_reward": 0.01830357313156128,
"rewards/similarity_reward": 0.197799414396286,
"step": 8
},
{
"completion_length": 348.2321472167969,
"epoch": 0.024,
"grad_norm": 0.7606250954270842,
"kl": 0.000186920166015625,
"learning_rate": 2e-06,
"loss": 0.1276,
"reward": 0.20473892986774445,
"reward_std": 0.23727914690971375,
"rewards/length_reward": 0.02008928917348385,
"rewards/similarity_reward": 0.18464964628219604,
"step": 9
},
{
"completion_length": 355.54913330078125,
"epoch": 0.02666666666666667,
"grad_norm": 0.8173370703071066,
"kl": 0.00018310546875,
"learning_rate": 2e-06,
"loss": 0.0787,
"reward": 0.227640300989151,
"reward_std": 0.2539962828159332,
"rewards/length_reward": 0.01741071790456772,
"rewards/similarity_reward": 0.21022957563400269,
"step": 10
},
{
"completion_length": 343.7232360839844,
"epoch": 0.029333333333333333,
"grad_norm": 0.9104553890156574,
"kl": 0.00019550323486328125,
"learning_rate": 2e-06,
"loss": 0.0575,
"reward": 0.25083568692207336,
"reward_std": 0.2815442383289337,
"rewards/length_reward": 0.02767857536673546,
"rewards/similarity_reward": 0.22315707802772522,
"step": 11
},
{
"completion_length": 355.0535888671875,
"epoch": 0.032,
"grad_norm": 0.8007928014475878,
"kl": 0.0003528594970703125,
"learning_rate": 2e-06,
"loss": 0.1448,
"reward": 0.2787685990333557,
"reward_std": 0.25941261649131775,
"rewards/length_reward": 0.025892863050103188,
"rewards/similarity_reward": 0.25287577509880066,
"step": 12
},
{
"completion_length": 395.4107360839844,
"epoch": 0.034666666666666665,
"grad_norm": 0.7050603406845205,
"kl": 0.000255584716796875,
"learning_rate": 2e-06,
"loss": 0.1127,
"reward": 0.31717172265052795,
"reward_std": 0.2762907147407532,
"rewards/length_reward": 0.02946428954601288,
"rewards/similarity_reward": 0.28770744800567627,
"step": 13
},
{
"completion_length": 361.05804443359375,
"epoch": 0.037333333333333336,
"grad_norm": 0.9360978406768153,
"kl": 0.0003376007080078125,
"learning_rate": 2e-06,
"loss": 0.142,
"reward": 0.24003884196281433,
"reward_std": 0.27974435687065125,
"rewards/length_reward": 0.021428575739264488,
"rewards/similarity_reward": 0.2186102569103241,
"step": 14
},
{
"completion_length": 398.0044860839844,
"epoch": 0.04,
"grad_norm": 0.7389563411116621,
"kl": 0.0003604888916015625,
"learning_rate": 2e-06,
"loss": 0.1137,
"reward": 0.23077349364757538,
"reward_std": 0.24957218766212463,
"rewards/length_reward": 0.0178571455180645,
"rewards/similarity_reward": 0.21291638910770416,
"step": 15
},
{
"completion_length": 366.4196472167969,
"epoch": 0.042666666666666665,
"grad_norm": 0.7079515986093292,
"kl": 0.0004177093505859375,
"learning_rate": 2e-06,
"loss": 0.1596,
"reward": 0.16758890450000763,
"reward_std": 0.1997506469488144,
"rewards/length_reward": 0.0178571455180645,
"rewards/similarity_reward": 0.14973175525665283,
"step": 16
},
{
"completion_length": 385.87054443359375,
"epoch": 0.04533333333333334,
"grad_norm": 0.7793857856999354,
"kl": 0.000385284423828125,
"learning_rate": 2e-06,
"loss": 0.1863,
"reward": 0.2408275306224823,
"reward_std": 0.28883570432662964,
"rewards/length_reward": 0.022321434691548347,
"rewards/similarity_reward": 0.2185060679912567,
"step": 17
},
{
"completion_length": 373.0446472167969,
"epoch": 0.048,
"grad_norm": 0.8181367418694138,
"kl": 0.0005035400390625,
"learning_rate": 2e-06,
"loss": 0.0998,
"reward": 0.2590915262699127,
"reward_std": 0.26667794585227966,
"rewards/length_reward": 0.02678571827709675,
"rewards/similarity_reward": 0.23230580985546112,
"step": 18
},
{
"completion_length": 314.1026916503906,
"epoch": 0.050666666666666665,
"grad_norm": 0.8473993736940718,
"kl": 0.000438690185546875,
"learning_rate": 2e-06,
"loss": 0.0861,
"reward": 0.34537845849990845,
"reward_std": 0.26645439863204956,
"rewards/length_reward": 0.03705357387661934,
"rewards/similarity_reward": 0.3083249032497406,
"step": 19
},
{
"completion_length": 354.93304443359375,
"epoch": 0.05333333333333334,
"grad_norm": 0.8882148635006984,
"kl": 0.00057220458984375,
"learning_rate": 2e-06,
"loss": 0.1732,
"reward": 0.3256897032260895,
"reward_std": 0.25436195731163025,
"rewards/length_reward": 0.025446433573961258,
"rewards/similarity_reward": 0.3002432584762573,
"step": 20
},
{
"completion_length": 354.1250305175781,
"epoch": 0.056,
"grad_norm": 0.8096248306365297,
"kl": 0.000713348388671875,
"learning_rate": 2e-06,
"loss": 0.1809,
"reward": 0.42166247963905334,
"reward_std": 0.2462671995162964,
"rewards/length_reward": 0.0401785746216774,
"rewards/similarity_reward": 0.38148391246795654,
"step": 21
},
{
"completion_length": 379.8258972167969,
"epoch": 0.058666666666666666,
"grad_norm": 0.8324528992208251,
"kl": 0.000713348388671875,
"learning_rate": 2e-06,
"loss": 0.1888,
"reward": 0.4674707353115082,
"reward_std": 0.28602704405784607,
"rewards/length_reward": 0.03883929178118706,
"rewards/similarity_reward": 0.42863139510154724,
"step": 22
},
{
"completion_length": 336.1964416503906,
"epoch": 0.06133333333333333,
"grad_norm": 0.883659092626158,
"kl": 0.001129150390625,
"learning_rate": 2e-06,
"loss": 0.1224,
"reward": 0.3572904169559479,
"reward_std": 0.2817726135253906,
"rewards/length_reward": 0.030803577974438667,
"rewards/similarity_reward": 0.32648688554763794,
"step": 23
},
{
"completion_length": 379.8750305175781,
"epoch": 0.064,
"grad_norm": 0.7759947978320788,
"kl": 0.00135040283203125,
"learning_rate": 2e-06,
"loss": 0.163,
"reward": 0.34559932351112366,
"reward_std": 0.26974016427993774,
"rewards/length_reward": 0.03705357387661934,
"rewards/similarity_reward": 0.3085457384586334,
"step": 24
},
{
"completion_length": 334.71875,
"epoch": 0.06666666666666667,
"grad_norm": 0.8358157724868338,
"kl": 0.001068115234375,
"learning_rate": 2e-06,
"loss": 0.1326,
"reward": 0.3908008337020874,
"reward_std": 0.3024666905403137,
"rewards/length_reward": 0.03928571566939354,
"rewards/similarity_reward": 0.35151511430740356,
"step": 25
},
{
"completion_length": 372.55804443359375,
"epoch": 0.06933333333333333,
"grad_norm": 0.7610834907565935,
"kl": 0.0027008056640625,
"learning_rate": 2e-06,
"loss": 0.1688,
"reward": 0.2864897847175598,
"reward_std": 0.2402629852294922,
"rewards/length_reward": 0.02500000409781933,
"rewards/similarity_reward": 0.2614898085594177,
"step": 26
},
{
"completion_length": 332.1651916503906,
"epoch": 0.072,
"grad_norm": 0.9340372327621089,
"kl": 0.00152587890625,
"learning_rate": 2e-06,
"loss": 0.1936,
"reward": 0.32003700733184814,
"reward_std": 0.28589487075805664,
"rewards/length_reward": 0.0334821492433548,
"rewards/similarity_reward": 0.28655487298965454,
"step": 27
},
{
"completion_length": 421.08929443359375,
"epoch": 0.07466666666666667,
"grad_norm": 0.7506727526601732,
"kl": 0.00148773193359375,
"learning_rate": 2e-06,
"loss": 0.1452,
"reward": 0.39091238379478455,
"reward_std": 0.21896174550056458,
"rewards/length_reward": 0.03482143208384514,
"rewards/similarity_reward": 0.3560909032821655,
"step": 28
},
{
"completion_length": 381.3348388671875,
"epoch": 0.07733333333333334,
"grad_norm": 0.6952200929063435,
"kl": 0.0017242431640625,
"learning_rate": 2e-06,
"loss": 0.1113,
"reward": 0.36936715245246887,
"reward_std": 0.23492401838302612,
"rewards/length_reward": 0.030357148498296738,
"rewards/similarity_reward": 0.33900997042655945,
"step": 29
},
{
"completion_length": 334.6607360839844,
"epoch": 0.08,
"grad_norm": 0.9106612624614471,
"kl": 0.00170135498046875,
"learning_rate": 2e-06,
"loss": 0.0756,
"reward": 0.4293070435523987,
"reward_std": 0.2823811173439026,
"rewards/length_reward": 0.0401785746216774,
"rewards/similarity_reward": 0.3891284763813019,
"step": 30
},
{
"completion_length": 461.99554443359375,
"epoch": 0.08266666666666667,
"grad_norm": 0.661013366993289,
"kl": 0.0020751953125,
"learning_rate": 2e-06,
"loss": 0.2489,
"reward": 0.3686121702194214,
"reward_std": 0.2422313094139099,
"rewards/length_reward": 0.028571434319019318,
"rewards/similarity_reward": 0.3400407135486603,
"step": 31
},
{
"completion_length": 354.83038330078125,
"epoch": 0.08533333333333333,
"grad_norm": 0.9261837111640477,
"kl": 0.0028533935546875,
"learning_rate": 2e-06,
"loss": 0.3225,
"reward": 0.3788739740848541,
"reward_std": 0.26962369680404663,
"rewards/length_reward": 0.03482143208384514,
"rewards/similarity_reward": 0.34405258297920227,
"step": 32
},
{
"completion_length": 355.5357360839844,
"epoch": 0.088,
"grad_norm": 0.8643961614615921,
"kl": 0.0026397705078125,
"learning_rate": 2e-06,
"loss": 0.1893,
"reward": 0.3799653649330139,
"reward_std": 0.2525205910205841,
"rewards/length_reward": 0.03303571790456772,
"rewards/similarity_reward": 0.346929669380188,
"step": 33
},
{
"completion_length": 303.2901916503906,
"epoch": 0.09066666666666667,
"grad_norm": 0.9531070627714385,
"kl": 0.002532958984375,
"learning_rate": 2e-06,
"loss": 0.0718,
"reward": 0.435234397649765,
"reward_std": 0.2490427941083908,
"rewards/length_reward": 0.03883928805589676,
"rewards/similarity_reward": 0.39639511704444885,
"step": 34
},
{
"completion_length": 391.89288330078125,
"epoch": 0.09333333333333334,
"grad_norm": 0.8111465127450542,
"kl": 0.0019683837890625,
"learning_rate": 2e-06,
"loss": 0.1647,
"reward": 0.4038808047771454,
"reward_std": 0.31206637620925903,
"rewards/length_reward": 0.03883928805589676,
"rewards/similarity_reward": 0.3650415241718292,
"step": 35
},
{
"completion_length": 359.9196472167969,
"epoch": 0.096,
"grad_norm": 0.8281785424636492,
"kl": 0.00335693359375,
"learning_rate": 2e-06,
"loss": 0.1875,
"reward": 0.4340634346008301,
"reward_std": 0.27002009749412537,
"rewards/length_reward": 0.04151785373687744,
"rewards/similarity_reward": 0.39254552125930786,
"step": 36
},
{
"completion_length": 287.6875,
"epoch": 0.09866666666666667,
"grad_norm": 1.053364246590671,
"kl": 0.00518798828125,
"learning_rate": 2e-06,
"loss": 0.085,
"reward": 0.47383809089660645,
"reward_std": 0.22637499868869781,
"rewards/length_reward": 0.04374999925494194,
"rewards/similarity_reward": 0.430088073015213,
"step": 37
},
{
"completion_length": 398.1160888671875,
"epoch": 0.10133333333333333,
"grad_norm": 0.7805356469167566,
"kl": 0.00341796875,
"learning_rate": 2e-06,
"loss": 0.2197,
"reward": 0.4928036332130432,
"reward_std": 0.24267539381980896,
"rewards/length_reward": 0.050446417182683945,
"rewards/similarity_reward": 0.4423570930957794,
"step": 38
},
{
"completion_length": 312.7276916503906,
"epoch": 0.104,
"grad_norm": 0.819085900048414,
"kl": 0.00177001953125,
"learning_rate": 2e-06,
"loss": 0.1426,
"reward": 0.5058891177177429,
"reward_std": 0.24202971160411835,
"rewards/length_reward": 0.04776785522699356,
"rewards/similarity_reward": 0.4581212103366852,
"step": 39
},
{
"completion_length": 386.8214416503906,
"epoch": 0.10666666666666667,
"grad_norm": 0.7650271991129984,
"kl": 0.0030364990234375,
"learning_rate": 2e-06,
"loss": 0.1882,
"reward": 0.4022373855113983,
"reward_std": 0.24781934916973114,
"rewards/length_reward": 0.0401785746216774,
"rewards/similarity_reward": 0.3620587885379791,
"step": 40
},
{
"completion_length": 250.4107208251953,
"epoch": 0.10933333333333334,
"grad_norm": 0.9803970599540968,
"kl": 0.003875732421875,
"learning_rate": 2e-06,
"loss": -0.0008,
"reward": 0.5008234977722168,
"reward_std": 0.22121772170066833,
"rewards/length_reward": 0.050446417182683945,
"rewards/similarity_reward": 0.45037704706192017,
"step": 41
},
{
"completion_length": 317.1160888671875,
"epoch": 0.112,
"grad_norm": 0.8986374178737812,
"kl": 0.00341796875,
"learning_rate": 2e-06,
"loss": 0.2065,
"reward": 0.45162686705589294,
"reward_std": 0.27914097905158997,
"rewards/length_reward": 0.04598214104771614,
"rewards/similarity_reward": 0.4056447148323059,
"step": 42
},
{
"completion_length": 313.4821472167969,
"epoch": 0.11466666666666667,
"grad_norm": 0.9439169733302692,
"kl": 0.0040283203125,
"learning_rate": 2e-06,
"loss": 0.1189,
"reward": 0.5015469789505005,
"reward_std": 0.2071218341588974,
"rewards/length_reward": 0.050892848521471024,
"rewards/similarity_reward": 0.4506540596485138,
"step": 43
},
{
"completion_length": 308.23663330078125,
"epoch": 0.11733333333333333,
"grad_norm": 0.843848181524653,
"kl": 0.004150390625,
"learning_rate": 2e-06,
"loss": 0.1423,
"reward": 0.49464964866638184,
"reward_std": 0.19760312139987946,
"rewards/length_reward": 0.05223213508725166,
"rewards/similarity_reward": 0.4424174726009369,
"step": 44
},
{
"completion_length": 322.51788330078125,
"epoch": 0.12,
"grad_norm": 0.808670899867239,
"kl": 0.004730224609375,
"learning_rate": 2e-06,
"loss": 0.2162,
"reward": 0.45207658410072327,
"reward_std": 0.22255302965641022,
"rewards/length_reward": 0.03883929178118706,
"rewards/similarity_reward": 0.4132373034954071,
"step": 45
},
{
"completion_length": 306.9732360839844,
"epoch": 0.12266666666666666,
"grad_norm": 0.9215129819151354,
"kl": 0.0038604736328125,
"learning_rate": 2e-06,
"loss": 0.1492,
"reward": 0.5217949151992798,
"reward_std": 0.24197062849998474,
"rewards/length_reward": 0.04598213732242584,
"rewards/similarity_reward": 0.47581273317337036,
"step": 46
},
{
"completion_length": 305.45538330078125,
"epoch": 0.12533333333333332,
"grad_norm": 0.8829840145792894,
"kl": 0.00518798828125,
"learning_rate": 2e-06,
"loss": 0.2397,
"reward": 0.471711665391922,
"reward_std": 0.15981332957744598,
"rewards/length_reward": 0.049553561955690384,
"rewards/similarity_reward": 0.42215806245803833,
"step": 47
},
{
"completion_length": 294.4419860839844,
"epoch": 0.128,
"grad_norm": 0.8246638858466566,
"kl": 0.004241943359375,
"learning_rate": 2e-06,
"loss": 0.0913,
"reward": 0.4828924238681793,
"reward_std": 0.19750112295150757,
"rewards/length_reward": 0.053124990314245224,
"rewards/similarity_reward": 0.429767370223999,
"step": 48
},
{
"completion_length": 332.45538330078125,
"epoch": 0.13066666666666665,
"grad_norm": 0.7904388485187617,
"kl": 0.004913330078125,
"learning_rate": 2e-06,
"loss": 0.1529,
"reward": 0.45801258087158203,
"reward_std": 0.2542867660522461,
"rewards/length_reward": 0.04196428507566452,
"rewards/similarity_reward": 0.4160482585430145,
"step": 49
},
{
"completion_length": 281.77679443359375,
"epoch": 0.13333333333333333,
"grad_norm": 0.9845645428183626,
"kl": 0.00433349609375,
"learning_rate": 2e-06,
"loss": 0.0801,
"reward": 0.5682670474052429,
"reward_std": 0.23296673595905304,
"rewards/length_reward": 0.06517855077981949,
"rewards/similarity_reward": 0.5030884742736816,
"step": 50
},
{
"completion_length": 316.7410888671875,
"epoch": 0.136,
"grad_norm": 0.9300429591791828,
"kl": 0.00592041015625,
"learning_rate": 2e-06,
"loss": 0.103,
"reward": 0.44630467891693115,
"reward_std": 0.12811601161956787,
"rewards/length_reward": 0.04821427911520004,
"rewards/similarity_reward": 0.3980904519557953,
"step": 51
},
{
"completion_length": 296.8973388671875,
"epoch": 0.13866666666666666,
"grad_norm": 0.8592422567531082,
"kl": 0.005218505859375,
"learning_rate": 2e-06,
"loss": 0.1159,
"reward": 0.5130535960197449,
"reward_std": 0.1873682290315628,
"rewards/length_reward": 0.0491071380674839,
"rewards/similarity_reward": 0.4639464318752289,
"step": 52
},
{
"completion_length": 284.0401916503906,
"epoch": 0.14133333333333334,
"grad_norm": 0.8593724590061699,
"kl": 0.0048828125,
"learning_rate": 2e-06,
"loss": 0.0087,
"reward": 0.5250208973884583,
"reward_std": 0.21563619375228882,
"rewards/length_reward": 0.057142842561006546,
"rewards/similarity_reward": 0.46787798404693604,
"step": 53
},
{
"completion_length": 268.21429443359375,
"epoch": 0.144,
"grad_norm": 0.9220995505083402,
"kl": 0.005645751953125,
"learning_rate": 2e-06,
"loss": 0.0234,
"reward": 0.6078009605407715,
"reward_std": 0.18404294550418854,
"rewards/length_reward": 0.04508928582072258,
"rewards/similarity_reward": 0.5627117156982422,
"step": 54
},
{
"completion_length": 332.2232360839844,
"epoch": 0.14666666666666667,
"grad_norm": 0.8153244083746986,
"kl": 0.004852294921875,
"learning_rate": 2e-06,
"loss": 0.1089,
"reward": 0.5709711313247681,
"reward_std": 0.18112631142139435,
"rewards/length_reward": 0.058035701513290405,
"rewards/similarity_reward": 0.5129354596138,
"step": 55
},
{
"completion_length": 298.62054443359375,
"epoch": 0.14933333333333335,
"grad_norm": 0.861900790561817,
"kl": 0.00567626953125,
"learning_rate": 2e-06,
"loss": 0.1037,
"reward": 0.5129757523536682,
"reward_std": 0.21154648065567017,
"rewards/length_reward": 0.05401784926652908,
"rewards/similarity_reward": 0.45895785093307495,
"step": 56
},
{
"completion_length": 243.4241180419922,
"epoch": 0.152,
"grad_norm": 0.928994891862699,
"kl": 0.004241943359375,
"learning_rate": 2e-06,
"loss": 0.0328,
"reward": 0.6340307593345642,
"reward_std": 0.16285859048366547,
"rewards/length_reward": 0.06562498211860657,
"rewards/similarity_reward": 0.56840580701828,
"step": 57
},
{
"completion_length": 293.0669860839844,
"epoch": 0.15466666666666667,
"grad_norm": 0.89777989101008,
"kl": 0.00579833984375,
"learning_rate": 2e-06,
"loss": 0.0313,
"reward": 0.5502158999443054,
"reward_std": 0.1914074867963791,
"rewards/length_reward": 0.06517855077981949,
"rewards/similarity_reward": 0.48503735661506653,
"step": 58
},
{
"completion_length": 348.3973388671875,
"epoch": 0.15733333333333333,
"grad_norm": 0.830581459672137,
"kl": 0.005706787109375,
"learning_rate": 2e-06,
"loss": 0.1262,
"reward": 0.5427281260490417,
"reward_std": 0.18273915350437164,
"rewards/length_reward": 0.04508928582072258,
"rewards/similarity_reward": 0.4976387917995453,
"step": 59
},
{
"completion_length": 300.43304443359375,
"epoch": 0.16,
"grad_norm": 0.7841145484798535,
"kl": 0.007080078125,
"learning_rate": 2e-06,
"loss": 0.0356,
"reward": 0.6150097846984863,
"reward_std": 0.15837538242340088,
"rewards/length_reward": 0.06383926421403885,
"rewards/similarity_reward": 0.5511705279350281,
"step": 60
},
{
"completion_length": 291.64288330078125,
"epoch": 0.16266666666666665,
"grad_norm": 0.9759946887460155,
"kl": 0.005645751953125,
"learning_rate": 2e-06,
"loss": 0.1347,
"reward": 0.6720048785209656,
"reward_std": 0.16562286019325256,
"rewards/length_reward": 0.06428569555282593,
"rewards/similarity_reward": 0.6077191233634949,
"step": 61
},
{
"completion_length": 300.3973388671875,
"epoch": 0.16533333333333333,
"grad_norm": 0.8353754395282778,
"kl": 0.00592041015625,
"learning_rate": 2e-06,
"loss": 0.1508,
"reward": 0.6174642443656921,
"reward_std": 0.1775916963815689,
"rewards/length_reward": 0.05848212540149689,
"rewards/similarity_reward": 0.5589820742607117,
"step": 62
},
{
"completion_length": 299.0,
"epoch": 0.168,
"grad_norm": 0.8434412806636016,
"kl": 0.0068359375,
"learning_rate": 2e-06,
"loss": 0.0787,
"reward": 0.5795109272003174,
"reward_std": 0.18212977051734924,
"rewards/length_reward": 0.056249987334012985,
"rewards/similarity_reward": 0.5232609510421753,
"step": 63
},
{
"completion_length": 314.3883972167969,
"epoch": 0.17066666666666666,
"grad_norm": 1.1818234014256608,
"kl": 0.005706787109375,
"learning_rate": 2e-06,
"loss": 0.2756,
"reward": 0.5499185919761658,
"reward_std": 0.22555634379386902,
"rewards/length_reward": 0.04776785522699356,
"rewards/similarity_reward": 0.5021507143974304,
"step": 64
},
{
"completion_length": 259.1785888671875,
"epoch": 0.17333333333333334,
"grad_norm": 0.9529921486665629,
"kl": 0.006439208984375,
"learning_rate": 2e-06,
"loss": 0.0704,
"reward": 0.5430376529693604,
"reward_std": 0.2042228877544403,
"rewards/length_reward": 0.052232131361961365,
"rewards/similarity_reward": 0.4908054769039154,
"step": 65
},
{
"completion_length": 258.0714416503906,
"epoch": 0.176,
"grad_norm": 1.1488507934693044,
"kl": 0.006805419921875,
"learning_rate": 2e-06,
"loss": 0.1057,
"reward": 0.570214033126831,
"reward_std": 0.160283625125885,
"rewards/length_reward": 0.06517855077981949,
"rewards/similarity_reward": 0.505035400390625,
"step": 66
},
{
"completion_length": 330.90179443359375,
"epoch": 0.17866666666666667,
"grad_norm": 0.912379625363708,
"kl": 0.0062255859375,
"learning_rate": 2e-06,
"loss": 0.1291,
"reward": 0.49484553933143616,
"reward_std": 0.21234968304634094,
"rewards/length_reward": 0.04553570970892906,
"rewards/similarity_reward": 0.4493098556995392,
"step": 67
},
{
"completion_length": 266.46429443359375,
"epoch": 0.18133333333333335,
"grad_norm": 0.9382639131370187,
"kl": 0.00909423828125,
"learning_rate": 2e-06,
"loss": 0.0685,
"reward": 0.5787621736526489,
"reward_std": 0.17865508794784546,
"rewards/length_reward": 0.054464273154735565,
"rewards/similarity_reward": 0.5242978930473328,
"step": 68
},
{
"completion_length": 285.25,
"epoch": 0.184,
"grad_norm": 0.8385679542137942,
"kl": 0.00555419921875,
"learning_rate": 2e-06,
"loss": 0.046,
"reward": 0.6689252257347107,
"reward_std": 0.16466915607452393,
"rewards/length_reward": 0.06651783734560013,
"rewards/similarity_reward": 0.6024073958396912,
"step": 69
},
{
"completion_length": 247.68751525878906,
"epoch": 0.18666666666666668,
"grad_norm": 1.01200025847724,
"kl": 0.00860595703125,
"learning_rate": 2e-06,
"loss": 0.1382,
"reward": 0.4780524969100952,
"reward_std": 0.19645950198173523,
"rewards/length_reward": 0.04196428507566452,
"rewards/similarity_reward": 0.4360882043838501,
"step": 70
},
{
"completion_length": 307.24554443359375,
"epoch": 0.18933333333333333,
"grad_norm": 0.8185082695628789,
"kl": 0.00787353515625,
"learning_rate": 2e-06,
"loss": 0.0749,
"reward": 0.5303549766540527,
"reward_std": 0.1896388977766037,
"rewards/length_reward": 0.056696414947509766,
"rewards/similarity_reward": 0.4736584722995758,
"step": 71
},
{
"completion_length": 329.15179443359375,
"epoch": 0.192,
"grad_norm": 0.8562549539520792,
"kl": 0.00823974609375,
"learning_rate": 2e-06,
"loss": 0.1642,
"reward": 0.5008990168571472,
"reward_std": 0.17187656462192535,
"rewards/length_reward": 0.050892848521471024,
"rewards/similarity_reward": 0.4500061273574829,
"step": 72
},
{
"completion_length": 253.59376525878906,
"epoch": 0.19466666666666665,
"grad_norm": 0.8806238339574037,
"kl": 0.006591796875,
"learning_rate": 2e-06,
"loss": 0.1082,
"reward": 0.7047773003578186,
"reward_std": 0.12662379443645477,
"rewards/length_reward": 0.07232140004634857,
"rewards/similarity_reward": 0.6324558258056641,
"step": 73
},
{
"completion_length": 302.7321472167969,
"epoch": 0.19733333333333333,
"grad_norm": 0.888373625390179,
"kl": 0.01300048828125,
"learning_rate": 2e-06,
"loss": 0.0384,
"reward": 0.5046581625938416,
"reward_std": 0.18071489036083221,
"rewards/length_reward": 0.0491071380674839,
"rewards/similarity_reward": 0.45555105805397034,
"step": 74
},
{
"completion_length": 309.0982360839844,
"epoch": 0.2,
"grad_norm": 0.8352994571315709,
"kl": 0.0081787109375,
"learning_rate": 2e-06,
"loss": 0.1895,
"reward": 0.6111252903938293,
"reward_std": 0.19863885641098022,
"rewards/length_reward": 0.054464273154735565,
"rewards/similarity_reward": 0.5566610097885132,
"step": 75
},
{
"completion_length": 255.25001525878906,
"epoch": 0.20266666666666666,
"grad_norm": 1.0786021298964794,
"kl": 0.0089111328125,
"learning_rate": 2e-06,
"loss": 0.0384,
"reward": 0.5118966102600098,
"reward_std": 0.1661101132631302,
"rewards/length_reward": 0.05223213508725166,
"rewards/similarity_reward": 0.4596644341945648,
"step": 76
},
{
"completion_length": 299.92413330078125,
"epoch": 0.20533333333333334,
"grad_norm": 0.887828324089484,
"kl": 0.0128173828125,
"learning_rate": 2e-06,
"loss": 0.1341,
"reward": 0.5058793425559998,
"reward_std": 0.2038315385580063,
"rewards/length_reward": 0.05044642463326454,
"rewards/similarity_reward": 0.45543283224105835,
"step": 77
},
{
"completion_length": 300.26788330078125,
"epoch": 0.208,
"grad_norm": 0.960422578229874,
"kl": 0.01080322265625,
"learning_rate": 2e-06,
"loss": 0.2173,
"reward": 0.5535677075386047,
"reward_std": 0.16259299218654633,
"rewards/length_reward": 0.06205355376005173,
"rewards/similarity_reward": 0.49151411652565,
"step": 78
},
{
"completion_length": 256.21875,
"epoch": 0.21066666666666667,
"grad_norm": 0.9394611442130687,
"kl": 0.01171875,
"learning_rate": 2e-06,
"loss": 0.031,
"reward": 0.6236703991889954,
"reward_std": 0.16783180832862854,
"rewards/length_reward": 0.05982141196727753,
"rewards/similarity_reward": 0.5638489127159119,
"step": 79
},
{
"completion_length": 279.9419860839844,
"epoch": 0.21333333333333335,
"grad_norm": 1.0860808591863038,
"kl": 0.0096435546875,
"learning_rate": 2e-06,
"loss": 0.2206,
"reward": 0.5311146974563599,
"reward_std": 0.20672693848609924,
"rewards/length_reward": 0.052678562700748444,
"rewards/similarity_reward": 0.47843608260154724,
"step": 80
},
{
"completion_length": 302.7008972167969,
"epoch": 0.216,
"grad_norm": 0.7695067843560371,
"kl": 0.00653076171875,
"learning_rate": 2e-06,
"loss": 0.0387,
"reward": 0.5848192572593689,
"reward_std": 0.20639710128307343,
"rewards/length_reward": 0.060267843306064606,
"rewards/similarity_reward": 0.5245514512062073,
"step": 81
},
{
"completion_length": 247.18751525878906,
"epoch": 0.21866666666666668,
"grad_norm": 1.0172061111487714,
"kl": 0.009765625,
"learning_rate": 2e-06,
"loss": 0.0528,
"reward": 0.6127398610115051,
"reward_std": 0.13182979822158813,
"rewards/length_reward": 0.06562498211860657,
"rewards/similarity_reward": 0.5471147894859314,
"step": 82
},
{
"completion_length": 303.40625,
"epoch": 0.22133333333333333,
"grad_norm": 0.7610369219853271,
"kl": 0.00921630859375,
"learning_rate": 2e-06,
"loss": 0.1034,
"reward": 0.5650977492332458,
"reward_std": 0.16646917164325714,
"rewards/length_reward": 0.056696418672800064,
"rewards/similarity_reward": 0.5084013342857361,
"step": 83
},
{
"completion_length": 296.625,
"epoch": 0.224,
"grad_norm": 0.8819878638905205,
"kl": 0.00653076171875,
"learning_rate": 2e-06,
"loss": 0.0509,
"reward": 0.6824392676353455,
"reward_std": 0.13198219239711761,
"rewards/length_reward": 0.07678568363189697,
"rewards/similarity_reward": 0.6056535243988037,
"step": 84
},
{
"completion_length": 264.4375,
"epoch": 0.22666666666666666,
"grad_norm": 0.9860703009968039,
"kl": 0.0103759765625,
"learning_rate": 2e-06,
"loss": 0.1082,
"reward": 0.5452346205711365,
"reward_std": 0.18002980947494507,
"rewards/length_reward": 0.055357132107019424,
"rewards/similarity_reward": 0.48987752199172974,
"step": 85
},
{
"completion_length": 275.1696472167969,
"epoch": 0.22933333333333333,
"grad_norm": 0.876007599982239,
"kl": 0.007720947265625,
"learning_rate": 2e-06,
"loss": 0.1075,
"reward": 0.5654360055923462,
"reward_std": 0.15497317910194397,
"rewards/length_reward": 0.050892848521471024,
"rewards/similarity_reward": 0.5145430564880371,
"step": 86
},
{
"completion_length": 268.3883972167969,
"epoch": 0.232,
"grad_norm": 0.9877196366166759,
"kl": 0.00872802734375,
"learning_rate": 2e-06,
"loss": 0.0664,
"reward": 0.6144102811813354,
"reward_std": 0.1374298632144928,
"rewards/length_reward": 0.06741069257259369,
"rewards/similarity_reward": 0.5469995141029358,
"step": 87
},
{
"completion_length": 339.67413330078125,
"epoch": 0.23466666666666666,
"grad_norm": 0.6868950012188707,
"kl": 0.006072998046875,
"learning_rate": 2e-06,
"loss": 0.0478,
"reward": 0.6562062501907349,
"reward_std": 0.1523490995168686,
"rewards/length_reward": 0.07187497615814209,
"rewards/similarity_reward": 0.5843312740325928,
"step": 88
},
{
"completion_length": 262.4598388671875,
"epoch": 0.23733333333333334,
"grad_norm": 0.8224596138062096,
"kl": 0.006378173828125,
"learning_rate": 2e-06,
"loss": 0.101,
"reward": 0.6333271265029907,
"reward_std": 0.170791357755661,
"rewards/length_reward": 0.06785711646080017,
"rewards/similarity_reward": 0.5654700398445129,
"step": 89
},
{
"completion_length": 256.42413330078125,
"epoch": 0.24,
"grad_norm": 0.9173507116779652,
"kl": 0.00946044921875,
"learning_rate": 2e-06,
"loss": 0.0437,
"reward": 0.5871028900146484,
"reward_std": 0.16378919780254364,
"rewards/length_reward": 0.04732142388820648,
"rewards/similarity_reward": 0.5397815108299255,
"step": 90
},
{
"completion_length": 350.6562805175781,
"epoch": 0.24266666666666667,
"grad_norm": 0.7989839039096065,
"kl": 0.01251220703125,
"learning_rate": 2e-06,
"loss": 0.0614,
"reward": 0.5199065208435059,
"reward_std": 0.20764127373695374,
"rewards/length_reward": 0.05044642463326454,
"rewards/similarity_reward": 0.4694600999355316,
"step": 91
},
{
"completion_length": 249.0044708251953,
"epoch": 0.24533333333333332,
"grad_norm": 0.9605121844841826,
"kl": 0.01123046875,
"learning_rate": 2e-06,
"loss": 0.036,
"reward": 0.6044343113899231,
"reward_std": 0.164906844496727,
"rewards/length_reward": 0.057589270174503326,
"rewards/similarity_reward": 0.5468449592590332,
"step": 92
},
{
"completion_length": 312.2232360839844,
"epoch": 0.248,
"grad_norm": 0.836379585280954,
"kl": 0.007781982421875,
"learning_rate": 2e-06,
"loss": 0.0948,
"reward": 0.5776776075363159,
"reward_std": 0.15271225571632385,
"rewards/length_reward": 0.055803555995225906,
"rewards/similarity_reward": 0.5218740701675415,
"step": 93
},
{
"completion_length": 311.3482360839844,
"epoch": 0.25066666666666665,
"grad_norm": 0.7945534599494852,
"kl": 0.00897216796875,
"learning_rate": 2e-06,
"loss": 0.1441,
"reward": 0.6111860275268555,
"reward_std": 0.13805179297924042,
"rewards/length_reward": 0.06696426123380661,
"rewards/similarity_reward": 0.5442216992378235,
"step": 94
},
{
"completion_length": 234.93751525878906,
"epoch": 0.25333333333333335,
"grad_norm": 1.2028189171950667,
"kl": 0.00909423828125,
"learning_rate": 2e-06,
"loss": 0.1766,
"reward": 0.5961614847183228,
"reward_std": 0.16394107043743134,
"rewards/length_reward": 0.05982141196727753,
"rewards/similarity_reward": 0.5363399982452393,
"step": 95
},
{
"completion_length": 305.6964416503906,
"epoch": 0.256,
"grad_norm": 0.8067577376172387,
"kl": 0.00982666015625,
"learning_rate": 2e-06,
"loss": 0.0671,
"reward": 0.656367301940918,
"reward_std": 0.12278923392295837,
"rewards/length_reward": 0.06874997913837433,
"rewards/similarity_reward": 0.5876173377037048,
"step": 96
},
{
"completion_length": 327.71429443359375,
"epoch": 0.25866666666666666,
"grad_norm": 0.770922327161602,
"kl": 0.0064697265625,
"learning_rate": 2e-06,
"loss": 0.0279,
"reward": 0.5429174304008484,
"reward_std": 0.16670171916484833,
"rewards/length_reward": 0.06562498211860657,
"rewards/similarity_reward": 0.47729235887527466,
"step": 97
},
{
"completion_length": 256.62054443359375,
"epoch": 0.2613333333333333,
"grad_norm": 1.051628532134925,
"kl": 0.0072021484375,
"learning_rate": 2e-06,
"loss": 0.0872,
"reward": 0.5515283346176147,
"reward_std": 0.17894278466701508,
"rewards/length_reward": 0.06205355003476143,
"rewards/similarity_reward": 0.48947471380233765,
"step": 98
},
{
"completion_length": 268.59375,
"epoch": 0.264,
"grad_norm": 0.8240697442290599,
"kl": 0.008544921875,
"learning_rate": 2e-06,
"loss": 0.0672,
"reward": 0.6131307482719421,
"reward_std": 0.17423538863658905,
"rewards/length_reward": 0.06205355003476143,
"rewards/similarity_reward": 0.5510770678520203,
"step": 99
},
{
"completion_length": 257.37054443359375,
"epoch": 0.26666666666666666,
"grad_norm": 131.12733526048441,
"kl": 0.1923828125,
"learning_rate": 2e-06,
"loss": 0.0817,
"reward": 0.6513006687164307,
"reward_std": 0.14835356175899506,
"rewards/length_reward": 0.06830354779958725,
"rewards/similarity_reward": 0.5829971432685852,
"step": 100
},
{
"completion_length": 307.1473388671875,
"epoch": 0.2693333333333333,
"grad_norm": 0.837570841896231,
"kl": 0.0054931640625,
"learning_rate": 2e-06,
"loss": 0.0194,
"reward": 0.6764991283416748,
"reward_std": 0.1323472112417221,
"rewards/length_reward": 0.07410712540149689,
"rewards/similarity_reward": 0.6023918986320496,
"step": 101
},
{
"completion_length": 265.5044860839844,
"epoch": 0.272,
"grad_norm": 0.7713890189466205,
"kl": 0.00830078125,
"learning_rate": 2e-06,
"loss": 0.0212,
"reward": 0.6779460310935974,
"reward_std": 0.12496771663427353,
"rewards/length_reward": 0.07276783138513565,
"rewards/similarity_reward": 0.6051782369613647,
"step": 102
},
{
"completion_length": 280.46429443359375,
"epoch": 0.27466666666666667,
"grad_norm": 0.8825358886169125,
"kl": 0.008056640625,
"learning_rate": 2e-06,
"loss": 0.1086,
"reward": 0.5848525166511536,
"reward_std": 0.11382713168859482,
"rewards/length_reward": 0.06071426719427109,
"rewards/similarity_reward": 0.5241381525993347,
"step": 103
},
{
"completion_length": 320.3571472167969,
"epoch": 0.2773333333333333,
"grad_norm": 0.8471202564701443,
"kl": 0.0069580078125,
"learning_rate": 2e-06,
"loss": 0.0388,
"reward": 0.6058804988861084,
"reward_std": 0.15757833421230316,
"rewards/length_reward": 0.06517855077981949,
"rewards/similarity_reward": 0.5407018661499023,
"step": 104
},
{
"completion_length": 287.0848388671875,
"epoch": 0.28,
"grad_norm": 0.885756988877436,
"kl": 0.00860595703125,
"learning_rate": 2e-06,
"loss": 0.089,
"reward": 0.6150888204574585,
"reward_std": 0.1344638168811798,
"rewards/length_reward": 0.057589273899793625,
"rewards/similarity_reward": 0.5574995875358582,
"step": 105
},
{
"completion_length": 293.1339416503906,
"epoch": 0.2826666666666667,
"grad_norm": 0.9299759085944364,
"kl": 0.01336669921875,
"learning_rate": 2e-06,
"loss": 0.0428,
"reward": 0.560218334197998,
"reward_std": 0.2031860500574112,
"rewards/length_reward": 0.058035701513290405,
"rewards/similarity_reward": 0.50218266248703,
"step": 106
},
{
"completion_length": 292.75,
"epoch": 0.2853333333333333,
"grad_norm": 0.8374882655316597,
"kl": 0.00848388671875,
"learning_rate": 2e-06,
"loss": 0.0683,
"reward": 0.6374148726463318,
"reward_std": 0.15000107884407043,
"rewards/length_reward": 0.06919640302658081,
"rewards/similarity_reward": 0.5682184100151062,
"step": 107
},
{
"completion_length": 263.3571472167969,
"epoch": 0.288,
"grad_norm": 1.0433586800088648,
"kl": 0.0078125,
"learning_rate": 2e-06,
"loss": 0.0913,
"reward": 0.5456939935684204,
"reward_std": 0.1411367952823639,
"rewards/length_reward": 0.056696414947509766,
"rewards/similarity_reward": 0.48899757862091064,
"step": 108
},
{
"completion_length": 263.9107360839844,
"epoch": 0.2906666666666667,
"grad_norm": 0.9650468316923807,
"kl": 0.01129150390625,
"learning_rate": 2e-06,
"loss": 0.119,
"reward": 0.6117041110992432,
"reward_std": 0.13907021284103394,
"rewards/length_reward": 0.06205355376005173,
"rewards/similarity_reward": 0.5496505498886108,
"step": 109
},
{
"completion_length": 235.0848388671875,
"epoch": 0.29333333333333333,
"grad_norm": 0.9205848620805003,
"kl": 0.009521484375,
"learning_rate": 2e-06,
"loss": 0.006,
"reward": 0.5724084377288818,
"reward_std": 0.12264589220285416,
"rewards/length_reward": 0.06964283436536789,
"rewards/similarity_reward": 0.5027655959129333,
"step": 110
},
{
"completion_length": 280.9419860839844,
"epoch": 0.296,
"grad_norm": 0.8242814043162366,
"kl": 0.00836181640625,
"learning_rate": 2e-06,
"loss": 0.1358,
"reward": 0.6025325059890747,
"reward_std": 0.13276302814483643,
"rewards/length_reward": 0.06741069257259369,
"rewards/similarity_reward": 0.5351218581199646,
"step": 111
},
{
"completion_length": 261.55804443359375,
"epoch": 0.2986666666666667,
"grad_norm": 0.8979430693793525,
"kl": 0.0145263671875,
"learning_rate": 2e-06,
"loss": 0.0898,
"reward": 0.5723416805267334,
"reward_std": 0.11434419453144073,
"rewards/length_reward": 0.06428569555282593,
"rewards/similarity_reward": 0.5080559253692627,
"step": 112
},
{
"completion_length": 273.8482360839844,
"epoch": 0.30133333333333334,
"grad_norm": 0.8994640436743108,
"kl": 0.0084228515625,
"learning_rate": 2e-06,
"loss": 0.0884,
"reward": 0.6239952445030212,
"reward_std": 0.15253794193267822,
"rewards/length_reward": 0.06517855077981949,
"rewards/similarity_reward": 0.5588168501853943,
"step": 113
},
{
"completion_length": 265.29913330078125,
"epoch": 0.304,
"grad_norm": 0.8511084352415984,
"kl": 0.015625,
"learning_rate": 2e-06,
"loss": 0.0217,
"reward": 0.5796217322349548,
"reward_std": 0.16319997608661652,
"rewards/length_reward": 0.06785712391138077,
"rewards/similarity_reward": 0.511764645576477,
"step": 114
},
{
"completion_length": 227.94644165039062,
"epoch": 0.30666666666666664,
"grad_norm": 1.0504493112285522,
"kl": 0.013427734375,
"learning_rate": 2e-06,
"loss": 0.092,
"reward": 0.611219584941864,
"reward_std": 0.1473054587841034,
"rewards/length_reward": 0.06562498211860657,
"rewards/similarity_reward": 0.5455944538116455,
"step": 115
},
{
"completion_length": 312.2008972167969,
"epoch": 0.30933333333333335,
"grad_norm": 0.8491847097599164,
"kl": 0.009033203125,
"learning_rate": 2e-06,
"loss": 0.0818,
"reward": 0.5205245018005371,
"reward_std": 0.18279042840003967,
"rewards/length_reward": 0.055803555995225906,
"rewards/similarity_reward": 0.46472102403640747,
"step": 116
},
{
"completion_length": 285.4151916503906,
"epoch": 0.312,
"grad_norm": 0.948136046714223,
"kl": 0.01251220703125,
"learning_rate": 2e-06,
"loss": 0.1402,
"reward": 0.5244685411453247,
"reward_std": 0.1221655011177063,
"rewards/length_reward": 0.058482129126787186,
"rewards/similarity_reward": 0.4659864008426666,
"step": 117
},
{
"completion_length": 291.8973388671875,
"epoch": 0.31466666666666665,
"grad_norm": 0.8327937599541795,
"kl": 0.00823974609375,
"learning_rate": 2e-06,
"loss": 0.0416,
"reward": 0.6440633535385132,
"reward_std": 0.14113157987594604,
"rewards/length_reward": 0.06696426123380661,
"rewards/similarity_reward": 0.5770990252494812,
"step": 118
},
{
"completion_length": 276.3035888671875,
"epoch": 0.31733333333333336,
"grad_norm": 1.0522569506493296,
"kl": 0.01007080078125,
"learning_rate": 2e-06,
"loss": 0.2508,
"reward": 0.501847505569458,
"reward_std": 0.16830717027187347,
"rewards/length_reward": 0.051785703748464584,
"rewards/similarity_reward": 0.4500618278980255,
"step": 119
},
{
"completion_length": 231.46876525878906,
"epoch": 0.32,
"grad_norm": 1.0564887037389263,
"kl": 0.01544189453125,
"learning_rate": 2e-06,
"loss": 0.1258,
"reward": 0.5212039351463318,
"reward_std": 0.1660899519920349,
"rewards/length_reward": 0.053124986588954926,
"rewards/similarity_reward": 0.46807900071144104,
"step": 120
},
{
"completion_length": 274.55804443359375,
"epoch": 0.32266666666666666,
"grad_norm": 0.892927807825851,
"kl": 0.006988525390625,
"learning_rate": 2e-06,
"loss": 0.0887,
"reward": 0.5594373941421509,
"reward_std": 0.13949331641197205,
"rewards/length_reward": 0.06651782989501953,
"rewards/similarity_reward": 0.49291953444480896,
"step": 121
},
{
"completion_length": 309.36163330078125,
"epoch": 0.3253333333333333,
"grad_norm": 0.8555023561165935,
"kl": 0.01019287109375,
"learning_rate": 2e-06,
"loss": 0.1512,
"reward": 0.5939301252365112,
"reward_std": 0.16705819964408875,
"rewards/length_reward": 0.061160698533058167,
"rewards/similarity_reward": 0.5327693819999695,
"step": 122
},
{
"completion_length": 264.3973388671875,
"epoch": 0.328,
"grad_norm": 0.9083757893001095,
"kl": 0.00775146484375,
"learning_rate": 2e-06,
"loss": 0.1449,
"reward": 0.6276513934135437,
"reward_std": 0.14763577282428741,
"rewards/length_reward": 0.06160712614655495,
"rewards/similarity_reward": 0.5660442113876343,
"step": 123
},
{
"completion_length": 267.96875,
"epoch": 0.33066666666666666,
"grad_norm": 0.9115344595637944,
"kl": 0.01336669921875,
"learning_rate": 2e-06,
"loss": 0.0998,
"reward": 0.6213053464889526,
"reward_std": 0.16126255691051483,
"rewards/length_reward": 0.05848212540149689,
"rewards/similarity_reward": 0.5628232359886169,
"step": 124
},
{
"completion_length": 275.2276916503906,
"epoch": 0.3333333333333333,
"grad_norm": 0.8933631069209625,
"kl": 0.01019287109375,
"learning_rate": 2e-06,
"loss": 0.0232,
"reward": 0.6394702792167664,
"reward_std": 0.17729975283145905,
"rewards/length_reward": 0.061160698533058167,
"rewards/similarity_reward": 0.5783094763755798,
"step": 125
},
{
"completion_length": 280.0848388671875,
"epoch": 0.336,
"grad_norm": 0.9959640208447441,
"kl": 0.0203857421875,
"learning_rate": 2e-06,
"loss": 0.066,
"reward": 0.5415524840354919,
"reward_std": 0.18598264455795288,
"rewards/length_reward": 0.06741069257259369,
"rewards/similarity_reward": 0.47414183616638184,
"step": 126
},
{
"completion_length": 294.37054443359375,
"epoch": 0.33866666666666667,
"grad_norm": 0.944172883238238,
"kl": 0.0078125,
"learning_rate": 2e-06,
"loss": 0.0825,
"reward": 0.6250823736190796,
"reward_std": 0.1783696711063385,
"rewards/length_reward": 0.06696426123380661,
"rewards/similarity_reward": 0.5581181049346924,
"step": 127
},
{
"completion_length": 252.44644165039062,
"epoch": 0.3413333333333333,
"grad_norm": 0.8622018142523461,
"kl": 0.01190185546875,
"learning_rate": 2e-06,
"loss": 0.0136,
"reward": 0.5941780209541321,
"reward_std": 0.1297590583562851,
"rewards/length_reward": 0.06517855077981949,
"rewards/similarity_reward": 0.5289995074272156,
"step": 128
},
{
"completion_length": 275.1696472167969,
"epoch": 0.344,
"grad_norm": 0.997627840820869,
"kl": 0.00933837890625,
"learning_rate": 2e-06,
"loss": 0.1634,
"reward": 0.5641010999679565,
"reward_std": 0.13691328465938568,
"rewards/length_reward": 0.057142842561006546,
"rewards/similarity_reward": 0.5069582462310791,
"step": 129
},
{
"completion_length": 291.78125,
"epoch": 0.3466666666666667,
"grad_norm": 0.9141566771741596,
"kl": 0.00885009765625,
"learning_rate": 2e-06,
"loss": 0.233,
"reward": 0.5903910398483276,
"reward_std": 0.14815300703048706,
"rewards/length_reward": 0.06249998137354851,
"rewards/similarity_reward": 0.5278909802436829,
"step": 130
},
{
"completion_length": 294.58929443359375,
"epoch": 0.34933333333333333,
"grad_norm": 0.9307314460988763,
"kl": 0.01153564453125,
"learning_rate": 2e-06,
"loss": 0.0979,
"reward": 0.5972681045532227,
"reward_std": 0.16272346675395966,
"rewards/length_reward": 0.061160698533058167,
"rewards/similarity_reward": 0.5361074805259705,
"step": 131
},
{
"completion_length": 240.2232208251953,
"epoch": 0.352,
"grad_norm": 0.9959808951952684,
"kl": 0.01092529296875,
"learning_rate": 2e-06,
"loss": 0.0428,
"reward": 0.6472880244255066,
"reward_std": 0.15316687524318695,
"rewards/length_reward": 0.06562498211860657,
"rewards/similarity_reward": 0.5816629528999329,
"step": 132
},
{
"completion_length": 237.1607208251953,
"epoch": 0.3546666666666667,
"grad_norm": 0.8515521500324365,
"kl": 0.01251220703125,
"learning_rate": 2e-06,
"loss": 0.0683,
"reward": 0.631507396697998,
"reward_std": 0.15118519961833954,
"rewards/length_reward": 0.06428569555282593,
"rewards/similarity_reward": 0.5672216415405273,
"step": 133
},
{
"completion_length": 307.5,
"epoch": 0.35733333333333334,
"grad_norm": 0.719487956498844,
"kl": 0.0059814453125,
"learning_rate": 2e-06,
"loss": 0.0352,
"reward": 0.6587818264961243,
"reward_std": 0.14100806415081024,
"rewards/length_reward": 0.07187497615814209,
"rewards/similarity_reward": 0.5869067907333374,
"step": 134
},
{
"completion_length": 305.2589416503906,
"epoch": 0.36,
"grad_norm": 1.0877799003245066,
"kl": 0.0169677734375,
"learning_rate": 2e-06,
"loss": 0.0955,
"reward": 0.5947835445404053,
"reward_std": 0.12429028749465942,
"rewards/length_reward": 0.06071426719427109,
"rewards/similarity_reward": 0.5340692400932312,
"step": 135
},
{
"completion_length": 223.81251525878906,
"epoch": 0.3626666666666667,
"grad_norm": 1.056046449389469,
"kl": 0.00665283203125,
"learning_rate": 2e-06,
"loss": 0.0631,
"reward": 0.6106573343276978,
"reward_std": 0.13982126116752625,
"rewards/length_reward": 0.06517855077981949,
"rewards/similarity_reward": 0.5454786419868469,
"step": 136
},
{
"completion_length": 302.52679443359375,
"epoch": 0.36533333333333334,
"grad_norm": 0.8473080240759754,
"kl": 0.01116943359375,
"learning_rate": 2e-06,
"loss": 0.121,
"reward": 0.5767890810966492,
"reward_std": 0.1566361039876938,
"rewards/length_reward": 0.06071426719427109,
"rewards/similarity_reward": 0.5160747766494751,
"step": 137
},
{
"completion_length": 294.9285888671875,
"epoch": 0.368,
"grad_norm": 0.8165111113975745,
"kl": 0.00677490234375,
"learning_rate": 2e-06,
"loss": -0.0013,
"reward": 0.6466237902641296,
"reward_std": 0.11831733584403992,
"rewards/length_reward": 0.07142855226993561,
"rewards/similarity_reward": 0.5751951336860657,
"step": 138
},
{
"completion_length": 297.4107360839844,
"epoch": 0.37066666666666664,
"grad_norm": 0.8905760527062927,
"kl": 0.00653076171875,
"learning_rate": 2e-06,
"loss": 0.0894,
"reward": 0.6628533601760864,
"reward_std": 0.10040118545293808,
"rewards/length_reward": 0.06339284032583237,
"rewards/similarity_reward": 0.5994604229927063,
"step": 139
},
{
"completion_length": 282.9508972167969,
"epoch": 0.37333333333333335,
"grad_norm": 0.9489224311946435,
"kl": 0.01007080078125,
"learning_rate": 2e-06,
"loss": 0.0747,
"reward": 0.5422684550285339,
"reward_std": 0.18701300024986267,
"rewards/length_reward": 0.055357132107019424,
"rewards/similarity_reward": 0.4869112968444824,
"step": 140
},
{
"completion_length": 286.9419860839844,
"epoch": 0.376,
"grad_norm": 1.5325112007084152,
"kl": 0.0205078125,
"learning_rate": 2e-06,
"loss": 0.0749,
"reward": 0.6462909579277039,
"reward_std": 0.1564369648694992,
"rewards/length_reward": 0.06919640302658081,
"rewards/similarity_reward": 0.5770944356918335,
"step": 141
},
{
"completion_length": 233.50001525878906,
"epoch": 0.37866666666666665,
"grad_norm": 1.1124358172264561,
"kl": 0.01336669921875,
"learning_rate": 2e-06,
"loss": 0.1038,
"reward": 0.6390895247459412,
"reward_std": 0.11253345012664795,
"rewards/length_reward": 0.05982141196727753,
"rewards/similarity_reward": 0.5792680978775024,
"step": 142
},
{
"completion_length": 308.0401916503906,
"epoch": 0.38133333333333336,
"grad_norm": 0.8437782349764958,
"kl": 0.01019287109375,
"learning_rate": 2e-06,
"loss": 0.0439,
"reward": 0.6860373616218567,
"reward_std": 0.08081385493278503,
"rewards/length_reward": 0.07901783287525177,
"rewards/similarity_reward": 0.6070196032524109,
"step": 143
},
{
"completion_length": 274.54913330078125,
"epoch": 0.384,
"grad_norm": 0.9174096594145076,
"kl": 0.01019287109375,
"learning_rate": 2e-06,
"loss": 0.0938,
"reward": 0.6485283970832825,
"reward_std": 0.1347315013408661,
"rewards/length_reward": 0.07455354183912277,
"rewards/similarity_reward": 0.5739747881889343,
"step": 144
},
{
"completion_length": 257.5535888671875,
"epoch": 0.38666666666666666,
"grad_norm": 0.9244068415305253,
"kl": 0.01019287109375,
"learning_rate": 2e-06,
"loss": 0.056,
"reward": 0.6137626767158508,
"reward_std": 0.13303914666175842,
"rewards/length_reward": 0.06562498211860657,
"rewards/similarity_reward": 0.5481376647949219,
"step": 145
},
{
"completion_length": 255.62501525878906,
"epoch": 0.3893333333333333,
"grad_norm": 0.8596867360926773,
"kl": 0.011474609375,
"learning_rate": 2e-06,
"loss": 0.0491,
"reward": 0.6294366717338562,
"reward_std": 0.13696229457855225,
"rewards/length_reward": 0.0741071105003357,
"rewards/similarity_reward": 0.5553295016288757,
"step": 146
},
{
"completion_length": 321.3258972167969,
"epoch": 0.392,
"grad_norm": 0.7611409673177786,
"kl": 0.01171875,
"learning_rate": 2e-06,
"loss": 0.0851,
"reward": 0.6082260012626648,
"reward_std": 0.11479248106479645,
"rewards/length_reward": 0.06964283436536789,
"rewards/similarity_reward": 0.5385831594467163,
"step": 147
},
{
"completion_length": 256.5223388671875,
"epoch": 0.39466666666666667,
"grad_norm": 1.017158083005092,
"kl": 0.01165771484375,
"learning_rate": 2e-06,
"loss": 0.0583,
"reward": 0.5373588800430298,
"reward_std": 0.1524331271648407,
"rewards/length_reward": 0.060267843306064606,
"rewards/similarity_reward": 0.477090984582901,
"step": 148
},
{
"completion_length": 247.34376525878906,
"epoch": 0.3973333333333333,
"grad_norm": 1.100826516879252,
"kl": 0.011474609375,
"learning_rate": 2e-06,
"loss": 0.1543,
"reward": 0.6250883340835571,
"reward_std": 0.1562027931213379,
"rewards/length_reward": 0.06785711646080017,
"rewards/similarity_reward": 0.5572311878204346,
"step": 149
},
{
"completion_length": 271.61163330078125,
"epoch": 0.4,
"grad_norm": 0.85368826964619,
"kl": 0.00897216796875,
"learning_rate": 2e-06,
"loss": 0.0525,
"reward": 0.5830017924308777,
"reward_std": 0.1454438865184784,
"rewards/length_reward": 0.06160712614655495,
"rewards/similarity_reward": 0.5213946104049683,
"step": 150
},
{
"completion_length": 274.40179443359375,
"epoch": 0.4026666666666667,
"grad_norm": 0.9117887687890662,
"kl": 0.014892578125,
"learning_rate": 2e-06,
"loss": -0.0237,
"reward": 0.6028919219970703,
"reward_std": 0.15602950751781464,
"rewards/length_reward": 0.06651782989501953,
"rewards/similarity_reward": 0.536374032497406,
"step": 151
},
{
"completion_length": 269.2008972167969,
"epoch": 0.4053333333333333,
"grad_norm": 0.8208276830838094,
"kl": 0.010009765625,
"learning_rate": 2e-06,
"loss": 0.014,
"reward": 0.665276825428009,
"reward_std": 0.1257169246673584,
"rewards/length_reward": 0.08035711199045181,
"rewards/similarity_reward": 0.5849196910858154,
"step": 152
},
{
"completion_length": 264.4732360839844,
"epoch": 0.408,
"grad_norm": 0.9062154210012625,
"kl": 0.013427734375,
"learning_rate": 2e-06,
"loss": 0.0701,
"reward": 0.6374659538269043,
"reward_std": 0.1712835431098938,
"rewards/length_reward": 0.06830354779958725,
"rewards/similarity_reward": 0.5691623091697693,
"step": 153
},
{
"completion_length": 325.1071472167969,
"epoch": 0.4106666666666667,
"grad_norm": 0.8808738957904011,
"kl": 0.0089111328125,
"learning_rate": 2e-06,
"loss": 0.1136,
"reward": 0.6423187255859375,
"reward_std": 0.1033661887049675,
"rewards/length_reward": 0.06383926421403885,
"rewards/similarity_reward": 0.578479528427124,
"step": 154
},
{
"completion_length": 281.3035888671875,
"epoch": 0.41333333333333333,
"grad_norm": 0.8449149570191646,
"kl": 0.012451171875,
"learning_rate": 2e-06,
"loss": 0.0893,
"reward": 0.6530374884605408,
"reward_std": 0.12996266782283783,
"rewards/length_reward": 0.06116069480776787,
"rewards/similarity_reward": 0.5918766856193542,
"step": 155
},
{
"completion_length": 297.02679443359375,
"epoch": 0.416,
"grad_norm": 0.8274002453741087,
"kl": 0.008056640625,
"learning_rate": 2e-06,
"loss": 0.0593,
"reward": 0.7200801372528076,
"reward_std": 0.12102329730987549,
"rewards/length_reward": 0.07901783287525177,
"rewards/similarity_reward": 0.6410622596740723,
"step": 156
},
{
"completion_length": 245.7991180419922,
"epoch": 0.4186666666666667,
"grad_norm": 1.0463728826517769,
"kl": 0.0145263671875,
"learning_rate": 2e-06,
"loss": 0.1192,
"reward": 0.6804168820381165,
"reward_std": 0.1330643892288208,
"rewards/length_reward": 0.0741071105003357,
"rewards/similarity_reward": 0.6063097715377808,
"step": 157
},
{
"completion_length": 272.2410888671875,
"epoch": 0.42133333333333334,
"grad_norm": 0.8424445256337731,
"kl": 0.0152587890625,
"learning_rate": 2e-06,
"loss": 0.0411,
"reward": 0.6152999401092529,
"reward_std": 0.18344512581825256,
"rewards/length_reward": 0.06428569555282593,
"rewards/similarity_reward": 0.5510141849517822,
"step": 158
},
{
"completion_length": 283.90625,
"epoch": 0.424,
"grad_norm": 0.944378171141832,
"kl": 0.0128173828125,
"learning_rate": 2e-06,
"loss": 0.0973,
"reward": 0.6098131537437439,
"reward_std": 0.14866778254508972,
"rewards/length_reward": 0.05982141196727753,
"rewards/similarity_reward": 0.5499916672706604,
"step": 159
},
{
"completion_length": 251.37054443359375,
"epoch": 0.4266666666666667,
"grad_norm": 0.867614538281579,
"kl": 0.01190185546875,
"learning_rate": 2e-06,
"loss": 0.0069,
"reward": 0.6304399371147156,
"reward_std": 0.12713229656219482,
"rewards/length_reward": 0.07276783138513565,
"rewards/similarity_reward": 0.5576720237731934,
"step": 160
},
{
"completion_length": 254.6607208251953,
"epoch": 0.42933333333333334,
"grad_norm": 1.0006767726840313,
"kl": 0.01226806640625,
"learning_rate": 2e-06,
"loss": 0.0167,
"reward": 0.6183627843856812,
"reward_std": 0.12064019590616226,
"rewards/length_reward": 0.057589273899793625,
"rewards/similarity_reward": 0.560773491859436,
"step": 161
},
{
"completion_length": 278.9821472167969,
"epoch": 0.432,
"grad_norm": 0.7754115998151179,
"kl": 0.0108642578125,
"learning_rate": 2e-06,
"loss": 0.0624,
"reward": 0.6279152035713196,
"reward_std": 0.1739441603422165,
"rewards/length_reward": 0.06651783734560013,
"rewards/similarity_reward": 0.5613973140716553,
"step": 162
},
{
"completion_length": 261.51788330078125,
"epoch": 0.43466666666666665,
"grad_norm": 0.9381033539462706,
"kl": 0.011962890625,
"learning_rate": 2e-06,
"loss": 0.1291,
"reward": 0.6449581384658813,
"reward_std": 0.13920167088508606,
"rewards/length_reward": 0.07589282840490341,
"rewards/similarity_reward": 0.569065272808075,
"step": 163
},
{
"completion_length": 311.8169860839844,
"epoch": 0.43733333333333335,
"grad_norm": 0.959860639301872,
"kl": 0.0084228515625,
"learning_rate": 2e-06,
"loss": 0.0207,
"reward": 0.6583375930786133,
"reward_std": 0.1428201049566269,
"rewards/length_reward": 0.07008926570415497,
"rewards/similarity_reward": 0.5882483720779419,
"step": 164
},
{
"completion_length": 261.5758972167969,
"epoch": 0.44,
"grad_norm": 0.8574273981386299,
"kl": 0.009033203125,
"learning_rate": 2e-06,
"loss": 0.1177,
"reward": 0.6945616006851196,
"reward_std": 0.12529133260250092,
"rewards/length_reward": 0.07812497019767761,
"rewards/similarity_reward": 0.6164366006851196,
"step": 165
},
{
"completion_length": 281.45538330078125,
"epoch": 0.44266666666666665,
"grad_norm": 0.8588941157426009,
"kl": 0.01220703125,
"learning_rate": 2e-06,
"loss": 0.0615,
"reward": 0.7305233478546143,
"reward_std": 0.12011624127626419,
"rewards/length_reward": 0.0808035358786583,
"rewards/similarity_reward": 0.6497198343276978,
"step": 166
},
{
"completion_length": 259.64288330078125,
"epoch": 0.44533333333333336,
"grad_norm": 1.137909715907424,
"kl": 0.01031494140625,
"learning_rate": 2e-06,
"loss": 0.2205,
"reward": 0.5699202418327332,
"reward_std": 0.1761079728603363,
"rewards/length_reward": 0.06294640898704529,
"rewards/similarity_reward": 0.5069737434387207,
"step": 167
},
{
"completion_length": 294.875,
"epoch": 0.448,
"grad_norm": 1.0016080727138688,
"kl": 0.01171875,
"learning_rate": 2e-06,
"loss": 0.1368,
"reward": 0.6165792942047119,
"reward_std": 0.12841306626796722,
"rewards/length_reward": 0.064732126891613,
"rewards/similarity_reward": 0.5518471002578735,
"step": 168
},
{
"completion_length": 308.37054443359375,
"epoch": 0.45066666666666666,
"grad_norm": 1.2557098703938632,
"kl": 0.0162353515625,
"learning_rate": 2e-06,
"loss": 0.0777,
"reward": 0.6842705607414246,
"reward_std": 0.09632124751806259,
"rewards/length_reward": 0.08124997466802597,
"rewards/similarity_reward": 0.6030204892158508,
"step": 169
},
{
"completion_length": 281.4821472167969,
"epoch": 0.4533333333333333,
"grad_norm": 0.8773655821391068,
"kl": 0.009033203125,
"learning_rate": 2e-06,
"loss": 0.0602,
"reward": 0.6346014738082886,
"reward_std": 0.14319205284118652,
"rewards/length_reward": 0.06116069480776787,
"rewards/similarity_reward": 0.5734407901763916,
"step": 170
},
{
"completion_length": 281.8125,
"epoch": 0.456,
"grad_norm": 0.9145529663215465,
"kl": 0.01275634765625,
"learning_rate": 2e-06,
"loss": 0.1688,
"reward": 0.5963006019592285,
"reward_std": 0.15331213176250458,
"rewards/length_reward": 0.06696426123380661,
"rewards/similarity_reward": 0.5293362736701965,
"step": 171
},
{
"completion_length": 280.8571472167969,
"epoch": 0.45866666666666667,
"grad_norm": 3.878703302716922,
"kl": 0.0269775390625,
"learning_rate": 2e-06,
"loss": 0.0219,
"reward": 0.6730906963348389,
"reward_std": 0.11424030363559723,
"rewards/length_reward": 0.06874997913837433,
"rewards/similarity_reward": 0.604340672492981,
"step": 172
},
{
"completion_length": 277.9910888671875,
"epoch": 0.4613333333333333,
"grad_norm": 1.0003205251640386,
"kl": 0.01141357421875,
"learning_rate": 2e-06,
"loss": 0.1181,
"reward": 0.6080780029296875,
"reward_std": 0.14715011417865753,
"rewards/length_reward": 0.06651782989501953,
"rewards/similarity_reward": 0.541560173034668,
"step": 173
},
{
"completion_length": 257.0133972167969,
"epoch": 0.464,
"grad_norm": 0.8626427313272481,
"kl": 0.00921630859375,
"learning_rate": 2e-06,
"loss": 0.0332,
"reward": 0.6258962154388428,
"reward_std": 0.13628825545310974,
"rewards/length_reward": 0.07098211348056793,
"rewards/similarity_reward": 0.554914116859436,
"step": 174
},
{
"completion_length": 267.1294860839844,
"epoch": 0.4666666666666667,
"grad_norm": 0.9448046232693003,
"kl": 0.0128173828125,
"learning_rate": 2e-06,
"loss": 0.0686,
"reward": 0.582805871963501,
"reward_std": 0.13543623685836792,
"rewards/length_reward": 0.07455354183912277,
"rewards/similarity_reward": 0.5082523226737976,
"step": 175
},
{
"completion_length": 230.3169708251953,
"epoch": 0.4693333333333333,
"grad_norm": 1.0336066582105279,
"kl": 0.01611328125,
"learning_rate": 2e-06,
"loss": 0.0188,
"reward": 0.5923266410827637,
"reward_std": 0.15992802381515503,
"rewards/length_reward": 0.06607140600681305,
"rewards/similarity_reward": 0.5262552499771118,
"step": 176
},
{
"completion_length": 259.2857360839844,
"epoch": 0.472,
"grad_norm": 0.8515404437990851,
"kl": 0.01422119140625,
"learning_rate": 2e-06,
"loss": -0.0031,
"reward": 0.6366464495658875,
"reward_std": 0.14244325459003448,
"rewards/length_reward": 0.06785711646080017,
"rewards/similarity_reward": 0.5687893033027649,
"step": 177
},
{
"completion_length": 269.0535888671875,
"epoch": 0.4746666666666667,
"grad_norm": 1.0508009846238586,
"kl": 0.01202392578125,
"learning_rate": 2e-06,
"loss": 0.1414,
"reward": 0.6338518857955933,
"reward_std": 0.13359463214874268,
"rewards/length_reward": 0.06651783734560013,
"rewards/similarity_reward": 0.5673341155052185,
"step": 178
},
{
"completion_length": 254.2053680419922,
"epoch": 0.47733333333333333,
"grad_norm": 1.0742338846656552,
"kl": 0.00799560546875,
"learning_rate": 2e-06,
"loss": 0.0226,
"reward": 0.6352322697639465,
"reward_std": 0.16355818510055542,
"rewards/length_reward": 0.07187497615814209,
"rewards/similarity_reward": 0.5633572936058044,
"step": 179
},
{
"completion_length": 258.8482360839844,
"epoch": 0.48,
"grad_norm": 1.1021168870169997,
"kl": 0.0213623046875,
"learning_rate": 2e-06,
"loss": 0.0656,
"reward": 0.594104528427124,
"reward_std": 0.1770821362733841,
"rewards/length_reward": 0.060714274644851685,
"rewards/similarity_reward": 0.5333902835845947,
"step": 180
},
{
"completion_length": 256.8125,
"epoch": 0.4826666666666667,
"grad_norm": 0.8501219854036921,
"kl": 0.009033203125,
"learning_rate": 2e-06,
"loss": -0.0019,
"reward": 0.6487245559692383,
"reward_std": 0.13405689597129822,
"rewards/length_reward": 0.07455354183912277,
"rewards/similarity_reward": 0.5741709470748901,
"step": 181
},
{
"completion_length": 256.02679443359375,
"epoch": 0.48533333333333334,
"grad_norm": 1.0385629776489995,
"kl": 0.012939453125,
"learning_rate": 2e-06,
"loss": 0.0721,
"reward": 0.6005666255950928,
"reward_std": 0.18559977412223816,
"rewards/length_reward": 0.06741069257259369,
"rewards/similarity_reward": 0.5331559181213379,
"step": 182
},
{
"completion_length": 245.2723388671875,
"epoch": 0.488,
"grad_norm": 0.9856929099072189,
"kl": 0.0140380859375,
"learning_rate": 2e-06,
"loss": 0.0559,
"reward": 0.6080025434494019,
"reward_std": 0.12059400230646133,
"rewards/length_reward": 0.06517855077981949,
"rewards/similarity_reward": 0.5428239703178406,
"step": 183
},
{
"completion_length": 275.40179443359375,
"epoch": 0.49066666666666664,
"grad_norm": 1.0764117312018395,
"kl": 0.0147705078125,
"learning_rate": 2e-06,
"loss": 0.1746,
"reward": 0.5906988382339478,
"reward_std": 0.13717274367809296,
"rewards/length_reward": 0.06339284032583237,
"rewards/similarity_reward": 0.5273059606552124,
"step": 184
},
{
"completion_length": 235.33929443359375,
"epoch": 0.49333333333333335,
"grad_norm": 1.216465109274426,
"kl": 0.015869140625,
"learning_rate": 2e-06,
"loss": 0.0441,
"reward": 0.6832688450813293,
"reward_std": 0.12071473151445389,
"rewards/length_reward": 0.07767853885889053,
"rewards/similarity_reward": 0.6055901646614075,
"step": 185
},
{
"completion_length": 271.0401916503906,
"epoch": 0.496,
"grad_norm": 0.9417708264398014,
"kl": 0.0113525390625,
"learning_rate": 2e-06,
"loss": 0.0103,
"reward": 0.7055503726005554,
"reward_std": 0.09871623665094376,
"rewards/length_reward": 0.07812497019767761,
"rewards/similarity_reward": 0.6274253129959106,
"step": 186
},
{
"completion_length": 288.40179443359375,
"epoch": 0.49866666666666665,
"grad_norm": 0.7904664413572577,
"kl": 0.0113525390625,
"learning_rate": 2e-06,
"loss": 0.0943,
"reward": 0.6588479280471802,
"reward_std": 0.1498415172100067,
"rewards/length_reward": 0.07053568959236145,
"rewards/similarity_reward": 0.5883120894432068,
"step": 187
},
{
"completion_length": 262.2857360839844,
"epoch": 0.5013333333333333,
"grad_norm": 1.0107893927701763,
"kl": 0.0113525390625,
"learning_rate": 2e-06,
"loss": 0.0639,
"reward": 0.5783969759941101,
"reward_std": 0.1660866141319275,
"rewards/length_reward": 0.06741069257259369,
"rewards/similarity_reward": 0.5109862685203552,
"step": 188
},
{
"completion_length": 243.23214721679688,
"epoch": 0.504,
"grad_norm": 0.9914068826603122,
"kl": 0.0263671875,
"learning_rate": 2e-06,
"loss": 0.0183,
"reward": 0.5762468576431274,
"reward_std": 0.1855197250843048,
"rewards/length_reward": 0.06741069257259369,
"rewards/similarity_reward": 0.5088360905647278,
"step": 189
},
{
"completion_length": 279.8973388671875,
"epoch": 0.5066666666666667,
"grad_norm": 0.8186202175206256,
"kl": 0.01031494140625,
"learning_rate": 2e-06,
"loss": 0.0181,
"reward": 0.6954742074012756,
"reward_std": 0.08623984456062317,
"rewards/length_reward": 0.07053568959236145,
"rewards/similarity_reward": 0.624938428401947,
"step": 190
},
{
"completion_length": 257.21429443359375,
"epoch": 0.5093333333333333,
"grad_norm": 0.877876828642467,
"kl": 0.014892578125,
"learning_rate": 2e-06,
"loss": 0.0328,
"reward": 0.6462003588676453,
"reward_std": 0.11538383364677429,
"rewards/length_reward": 0.06830354779958725,
"rewards/similarity_reward": 0.577896773815155,
"step": 191
},
{
"completion_length": 257.7946472167969,
"epoch": 0.512,
"grad_norm": 0.8857490639900779,
"kl": 0.01214599609375,
"learning_rate": 2e-06,
"loss": 0.0477,
"reward": 0.6250221133232117,
"reward_std": 0.15633754432201385,
"rewards/length_reward": 0.06830354779958725,
"rewards/similarity_reward": 0.5567185282707214,
"step": 192
},
{
"completion_length": 303.4464416503906,
"epoch": 0.5146666666666667,
"grad_norm": 0.8375649728004798,
"kl": 0.00897216796875,
"learning_rate": 2e-06,
"loss": 0.0444,
"reward": 0.6938925981521606,
"reward_std": 0.13664484024047852,
"rewards/length_reward": 0.07544640451669693,
"rewards/similarity_reward": 0.6184461116790771,
"step": 193
},
{
"completion_length": 239.02679443359375,
"epoch": 0.5173333333333333,
"grad_norm": 0.9796244769392795,
"kl": 0.0169677734375,
"learning_rate": 2e-06,
"loss": 0.0277,
"reward": 0.6308580636978149,
"reward_std": 0.09844722598791122,
"rewards/length_reward": 0.07544640451669693,
"rewards/similarity_reward": 0.5554116368293762,
"step": 194
},
{
"completion_length": 220.04019165039062,
"epoch": 0.52,
"grad_norm": 1.1138163852092198,
"kl": 0.0211181640625,
"learning_rate": 2e-06,
"loss": 0.101,
"reward": 0.5808507204055786,
"reward_std": 0.14026090502738953,
"rewards/length_reward": 0.056696418672800064,
"rewards/similarity_reward": 0.5241542458534241,
"step": 195
},
{
"completion_length": 310.6294860839844,
"epoch": 0.5226666666666666,
"grad_norm": 0.7596161810526226,
"kl": 0.012451171875,
"learning_rate": 2e-06,
"loss": 0.0671,
"reward": 0.6398200988769531,
"reward_std": 0.16089944541454315,
"rewards/length_reward": 0.07633925974369049,
"rewards/similarity_reward": 0.5634807348251343,
"step": 196
},
{
"completion_length": 293.65179443359375,
"epoch": 0.5253333333333333,
"grad_norm": 1.3057397068251875,
"kl": 0.0137939453125,
"learning_rate": 2e-06,
"loss": 0.0229,
"reward": 0.69722580909729,
"reward_std": 0.10665407031774521,
"rewards/length_reward": 0.07901783287525177,
"rewards/similarity_reward": 0.6182078719139099,
"step": 197
},
{
"completion_length": 271.8794860839844,
"epoch": 0.528,
"grad_norm": 0.9454287770215252,
"kl": 0.01190185546875,
"learning_rate": 2e-06,
"loss": 0.0697,
"reward": 0.6203178763389587,
"reward_std": 0.1754215508699417,
"rewards/length_reward": 0.06339284032583237,
"rewards/similarity_reward": 0.5569249987602234,
"step": 198
},
{
"completion_length": 259.3571472167969,
"epoch": 0.5306666666666666,
"grad_norm": 0.8381899247069013,
"kl": 0.011474609375,
"learning_rate": 2e-06,
"loss": 0.0283,
"reward": 0.6383811831474304,
"reward_std": 0.11189709603786469,
"rewards/length_reward": 0.07544640451669693,
"rewards/similarity_reward": 0.5629347562789917,
"step": 199
},
{
"completion_length": 258.0401916503906,
"epoch": 0.5333333333333333,
"grad_norm": 1.0128439289515407,
"kl": 0.012451171875,
"learning_rate": 2e-06,
"loss": 0.1465,
"reward": 0.5912656784057617,
"reward_std": 0.14550404250621796,
"rewards/length_reward": 0.058482129126787186,
"rewards/similarity_reward": 0.5327835083007812,
"step": 200
},
{
"completion_length": 311.27679443359375,
"epoch": 0.536,
"grad_norm": 0.86779018830801,
"kl": 0.00909423828125,
"learning_rate": 2e-06,
"loss": 0.0985,
"reward": 0.588790237903595,
"reward_std": 0.140910342335701,
"rewards/length_reward": 0.07187497615814209,
"rewards/similarity_reward": 0.5169152021408081,
"step": 201
},
{
"completion_length": 223.9598388671875,
"epoch": 0.5386666666666666,
"grad_norm": 0.9896401672605407,
"kl": 0.01263427734375,
"learning_rate": 2e-06,
"loss": 0.0605,
"reward": 0.5413497686386108,
"reward_std": 0.13121715188026428,
"rewards/length_reward": 0.06517855077981949,
"rewards/similarity_reward": 0.4761711657047272,
"step": 202
},
{
"completion_length": 245.66964721679688,
"epoch": 0.5413333333333333,
"grad_norm": 0.8825950131925253,
"kl": 0.01300048828125,
"learning_rate": 2e-06,
"loss": 0.0168,
"reward": 0.632973849773407,
"reward_std": 0.15790660679340363,
"rewards/length_reward": 0.07678568363189697,
"rewards/similarity_reward": 0.5561880469322205,
"step": 203
},
{
"completion_length": 214.50894165039062,
"epoch": 0.544,
"grad_norm": 0.9691668766051184,
"kl": 0.0133056640625,
"learning_rate": 2e-06,
"loss": 0.0135,
"reward": 0.6077868938446045,
"reward_std": 0.12028573453426361,
"rewards/length_reward": 0.06964283436536789,
"rewards/similarity_reward": 0.5381439328193665,
"step": 204
},
{
"completion_length": 292.73663330078125,
"epoch": 0.5466666666666666,
"grad_norm": 0.7967760450327859,
"kl": 0.00982666015625,
"learning_rate": 2e-06,
"loss": 0.0266,
"reward": 0.6232799887657166,
"reward_std": 0.10140591114759445,
"rewards/length_reward": 0.07410712540149689,
"rewards/similarity_reward": 0.5491728186607361,
"step": 205
},
{
"completion_length": 221.25001525878906,
"epoch": 0.5493333333333333,
"grad_norm": 1.1638296703356164,
"kl": 0.01385498046875,
"learning_rate": 2e-06,
"loss": 0.1653,
"reward": 0.5745998620986938,
"reward_std": 0.1350637972354889,
"rewards/length_reward": 0.06741069257259369,
"rewards/similarity_reward": 0.5071890950202942,
"step": 206
},
{
"completion_length": 265.58929443359375,
"epoch": 0.552,
"grad_norm": 0.845270302637572,
"kl": 0.01031494140625,
"learning_rate": 2e-06,
"loss": 0.0111,
"reward": 0.6836676001548767,
"reward_std": 0.10602893680334091,
"rewards/length_reward": 0.07053568959236145,
"rewards/similarity_reward": 0.6131318211555481,
"step": 207
},
{
"completion_length": 295.58929443359375,
"epoch": 0.5546666666666666,
"grad_norm": 0.9323286808595849,
"kl": 0.00921630859375,
"learning_rate": 2e-06,
"loss": 0.0749,
"reward": 0.5889706611633301,
"reward_std": 0.1236046850681305,
"rewards/length_reward": 0.06517855823040009,
"rewards/similarity_reward": 0.5237920880317688,
"step": 208
},
{
"completion_length": 293.58038330078125,
"epoch": 0.5573333333333333,
"grad_norm": 0.9233381319586115,
"kl": 0.01055908203125,
"learning_rate": 2e-06,
"loss": 0.2,
"reward": 0.6363462805747986,
"reward_std": 0.12041884660720825,
"rewards/length_reward": 0.06874997913837433,
"rewards/similarity_reward": 0.5675963759422302,
"step": 209
},
{
"completion_length": 266.3660888671875,
"epoch": 0.56,
"grad_norm": 0.9921663239986533,
"kl": 0.01214599609375,
"learning_rate": 2e-06,
"loss": 0.0664,
"reward": 0.5890473127365112,
"reward_std": 0.14096976816654205,
"rewards/length_reward": 0.06339284032583237,
"rewards/similarity_reward": 0.5256544351577759,
"step": 210
},
{
"completion_length": 257.46875,
"epoch": 0.5626666666666666,
"grad_norm": 0.9774355514560761,
"kl": 0.01080322265625,
"learning_rate": 2e-06,
"loss": 0.0824,
"reward": 0.5761434435844421,
"reward_std": 0.18852439522743225,
"rewards/length_reward": 0.061160698533058167,
"rewards/similarity_reward": 0.5149827003479004,
"step": 211
},
{
"completion_length": 214.66964721679688,
"epoch": 0.5653333333333334,
"grad_norm": 1.0333338701683394,
"kl": 0.00994873046875,
"learning_rate": 2e-06,
"loss": 0.1128,
"reward": 0.6288223266601562,
"reward_std": 0.1170286163687706,
"rewards/length_reward": 0.06964283436536789,
"rewards/similarity_reward": 0.5591794848442078,
"step": 212
},
{
"completion_length": 284.0089416503906,
"epoch": 0.568,
"grad_norm": 0.9552749101564338,
"kl": 0.01422119140625,
"learning_rate": 2e-06,
"loss": 0.1971,
"reward": 0.5316947102546692,
"reward_std": 0.14774499833583832,
"rewards/length_reward": 0.06607140600681305,
"rewards/similarity_reward": 0.46562325954437256,
"step": 213
},
{
"completion_length": 264.84375,
"epoch": 0.5706666666666667,
"grad_norm": 0.9328609206359839,
"kl": 0.0128173828125,
"learning_rate": 2e-06,
"loss": 0.0611,
"reward": 0.61468505859375,
"reward_std": 0.0943475142121315,
"rewards/length_reward": 0.08214282244443893,
"rewards/similarity_reward": 0.5325421690940857,
"step": 214
},
{
"completion_length": 288.65625,
"epoch": 0.5733333333333334,
"grad_norm": 0.9888552258575887,
"kl": 0.0147705078125,
"learning_rate": 2e-06,
"loss": 0.1398,
"reward": 0.5841876864433289,
"reward_std": 0.10097295790910721,
"rewards/length_reward": 0.06294640898704529,
"rewards/similarity_reward": 0.5212411880493164,
"step": 215
},
{
"completion_length": 270.28125,
"epoch": 0.576,
"grad_norm": 0.8803342156226522,
"kl": 0.014892578125,
"learning_rate": 2e-06,
"loss": 0.0359,
"reward": 0.6344039440155029,
"reward_std": 0.17091530561447144,
"rewards/length_reward": 0.06607141345739365,
"rewards/similarity_reward": 0.5683325529098511,
"step": 216
},
{
"completion_length": 255.12501525878906,
"epoch": 0.5786666666666667,
"grad_norm": 0.8979196392383272,
"kl": 0.027099609375,
"learning_rate": 2e-06,
"loss": -0.0078,
"reward": 0.661945641040802,
"reward_std": 0.1631477326154709,
"rewards/length_reward": 0.07767854630947113,
"rewards/similarity_reward": 0.5842669606208801,
"step": 217
},
{
"completion_length": 263.58038330078125,
"epoch": 0.5813333333333334,
"grad_norm": 1.180022786404114,
"kl": 0.0191650390625,
"learning_rate": 2e-06,
"loss": 0.1111,
"reward": 0.5408477187156677,
"reward_std": 0.11287137866020203,
"rewards/length_reward": 0.060267843306064606,
"rewards/similarity_reward": 0.4805798828601837,
"step": 218
},
{
"completion_length": 293.24554443359375,
"epoch": 0.584,
"grad_norm": 1.082130352994329,
"kl": 0.0113525390625,
"learning_rate": 2e-06,
"loss": 0.166,
"reward": 0.6229541301727295,
"reward_std": 0.18498755991458893,
"rewards/length_reward": 0.06294640898704529,
"rewards/similarity_reward": 0.5600076913833618,
"step": 219
},
{
"completion_length": 255.9553680419922,
"epoch": 0.5866666666666667,
"grad_norm": 0.8518142779942337,
"kl": 0.0216064453125,
"learning_rate": 2e-06,
"loss": -0.0418,
"reward": 0.6351791620254517,
"reward_std": 0.1440075933933258,
"rewards/length_reward": 0.06919640302658081,
"rewards/similarity_reward": 0.5659827589988708,
"step": 220
},
{
"completion_length": 268.2589416503906,
"epoch": 0.5893333333333334,
"grad_norm": 0.8005051959777295,
"kl": 0.00799560546875,
"learning_rate": 2e-06,
"loss": 0.029,
"reward": 0.6807352304458618,
"reward_std": 0.11082387715578079,
"rewards/length_reward": 0.07455354183912277,
"rewards/similarity_reward": 0.6061817407608032,
"step": 221
},
{
"completion_length": 267.5089416503906,
"epoch": 0.592,
"grad_norm": 0.8727360410582777,
"kl": 0.00927734375,
"learning_rate": 2e-06,
"loss": 0.0333,
"reward": 0.6831346154212952,
"reward_std": 0.09732881933450699,
"rewards/length_reward": 0.07633925974369049,
"rewards/similarity_reward": 0.6067953109741211,
"step": 222
},
{
"completion_length": 271.2008972167969,
"epoch": 0.5946666666666667,
"grad_norm": 0.7543972270797626,
"kl": 0.00921630859375,
"learning_rate": 2e-06,
"loss": 0.0251,
"reward": 0.7254729866981506,
"reward_std": 0.13280263543128967,
"rewards/length_reward": 0.07723211497068405,
"rewards/similarity_reward": 0.6482407450675964,
"step": 223
},
{
"completion_length": 276.33038330078125,
"epoch": 0.5973333333333334,
"grad_norm": 0.8477057601765857,
"kl": 0.01470947265625,
"learning_rate": 2e-06,
"loss": 0.093,
"reward": 0.6127163171768188,
"reward_std": 0.14752325415611267,
"rewards/length_reward": 0.06160712614655495,
"rewards/similarity_reward": 0.5511091351509094,
"step": 224
},
{
"completion_length": 313.02679443359375,
"epoch": 0.6,
"grad_norm": 0.9444843501933834,
"kl": 0.01953125,
"learning_rate": 2e-06,
"loss": 0.1058,
"reward": 0.6097243428230286,
"reward_std": 0.1704142987728119,
"rewards/length_reward": 0.06696426123380661,
"rewards/similarity_reward": 0.5427600741386414,
"step": 225
},
{
"completion_length": 288.37054443359375,
"epoch": 0.6026666666666667,
"grad_norm": 0.8579103953599808,
"kl": 0.01373291015625,
"learning_rate": 2e-06,
"loss": 0.1122,
"reward": 0.6366080641746521,
"reward_std": 0.12147609889507294,
"rewards/length_reward": 0.07232140004634857,
"rewards/similarity_reward": 0.5642866492271423,
"step": 226
},
{
"completion_length": 310.1607360839844,
"epoch": 0.6053333333333333,
"grad_norm": 0.759533963504491,
"kl": 0.01312255859375,
"learning_rate": 2e-06,
"loss": 0.0699,
"reward": 0.6739456057548523,
"reward_std": 0.10951042920351028,
"rewards/length_reward": 0.07276783138513565,
"rewards/similarity_reward": 0.6011778116226196,
"step": 227
},
{
"completion_length": 239.0848388671875,
"epoch": 0.608,
"grad_norm": 0.9025930213219101,
"kl": 0.01104736328125,
"learning_rate": 2e-06,
"loss": 0.0215,
"reward": 0.6142429709434509,
"reward_std": 0.08829416334629059,
"rewards/length_reward": 0.07633925974369049,
"rewards/similarity_reward": 0.5379037261009216,
"step": 228
},
{
"completion_length": 278.3125,
"epoch": 0.6106666666666667,
"grad_norm": 0.886301576249163,
"kl": 0.01104736328125,
"learning_rate": 2e-06,
"loss": 0.0334,
"reward": 0.7060741186141968,
"reward_std": 0.11311851441860199,
"rewards/length_reward": 0.07723211497068405,
"rewards/similarity_reward": 0.6288419961929321,
"step": 229
},
{
"completion_length": 251.18751525878906,
"epoch": 0.6133333333333333,
"grad_norm": 2.476356671041086,
"kl": 0.0244140625,
"learning_rate": 2e-06,
"loss": 0.0096,
"reward": 0.5848217010498047,
"reward_std": 0.11717528849840164,
"rewards/length_reward": 0.061160698533058167,
"rewards/similarity_reward": 0.5236610770225525,
"step": 230
},
{
"completion_length": 236.29019165039062,
"epoch": 0.616,
"grad_norm": 0.9163834681525471,
"kl": 0.0108642578125,
"learning_rate": 2e-06,
"loss": 0.0201,
"reward": 0.6646043658256531,
"reward_std": 0.11276809126138687,
"rewards/length_reward": 0.07991068810224533,
"rewards/similarity_reward": 0.5846936702728271,
"step": 231
},
{
"completion_length": 317.0089416503906,
"epoch": 0.6186666666666667,
"grad_norm": 0.8636877609886525,
"kl": 0.013427734375,
"learning_rate": 2e-06,
"loss": 0.036,
"reward": 0.5721753835678101,
"reward_std": 0.16120396554470062,
"rewards/length_reward": 0.06517855077981949,
"rewards/similarity_reward": 0.506996750831604,
"step": 232
},
{
"completion_length": 307.7008972167969,
"epoch": 0.6213333333333333,
"grad_norm": 1.3729033520790577,
"kl": 0.0166015625,
"learning_rate": 2e-06,
"loss": 0.0383,
"reward": 0.6260521411895752,
"reward_std": 0.11853621900081635,
"rewards/length_reward": 0.07767853885889053,
"rewards/similarity_reward": 0.5483735799789429,
"step": 233
},
{
"completion_length": 283.5714416503906,
"epoch": 0.624,
"grad_norm": 0.8530397710423918,
"kl": 0.016357421875,
"learning_rate": 2e-06,
"loss": 0.0206,
"reward": 0.6967118382453918,
"reward_std": 0.1562497317790985,
"rewards/length_reward": 0.07857140153646469,
"rewards/similarity_reward": 0.618140459060669,
"step": 234
},
{
"completion_length": 272.6160888671875,
"epoch": 0.6266666666666667,
"grad_norm": 0.8799568084373302,
"kl": 0.01153564453125,
"learning_rate": 2e-06,
"loss": 0.0181,
"reward": 0.6851814985275269,
"reward_std": 0.10234292596578598,
"rewards/length_reward": 0.07142854481935501,
"rewards/similarity_reward": 0.6137529611587524,
"step": 235
},
{
"completion_length": 316.4776916503906,
"epoch": 0.6293333333333333,
"grad_norm": 0.8224273598135922,
"kl": 0.01416015625,
"learning_rate": 2e-06,
"loss": 0.1293,
"reward": 0.5818712115287781,
"reward_std": 0.1467462033033371,
"rewards/length_reward": 0.059374988079071045,
"rewards/similarity_reward": 0.5224961638450623,
"step": 236
},
{
"completion_length": 276.1651916503906,
"epoch": 0.632,
"grad_norm": 0.8214244182848573,
"kl": 0.01373291015625,
"learning_rate": 2e-06,
"loss": 0.0324,
"reward": 0.6201799511909485,
"reward_std": 0.14638349413871765,
"rewards/length_reward": 0.07098212093114853,
"rewards/similarity_reward": 0.5491978526115417,
"step": 237
},
{
"completion_length": 314.15179443359375,
"epoch": 0.6346666666666667,
"grad_norm": 0.8751337279602847,
"kl": 0.011474609375,
"learning_rate": 2e-06,
"loss": 0.1279,
"reward": 0.6131877899169922,
"reward_std": 0.15269529819488525,
"rewards/length_reward": 0.06383927166461945,
"rewards/similarity_reward": 0.5493485331535339,
"step": 238
},
{
"completion_length": 241.5982208251953,
"epoch": 0.6373333333333333,
"grad_norm": 0.9065686563133485,
"kl": 0.01300048828125,
"learning_rate": 2e-06,
"loss": 0.053,
"reward": 0.6322412490844727,
"reward_std": 0.13913773000240326,
"rewards/length_reward": 0.06741069257259369,
"rewards/similarity_reward": 0.5648305416107178,
"step": 239
},
{
"completion_length": 282.3973388671875,
"epoch": 0.64,
"grad_norm": 0.8759378979761268,
"kl": 0.0130615234375,
"learning_rate": 2e-06,
"loss": 0.0351,
"reward": 0.6563798785209656,
"reward_std": 0.11333189904689789,
"rewards/length_reward": 0.07053568959236145,
"rewards/similarity_reward": 0.5858440399169922,
"step": 240
},
{
"completion_length": 258.2946472167969,
"epoch": 0.6426666666666667,
"grad_norm": 0.8083399774508907,
"kl": 0.0115966796875,
"learning_rate": 2e-06,
"loss": -0.0089,
"reward": 0.7068819403648376,
"reward_std": 0.10833070427179337,
"rewards/length_reward": 0.08392854034900665,
"rewards/similarity_reward": 0.6229532957077026,
"step": 241
},
{
"completion_length": 226.43751525878906,
"epoch": 0.6453333333333333,
"grad_norm": 1.113129598732782,
"kl": 0.01708984375,
"learning_rate": 2e-06,
"loss": 0.0985,
"reward": 0.47489413619041443,
"reward_std": 0.14593513309955597,
"rewards/length_reward": 0.053124986588954926,
"rewards/similarity_reward": 0.4217691719532013,
"step": 242
},
{
"completion_length": 254.5982208251953,
"epoch": 0.648,
"grad_norm": 1.0574663727866278,
"kl": 0.01171875,
"learning_rate": 2e-06,
"loss": 0.0655,
"reward": 0.585192084312439,
"reward_std": 0.1674540489912033,
"rewards/length_reward": 0.05937498062849045,
"rewards/similarity_reward": 0.5258170962333679,
"step": 243
},
{
"completion_length": 245.9866180419922,
"epoch": 0.6506666666666666,
"grad_norm": 0.9866678905813414,
"kl": 0.01385498046875,
"learning_rate": 2e-06,
"loss": 0.0992,
"reward": 0.6301730871200562,
"reward_std": 0.11110112071037292,
"rewards/length_reward": 0.07142854481935501,
"rewards/similarity_reward": 0.5587445497512817,
"step": 244
},
{
"completion_length": 276.9419860839844,
"epoch": 0.6533333333333333,
"grad_norm": 0.8334751875894263,
"kl": 0.01226806640625,
"learning_rate": 2e-06,
"loss": 0.0199,
"reward": 0.734417736530304,
"reward_std": 0.13278159499168396,
"rewards/length_reward": 0.07767854630947113,
"rewards/similarity_reward": 0.6567391157150269,
"step": 245
},
{
"completion_length": 224.4241180419922,
"epoch": 0.656,
"grad_norm": 1.039042606617133,
"kl": 0.0211181640625,
"learning_rate": 2e-06,
"loss": 0.1146,
"reward": 0.6315779089927673,
"reward_std": 0.1330062597990036,
"rewards/length_reward": 0.07053568959236145,
"rewards/similarity_reward": 0.5610421895980835,
"step": 246
},
{
"completion_length": 252.9732208251953,
"epoch": 0.6586666666666666,
"grad_norm": 0.8538008305966633,
"kl": 0.01275634765625,
"learning_rate": 2e-06,
"loss": 0.0224,
"reward": 0.6646360754966736,
"reward_std": 0.12239360809326172,
"rewards/length_reward": 0.07455354183912277,
"rewards/similarity_reward": 0.5900824666023254,
"step": 247
},
{
"completion_length": 218.7366180419922,
"epoch": 0.6613333333333333,
"grad_norm": 0.963162350651896,
"kl": 0.01507568359375,
"learning_rate": 2e-06,
"loss": 0.0349,
"reward": 0.6808683276176453,
"reward_std": 0.13527972996234894,
"rewards/length_reward": 0.07187497615814209,
"rewards/similarity_reward": 0.6089933514595032,
"step": 248
},
{
"completion_length": 254.5982208251953,
"epoch": 0.664,
"grad_norm": 0.9385464548992294,
"kl": 0.0179443359375,
"learning_rate": 2e-06,
"loss": 0.0288,
"reward": 0.631544828414917,
"reward_std": 0.13531894981861115,
"rewards/length_reward": 0.06874997913837433,
"rewards/similarity_reward": 0.5627948045730591,
"step": 249
},
{
"completion_length": 291.37054443359375,
"epoch": 0.6666666666666666,
"grad_norm": 0.9140004673035673,
"kl": 0.01708984375,
"learning_rate": 2e-06,
"loss": 0.0837,
"reward": 0.5439311861991882,
"reward_std": 0.13576674461364746,
"rewards/length_reward": 0.05848212540149689,
"rewards/similarity_reward": 0.48544901609420776,
"step": 250
},
{
"completion_length": 302.15179443359375,
"epoch": 0.6693333333333333,
"grad_norm": 0.8827607842873424,
"kl": 0.0137939453125,
"learning_rate": 2e-06,
"loss": 0.1511,
"reward": 0.521554172039032,
"reward_std": 0.13812950253486633,
"rewards/length_reward": 0.06249998137354851,
"rewards/similarity_reward": 0.459054172039032,
"step": 251
},
{
"completion_length": 256.67413330078125,
"epoch": 0.672,
"grad_norm": 0.9030494234496588,
"kl": 0.015869140625,
"learning_rate": 2e-06,
"loss": 0.0098,
"reward": 0.6373765468597412,
"reward_std": 0.16011908650398254,
"rewards/length_reward": 0.06517855077981949,
"rewards/similarity_reward": 0.5721979141235352,
"step": 252
},
{
"completion_length": 263.4464416503906,
"epoch": 0.6746666666666666,
"grad_norm": 0.9917515330394924,
"kl": 0.0157470703125,
"learning_rate": 2e-06,
"loss": 0.1071,
"reward": 0.5835117697715759,
"reward_std": 0.15427549183368683,
"rewards/length_reward": 0.06205355003476143,
"rewards/similarity_reward": 0.5214581489562988,
"step": 253
},
{
"completion_length": 263.1875,
"epoch": 0.6773333333333333,
"grad_norm": 0.8527537949516601,
"kl": 0.01007080078125,
"learning_rate": 2e-06,
"loss": 0.0704,
"reward": 0.659292459487915,
"reward_std": 0.11287476867437363,
"rewards/length_reward": 0.07723211497068405,
"rewards/similarity_reward": 0.5820602774620056,
"step": 254
},
{
"completion_length": 247.35714721679688,
"epoch": 0.68,
"grad_norm": 0.9306878660258886,
"kl": 0.018798828125,
"learning_rate": 2e-06,
"loss": 0.1071,
"reward": 0.5428202748298645,
"reward_std": 0.14576061069965363,
"rewards/length_reward": 0.055357132107019424,
"rewards/similarity_reward": 0.4874631464481354,
"step": 255
},
{
"completion_length": 304.77679443359375,
"epoch": 0.6826666666666666,
"grad_norm": 0.8122651765327112,
"kl": 0.01123046875,
"learning_rate": 2e-06,
"loss": 0.0722,
"reward": 0.6534665822982788,
"reward_std": 0.17661263048648834,
"rewards/length_reward": 0.06651783734560013,
"rewards/similarity_reward": 0.5869486927986145,
"step": 256
},
{
"completion_length": 316.0,
"epoch": 0.6853333333333333,
"grad_norm": 0.7637865876113388,
"kl": 0.01153564453125,
"learning_rate": 2e-06,
"loss": 0.0805,
"reward": 0.6809090375900269,
"reward_std": 0.12792253494262695,
"rewards/length_reward": 0.07499997317790985,
"rewards/similarity_reward": 0.6059090495109558,
"step": 257
},
{
"completion_length": 267.7410888671875,
"epoch": 0.688,
"grad_norm": 0.908951786987476,
"kl": 0.0123291015625,
"learning_rate": 2e-06,
"loss": 0.0919,
"reward": 0.6498162150382996,
"reward_std": 0.12118736654520035,
"rewards/length_reward": 0.057589273899793625,
"rewards/similarity_reward": 0.5922268629074097,
"step": 258
},
{
"completion_length": 255.13394165039062,
"epoch": 0.6906666666666667,
"grad_norm": 0.9868527698980504,
"kl": 0.01116943359375,
"learning_rate": 2e-06,
"loss": 0.0518,
"reward": 0.6177918910980225,
"reward_std": 0.1076013594865799,
"rewards/length_reward": 0.07232140004634857,
"rewards/similarity_reward": 0.5454704165458679,
"step": 259
},
{
"completion_length": 211.96429443359375,
"epoch": 0.6933333333333334,
"grad_norm": 1.1390879691759828,
"kl": 0.01251220703125,
"learning_rate": 2e-06,
"loss": 0.0376,
"reward": 0.6133698225021362,
"reward_std": 0.1394728273153305,
"rewards/length_reward": 0.07187497615814209,
"rewards/similarity_reward": 0.5414947867393494,
"step": 260
},
{
"completion_length": 226.0491180419922,
"epoch": 0.696,
"grad_norm": 0.9181631341556423,
"kl": 0.01458740234375,
"learning_rate": 2e-06,
"loss": 0.0146,
"reward": 0.6710724234580994,
"reward_std": 0.12615807354450226,
"rewards/length_reward": 0.06696426123380661,
"rewards/similarity_reward": 0.6041080951690674,
"step": 261
},
{
"completion_length": 290.71875,
"epoch": 0.6986666666666667,
"grad_norm": 0.9242817884998483,
"kl": 0.010986328125,
"learning_rate": 2e-06,
"loss": 0.0492,
"reward": 0.758361279964447,
"reward_std": 0.0939282700419426,
"rewards/length_reward": 0.07812497019767761,
"rewards/similarity_reward": 0.680236279964447,
"step": 262
},
{
"completion_length": 295.2276916503906,
"epoch": 0.7013333333333334,
"grad_norm": 0.9268240432998979,
"kl": 0.01434326171875,
"learning_rate": 2e-06,
"loss": 0.0777,
"reward": 0.5887910723686218,
"reward_std": 0.15629605948925018,
"rewards/length_reward": 0.06517855077981949,
"rewards/similarity_reward": 0.5236124992370605,
"step": 263
},
{
"completion_length": 197.97769165039062,
"epoch": 0.704,
"grad_norm": 0.9100124092555839,
"kl": 0.01190185546875,
"learning_rate": 2e-06,
"loss": 0.0176,
"reward": 0.5667382478713989,
"reward_std": 0.1061021164059639,
"rewards/length_reward": 0.06741069257259369,
"rewards/similarity_reward": 0.4993274211883545,
"step": 264
},
{
"completion_length": 242.96429443359375,
"epoch": 0.7066666666666667,
"grad_norm": 0.9638821889698813,
"kl": 0.0205078125,
"learning_rate": 2e-06,
"loss": 0.0458,
"reward": 0.6227900981903076,
"reward_std": 0.1025083139538765,
"rewards/length_reward": 0.06562498211860657,
"rewards/similarity_reward": 0.5571650862693787,
"step": 265
},
{
"completion_length": 223.3973388671875,
"epoch": 0.7093333333333334,
"grad_norm": 1.0581452656511359,
"kl": 0.01336669921875,
"learning_rate": 2e-06,
"loss": 0.0641,
"reward": 0.6723743081092834,
"reward_std": 0.1159893348813057,
"rewards/length_reward": 0.06517855077981949,
"rewards/similarity_reward": 0.6071956753730774,
"step": 266
},
{
"completion_length": 270.2098388671875,
"epoch": 0.712,
"grad_norm": 0.8609101277095984,
"kl": 0.0146484375,
"learning_rate": 2e-06,
"loss": 0.0419,
"reward": 0.7352553009986877,
"reward_std": 0.13158555328845978,
"rewards/length_reward": 0.07499997317790985,
"rewards/similarity_reward": 0.6602552533149719,
"step": 267
},
{
"completion_length": 286.65179443359375,
"epoch": 0.7146666666666667,
"grad_norm": 0.85187609520963,
"kl": 0.01470947265625,
"learning_rate": 2e-06,
"loss": 0.0801,
"reward": 0.5860309600830078,
"reward_std": 0.12361589819192886,
"rewards/length_reward": 0.06562498211860657,
"rewards/similarity_reward": 0.5204059481620789,
"step": 268
},
{
"completion_length": 258.5,
"epoch": 0.7173333333333334,
"grad_norm": 0.8462850933355499,
"kl": 0.01080322265625,
"learning_rate": 2e-06,
"loss": 0.1016,
"reward": 0.7648903131484985,
"reward_std": 0.08282845467329025,
"rewards/length_reward": 0.08348211646080017,
"rewards/similarity_reward": 0.6814082264900208,
"step": 269
},
{
"completion_length": 255.4598388671875,
"epoch": 0.72,
"grad_norm": 0.8789894818999019,
"kl": 0.01507568359375,
"learning_rate": 2e-06,
"loss": 0.1231,
"reward": 0.6465427875518799,
"reward_std": 0.12229768186807632,
"rewards/length_reward": 0.06607140600681305,
"rewards/similarity_reward": 0.5804713368415833,
"step": 270
},
{
"completion_length": 273.01788330078125,
"epoch": 0.7226666666666667,
"grad_norm": 0.8695715294946503,
"kl": 0.0184326171875,
"learning_rate": 2e-06,
"loss": 0.087,
"reward": 0.6258493661880493,
"reward_std": 0.13294367492198944,
"rewards/length_reward": 0.07053568959236145,
"rewards/similarity_reward": 0.5553135871887207,
"step": 271
},
{
"completion_length": 260.8169860839844,
"epoch": 0.7253333333333334,
"grad_norm": 1.2217664906535957,
"kl": 0.015380859375,
"learning_rate": 2e-06,
"loss": 0.0431,
"reward": 0.6779768466949463,
"reward_std": 0.11835993826389313,
"rewards/length_reward": 0.07678568363189697,
"rewards/similarity_reward": 0.6011910438537598,
"step": 272
},
{
"completion_length": 284.625,
"epoch": 0.728,
"grad_norm": 0.70076206752431,
"kl": 0.01031494140625,
"learning_rate": 2e-06,
"loss": 0.0623,
"reward": 0.6957324147224426,
"reward_std": 0.11651583760976791,
"rewards/length_reward": 0.08348210901021957,
"rewards/similarity_reward": 0.6122502088546753,
"step": 273
},
{
"completion_length": 255.76339721679688,
"epoch": 0.7306666666666667,
"grad_norm": 0.9105713339266117,
"kl": 0.012939453125,
"learning_rate": 2e-06,
"loss": 0.0998,
"reward": 0.6966086626052856,
"reward_std": 0.08685937523841858,
"rewards/length_reward": 0.07187496870756149,
"rewards/similarity_reward": 0.624733567237854,
"step": 274
},
{
"completion_length": 296.76788330078125,
"epoch": 0.7333333333333333,
"grad_norm": 0.7414942240643484,
"kl": 0.00921630859375,
"learning_rate": 2e-06,
"loss": 0.0451,
"reward": 0.7098910212516785,
"reward_std": 0.09432552009820938,
"rewards/length_reward": 0.08482139557600021,
"rewards/similarity_reward": 0.6250695586204529,
"step": 275
},
{
"completion_length": 270.0669860839844,
"epoch": 0.736,
"grad_norm": 0.898333119316704,
"kl": 0.0137939453125,
"learning_rate": 2e-06,
"loss": -0.0225,
"reward": 0.7013087868690491,
"reward_std": 0.11285625398159027,
"rewards/length_reward": 0.0741071105003357,
"rewards/similarity_reward": 0.6272015571594238,
"step": 276
},
{
"completion_length": 198.93751525878906,
"epoch": 0.7386666666666667,
"grad_norm": 1.1511982372559852,
"kl": 0.0205078125,
"learning_rate": 2e-06,
"loss": 0.0183,
"reward": 0.5090009570121765,
"reward_std": 0.13011598587036133,
"rewards/length_reward": 0.06205355003476143,
"rewards/similarity_reward": 0.4469473958015442,
"step": 277
},
{
"completion_length": 271.58038330078125,
"epoch": 0.7413333333333333,
"grad_norm": 0.8195827963319392,
"kl": 0.01092529296875,
"learning_rate": 2e-06,
"loss": 0.0099,
"reward": 0.6257685422897339,
"reward_std": 0.1082058921456337,
"rewards/length_reward": 0.07544640451669693,
"rewards/similarity_reward": 0.5503220558166504,
"step": 278
},
{
"completion_length": 273.2232360839844,
"epoch": 0.744,
"grad_norm": 0.9459393517121532,
"kl": 0.01422119140625,
"learning_rate": 2e-06,
"loss": 0.0902,
"reward": 0.6197928190231323,
"reward_std": 0.15125982463359833,
"rewards/length_reward": 0.06874997913837433,
"rewards/similarity_reward": 0.551042914390564,
"step": 279
},
{
"completion_length": 279.65625,
"epoch": 0.7466666666666667,
"grad_norm": 0.9319904211339567,
"kl": 0.0126953125,
"learning_rate": 2e-06,
"loss": 0.0701,
"reward": 0.6224436163902283,
"reward_std": 0.11631693691015244,
"rewards/length_reward": 0.07008926570415497,
"rewards/similarity_reward": 0.5523543357849121,
"step": 280
},
{
"completion_length": 289.61163330078125,
"epoch": 0.7493333333333333,
"grad_norm": 1.071648613346095,
"kl": 0.010009765625,
"learning_rate": 2e-06,
"loss": 0.1723,
"reward": 0.6961318850517273,
"reward_std": 0.10897497087717056,
"rewards/length_reward": 0.07723211497068405,
"rewards/similarity_reward": 0.6188997030258179,
"step": 281
},
{
"completion_length": 243.8348388671875,
"epoch": 0.752,
"grad_norm": 0.994726741013797,
"kl": 0.01495361328125,
"learning_rate": 2e-06,
"loss": 0.0637,
"reward": 0.6346572637557983,
"reward_std": 0.10820147395133972,
"rewards/length_reward": 0.07008925825357437,
"rewards/similarity_reward": 0.5645679831504822,
"step": 282
},
{
"completion_length": 251.38394165039062,
"epoch": 0.7546666666666667,
"grad_norm": 0.976491323713793,
"kl": 0.015380859375,
"learning_rate": 2e-06,
"loss": 0.0921,
"reward": 0.5713584423065186,
"reward_std": 0.1460532546043396,
"rewards/length_reward": 0.06517855077981949,
"rewards/similarity_reward": 0.5061798095703125,
"step": 283
},
{
"completion_length": 295.8348388671875,
"epoch": 0.7573333333333333,
"grad_norm": 0.8830201574560093,
"kl": 0.0140380859375,
"learning_rate": 2e-06,
"loss": -0.0096,
"reward": 0.5986955761909485,
"reward_std": 0.15517514944076538,
"rewards/length_reward": 0.07008926570415497,
"rewards/similarity_reward": 0.5286062955856323,
"step": 284
},
{
"completion_length": 247.72769165039062,
"epoch": 0.76,
"grad_norm": 0.8557479345922782,
"kl": 0.01385498046875,
"learning_rate": 2e-06,
"loss": -0.0017,
"reward": 0.6696428656578064,
"reward_std": 0.10514307767152786,
"rewards/length_reward": 0.07857140153646469,
"rewards/similarity_reward": 0.5910714268684387,
"step": 285
},
{
"completion_length": 264.9196472167969,
"epoch": 0.7626666666666667,
"grad_norm": 0.900029253879394,
"kl": 0.01031494140625,
"learning_rate": 2e-06,
"loss": 0.011,
"reward": 0.6437191367149353,
"reward_std": 0.12881356477737427,
"rewards/length_reward": 0.07991068065166473,
"rewards/similarity_reward": 0.5638083815574646,
"step": 286
},
{
"completion_length": 275.76788330078125,
"epoch": 0.7653333333333333,
"grad_norm": 0.8434547706339766,
"kl": 0.01519775390625,
"learning_rate": 2e-06,
"loss": 0.0667,
"reward": 0.6621875762939453,
"reward_std": 0.15463142096996307,
"rewards/length_reward": 0.06964283436536789,
"rewards/similarity_reward": 0.5925447344779968,
"step": 287
},
{
"completion_length": 251.4866180419922,
"epoch": 0.768,
"grad_norm": 0.9853184231195431,
"kl": 0.01507568359375,
"learning_rate": 2e-06,
"loss": 0.0183,
"reward": 0.5950483679771423,
"reward_std": 0.1278952956199646,
"rewards/length_reward": 0.07544640451669693,
"rewards/similarity_reward": 0.5196019411087036,
"step": 288
},
{
"completion_length": 279.9419860839844,
"epoch": 0.7706666666666667,
"grad_norm": 0.8374958728984951,
"kl": 0.0126953125,
"learning_rate": 2e-06,
"loss": 0.1275,
"reward": 0.6563042402267456,
"reward_std": 0.14691661298274994,
"rewards/length_reward": 0.07098212093114853,
"rewards/similarity_reward": 0.5853220820426941,
"step": 289
},
{
"completion_length": 283.12054443359375,
"epoch": 0.7733333333333333,
"grad_norm": 0.9945411525052974,
"kl": 0.01019287109375,
"learning_rate": 2e-06,
"loss": 0.0685,
"reward": 0.7337676882743835,
"reward_std": 0.09462190419435501,
"rewards/length_reward": 0.07633925974369049,
"rewards/similarity_reward": 0.6574283838272095,
"step": 290
},
{
"completion_length": 286.1026916503906,
"epoch": 0.776,
"grad_norm": 0.7519820653879999,
"kl": 0.01275634765625,
"learning_rate": 2e-06,
"loss": 0.069,
"reward": 0.6660787463188171,
"reward_std": 0.13770896196365356,
"rewards/length_reward": 0.07455354183912277,
"rewards/similarity_reward": 0.5915251970291138,
"step": 291
},
{
"completion_length": 300.4419860839844,
"epoch": 0.7786666666666666,
"grad_norm": 0.9216655286630218,
"kl": 0.01190185546875,
"learning_rate": 2e-06,
"loss": 0.1068,
"reward": 0.7210602164268494,
"reward_std": 0.15427368879318237,
"rewards/length_reward": 0.08035711199045181,
"rewards/similarity_reward": 0.6407030820846558,
"step": 292
},
{
"completion_length": 252.0178680419922,
"epoch": 0.7813333333333333,
"grad_norm": 0.8654140719017164,
"kl": 0.022705078125,
"learning_rate": 2e-06,
"loss": 0.1224,
"reward": 0.6323553919792175,
"reward_std": 0.1670098751783371,
"rewards/length_reward": 0.06651784479618073,
"rewards/similarity_reward": 0.5658375024795532,
"step": 293
},
{
"completion_length": 274.8839416503906,
"epoch": 0.784,
"grad_norm": 0.7786721291031314,
"kl": 0.0142822265625,
"learning_rate": 2e-06,
"loss": 0.0166,
"reward": 0.7442488074302673,
"reward_std": 0.10799020528793335,
"rewards/length_reward": 0.07455354183912277,
"rewards/similarity_reward": 0.6696951389312744,
"step": 294
},
{
"completion_length": 279.1294860839844,
"epoch": 0.7866666666666666,
"grad_norm": 0.8056776017830058,
"kl": 0.011962890625,
"learning_rate": 2e-06,
"loss": 0.0048,
"reward": 0.6635159850120544,
"reward_std": 0.11003357172012329,
"rewards/length_reward": 0.07008926570415497,
"rewards/similarity_reward": 0.5934267044067383,
"step": 295
},
{
"completion_length": 254.96429443359375,
"epoch": 0.7893333333333333,
"grad_norm": 1.0354511241505295,
"kl": 0.0137939453125,
"learning_rate": 2e-06,
"loss": 0.0919,
"reward": 0.6252850294113159,
"reward_std": 0.10695895552635193,
"rewards/length_reward": 0.07142854481935501,
"rewards/similarity_reward": 0.553856372833252,
"step": 296
},
{
"completion_length": 233.25894165039062,
"epoch": 0.792,
"grad_norm": 0.9468597047570717,
"kl": 0.01275634765625,
"learning_rate": 2e-06,
"loss": -0.0101,
"reward": 0.6699472665786743,
"reward_std": 0.16828653216362,
"rewards/length_reward": 0.06964283436536789,
"rewards/similarity_reward": 0.6003044247627258,
"step": 297
},
{
"completion_length": 249.1428680419922,
"epoch": 0.7946666666666666,
"grad_norm": 0.9140690517111535,
"kl": 0.01141357421875,
"learning_rate": 2e-06,
"loss": 0.072,
"reward": 0.6824041604995728,
"reward_std": 0.12977474927902222,
"rewards/length_reward": 0.08035711199045181,
"rewards/similarity_reward": 0.6020469069480896,
"step": 298
},
{
"completion_length": 276.3571472167969,
"epoch": 0.7973333333333333,
"grad_norm": 0.8259435738042828,
"kl": 0.01141357421875,
"learning_rate": 2e-06,
"loss": 0.0105,
"reward": 0.7071071863174438,
"reward_std": 0.08347765356302261,
"rewards/length_reward": 0.08169639110565186,
"rewards/similarity_reward": 0.625410795211792,
"step": 299
},
{
"completion_length": 270.0758972167969,
"epoch": 0.8,
"grad_norm": 0.8981450111371676,
"kl": 0.0126953125,
"learning_rate": 2e-06,
"loss": 0.0594,
"reward": 0.6137918829917908,
"reward_std": 0.12631313502788544,
"rewards/length_reward": 0.064732126891613,
"rewards/similarity_reward": 0.5490598082542419,
"step": 300
},
{
"completion_length": 278.83038330078125,
"epoch": 0.8026666666666666,
"grad_norm": 0.8303352041330266,
"kl": 0.01397705078125,
"learning_rate": 2e-06,
"loss": 0.0166,
"reward": 0.6976829767227173,
"reward_std": 0.11335788667201996,
"rewards/length_reward": 0.08169639110565186,
"rewards/similarity_reward": 0.6159866452217102,
"step": 301
},
{
"completion_length": 257.5357360839844,
"epoch": 0.8053333333333333,
"grad_norm": 0.8867998574709848,
"kl": 0.0135498046875,
"learning_rate": 2e-06,
"loss": 0.0754,
"reward": 0.6026350855827332,
"reward_std": 0.0930032953619957,
"rewards/length_reward": 0.07589282840490341,
"rewards/similarity_reward": 0.526742160320282,
"step": 302
},
{
"completion_length": 283.5669860839844,
"epoch": 0.808,
"grad_norm": 0.8168218668358965,
"kl": 0.0120849609375,
"learning_rate": 2e-06,
"loss": 0.0773,
"reward": 0.5895494222640991,
"reward_std": 0.1272886097431183,
"rewards/length_reward": 0.06874997913837433,
"rewards/similarity_reward": 0.5207993984222412,
"step": 303
},
{
"completion_length": 303.6026916503906,
"epoch": 0.8106666666666666,
"grad_norm": 0.8899545222480755,
"kl": 0.012939453125,
"learning_rate": 2e-06,
"loss": 0.0372,
"reward": 0.6067291498184204,
"reward_std": 0.1214829757809639,
"rewards/length_reward": 0.07008926570415497,
"rewards/similarity_reward": 0.5366398692131042,
"step": 304
},
{
"completion_length": 294.3482360839844,
"epoch": 0.8133333333333334,
"grad_norm": 0.8194602682013028,
"kl": 0.01153564453125,
"learning_rate": 2e-06,
"loss": 0.0566,
"reward": 0.7332960963249207,
"reward_std": 0.08479318022727966,
"rewards/length_reward": 0.07767854630947113,
"rewards/similarity_reward": 0.6556174755096436,
"step": 305
},
{
"completion_length": 266.4375,
"epoch": 0.816,
"grad_norm": 0.987377079764631,
"kl": 0.0133056640625,
"learning_rate": 2e-06,
"loss": 0.0409,
"reward": 0.6382983326911926,
"reward_std": 0.1240207627415657,
"rewards/length_reward": 0.07366068661212921,
"rewards/similarity_reward": 0.5646375417709351,
"step": 306
},
{
"completion_length": 267.95538330078125,
"epoch": 0.8186666666666667,
"grad_norm": 0.9217292474743543,
"kl": 0.01263427734375,
"learning_rate": 2e-06,
"loss": 0.0267,
"reward": 0.6616266965866089,
"reward_std": 0.1070173904299736,
"rewards/length_reward": 0.06964283436536789,
"rewards/similarity_reward": 0.5919837355613708,
"step": 307
},
{
"completion_length": 279.0535888671875,
"epoch": 0.8213333333333334,
"grad_norm": 0.8072891907153936,
"kl": 0.01385498046875,
"learning_rate": 2e-06,
"loss": 0.0943,
"reward": 0.6182869672775269,
"reward_std": 0.1361446976661682,
"rewards/length_reward": 0.07187497615814209,
"rewards/similarity_reward": 0.5464120507240295,
"step": 308
},
{
"completion_length": 287.90625,
"epoch": 0.824,
"grad_norm": 0.9054980085837359,
"kl": 0.01153564453125,
"learning_rate": 2e-06,
"loss": 0.068,
"reward": 0.7372510433197021,
"reward_std": 0.0957195907831192,
"rewards/length_reward": 0.07499997317790985,
"rewards/similarity_reward": 0.6622509956359863,
"step": 309
},
{
"completion_length": 258.3973388671875,
"epoch": 0.8266666666666667,
"grad_norm": 0.9082585914171709,
"kl": 0.01202392578125,
"learning_rate": 2e-06,
"loss": 0.0061,
"reward": 0.647227942943573,
"reward_std": 0.1310262531042099,
"rewards/length_reward": 0.07053568959236145,
"rewards/similarity_reward": 0.5766921043395996,
"step": 310
},
{
"completion_length": 221.2991180419922,
"epoch": 0.8293333333333334,
"grad_norm": 0.9448119727952166,
"kl": 0.021728515625,
"learning_rate": 2e-06,
"loss": -0.0416,
"reward": 0.6708490252494812,
"reward_std": 0.1254904717206955,
"rewards/length_reward": 0.07812497019767761,
"rewards/similarity_reward": 0.5927240252494812,
"step": 311
},
{
"completion_length": 249.3616180419922,
"epoch": 0.832,
"grad_norm": 0.8967865636957736,
"kl": 0.010009765625,
"learning_rate": 2e-06,
"loss": 0.0332,
"reward": 0.7173448801040649,
"reward_std": 0.10435692220926285,
"rewards/length_reward": 0.08214282244443893,
"rewards/similarity_reward": 0.6352020502090454,
"step": 312
},
{
"completion_length": 246.80804443359375,
"epoch": 0.8346666666666667,
"grad_norm": 0.9778150152016866,
"kl": 0.01373291015625,
"learning_rate": 2e-06,
"loss": 0.0158,
"reward": 0.6462720036506653,
"reward_std": 0.13069510459899902,
"rewards/length_reward": 0.07499997317790985,
"rewards/similarity_reward": 0.5712720155715942,
"step": 313
},
{
"completion_length": 244.13839721679688,
"epoch": 0.8373333333333334,
"grad_norm": 0.9354621189513169,
"kl": 0.01214599609375,
"learning_rate": 2e-06,
"loss": 0.264,
"reward": 0.6220008730888367,
"reward_std": 0.12123651802539825,
"rewards/length_reward": 0.06205355003476143,
"rewards/similarity_reward": 0.5599472522735596,
"step": 314
},
{
"completion_length": 269.5446472167969,
"epoch": 0.84,
"grad_norm": 0.9611408485021674,
"kl": 0.01513671875,
"learning_rate": 2e-06,
"loss": 0.0553,
"reward": 0.725965678691864,
"reward_std": 0.09657153487205505,
"rewards/length_reward": 0.07723211497068405,
"rewards/similarity_reward": 0.6487335562705994,
"step": 315
},
{
"completion_length": 274.84375,
"epoch": 0.8426666666666667,
"grad_norm": 0.8797127493353065,
"kl": 0.01129150390625,
"learning_rate": 2e-06,
"loss": 0.014,
"reward": 0.6985806226730347,
"reward_std": 0.11010481417179108,
"rewards/length_reward": 0.0741071105003357,
"rewards/similarity_reward": 0.6244734525680542,
"step": 316
},
{
"completion_length": 272.8348388671875,
"epoch": 0.8453333333333334,
"grad_norm": 0.779139564448991,
"kl": 0.01055908203125,
"learning_rate": 2e-06,
"loss": -0.0161,
"reward": 0.7422655820846558,
"reward_std": 0.06264423578977585,
"rewards/length_reward": 0.08124996721744537,
"rewards/similarity_reward": 0.6610156297683716,
"step": 317
},
{
"completion_length": 229.7678680419922,
"epoch": 0.848,
"grad_norm": 4.278780421719415,
"kl": 0.0390625,
"learning_rate": 2e-06,
"loss": 0.0681,
"reward": 0.6605138778686523,
"reward_std": 0.14701789617538452,
"rewards/length_reward": 0.07187496870756149,
"rewards/similarity_reward": 0.5886389017105103,
"step": 318
},
{
"completion_length": 286.2723388671875,
"epoch": 0.8506666666666667,
"grad_norm": 1.0371097952967725,
"kl": 0.01611328125,
"learning_rate": 2e-06,
"loss": 0.1372,
"reward": 0.6110987663269043,
"reward_std": 0.18697677552700043,
"rewards/length_reward": 0.06205355003476143,
"rewards/similarity_reward": 0.549045205116272,
"step": 319
},
{
"completion_length": 260.0625,
"epoch": 0.8533333333333334,
"grad_norm": 0.9056186985134533,
"kl": 0.009521484375,
"learning_rate": 2e-06,
"loss": 0.0351,
"reward": 0.6572511196136475,
"reward_std": 0.08565808087587357,
"rewards/length_reward": 0.07812497019767761,
"rewards/similarity_reward": 0.5791261196136475,
"step": 320
},
{
"completion_length": 281.71875,
"epoch": 0.856,
"grad_norm": 0.71560536286136,
"kl": 0.00860595703125,
"learning_rate": 2e-06,
"loss": 0.0613,
"reward": 0.7018586993217468,
"reward_std": 0.14483648538589478,
"rewards/length_reward": 0.08169639110565186,
"rewards/similarity_reward": 0.620162308216095,
"step": 321
},
{
"completion_length": 258.2276916503906,
"epoch": 0.8586666666666667,
"grad_norm": 0.9896625298898443,
"kl": 0.0181884765625,
"learning_rate": 2e-06,
"loss": 0.2194,
"reward": 0.5979973077774048,
"reward_std": 0.13931064307689667,
"rewards/length_reward": 0.06249998137354851,
"rewards/similarity_reward": 0.5354973077774048,
"step": 322
},
{
"completion_length": 237.1741180419922,
"epoch": 0.8613333333333333,
"grad_norm": 0.9609452774135127,
"kl": 0.01513671875,
"learning_rate": 2e-06,
"loss": 0.0016,
"reward": 0.6683059334754944,
"reward_std": 0.10867080092430115,
"rewards/length_reward": 0.07991068810224533,
"rewards/similarity_reward": 0.5883952975273132,
"step": 323
},
{
"completion_length": 254.01339721679688,
"epoch": 0.864,
"grad_norm": 1.0568546085260409,
"kl": 0.01531982421875,
"learning_rate": 2e-06,
"loss": 0.0933,
"reward": 0.6546286940574646,
"reward_std": 0.11366698145866394,
"rewards/length_reward": 0.07008926570415497,
"rewards/similarity_reward": 0.5845393538475037,
"step": 324
},
{
"completion_length": 242.03126525878906,
"epoch": 0.8666666666666667,
"grad_norm": 0.926913110151395,
"kl": 0.014892578125,
"learning_rate": 2e-06,
"loss": 0.0865,
"reward": 0.5440469980239868,
"reward_std": 0.11594089865684509,
"rewards/length_reward": 0.06383926421403885,
"rewards/similarity_reward": 0.48020774126052856,
"step": 325
},
{
"completion_length": 228.21429443359375,
"epoch": 0.8693333333333333,
"grad_norm": 1.0043052790420461,
"kl": 0.01385498046875,
"learning_rate": 2e-06,
"loss": -0.0272,
"reward": 0.6953443884849548,
"reward_std": 0.1263352483510971,
"rewards/length_reward": 0.06785711646080017,
"rewards/similarity_reward": 0.6274873614311218,
"step": 326
},
{
"completion_length": 307.3125,
"epoch": 0.872,
"grad_norm": 0.7734278067274474,
"kl": 0.0089111328125,
"learning_rate": 2e-06,
"loss": 0.0474,
"reward": 0.689052402973175,
"reward_std": 0.10326018929481506,
"rewards/length_reward": 0.07455354183912277,
"rewards/similarity_reward": 0.6144987940788269,
"step": 327
},
{
"completion_length": 256.7589416503906,
"epoch": 0.8746666666666667,
"grad_norm": 0.8349456447351374,
"kl": 0.01275634765625,
"learning_rate": 2e-06,
"loss": -0.0053,
"reward": 0.6755567193031311,
"reward_std": 0.12627391517162323,
"rewards/length_reward": 0.07321426272392273,
"rewards/similarity_reward": 0.6023423671722412,
"step": 328
},
{
"completion_length": 272.67413330078125,
"epoch": 0.8773333333333333,
"grad_norm": 0.9337813221087722,
"kl": 0.013671875,
"learning_rate": 2e-06,
"loss": 0.0863,
"reward": 0.6436842083930969,
"reward_std": 0.1272781938314438,
"rewards/length_reward": 0.06339284032583237,
"rewards/similarity_reward": 0.5802912712097168,
"step": 329
},
{
"completion_length": 268.5401916503906,
"epoch": 0.88,
"grad_norm": 0.7822000698940798,
"kl": 0.014892578125,
"learning_rate": 2e-06,
"loss": 0.0076,
"reward": 0.6488507986068726,
"reward_std": 0.1486339569091797,
"rewards/length_reward": 0.07812497019767761,
"rewards/similarity_reward": 0.5707257986068726,
"step": 330
},
{
"completion_length": 255.2991180419922,
"epoch": 0.8826666666666667,
"grad_norm": 0.9970416796611882,
"kl": 0.01348876953125,
"learning_rate": 2e-06,
"loss": -0.0091,
"reward": 0.623367190361023,
"reward_std": 0.11435237526893616,
"rewards/length_reward": 0.07544640451669693,
"rewards/similarity_reward": 0.547920823097229,
"step": 331
},
{
"completion_length": 281.2544860839844,
"epoch": 0.8853333333333333,
"grad_norm": 0.9393864746480084,
"kl": 0.0177001953125,
"learning_rate": 2e-06,
"loss": -0.0269,
"reward": 0.6835038661956787,
"reward_std": 0.12119947373867035,
"rewards/length_reward": 0.07366069406270981,
"rewards/similarity_reward": 0.6098431348800659,
"step": 332
},
{
"completion_length": 277.3883972167969,
"epoch": 0.888,
"grad_norm": 0.8857739277618905,
"kl": 0.0133056640625,
"learning_rate": 2e-06,
"loss": 0.1155,
"reward": 0.6458525657653809,
"reward_std": 0.10718663036823273,
"rewards/length_reward": 0.07187497615814209,
"rewards/similarity_reward": 0.5739776492118835,
"step": 333
},
{
"completion_length": 222.0491180419922,
"epoch": 0.8906666666666667,
"grad_norm": 0.9211484451974372,
"kl": 0.0145263671875,
"learning_rate": 2e-06,
"loss": 0.0326,
"reward": 0.6779581308364868,
"reward_std": 0.09158685058355331,
"rewards/length_reward": 0.07098212093114853,
"rewards/similarity_reward": 0.6069758534431458,
"step": 334
},
{
"completion_length": 253.54464721679688,
"epoch": 0.8933333333333333,
"grad_norm": 1.0439727694024494,
"kl": 0.01513671875,
"learning_rate": 2e-06,
"loss": 0.0938,
"reward": 0.6808232069015503,
"reward_std": 0.11848772317171097,
"rewards/length_reward": 0.07544640451669693,
"rewards/similarity_reward": 0.6053767204284668,
"step": 335
},
{
"completion_length": 233.00001525878906,
"epoch": 0.896,
"grad_norm": 0.9838382076638493,
"kl": 0.015869140625,
"learning_rate": 2e-06,
"loss": 0.0451,
"reward": 0.58315509557724,
"reward_std": 0.11559745669364929,
"rewards/length_reward": 0.06741069257259369,
"rewards/similarity_reward": 0.5157443881034851,
"step": 336
},
{
"completion_length": 269.0133972167969,
"epoch": 0.8986666666666666,
"grad_norm": 0.836096159627277,
"kl": 0.01361083984375,
"learning_rate": 2e-06,
"loss": 0.0062,
"reward": 0.6409623622894287,
"reward_std": 0.12886659801006317,
"rewards/length_reward": 0.07767854630947113,
"rewards/similarity_reward": 0.5632836818695068,
"step": 337
},
{
"completion_length": 251.40179443359375,
"epoch": 0.9013333333333333,
"grad_norm": 0.8535550782434568,
"kl": 0.01422119140625,
"learning_rate": 2e-06,
"loss": 0.0089,
"reward": 0.6296460032463074,
"reward_std": 0.16196994483470917,
"rewards/length_reward": 0.07187497615814209,
"rewards/similarity_reward": 0.5577709674835205,
"step": 338
},
{
"completion_length": 244.80804443359375,
"epoch": 0.904,
"grad_norm": 1.0854210987310389,
"kl": 0.0181884765625,
"learning_rate": 2e-06,
"loss": 0.0595,
"reward": 0.597773551940918,
"reward_std": 0.10373269766569138,
"rewards/length_reward": 0.06339284032583237,
"rewards/similarity_reward": 0.5343807935714722,
"step": 339
},
{
"completion_length": 262.6875,
"epoch": 0.9066666666666666,
"grad_norm": 0.8356624214145669,
"kl": 0.013916015625,
"learning_rate": 2e-06,
"loss": 0.0101,
"reward": 0.6685509085655212,
"reward_std": 0.0960090234875679,
"rewards/length_reward": 0.07946424931287766,
"rewards/similarity_reward": 0.5890867114067078,
"step": 340
},
{
"completion_length": 262.5535888671875,
"epoch": 0.9093333333333333,
"grad_norm": 1.0702777784285404,
"kl": 0.02783203125,
"learning_rate": 2e-06,
"loss": 0.1024,
"reward": 0.5271078944206238,
"reward_std": 0.14886566996574402,
"rewards/length_reward": 0.064732126891613,
"rewards/similarity_reward": 0.46237578988075256,
"step": 341
},
{
"completion_length": 251.03126525878906,
"epoch": 0.912,
"grad_norm": 0.9235089036400403,
"kl": 0.0172119140625,
"learning_rate": 2e-06,
"loss": 0.0324,
"reward": 0.6528847217559814,
"reward_std": 0.11811169981956482,
"rewards/length_reward": 0.06383927166461945,
"rewards/similarity_reward": 0.5890454053878784,
"step": 342
},
{
"completion_length": 236.2857208251953,
"epoch": 0.9146666666666666,
"grad_norm": 0.9202873381431551,
"kl": 0.01556396484375,
"learning_rate": 2e-06,
"loss": 0.0497,
"reward": 0.6048458814620972,
"reward_std": 0.13773028552532196,
"rewards/length_reward": 0.07544640451669693,
"rewards/similarity_reward": 0.5293995141983032,
"step": 343
},
{
"completion_length": 309.99554443359375,
"epoch": 0.9173333333333333,
"grad_norm": 0.9489271796425148,
"kl": 0.01226806640625,
"learning_rate": 2e-06,
"loss": 0.0812,
"reward": 0.6291395425796509,
"reward_std": 0.15959399938583374,
"rewards/length_reward": 0.06696426123380661,
"rewards/similarity_reward": 0.5621752738952637,
"step": 344
},
{
"completion_length": 245.21429443359375,
"epoch": 0.92,
"grad_norm": 1.1363976997302783,
"kl": 0.0213623046875,
"learning_rate": 2e-06,
"loss": 0.1781,
"reward": 0.5569170713424683,
"reward_std": 0.14195482432842255,
"rewards/length_reward": 0.06160712614655495,
"rewards/similarity_reward": 0.49530985951423645,
"step": 345
},
{
"completion_length": 272.2232360839844,
"epoch": 0.9226666666666666,
"grad_norm": 0.7565012868381632,
"kl": 0.0108642578125,
"learning_rate": 2e-06,
"loss": 0.0695,
"reward": 0.6887885928153992,
"reward_std": 0.11395367234945297,
"rewards/length_reward": 0.07857140153646469,
"rewards/similarity_reward": 0.6102170348167419,
"step": 346
},
{
"completion_length": 310.28125,
"epoch": 0.9253333333333333,
"grad_norm": 0.782594647397142,
"kl": 0.01202392578125,
"learning_rate": 2e-06,
"loss": 0.0752,
"reward": 0.6732801795005798,
"reward_std": 0.12288369983434677,
"rewards/length_reward": 0.07455354183912277,
"rewards/similarity_reward": 0.5987265706062317,
"step": 347
},
{
"completion_length": 307.86163330078125,
"epoch": 0.928,
"grad_norm": 0.7087010173295871,
"kl": 0.013671875,
"learning_rate": 2e-06,
"loss": 0.008,
"reward": 0.6750614047050476,
"reward_std": 0.09951343387365341,
"rewards/length_reward": 0.07410712540149689,
"rewards/similarity_reward": 0.6009542942047119,
"step": 348
},
{
"completion_length": 273.3883972167969,
"epoch": 0.9306666666666666,
"grad_norm": 1.0456544147832767,
"kl": 0.01446533203125,
"learning_rate": 2e-06,
"loss": 0.0404,
"reward": 0.6893116235733032,
"reward_std": 0.09679926186800003,
"rewards/length_reward": 0.06517855077981949,
"rewards/similarity_reward": 0.6241331100463867,
"step": 349
},
{
"completion_length": 264.1919860839844,
"epoch": 0.9333333333333333,
"grad_norm": 0.8991803002318822,
"kl": 0.01397705078125,
"learning_rate": 2e-06,
"loss": 0.0129,
"reward": 0.6719235181808472,
"reward_std": 0.13838014006614685,
"rewards/length_reward": 0.07321426272392273,
"rewards/similarity_reward": 0.598709225654602,
"step": 350
},
{
"completion_length": 274.9508972167969,
"epoch": 0.936,
"grad_norm": 0.8591029679110356,
"kl": 0.013916015625,
"learning_rate": 2e-06,
"loss": 0.0312,
"reward": 0.5572786331176758,
"reward_std": 0.14849816262722015,
"rewards/length_reward": 0.06785711646080017,
"rewards/similarity_reward": 0.489421546459198,
"step": 351
},
{
"completion_length": 277.9508972167969,
"epoch": 0.9386666666666666,
"grad_norm": 0.8388349903111052,
"kl": 0.01141357421875,
"learning_rate": 2e-06,
"loss": 0.0278,
"reward": 0.681722104549408,
"reward_std": 0.0899442657828331,
"rewards/length_reward": 0.07991068065166473,
"rewards/similarity_reward": 0.6018112897872925,
"step": 352
},
{
"completion_length": 240.3348388671875,
"epoch": 0.9413333333333334,
"grad_norm": 0.867198144640214,
"kl": 0.01336669921875,
"learning_rate": 2e-06,
"loss": -0.0137,
"reward": 0.6065589785575867,
"reward_std": 0.11837570369243622,
"rewards/length_reward": 0.07946424931287766,
"rewards/similarity_reward": 0.5270946025848389,
"step": 353
},
{
"completion_length": 287.4151916503906,
"epoch": 0.944,
"grad_norm": 1.2363858371428533,
"kl": 0.0157470703125,
"learning_rate": 2e-06,
"loss": 0.0649,
"reward": 0.6748880743980408,
"reward_std": 0.13465073704719543,
"rewards/length_reward": 0.07187496870756149,
"rewards/similarity_reward": 0.6030132174491882,
"step": 354
},
{
"completion_length": 309.6919860839844,
"epoch": 0.9466666666666667,
"grad_norm": 0.6824545679415536,
"kl": 0.00946044921875,
"learning_rate": 2e-06,
"loss": 0.0881,
"reward": 0.6755697727203369,
"reward_std": 0.13920390605926514,
"rewards/length_reward": 0.07901783287525177,
"rewards/similarity_reward": 0.5965518355369568,
"step": 355
},
{
"completion_length": 304.0089416503906,
"epoch": 0.9493333333333334,
"grad_norm": 0.7612207527990814,
"kl": 0.00860595703125,
"learning_rate": 2e-06,
"loss": 0.0573,
"reward": 0.6815410852432251,
"reward_std": 0.11536341905593872,
"rewards/length_reward": 0.07455354928970337,
"rewards/similarity_reward": 0.6069875955581665,
"step": 356
},
{
"completion_length": 280.4151916503906,
"epoch": 0.952,
"grad_norm": 0.9197442279455559,
"kl": 0.014892578125,
"learning_rate": 2e-06,
"loss": 0.0976,
"reward": 0.6213651895523071,
"reward_std": 0.1452549546957016,
"rewards/length_reward": 0.059374988079071045,
"rewards/similarity_reward": 0.5619902014732361,
"step": 357
},
{
"completion_length": 311.9375,
"epoch": 0.9546666666666667,
"grad_norm": 0.7667231406265689,
"kl": 0.0123291015625,
"learning_rate": 2e-06,
"loss": 0.0949,
"reward": 0.671852707862854,
"reward_std": 0.12415429949760437,
"rewards/length_reward": 0.07455354183912277,
"rewards/similarity_reward": 0.5972990989685059,
"step": 358
},
{
"completion_length": 296.4598388671875,
"epoch": 0.9573333333333334,
"grad_norm": 0.784755991417782,
"kl": 0.0093994140625,
"learning_rate": 2e-06,
"loss": 0.0284,
"reward": 0.7474254965782166,
"reward_std": 0.10959716141223907,
"rewards/length_reward": 0.07901783287525177,
"rewards/similarity_reward": 0.6684076189994812,
"step": 359
},
{
"completion_length": 253.15626525878906,
"epoch": 0.96,
"grad_norm": 0.8190677358549971,
"kl": 0.0125732421875,
"learning_rate": 2e-06,
"loss": -0.0109,
"reward": 0.6444076299667358,
"reward_std": 0.1199827641248703,
"rewards/length_reward": 0.07142855226993561,
"rewards/similarity_reward": 0.5729790925979614,
"step": 360
},
{
"completion_length": 306.5848388671875,
"epoch": 0.9626666666666667,
"grad_norm": 0.8327047575723723,
"kl": 0.0101318359375,
"learning_rate": 2e-06,
"loss": 0.044,
"reward": 0.6085981726646423,
"reward_std": 0.15067243576049805,
"rewards/length_reward": 0.07366069406270981,
"rewards/similarity_reward": 0.5349374413490295,
"step": 361
},
{
"completion_length": 254.7723388671875,
"epoch": 0.9653333333333334,
"grad_norm": 0.8753840404117226,
"kl": 0.0155029296875,
"learning_rate": 2e-06,
"loss": 0.0219,
"reward": 0.6461009383201599,
"reward_std": 0.11166159808635712,
"rewards/length_reward": 0.07053568959236145,
"rewards/similarity_reward": 0.5755651593208313,
"step": 362
},
{
"completion_length": 260.33929443359375,
"epoch": 0.968,
"grad_norm": 0.8487935107258318,
"kl": 0.0135498046875,
"learning_rate": 2e-06,
"loss": 0.0758,
"reward": 0.6731547713279724,
"reward_std": 0.0943944975733757,
"rewards/length_reward": 0.07946424931287766,
"rewards/similarity_reward": 0.5936905145645142,
"step": 363
},
{
"completion_length": 284.9107360839844,
"epoch": 0.9706666666666667,
"grad_norm": 0.9104348928736092,
"kl": 0.0146484375,
"learning_rate": 2e-06,
"loss": 0.0438,
"reward": 0.6029422879219055,
"reward_std": 0.13879723846912384,
"rewards/length_reward": 0.07053568959236145,
"rewards/similarity_reward": 0.5324065685272217,
"step": 364
},
{
"completion_length": 291.1339416503906,
"epoch": 0.9733333333333334,
"grad_norm": 0.8351250207880698,
"kl": 0.01312255859375,
"learning_rate": 2e-06,
"loss": 0.0447,
"reward": 0.6295793056488037,
"reward_std": 0.11455141007900238,
"rewards/length_reward": 0.07321426272392273,
"rewards/similarity_reward": 0.556364893913269,
"step": 365
},
{
"completion_length": 313.6160888671875,
"epoch": 0.976,
"grad_norm": 0.8331869500678173,
"kl": 0.0137939453125,
"learning_rate": 2e-06,
"loss": 0.1377,
"reward": 0.6497610807418823,
"reward_std": 0.13638634979724884,
"rewards/length_reward": 0.064732126891613,
"rewards/similarity_reward": 0.5850289463996887,
"step": 366
},
{
"completion_length": 260.46875,
"epoch": 0.9786666666666667,
"grad_norm": 0.9519334592833407,
"kl": 0.0184326171875,
"learning_rate": 2e-06,
"loss": 0.108,
"reward": 0.6215986609458923,
"reward_std": 0.13745638728141785,
"rewards/length_reward": 0.06696426123380661,
"rewards/similarity_reward": 0.5546343326568604,
"step": 367
},
{
"completion_length": 261.9196472167969,
"epoch": 0.9813333333333333,
"grad_norm": 0.9206057200583376,
"kl": 0.01361083984375,
"learning_rate": 2e-06,
"loss": 0.1112,
"reward": 0.5996190905570984,
"reward_std": 0.13816344738006592,
"rewards/length_reward": 0.0741071105003357,
"rewards/similarity_reward": 0.5255119204521179,
"step": 368
},
{
"completion_length": 187.60269165039062,
"epoch": 0.984,
"grad_norm": 34.2971123341626,
"kl": 0.0152587890625,
"learning_rate": 2e-06,
"loss": -0.0025,
"reward": 0.6047165989875793,
"reward_std": 0.11468542367219925,
"rewards/length_reward": 0.07276783138513565,
"rewards/similarity_reward": 0.5319487452507019,
"step": 369
},
{
"completion_length": 273.24554443359375,
"epoch": 0.9866666666666667,
"grad_norm": 1.0225592676611843,
"kl": 0.0157470703125,
"learning_rate": 2e-06,
"loss": 0.0974,
"reward": 0.5848848819732666,
"reward_std": 0.13747373223304749,
"rewards/length_reward": 0.06205355003476143,
"rewards/similarity_reward": 0.5228313207626343,
"step": 370
},
{
"completion_length": 242.43304443359375,
"epoch": 0.9893333333333333,
"grad_norm": 0.9519953414264258,
"kl": 0.01556396484375,
"learning_rate": 2e-06,
"loss": 0.1648,
"reward": 0.6343554854393005,
"reward_std": 0.14080199599266052,
"rewards/length_reward": 0.07142855226993561,
"rewards/similarity_reward": 0.5629268884658813,
"step": 371
},
{
"completion_length": 295.7633972167969,
"epoch": 0.992,
"grad_norm": 0.7534581065002547,
"kl": 0.0140380859375,
"learning_rate": 2e-06,
"loss": 0.0601,
"reward": 0.6254644393920898,
"reward_std": 0.14738810062408447,
"rewards/length_reward": 0.07232140004634857,
"rewards/similarity_reward": 0.5531430244445801,
"step": 372
},
{
"completion_length": 240.38394165039062,
"epoch": 0.9946666666666667,
"grad_norm": 0.9672618888481953,
"kl": 0.01239013671875,
"learning_rate": 2e-06,
"loss": 0.0627,
"reward": 0.6286079287528992,
"reward_std": 0.1355430781841278,
"rewards/length_reward": 0.07232140004634857,
"rewards/similarity_reward": 0.5562865734100342,
"step": 373
},
{
"completion_length": 296.3482360839844,
"epoch": 0.9973333333333333,
"grad_norm": 0.8090409603684708,
"kl": 0.0118408203125,
"learning_rate": 2e-06,
"loss": 0.0202,
"reward": 0.7064945697784424,
"reward_std": 0.08492975682020187,
"rewards/length_reward": 0.07455354183912277,
"rewards/similarity_reward": 0.631941020488739,
"step": 374
},
{
"completion_length": 207.4114227294922,
"epoch": 1.0,
"grad_norm": 0.9985931023470964,
"kl": 0.01458740234375,
"learning_rate": 2e-06,
"loss": 0.0328,
"reward": 0.6288642883300781,
"reward_std": 0.13037118315696716,
"rewards/length_reward": 0.07276783138513565,
"rewards/similarity_reward": 0.5560964941978455,
"step": 375
}
],
"logging_steps": 1,
"max_steps": 375,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 32,
"trial_name": null,
"trial_params": null
}