{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 500, "global_step": 375, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "completion_length": 376.9687805175781, "epoch": 0.0026666666666666666, "grad_norm": 0.8290089342545105, "kl": 0.00012969970703125, "learning_rate": 2e-06, "loss": 0.1243, "reward": 0.26896461844444275, "reward_std": 0.2736624479293823, "rewards/length_reward": 0.026339290663599968, "rewards/similarity_reward": 0.24262532591819763, "step": 1 }, { "completion_length": 363.27679443359375, "epoch": 0.005333333333333333, "grad_norm": 0.8474061525539559, "kl": 0.00015354156494140625, "learning_rate": 2e-06, "loss": 0.1524, "reward": 0.1628378927707672, "reward_std": 0.21853798627853394, "rewards/length_reward": 0.0178571455180645, "rewards/similarity_reward": 0.14498072862625122, "step": 2 }, { "completion_length": 378.4732360839844, "epoch": 0.008, "grad_norm": 0.8192733588218007, "kl": 0.0001430511474609375, "learning_rate": 2e-06, "loss": 0.1852, "reward": 0.27797675132751465, "reward_std": 0.23766961693763733, "rewards/length_reward": 0.02633928880095482, "rewards/similarity_reward": 0.25163745880126953, "step": 3 }, { "completion_length": 359.21429443359375, "epoch": 0.010666666666666666, "grad_norm": 0.845054880446133, "kl": 0.00015544891357421875, "learning_rate": 2e-06, "loss": 0.0616, "reward": 0.1538199633359909, "reward_std": 0.17373259365558624, "rewards/length_reward": 0.013839286752045155, "rewards/similarity_reward": 0.13998067378997803, "step": 4 }, { "completion_length": 336.15625, "epoch": 0.013333333333333334, "grad_norm": 0.8717553512231005, "kl": 0.00014972686767578125, "learning_rate": 2e-06, "loss": 0.1209, "reward": 0.21149994432926178, "reward_std": 0.2120143473148346, "rewards/length_reward": 0.01830357313156128, "rewards/similarity_reward": 0.1931963711977005, "step": 5 }, { "completion_length": 373.6026916503906, "epoch": 0.016, "grad_norm": 0.7610025457969676, "kl": 0.00013446807861328125, "learning_rate": 2e-06, "loss": 0.1513, "reward": 0.1827303022146225, "reward_std": 0.24866001307964325, "rewards/length_reward": 0.02187500335276127, "rewards/similarity_reward": 0.16085529327392578, "step": 6 }, { "completion_length": 341.7589416503906, "epoch": 0.018666666666666668, "grad_norm": 0.8626611704865026, "kl": 0.0001583099365234375, "learning_rate": 2e-06, "loss": 0.1271, "reward": 0.19529196619987488, "reward_std": 0.2814559042453766, "rewards/length_reward": 0.021428575739264488, "rewards/similarity_reward": 0.17386338114738464, "step": 7 }, { "completion_length": 424.98663330078125, "epoch": 0.021333333333333333, "grad_norm": 0.7146043340468313, "kl": 0.00017452239990234375, "learning_rate": 2e-06, "loss": 0.1881, "reward": 0.21610300242900848, "reward_std": 0.2689198851585388, "rewards/length_reward": 0.01830357313156128, "rewards/similarity_reward": 0.197799414396286, "step": 8 }, { "completion_length": 348.2321472167969, "epoch": 0.024, "grad_norm": 0.7606250954270842, "kl": 0.000186920166015625, "learning_rate": 2e-06, "loss": 0.1276, "reward": 0.20473892986774445, "reward_std": 0.23727914690971375, "rewards/length_reward": 0.02008928917348385, "rewards/similarity_reward": 0.18464964628219604, "step": 9 }, { "completion_length": 355.54913330078125, "epoch": 0.02666666666666667, "grad_norm": 0.8173370703071066, "kl": 0.00018310546875, "learning_rate": 2e-06, "loss": 0.0787, "reward": 0.227640300989151, "reward_std": 0.2539962828159332, "rewards/length_reward": 0.01741071790456772, "rewards/similarity_reward": 0.21022957563400269, "step": 10 }, { "completion_length": 343.7232360839844, "epoch": 0.029333333333333333, "grad_norm": 0.9104553890156574, "kl": 0.00019550323486328125, "learning_rate": 2e-06, "loss": 0.0575, "reward": 0.25083568692207336, "reward_std": 0.2815442383289337, "rewards/length_reward": 0.02767857536673546, "rewards/similarity_reward": 0.22315707802772522, "step": 11 }, { "completion_length": 355.0535888671875, "epoch": 0.032, "grad_norm": 0.8007928014475878, "kl": 0.0003528594970703125, "learning_rate": 2e-06, "loss": 0.1448, "reward": 0.2787685990333557, "reward_std": 0.25941261649131775, "rewards/length_reward": 0.025892863050103188, "rewards/similarity_reward": 0.25287577509880066, "step": 12 }, { "completion_length": 395.4107360839844, "epoch": 0.034666666666666665, "grad_norm": 0.7050603406845205, "kl": 0.000255584716796875, "learning_rate": 2e-06, "loss": 0.1127, "reward": 0.31717172265052795, "reward_std": 0.2762907147407532, "rewards/length_reward": 0.02946428954601288, "rewards/similarity_reward": 0.28770744800567627, "step": 13 }, { "completion_length": 361.05804443359375, "epoch": 0.037333333333333336, "grad_norm": 0.9360978406768153, "kl": 0.0003376007080078125, "learning_rate": 2e-06, "loss": 0.142, "reward": 0.24003884196281433, "reward_std": 0.27974435687065125, "rewards/length_reward": 0.021428575739264488, "rewards/similarity_reward": 0.2186102569103241, "step": 14 }, { "completion_length": 398.0044860839844, "epoch": 0.04, "grad_norm": 0.7389563411116621, "kl": 0.0003604888916015625, "learning_rate": 2e-06, "loss": 0.1137, "reward": 0.23077349364757538, "reward_std": 0.24957218766212463, "rewards/length_reward": 0.0178571455180645, "rewards/similarity_reward": 0.21291638910770416, "step": 15 }, { "completion_length": 366.4196472167969, "epoch": 0.042666666666666665, "grad_norm": 0.7079515986093292, "kl": 0.0004177093505859375, "learning_rate": 2e-06, "loss": 0.1596, "reward": 0.16758890450000763, "reward_std": 0.1997506469488144, "rewards/length_reward": 0.0178571455180645, "rewards/similarity_reward": 0.14973175525665283, "step": 16 }, { "completion_length": 385.87054443359375, "epoch": 0.04533333333333334, "grad_norm": 0.7793857856999354, "kl": 0.000385284423828125, "learning_rate": 2e-06, "loss": 0.1863, "reward": 0.2408275306224823, "reward_std": 0.28883570432662964, "rewards/length_reward": 0.022321434691548347, "rewards/similarity_reward": 0.2185060679912567, "step": 17 }, { "completion_length": 373.0446472167969, "epoch": 0.048, "grad_norm": 0.8181367418694138, "kl": 0.0005035400390625, "learning_rate": 2e-06, "loss": 0.0998, "reward": 0.2590915262699127, "reward_std": 0.26667794585227966, "rewards/length_reward": 0.02678571827709675, "rewards/similarity_reward": 0.23230580985546112, "step": 18 }, { "completion_length": 314.1026916503906, "epoch": 0.050666666666666665, "grad_norm": 0.8473993736940718, "kl": 0.000438690185546875, "learning_rate": 2e-06, "loss": 0.0861, "reward": 0.34537845849990845, "reward_std": 0.26645439863204956, "rewards/length_reward": 0.03705357387661934, "rewards/similarity_reward": 0.3083249032497406, "step": 19 }, { "completion_length": 354.93304443359375, "epoch": 0.05333333333333334, "grad_norm": 0.8882148635006984, "kl": 0.00057220458984375, "learning_rate": 2e-06, "loss": 0.1732, "reward": 0.3256897032260895, "reward_std": 0.25436195731163025, "rewards/length_reward": 0.025446433573961258, "rewards/similarity_reward": 0.3002432584762573, "step": 20 }, { "completion_length": 354.1250305175781, "epoch": 0.056, "grad_norm": 0.8096248306365297, "kl": 0.000713348388671875, "learning_rate": 2e-06, "loss": 0.1809, "reward": 0.42166247963905334, "reward_std": 0.2462671995162964, "rewards/length_reward": 0.0401785746216774, "rewards/similarity_reward": 0.38148391246795654, "step": 21 }, { "completion_length": 379.8258972167969, "epoch": 0.058666666666666666, "grad_norm": 0.8324528992208251, "kl": 0.000713348388671875, "learning_rate": 2e-06, "loss": 0.1888, "reward": 0.4674707353115082, "reward_std": 0.28602704405784607, "rewards/length_reward": 0.03883929178118706, "rewards/similarity_reward": 0.42863139510154724, "step": 22 }, { "completion_length": 336.1964416503906, "epoch": 0.06133333333333333, "grad_norm": 0.883659092626158, "kl": 0.001129150390625, "learning_rate": 2e-06, "loss": 0.1224, "reward": 0.3572904169559479, "reward_std": 0.2817726135253906, "rewards/length_reward": 0.030803577974438667, "rewards/similarity_reward": 0.32648688554763794, "step": 23 }, { "completion_length": 379.8750305175781, "epoch": 0.064, "grad_norm": 0.7759947978320788, "kl": 0.00135040283203125, "learning_rate": 2e-06, "loss": 0.163, "reward": 0.34559932351112366, "reward_std": 0.26974016427993774, "rewards/length_reward": 0.03705357387661934, "rewards/similarity_reward": 0.3085457384586334, "step": 24 }, { "completion_length": 334.71875, "epoch": 0.06666666666666667, "grad_norm": 0.8358157724868338, "kl": 0.001068115234375, "learning_rate": 2e-06, "loss": 0.1326, "reward": 0.3908008337020874, "reward_std": 0.3024666905403137, "rewards/length_reward": 0.03928571566939354, "rewards/similarity_reward": 0.35151511430740356, "step": 25 }, { "completion_length": 372.55804443359375, "epoch": 0.06933333333333333, "grad_norm": 0.7610834907565935, "kl": 0.0027008056640625, "learning_rate": 2e-06, "loss": 0.1688, "reward": 0.2864897847175598, "reward_std": 0.2402629852294922, "rewards/length_reward": 0.02500000409781933, "rewards/similarity_reward": 0.2614898085594177, "step": 26 }, { "completion_length": 332.1651916503906, "epoch": 0.072, "grad_norm": 0.9340372327621089, "kl": 0.00152587890625, "learning_rate": 2e-06, "loss": 0.1936, "reward": 0.32003700733184814, "reward_std": 0.28589487075805664, "rewards/length_reward": 0.0334821492433548, "rewards/similarity_reward": 0.28655487298965454, "step": 27 }, { "completion_length": 421.08929443359375, "epoch": 0.07466666666666667, "grad_norm": 0.7506727526601732, "kl": 0.00148773193359375, "learning_rate": 2e-06, "loss": 0.1452, "reward": 0.39091238379478455, "reward_std": 0.21896174550056458, "rewards/length_reward": 0.03482143208384514, "rewards/similarity_reward": 0.3560909032821655, "step": 28 }, { "completion_length": 381.3348388671875, "epoch": 0.07733333333333334, "grad_norm": 0.6952200929063435, "kl": 0.0017242431640625, "learning_rate": 2e-06, "loss": 0.1113, "reward": 0.36936715245246887, "reward_std": 0.23492401838302612, "rewards/length_reward": 0.030357148498296738, "rewards/similarity_reward": 0.33900997042655945, "step": 29 }, { "completion_length": 334.6607360839844, "epoch": 0.08, "grad_norm": 0.9106612624614471, "kl": 0.00170135498046875, "learning_rate": 2e-06, "loss": 0.0756, "reward": 0.4293070435523987, "reward_std": 0.2823811173439026, "rewards/length_reward": 0.0401785746216774, "rewards/similarity_reward": 0.3891284763813019, "step": 30 }, { "completion_length": 461.99554443359375, "epoch": 0.08266666666666667, "grad_norm": 0.661013366993289, "kl": 0.0020751953125, "learning_rate": 2e-06, "loss": 0.2489, "reward": 0.3686121702194214, "reward_std": 0.2422313094139099, "rewards/length_reward": 0.028571434319019318, "rewards/similarity_reward": 0.3400407135486603, "step": 31 }, { "completion_length": 354.83038330078125, "epoch": 0.08533333333333333, "grad_norm": 0.9261837111640477, "kl": 0.0028533935546875, "learning_rate": 2e-06, "loss": 0.3225, "reward": 0.3788739740848541, "reward_std": 0.26962369680404663, "rewards/length_reward": 0.03482143208384514, "rewards/similarity_reward": 0.34405258297920227, "step": 32 }, { "completion_length": 355.5357360839844, "epoch": 0.088, "grad_norm": 0.8643961614615921, "kl": 0.0026397705078125, "learning_rate": 2e-06, "loss": 0.1893, "reward": 0.3799653649330139, "reward_std": 0.2525205910205841, "rewards/length_reward": 0.03303571790456772, "rewards/similarity_reward": 0.346929669380188, "step": 33 }, { "completion_length": 303.2901916503906, "epoch": 0.09066666666666667, "grad_norm": 0.9531070627714385, "kl": 0.002532958984375, "learning_rate": 2e-06, "loss": 0.0718, "reward": 0.435234397649765, "reward_std": 0.2490427941083908, "rewards/length_reward": 0.03883928805589676, "rewards/similarity_reward": 0.39639511704444885, "step": 34 }, { "completion_length": 391.89288330078125, "epoch": 0.09333333333333334, "grad_norm": 0.8111465127450542, "kl": 0.0019683837890625, "learning_rate": 2e-06, "loss": 0.1647, "reward": 0.4038808047771454, "reward_std": 0.31206637620925903, "rewards/length_reward": 0.03883928805589676, "rewards/similarity_reward": 0.3650415241718292, "step": 35 }, { "completion_length": 359.9196472167969, "epoch": 0.096, "grad_norm": 0.8281785424636492, "kl": 0.00335693359375, "learning_rate": 2e-06, "loss": 0.1875, "reward": 0.4340634346008301, "reward_std": 0.27002009749412537, "rewards/length_reward": 0.04151785373687744, "rewards/similarity_reward": 0.39254552125930786, "step": 36 }, { "completion_length": 287.6875, "epoch": 0.09866666666666667, "grad_norm": 1.053364246590671, "kl": 0.00518798828125, "learning_rate": 2e-06, "loss": 0.085, "reward": 0.47383809089660645, "reward_std": 0.22637499868869781, "rewards/length_reward": 0.04374999925494194, "rewards/similarity_reward": 0.430088073015213, "step": 37 }, { "completion_length": 398.1160888671875, "epoch": 0.10133333333333333, "grad_norm": 0.7805356469167566, "kl": 0.00341796875, "learning_rate": 2e-06, "loss": 0.2197, "reward": 0.4928036332130432, "reward_std": 0.24267539381980896, "rewards/length_reward": 0.050446417182683945, "rewards/similarity_reward": 0.4423570930957794, "step": 38 }, { "completion_length": 312.7276916503906, "epoch": 0.104, "grad_norm": 0.819085900048414, "kl": 0.00177001953125, "learning_rate": 2e-06, "loss": 0.1426, "reward": 0.5058891177177429, "reward_std": 0.24202971160411835, "rewards/length_reward": 0.04776785522699356, "rewards/similarity_reward": 0.4581212103366852, "step": 39 }, { "completion_length": 386.8214416503906, "epoch": 0.10666666666666667, "grad_norm": 0.7650271991129984, "kl": 0.0030364990234375, "learning_rate": 2e-06, "loss": 0.1882, "reward": 0.4022373855113983, "reward_std": 0.24781934916973114, "rewards/length_reward": 0.0401785746216774, "rewards/similarity_reward": 0.3620587885379791, "step": 40 }, { "completion_length": 250.4107208251953, "epoch": 0.10933333333333334, "grad_norm": 0.9803970599540968, "kl": 0.003875732421875, "learning_rate": 2e-06, "loss": -0.0008, "reward": 0.5008234977722168, "reward_std": 0.22121772170066833, "rewards/length_reward": 0.050446417182683945, "rewards/similarity_reward": 0.45037704706192017, "step": 41 }, { "completion_length": 317.1160888671875, "epoch": 0.112, "grad_norm": 0.8986374178737812, "kl": 0.00341796875, "learning_rate": 2e-06, "loss": 0.2065, "reward": 0.45162686705589294, "reward_std": 0.27914097905158997, "rewards/length_reward": 0.04598214104771614, "rewards/similarity_reward": 0.4056447148323059, "step": 42 }, { "completion_length": 313.4821472167969, "epoch": 0.11466666666666667, "grad_norm": 0.9439169733302692, "kl": 0.0040283203125, "learning_rate": 2e-06, "loss": 0.1189, "reward": 0.5015469789505005, "reward_std": 0.2071218341588974, "rewards/length_reward": 0.050892848521471024, "rewards/similarity_reward": 0.4506540596485138, "step": 43 }, { "completion_length": 308.23663330078125, "epoch": 0.11733333333333333, "grad_norm": 0.843848181524653, "kl": 0.004150390625, "learning_rate": 2e-06, "loss": 0.1423, "reward": 0.49464964866638184, "reward_std": 0.19760312139987946, "rewards/length_reward": 0.05223213508725166, "rewards/similarity_reward": 0.4424174726009369, "step": 44 }, { "completion_length": 322.51788330078125, "epoch": 0.12, "grad_norm": 0.808670899867239, "kl": 0.004730224609375, "learning_rate": 2e-06, "loss": 0.2162, "reward": 0.45207658410072327, "reward_std": 0.22255302965641022, "rewards/length_reward": 0.03883929178118706, "rewards/similarity_reward": 0.4132373034954071, "step": 45 }, { "completion_length": 306.9732360839844, "epoch": 0.12266666666666666, "grad_norm": 0.9215129819151354, "kl": 0.0038604736328125, "learning_rate": 2e-06, "loss": 0.1492, "reward": 0.5217949151992798, "reward_std": 0.24197062849998474, "rewards/length_reward": 0.04598213732242584, "rewards/similarity_reward": 0.47581273317337036, "step": 46 }, { "completion_length": 305.45538330078125, "epoch": 0.12533333333333332, "grad_norm": 0.8829840145792894, "kl": 0.00518798828125, "learning_rate": 2e-06, "loss": 0.2397, "reward": 0.471711665391922, "reward_std": 0.15981332957744598, "rewards/length_reward": 0.049553561955690384, "rewards/similarity_reward": 0.42215806245803833, "step": 47 }, { "completion_length": 294.4419860839844, "epoch": 0.128, "grad_norm": 0.8246638858466566, "kl": 0.004241943359375, "learning_rate": 2e-06, "loss": 0.0913, "reward": 0.4828924238681793, "reward_std": 0.19750112295150757, "rewards/length_reward": 0.053124990314245224, "rewards/similarity_reward": 0.429767370223999, "step": 48 }, { "completion_length": 332.45538330078125, "epoch": 0.13066666666666665, "grad_norm": 0.7904388485187617, "kl": 0.004913330078125, "learning_rate": 2e-06, "loss": 0.1529, "reward": 0.45801258087158203, "reward_std": 0.2542867660522461, "rewards/length_reward": 0.04196428507566452, "rewards/similarity_reward": 0.4160482585430145, "step": 49 }, { "completion_length": 281.77679443359375, "epoch": 0.13333333333333333, "grad_norm": 0.9845645428183626, "kl": 0.00433349609375, "learning_rate": 2e-06, "loss": 0.0801, "reward": 0.5682670474052429, "reward_std": 0.23296673595905304, "rewards/length_reward": 0.06517855077981949, "rewards/similarity_reward": 0.5030884742736816, "step": 50 }, { "completion_length": 316.7410888671875, "epoch": 0.136, "grad_norm": 0.9300429591791828, "kl": 0.00592041015625, "learning_rate": 2e-06, "loss": 0.103, "reward": 0.44630467891693115, "reward_std": 0.12811601161956787, "rewards/length_reward": 0.04821427911520004, "rewards/similarity_reward": 0.3980904519557953, "step": 51 }, { "completion_length": 296.8973388671875, "epoch": 0.13866666666666666, "grad_norm": 0.8592422567531082, "kl": 0.005218505859375, "learning_rate": 2e-06, "loss": 0.1159, "reward": 0.5130535960197449, "reward_std": 0.1873682290315628, "rewards/length_reward": 0.0491071380674839, "rewards/similarity_reward": 0.4639464318752289, "step": 52 }, { "completion_length": 284.0401916503906, "epoch": 0.14133333333333334, "grad_norm": 0.8593724590061699, "kl": 0.0048828125, "learning_rate": 2e-06, "loss": 0.0087, "reward": 0.5250208973884583, "reward_std": 0.21563619375228882, "rewards/length_reward": 0.057142842561006546, "rewards/similarity_reward": 0.46787798404693604, "step": 53 }, { "completion_length": 268.21429443359375, "epoch": 0.144, "grad_norm": 0.9220995505083402, "kl": 0.005645751953125, "learning_rate": 2e-06, "loss": 0.0234, "reward": 0.6078009605407715, "reward_std": 0.18404294550418854, "rewards/length_reward": 0.04508928582072258, "rewards/similarity_reward": 0.5627117156982422, "step": 54 }, { "completion_length": 332.2232360839844, "epoch": 0.14666666666666667, "grad_norm": 0.8153244083746986, "kl": 0.004852294921875, "learning_rate": 2e-06, "loss": 0.1089, "reward": 0.5709711313247681, "reward_std": 0.18112631142139435, "rewards/length_reward": 0.058035701513290405, "rewards/similarity_reward": 0.5129354596138, "step": 55 }, { "completion_length": 298.62054443359375, "epoch": 0.14933333333333335, "grad_norm": 0.861900790561817, "kl": 0.00567626953125, "learning_rate": 2e-06, "loss": 0.1037, "reward": 0.5129757523536682, "reward_std": 0.21154648065567017, "rewards/length_reward": 0.05401784926652908, "rewards/similarity_reward": 0.45895785093307495, "step": 56 }, { "completion_length": 243.4241180419922, "epoch": 0.152, "grad_norm": 0.928994891862699, "kl": 0.004241943359375, "learning_rate": 2e-06, "loss": 0.0328, "reward": 0.6340307593345642, "reward_std": 0.16285859048366547, "rewards/length_reward": 0.06562498211860657, "rewards/similarity_reward": 0.56840580701828, "step": 57 }, { "completion_length": 293.0669860839844, "epoch": 0.15466666666666667, "grad_norm": 0.89777989101008, "kl": 0.00579833984375, "learning_rate": 2e-06, "loss": 0.0313, "reward": 0.5502158999443054, "reward_std": 0.1914074867963791, "rewards/length_reward": 0.06517855077981949, "rewards/similarity_reward": 0.48503735661506653, "step": 58 }, { "completion_length": 348.3973388671875, "epoch": 0.15733333333333333, "grad_norm": 0.830581459672137, "kl": 0.005706787109375, "learning_rate": 2e-06, "loss": 0.1262, "reward": 0.5427281260490417, "reward_std": 0.18273915350437164, "rewards/length_reward": 0.04508928582072258, "rewards/similarity_reward": 0.4976387917995453, "step": 59 }, { "completion_length": 300.43304443359375, "epoch": 0.16, "grad_norm": 0.7841145484798535, "kl": 0.007080078125, "learning_rate": 2e-06, "loss": 0.0356, "reward": 0.6150097846984863, "reward_std": 0.15837538242340088, "rewards/length_reward": 0.06383926421403885, "rewards/similarity_reward": 0.5511705279350281, "step": 60 }, { "completion_length": 291.64288330078125, "epoch": 0.16266666666666665, "grad_norm": 0.9759946887460155, "kl": 0.005645751953125, "learning_rate": 2e-06, "loss": 0.1347, "reward": 0.6720048785209656, "reward_std": 0.16562286019325256, "rewards/length_reward": 0.06428569555282593, "rewards/similarity_reward": 0.6077191233634949, "step": 61 }, { "completion_length": 300.3973388671875, "epoch": 0.16533333333333333, "grad_norm": 0.8353754395282778, "kl": 0.00592041015625, "learning_rate": 2e-06, "loss": 0.1508, "reward": 0.6174642443656921, "reward_std": 0.1775916963815689, "rewards/length_reward": 0.05848212540149689, "rewards/similarity_reward": 0.5589820742607117, "step": 62 }, { "completion_length": 299.0, "epoch": 0.168, "grad_norm": 0.8434412806636016, "kl": 0.0068359375, "learning_rate": 2e-06, "loss": 0.0787, "reward": 0.5795109272003174, "reward_std": 0.18212977051734924, "rewards/length_reward": 0.056249987334012985, "rewards/similarity_reward": 0.5232609510421753, "step": 63 }, { "completion_length": 314.3883972167969, "epoch": 0.17066666666666666, "grad_norm": 1.1818234014256608, "kl": 0.005706787109375, "learning_rate": 2e-06, "loss": 0.2756, "reward": 0.5499185919761658, "reward_std": 0.22555634379386902, "rewards/length_reward": 0.04776785522699356, "rewards/similarity_reward": 0.5021507143974304, "step": 64 }, { "completion_length": 259.1785888671875, "epoch": 0.17333333333333334, "grad_norm": 0.9529921486665629, "kl": 0.006439208984375, "learning_rate": 2e-06, "loss": 0.0704, "reward": 0.5430376529693604, "reward_std": 0.2042228877544403, "rewards/length_reward": 0.052232131361961365, "rewards/similarity_reward": 0.4908054769039154, "step": 65 }, { "completion_length": 258.0714416503906, "epoch": 0.176, "grad_norm": 1.1488507934693044, "kl": 0.006805419921875, "learning_rate": 2e-06, "loss": 0.1057, "reward": 0.570214033126831, "reward_std": 0.160283625125885, "rewards/length_reward": 0.06517855077981949, "rewards/similarity_reward": 0.505035400390625, "step": 66 }, { "completion_length": 330.90179443359375, "epoch": 0.17866666666666667, "grad_norm": 0.912379625363708, "kl": 0.0062255859375, "learning_rate": 2e-06, "loss": 0.1291, "reward": 0.49484553933143616, "reward_std": 0.21234968304634094, "rewards/length_reward": 0.04553570970892906, "rewards/similarity_reward": 0.4493098556995392, "step": 67 }, { "completion_length": 266.46429443359375, "epoch": 0.18133333333333335, "grad_norm": 0.9382639131370187, "kl": 0.00909423828125, "learning_rate": 2e-06, "loss": 0.0685, "reward": 0.5787621736526489, "reward_std": 0.17865508794784546, "rewards/length_reward": 0.054464273154735565, "rewards/similarity_reward": 0.5242978930473328, "step": 68 }, { "completion_length": 285.25, "epoch": 0.184, "grad_norm": 0.8385679542137942, "kl": 0.00555419921875, "learning_rate": 2e-06, "loss": 0.046, "reward": 0.6689252257347107, "reward_std": 0.16466915607452393, "rewards/length_reward": 0.06651783734560013, "rewards/similarity_reward": 0.6024073958396912, "step": 69 }, { "completion_length": 247.68751525878906, "epoch": 0.18666666666666668, "grad_norm": 1.01200025847724, "kl": 0.00860595703125, "learning_rate": 2e-06, "loss": 0.1382, "reward": 0.4780524969100952, "reward_std": 0.19645950198173523, "rewards/length_reward": 0.04196428507566452, "rewards/similarity_reward": 0.4360882043838501, "step": 70 }, { "completion_length": 307.24554443359375, "epoch": 0.18933333333333333, "grad_norm": 0.8185082695628789, "kl": 0.00787353515625, "learning_rate": 2e-06, "loss": 0.0749, "reward": 0.5303549766540527, "reward_std": 0.1896388977766037, "rewards/length_reward": 0.056696414947509766, "rewards/similarity_reward": 0.4736584722995758, "step": 71 }, { "completion_length": 329.15179443359375, "epoch": 0.192, "grad_norm": 0.8562549539520792, "kl": 0.00823974609375, "learning_rate": 2e-06, "loss": 0.1642, "reward": 0.5008990168571472, "reward_std": 0.17187656462192535, "rewards/length_reward": 0.050892848521471024, "rewards/similarity_reward": 0.4500061273574829, "step": 72 }, { "completion_length": 253.59376525878906, "epoch": 0.19466666666666665, "grad_norm": 0.8806238339574037, "kl": 0.006591796875, "learning_rate": 2e-06, "loss": 0.1082, "reward": 0.7047773003578186, "reward_std": 0.12662379443645477, "rewards/length_reward": 0.07232140004634857, "rewards/similarity_reward": 0.6324558258056641, "step": 73 }, { "completion_length": 302.7321472167969, "epoch": 0.19733333333333333, "grad_norm": 0.888373625390179, "kl": 0.01300048828125, "learning_rate": 2e-06, "loss": 0.0384, "reward": 0.5046581625938416, "reward_std": 0.18071489036083221, "rewards/length_reward": 0.0491071380674839, "rewards/similarity_reward": 0.45555105805397034, "step": 74 }, { "completion_length": 309.0982360839844, "epoch": 0.2, "grad_norm": 0.8352994571315709, "kl": 0.0081787109375, "learning_rate": 2e-06, "loss": 0.1895, "reward": 0.6111252903938293, "reward_std": 0.19863885641098022, "rewards/length_reward": 0.054464273154735565, "rewards/similarity_reward": 0.5566610097885132, "step": 75 }, { "completion_length": 255.25001525878906, "epoch": 0.20266666666666666, "grad_norm": 1.0786021298964794, "kl": 0.0089111328125, "learning_rate": 2e-06, "loss": 0.0384, "reward": 0.5118966102600098, "reward_std": 0.1661101132631302, "rewards/length_reward": 0.05223213508725166, "rewards/similarity_reward": 0.4596644341945648, "step": 76 }, { "completion_length": 299.92413330078125, "epoch": 0.20533333333333334, "grad_norm": 0.887828324089484, "kl": 0.0128173828125, "learning_rate": 2e-06, "loss": 0.1341, "reward": 0.5058793425559998, "reward_std": 0.2038315385580063, "rewards/length_reward": 0.05044642463326454, "rewards/similarity_reward": 0.45543283224105835, "step": 77 }, { "completion_length": 300.26788330078125, "epoch": 0.208, "grad_norm": 0.960422578229874, "kl": 0.01080322265625, "learning_rate": 2e-06, "loss": 0.2173, "reward": 0.5535677075386047, "reward_std": 0.16259299218654633, "rewards/length_reward": 0.06205355376005173, "rewards/similarity_reward": 0.49151411652565, "step": 78 }, { "completion_length": 256.21875, "epoch": 0.21066666666666667, "grad_norm": 0.9394611442130687, "kl": 0.01171875, "learning_rate": 2e-06, "loss": 0.031, "reward": 0.6236703991889954, "reward_std": 0.16783180832862854, "rewards/length_reward": 0.05982141196727753, "rewards/similarity_reward": 0.5638489127159119, "step": 79 }, { "completion_length": 279.9419860839844, "epoch": 0.21333333333333335, "grad_norm": 1.0860808591863038, "kl": 0.0096435546875, "learning_rate": 2e-06, "loss": 0.2206, "reward": 0.5311146974563599, "reward_std": 0.20672693848609924, "rewards/length_reward": 0.052678562700748444, "rewards/similarity_reward": 0.47843608260154724, "step": 80 }, { "completion_length": 302.7008972167969, "epoch": 0.216, "grad_norm": 0.7695067843560371, "kl": 0.00653076171875, "learning_rate": 2e-06, "loss": 0.0387, "reward": 0.5848192572593689, "reward_std": 0.20639710128307343, "rewards/length_reward": 0.060267843306064606, "rewards/similarity_reward": 0.5245514512062073, "step": 81 }, { "completion_length": 247.18751525878906, "epoch": 0.21866666666666668, "grad_norm": 1.0172061111487714, "kl": 0.009765625, "learning_rate": 2e-06, "loss": 0.0528, "reward": 0.6127398610115051, "reward_std": 0.13182979822158813, "rewards/length_reward": 0.06562498211860657, "rewards/similarity_reward": 0.5471147894859314, "step": 82 }, { "completion_length": 303.40625, "epoch": 0.22133333333333333, "grad_norm": 0.7610369219853271, "kl": 0.00921630859375, "learning_rate": 2e-06, "loss": 0.1034, "reward": 0.5650977492332458, "reward_std": 0.16646917164325714, "rewards/length_reward": 0.056696418672800064, "rewards/similarity_reward": 0.5084013342857361, "step": 83 }, { "completion_length": 296.625, "epoch": 0.224, "grad_norm": 0.8819878638905205, "kl": 0.00653076171875, "learning_rate": 2e-06, "loss": 0.0509, "reward": 0.6824392676353455, "reward_std": 0.13198219239711761, "rewards/length_reward": 0.07678568363189697, "rewards/similarity_reward": 0.6056535243988037, "step": 84 }, { "completion_length": 264.4375, "epoch": 0.22666666666666666, "grad_norm": 0.9860703009968039, "kl": 0.0103759765625, "learning_rate": 2e-06, "loss": 0.1082, "reward": 0.5452346205711365, "reward_std": 0.18002980947494507, "rewards/length_reward": 0.055357132107019424, "rewards/similarity_reward": 0.48987752199172974, "step": 85 }, { "completion_length": 275.1696472167969, "epoch": 0.22933333333333333, "grad_norm": 0.876007599982239, "kl": 0.007720947265625, "learning_rate": 2e-06, "loss": 0.1075, "reward": 0.5654360055923462, "reward_std": 0.15497317910194397, "rewards/length_reward": 0.050892848521471024, "rewards/similarity_reward": 0.5145430564880371, "step": 86 }, { "completion_length": 268.3883972167969, "epoch": 0.232, "grad_norm": 0.9877196366166759, "kl": 0.00872802734375, "learning_rate": 2e-06, "loss": 0.0664, "reward": 0.6144102811813354, "reward_std": 0.1374298632144928, "rewards/length_reward": 0.06741069257259369, "rewards/similarity_reward": 0.5469995141029358, "step": 87 }, { "completion_length": 339.67413330078125, "epoch": 0.23466666666666666, "grad_norm": 0.6868950012188707, "kl": 0.006072998046875, "learning_rate": 2e-06, "loss": 0.0478, "reward": 0.6562062501907349, "reward_std": 0.1523490995168686, "rewards/length_reward": 0.07187497615814209, "rewards/similarity_reward": 0.5843312740325928, "step": 88 }, { "completion_length": 262.4598388671875, "epoch": 0.23733333333333334, "grad_norm": 0.8224596138062096, "kl": 0.006378173828125, "learning_rate": 2e-06, "loss": 0.101, "reward": 0.6333271265029907, "reward_std": 0.170791357755661, "rewards/length_reward": 0.06785711646080017, "rewards/similarity_reward": 0.5654700398445129, "step": 89 }, { "completion_length": 256.42413330078125, "epoch": 0.24, "grad_norm": 0.9173507116779652, "kl": 0.00946044921875, "learning_rate": 2e-06, "loss": 0.0437, "reward": 0.5871028900146484, "reward_std": 0.16378919780254364, "rewards/length_reward": 0.04732142388820648, "rewards/similarity_reward": 0.5397815108299255, "step": 90 }, { "completion_length": 350.6562805175781, "epoch": 0.24266666666666667, "grad_norm": 0.7989839039096065, "kl": 0.01251220703125, "learning_rate": 2e-06, "loss": 0.0614, "reward": 0.5199065208435059, "reward_std": 0.20764127373695374, "rewards/length_reward": 0.05044642463326454, "rewards/similarity_reward": 0.4694600999355316, "step": 91 }, { "completion_length": 249.0044708251953, "epoch": 0.24533333333333332, "grad_norm": 0.9605121844841826, "kl": 0.01123046875, "learning_rate": 2e-06, "loss": 0.036, "reward": 0.6044343113899231, "reward_std": 0.164906844496727, "rewards/length_reward": 0.057589270174503326, "rewards/similarity_reward": 0.5468449592590332, "step": 92 }, { "completion_length": 312.2232360839844, "epoch": 0.248, "grad_norm": 0.836379585280954, "kl": 0.007781982421875, "learning_rate": 2e-06, "loss": 0.0948, "reward": 0.5776776075363159, "reward_std": 0.15271225571632385, "rewards/length_reward": 0.055803555995225906, "rewards/similarity_reward": 0.5218740701675415, "step": 93 }, { "completion_length": 311.3482360839844, "epoch": 0.25066666666666665, "grad_norm": 0.7945534599494852, "kl": 0.00897216796875, "learning_rate": 2e-06, "loss": 0.1441, "reward": 0.6111860275268555, "reward_std": 0.13805179297924042, "rewards/length_reward": 0.06696426123380661, "rewards/similarity_reward": 0.5442216992378235, "step": 94 }, { "completion_length": 234.93751525878906, "epoch": 0.25333333333333335, "grad_norm": 1.2028189171950667, "kl": 0.00909423828125, "learning_rate": 2e-06, "loss": 0.1766, "reward": 0.5961614847183228, "reward_std": 0.16394107043743134, "rewards/length_reward": 0.05982141196727753, "rewards/similarity_reward": 0.5363399982452393, "step": 95 }, { "completion_length": 305.6964416503906, "epoch": 0.256, "grad_norm": 0.8067577376172387, "kl": 0.00982666015625, "learning_rate": 2e-06, "loss": 0.0671, "reward": 0.656367301940918, "reward_std": 0.12278923392295837, "rewards/length_reward": 0.06874997913837433, "rewards/similarity_reward": 0.5876173377037048, "step": 96 }, { "completion_length": 327.71429443359375, "epoch": 0.25866666666666666, "grad_norm": 0.770922327161602, "kl": 0.0064697265625, "learning_rate": 2e-06, "loss": 0.0279, "reward": 0.5429174304008484, "reward_std": 0.16670171916484833, "rewards/length_reward": 0.06562498211860657, "rewards/similarity_reward": 0.47729235887527466, "step": 97 }, { "completion_length": 256.62054443359375, "epoch": 0.2613333333333333, "grad_norm": 1.051628532134925, "kl": 0.0072021484375, "learning_rate": 2e-06, "loss": 0.0872, "reward": 0.5515283346176147, "reward_std": 0.17894278466701508, "rewards/length_reward": 0.06205355003476143, "rewards/similarity_reward": 0.48947471380233765, "step": 98 }, { "completion_length": 268.59375, "epoch": 0.264, "grad_norm": 0.8240697442290599, "kl": 0.008544921875, "learning_rate": 2e-06, "loss": 0.0672, "reward": 0.6131307482719421, "reward_std": 0.17423538863658905, "rewards/length_reward": 0.06205355003476143, "rewards/similarity_reward": 0.5510770678520203, "step": 99 }, { "completion_length": 257.37054443359375, "epoch": 0.26666666666666666, "grad_norm": 131.12733526048441, "kl": 0.1923828125, "learning_rate": 2e-06, "loss": 0.0817, "reward": 0.6513006687164307, "reward_std": 0.14835356175899506, "rewards/length_reward": 0.06830354779958725, "rewards/similarity_reward": 0.5829971432685852, "step": 100 }, { "completion_length": 307.1473388671875, "epoch": 0.2693333333333333, "grad_norm": 0.837570841896231, "kl": 0.0054931640625, "learning_rate": 2e-06, "loss": 0.0194, "reward": 0.6764991283416748, "reward_std": 0.1323472112417221, "rewards/length_reward": 0.07410712540149689, "rewards/similarity_reward": 0.6023918986320496, "step": 101 }, { "completion_length": 265.5044860839844, "epoch": 0.272, "grad_norm": 0.7713890189466205, "kl": 0.00830078125, "learning_rate": 2e-06, "loss": 0.0212, "reward": 0.6779460310935974, "reward_std": 0.12496771663427353, "rewards/length_reward": 0.07276783138513565, "rewards/similarity_reward": 0.6051782369613647, "step": 102 }, { "completion_length": 280.46429443359375, "epoch": 0.27466666666666667, "grad_norm": 0.8825358886169125, "kl": 0.008056640625, "learning_rate": 2e-06, "loss": 0.1086, "reward": 0.5848525166511536, "reward_std": 0.11382713168859482, "rewards/length_reward": 0.06071426719427109, "rewards/similarity_reward": 0.5241381525993347, "step": 103 }, { "completion_length": 320.3571472167969, "epoch": 0.2773333333333333, "grad_norm": 0.8471202564701443, "kl": 0.0069580078125, "learning_rate": 2e-06, "loss": 0.0388, "reward": 0.6058804988861084, "reward_std": 0.15757833421230316, "rewards/length_reward": 0.06517855077981949, "rewards/similarity_reward": 0.5407018661499023, "step": 104 }, { "completion_length": 287.0848388671875, "epoch": 0.28, "grad_norm": 0.885756988877436, "kl": 0.00860595703125, "learning_rate": 2e-06, "loss": 0.089, "reward": 0.6150888204574585, "reward_std": 0.1344638168811798, "rewards/length_reward": 0.057589273899793625, "rewards/similarity_reward": 0.5574995875358582, "step": 105 }, { "completion_length": 293.1339416503906, "epoch": 0.2826666666666667, "grad_norm": 0.9299759085944364, "kl": 0.01336669921875, "learning_rate": 2e-06, "loss": 0.0428, "reward": 0.560218334197998, "reward_std": 0.2031860500574112, "rewards/length_reward": 0.058035701513290405, "rewards/similarity_reward": 0.50218266248703, "step": 106 }, { "completion_length": 292.75, "epoch": 0.2853333333333333, "grad_norm": 0.8374882655316597, "kl": 0.00848388671875, "learning_rate": 2e-06, "loss": 0.0683, "reward": 0.6374148726463318, "reward_std": 0.15000107884407043, "rewards/length_reward": 0.06919640302658081, "rewards/similarity_reward": 0.5682184100151062, "step": 107 }, { "completion_length": 263.3571472167969, "epoch": 0.288, "grad_norm": 1.0433586800088648, "kl": 0.0078125, "learning_rate": 2e-06, "loss": 0.0913, "reward": 0.5456939935684204, "reward_std": 0.1411367952823639, "rewards/length_reward": 0.056696414947509766, "rewards/similarity_reward": 0.48899757862091064, "step": 108 }, { "completion_length": 263.9107360839844, "epoch": 0.2906666666666667, "grad_norm": 0.9650468316923807, "kl": 0.01129150390625, "learning_rate": 2e-06, "loss": 0.119, "reward": 0.6117041110992432, "reward_std": 0.13907021284103394, "rewards/length_reward": 0.06205355376005173, "rewards/similarity_reward": 0.5496505498886108, "step": 109 }, { "completion_length": 235.0848388671875, "epoch": 0.29333333333333333, "grad_norm": 0.9205848620805003, "kl": 0.009521484375, "learning_rate": 2e-06, "loss": 0.006, "reward": 0.5724084377288818, "reward_std": 0.12264589220285416, "rewards/length_reward": 0.06964283436536789, "rewards/similarity_reward": 0.5027655959129333, "step": 110 }, { "completion_length": 280.9419860839844, "epoch": 0.296, "grad_norm": 0.8242814043162366, "kl": 0.00836181640625, "learning_rate": 2e-06, "loss": 0.1358, "reward": 0.6025325059890747, "reward_std": 0.13276302814483643, "rewards/length_reward": 0.06741069257259369, "rewards/similarity_reward": 0.5351218581199646, "step": 111 }, { "completion_length": 261.55804443359375, "epoch": 0.2986666666666667, "grad_norm": 0.8979430693793525, "kl": 0.0145263671875, "learning_rate": 2e-06, "loss": 0.0898, "reward": 0.5723416805267334, "reward_std": 0.11434419453144073, "rewards/length_reward": 0.06428569555282593, "rewards/similarity_reward": 0.5080559253692627, "step": 112 }, { "completion_length": 273.8482360839844, "epoch": 0.30133333333333334, "grad_norm": 0.8994640436743108, "kl": 0.0084228515625, "learning_rate": 2e-06, "loss": 0.0884, "reward": 0.6239952445030212, "reward_std": 0.15253794193267822, "rewards/length_reward": 0.06517855077981949, "rewards/similarity_reward": 0.5588168501853943, "step": 113 }, { "completion_length": 265.29913330078125, "epoch": 0.304, "grad_norm": 0.8511084352415984, "kl": 0.015625, "learning_rate": 2e-06, "loss": 0.0217, "reward": 0.5796217322349548, "reward_std": 0.16319997608661652, "rewards/length_reward": 0.06785712391138077, "rewards/similarity_reward": 0.511764645576477, "step": 114 }, { "completion_length": 227.94644165039062, "epoch": 0.30666666666666664, "grad_norm": 1.0504493112285522, "kl": 0.013427734375, "learning_rate": 2e-06, "loss": 0.092, "reward": 0.611219584941864, "reward_std": 0.1473054587841034, "rewards/length_reward": 0.06562498211860657, "rewards/similarity_reward": 0.5455944538116455, "step": 115 }, { "completion_length": 312.2008972167969, "epoch": 0.30933333333333335, "grad_norm": 0.8491847097599164, "kl": 0.009033203125, "learning_rate": 2e-06, "loss": 0.0818, "reward": 0.5205245018005371, "reward_std": 0.18279042840003967, "rewards/length_reward": 0.055803555995225906, "rewards/similarity_reward": 0.46472102403640747, "step": 116 }, { "completion_length": 285.4151916503906, "epoch": 0.312, "grad_norm": 0.948136046714223, "kl": 0.01251220703125, "learning_rate": 2e-06, "loss": 0.1402, "reward": 0.5244685411453247, "reward_std": 0.1221655011177063, "rewards/length_reward": 0.058482129126787186, "rewards/similarity_reward": 0.4659864008426666, "step": 117 }, { "completion_length": 291.8973388671875, "epoch": 0.31466666666666665, "grad_norm": 0.8327937599541795, "kl": 0.00823974609375, "learning_rate": 2e-06, "loss": 0.0416, "reward": 0.6440633535385132, "reward_std": 0.14113157987594604, "rewards/length_reward": 0.06696426123380661, "rewards/similarity_reward": 0.5770990252494812, "step": 118 }, { "completion_length": 276.3035888671875, "epoch": 0.31733333333333336, "grad_norm": 1.0522569506493296, "kl": 0.01007080078125, "learning_rate": 2e-06, "loss": 0.2508, "reward": 0.501847505569458, "reward_std": 0.16830717027187347, "rewards/length_reward": 0.051785703748464584, "rewards/similarity_reward": 0.4500618278980255, "step": 119 }, { "completion_length": 231.46876525878906, "epoch": 0.32, "grad_norm": 1.0564887037389263, "kl": 0.01544189453125, "learning_rate": 2e-06, "loss": 0.1258, "reward": 0.5212039351463318, "reward_std": 0.1660899519920349, "rewards/length_reward": 0.053124986588954926, "rewards/similarity_reward": 0.46807900071144104, "step": 120 }, { "completion_length": 274.55804443359375, "epoch": 0.32266666666666666, "grad_norm": 0.892927807825851, "kl": 0.006988525390625, "learning_rate": 2e-06, "loss": 0.0887, "reward": 0.5594373941421509, "reward_std": 0.13949331641197205, "rewards/length_reward": 0.06651782989501953, "rewards/similarity_reward": 0.49291953444480896, "step": 121 }, { "completion_length": 309.36163330078125, "epoch": 0.3253333333333333, "grad_norm": 0.8555023561165935, "kl": 0.01019287109375, "learning_rate": 2e-06, "loss": 0.1512, "reward": 0.5939301252365112, "reward_std": 0.16705819964408875, "rewards/length_reward": 0.061160698533058167, "rewards/similarity_reward": 0.5327693819999695, "step": 122 }, { "completion_length": 264.3973388671875, "epoch": 0.328, "grad_norm": 0.9083757893001095, "kl": 0.00775146484375, "learning_rate": 2e-06, "loss": 0.1449, "reward": 0.6276513934135437, "reward_std": 0.14763577282428741, "rewards/length_reward": 0.06160712614655495, "rewards/similarity_reward": 0.5660442113876343, "step": 123 }, { "completion_length": 267.96875, "epoch": 0.33066666666666666, "grad_norm": 0.9115344595637944, "kl": 0.01336669921875, "learning_rate": 2e-06, "loss": 0.0998, "reward": 0.6213053464889526, "reward_std": 0.16126255691051483, "rewards/length_reward": 0.05848212540149689, "rewards/similarity_reward": 0.5628232359886169, "step": 124 }, { "completion_length": 275.2276916503906, "epoch": 0.3333333333333333, "grad_norm": 0.8933631069209625, "kl": 0.01019287109375, "learning_rate": 2e-06, "loss": 0.0232, "reward": 0.6394702792167664, "reward_std": 0.17729975283145905, "rewards/length_reward": 0.061160698533058167, "rewards/similarity_reward": 0.5783094763755798, "step": 125 }, { "completion_length": 280.0848388671875, "epoch": 0.336, "grad_norm": 0.9959640208447441, "kl": 0.0203857421875, "learning_rate": 2e-06, "loss": 0.066, "reward": 0.5415524840354919, "reward_std": 0.18598264455795288, "rewards/length_reward": 0.06741069257259369, "rewards/similarity_reward": 0.47414183616638184, "step": 126 }, { "completion_length": 294.37054443359375, "epoch": 0.33866666666666667, "grad_norm": 0.944172883238238, "kl": 0.0078125, "learning_rate": 2e-06, "loss": 0.0825, "reward": 0.6250823736190796, "reward_std": 0.1783696711063385, "rewards/length_reward": 0.06696426123380661, "rewards/similarity_reward": 0.5581181049346924, "step": 127 }, { "completion_length": 252.44644165039062, "epoch": 0.3413333333333333, "grad_norm": 0.8622018142523461, "kl": 0.01190185546875, "learning_rate": 2e-06, "loss": 0.0136, "reward": 0.5941780209541321, "reward_std": 0.1297590583562851, "rewards/length_reward": 0.06517855077981949, "rewards/similarity_reward": 0.5289995074272156, "step": 128 }, { "completion_length": 275.1696472167969, "epoch": 0.344, "grad_norm": 0.997627840820869, "kl": 0.00933837890625, "learning_rate": 2e-06, "loss": 0.1634, "reward": 0.5641010999679565, "reward_std": 0.13691328465938568, "rewards/length_reward": 0.057142842561006546, "rewards/similarity_reward": 0.5069582462310791, "step": 129 }, { "completion_length": 291.78125, "epoch": 0.3466666666666667, "grad_norm": 0.9141566771741596, "kl": 0.00885009765625, "learning_rate": 2e-06, "loss": 0.233, "reward": 0.5903910398483276, "reward_std": 0.14815300703048706, "rewards/length_reward": 0.06249998137354851, "rewards/similarity_reward": 0.5278909802436829, "step": 130 }, { "completion_length": 294.58929443359375, "epoch": 0.34933333333333333, "grad_norm": 0.9307314460988763, "kl": 0.01153564453125, "learning_rate": 2e-06, "loss": 0.0979, "reward": 0.5972681045532227, "reward_std": 0.16272346675395966, "rewards/length_reward": 0.061160698533058167, "rewards/similarity_reward": 0.5361074805259705, "step": 131 }, { "completion_length": 240.2232208251953, "epoch": 0.352, "grad_norm": 0.9959808951952684, "kl": 0.01092529296875, "learning_rate": 2e-06, "loss": 0.0428, "reward": 0.6472880244255066, "reward_std": 0.15316687524318695, "rewards/length_reward": 0.06562498211860657, "rewards/similarity_reward": 0.5816629528999329, "step": 132 }, { "completion_length": 237.1607208251953, "epoch": 0.3546666666666667, "grad_norm": 0.8515521500324365, "kl": 0.01251220703125, "learning_rate": 2e-06, "loss": 0.0683, "reward": 0.631507396697998, "reward_std": 0.15118519961833954, "rewards/length_reward": 0.06428569555282593, "rewards/similarity_reward": 0.5672216415405273, "step": 133 }, { "completion_length": 307.5, "epoch": 0.35733333333333334, "grad_norm": 0.719487956498844, "kl": 0.0059814453125, "learning_rate": 2e-06, "loss": 0.0352, "reward": 0.6587818264961243, "reward_std": 0.14100806415081024, "rewards/length_reward": 0.07187497615814209, "rewards/similarity_reward": 0.5869067907333374, "step": 134 }, { "completion_length": 305.2589416503906, "epoch": 0.36, "grad_norm": 1.0877799003245066, "kl": 0.0169677734375, "learning_rate": 2e-06, "loss": 0.0955, "reward": 0.5947835445404053, "reward_std": 0.12429028749465942, "rewards/length_reward": 0.06071426719427109, "rewards/similarity_reward": 0.5340692400932312, "step": 135 }, { "completion_length": 223.81251525878906, "epoch": 0.3626666666666667, "grad_norm": 1.056046449389469, "kl": 0.00665283203125, "learning_rate": 2e-06, "loss": 0.0631, "reward": 0.6106573343276978, "reward_std": 0.13982126116752625, "rewards/length_reward": 0.06517855077981949, "rewards/similarity_reward": 0.5454786419868469, "step": 136 }, { "completion_length": 302.52679443359375, "epoch": 0.36533333333333334, "grad_norm": 0.8473080240759754, "kl": 0.01116943359375, "learning_rate": 2e-06, "loss": 0.121, "reward": 0.5767890810966492, "reward_std": 0.1566361039876938, "rewards/length_reward": 0.06071426719427109, "rewards/similarity_reward": 0.5160747766494751, "step": 137 }, { "completion_length": 294.9285888671875, "epoch": 0.368, "grad_norm": 0.8165111113975745, "kl": 0.00677490234375, "learning_rate": 2e-06, "loss": -0.0013, "reward": 0.6466237902641296, "reward_std": 0.11831733584403992, "rewards/length_reward": 0.07142855226993561, "rewards/similarity_reward": 0.5751951336860657, "step": 138 }, { "completion_length": 297.4107360839844, "epoch": 0.37066666666666664, "grad_norm": 0.8905760527062927, "kl": 0.00653076171875, "learning_rate": 2e-06, "loss": 0.0894, "reward": 0.6628533601760864, "reward_std": 0.10040118545293808, "rewards/length_reward": 0.06339284032583237, "rewards/similarity_reward": 0.5994604229927063, "step": 139 }, { "completion_length": 282.9508972167969, "epoch": 0.37333333333333335, "grad_norm": 0.9489224311946435, "kl": 0.01007080078125, "learning_rate": 2e-06, "loss": 0.0747, "reward": 0.5422684550285339, "reward_std": 0.18701300024986267, "rewards/length_reward": 0.055357132107019424, "rewards/similarity_reward": 0.4869112968444824, "step": 140 }, { "completion_length": 286.9419860839844, "epoch": 0.376, "grad_norm": 1.5325112007084152, "kl": 0.0205078125, "learning_rate": 2e-06, "loss": 0.0749, "reward": 0.6462909579277039, "reward_std": 0.1564369648694992, "rewards/length_reward": 0.06919640302658081, "rewards/similarity_reward": 0.5770944356918335, "step": 141 }, { "completion_length": 233.50001525878906, "epoch": 0.37866666666666665, "grad_norm": 1.1124358172264561, "kl": 0.01336669921875, "learning_rate": 2e-06, "loss": 0.1038, "reward": 0.6390895247459412, "reward_std": 0.11253345012664795, "rewards/length_reward": 0.05982141196727753, "rewards/similarity_reward": 0.5792680978775024, "step": 142 }, { "completion_length": 308.0401916503906, "epoch": 0.38133333333333336, "grad_norm": 0.8437782349764958, "kl": 0.01019287109375, "learning_rate": 2e-06, "loss": 0.0439, "reward": 0.6860373616218567, "reward_std": 0.08081385493278503, "rewards/length_reward": 0.07901783287525177, "rewards/similarity_reward": 0.6070196032524109, "step": 143 }, { "completion_length": 274.54913330078125, "epoch": 0.384, "grad_norm": 0.9174096594145076, "kl": 0.01019287109375, "learning_rate": 2e-06, "loss": 0.0938, "reward": 0.6485283970832825, "reward_std": 0.1347315013408661, "rewards/length_reward": 0.07455354183912277, "rewards/similarity_reward": 0.5739747881889343, "step": 144 }, { "completion_length": 257.5535888671875, "epoch": 0.38666666666666666, "grad_norm": 0.9244068415305253, "kl": 0.01019287109375, "learning_rate": 2e-06, "loss": 0.056, "reward": 0.6137626767158508, "reward_std": 0.13303914666175842, "rewards/length_reward": 0.06562498211860657, "rewards/similarity_reward": 0.5481376647949219, "step": 145 }, { "completion_length": 255.62501525878906, "epoch": 0.3893333333333333, "grad_norm": 0.8596867360926773, "kl": 0.011474609375, "learning_rate": 2e-06, "loss": 0.0491, "reward": 0.6294366717338562, "reward_std": 0.13696229457855225, "rewards/length_reward": 0.0741071105003357, "rewards/similarity_reward": 0.5553295016288757, "step": 146 }, { "completion_length": 321.3258972167969, "epoch": 0.392, "grad_norm": 0.7611409673177786, "kl": 0.01171875, "learning_rate": 2e-06, "loss": 0.0851, "reward": 0.6082260012626648, "reward_std": 0.11479248106479645, "rewards/length_reward": 0.06964283436536789, "rewards/similarity_reward": 0.5385831594467163, "step": 147 }, { "completion_length": 256.5223388671875, "epoch": 0.39466666666666667, "grad_norm": 1.017158083005092, "kl": 0.01165771484375, "learning_rate": 2e-06, "loss": 0.0583, "reward": 0.5373588800430298, "reward_std": 0.1524331271648407, "rewards/length_reward": 0.060267843306064606, "rewards/similarity_reward": 0.477090984582901, "step": 148 }, { "completion_length": 247.34376525878906, "epoch": 0.3973333333333333, "grad_norm": 1.100826516879252, "kl": 0.011474609375, "learning_rate": 2e-06, "loss": 0.1543, "reward": 0.6250883340835571, "reward_std": 0.1562027931213379, "rewards/length_reward": 0.06785711646080017, "rewards/similarity_reward": 0.5572311878204346, "step": 149 }, { "completion_length": 271.61163330078125, "epoch": 0.4, "grad_norm": 0.85368826964619, "kl": 0.00897216796875, "learning_rate": 2e-06, "loss": 0.0525, "reward": 0.5830017924308777, "reward_std": 0.1454438865184784, "rewards/length_reward": 0.06160712614655495, "rewards/similarity_reward": 0.5213946104049683, "step": 150 }, { "completion_length": 274.40179443359375, "epoch": 0.4026666666666667, "grad_norm": 0.9117887687890662, "kl": 0.014892578125, "learning_rate": 2e-06, "loss": -0.0237, "reward": 0.6028919219970703, "reward_std": 0.15602950751781464, "rewards/length_reward": 0.06651782989501953, "rewards/similarity_reward": 0.536374032497406, "step": 151 }, { "completion_length": 269.2008972167969, "epoch": 0.4053333333333333, "grad_norm": 0.8208276830838094, "kl": 0.010009765625, "learning_rate": 2e-06, "loss": 0.014, "reward": 0.665276825428009, "reward_std": 0.1257169246673584, "rewards/length_reward": 0.08035711199045181, "rewards/similarity_reward": 0.5849196910858154, "step": 152 }, { "completion_length": 264.4732360839844, "epoch": 0.408, "grad_norm": 0.9062154210012625, "kl": 0.013427734375, "learning_rate": 2e-06, "loss": 0.0701, "reward": 0.6374659538269043, "reward_std": 0.1712835431098938, "rewards/length_reward": 0.06830354779958725, "rewards/similarity_reward": 0.5691623091697693, "step": 153 }, { "completion_length": 325.1071472167969, "epoch": 0.4106666666666667, "grad_norm": 0.8808738957904011, "kl": 0.0089111328125, "learning_rate": 2e-06, "loss": 0.1136, "reward": 0.6423187255859375, "reward_std": 0.1033661887049675, "rewards/length_reward": 0.06383926421403885, "rewards/similarity_reward": 0.578479528427124, "step": 154 }, { "completion_length": 281.3035888671875, "epoch": 0.41333333333333333, "grad_norm": 0.8449149570191646, "kl": 0.012451171875, "learning_rate": 2e-06, "loss": 0.0893, "reward": 0.6530374884605408, "reward_std": 0.12996266782283783, "rewards/length_reward": 0.06116069480776787, "rewards/similarity_reward": 0.5918766856193542, "step": 155 }, { "completion_length": 297.02679443359375, "epoch": 0.416, "grad_norm": 0.8274002453741087, "kl": 0.008056640625, "learning_rate": 2e-06, "loss": 0.0593, "reward": 0.7200801372528076, "reward_std": 0.12102329730987549, "rewards/length_reward": 0.07901783287525177, "rewards/similarity_reward": 0.6410622596740723, "step": 156 }, { "completion_length": 245.7991180419922, "epoch": 0.4186666666666667, "grad_norm": 1.0463728826517769, "kl": 0.0145263671875, "learning_rate": 2e-06, "loss": 0.1192, "reward": 0.6804168820381165, "reward_std": 0.1330643892288208, "rewards/length_reward": 0.0741071105003357, "rewards/similarity_reward": 0.6063097715377808, "step": 157 }, { "completion_length": 272.2410888671875, "epoch": 0.42133333333333334, "grad_norm": 0.8424445256337731, "kl": 0.0152587890625, "learning_rate": 2e-06, "loss": 0.0411, "reward": 0.6152999401092529, "reward_std": 0.18344512581825256, "rewards/length_reward": 0.06428569555282593, "rewards/similarity_reward": 0.5510141849517822, "step": 158 }, { "completion_length": 283.90625, "epoch": 0.424, "grad_norm": 0.944378171141832, "kl": 0.0128173828125, "learning_rate": 2e-06, "loss": 0.0973, "reward": 0.6098131537437439, "reward_std": 0.14866778254508972, "rewards/length_reward": 0.05982141196727753, "rewards/similarity_reward": 0.5499916672706604, "step": 159 }, { "completion_length": 251.37054443359375, "epoch": 0.4266666666666667, "grad_norm": 0.867614538281579, "kl": 0.01190185546875, "learning_rate": 2e-06, "loss": 0.0069, "reward": 0.6304399371147156, "reward_std": 0.12713229656219482, "rewards/length_reward": 0.07276783138513565, "rewards/similarity_reward": 0.5576720237731934, "step": 160 }, { "completion_length": 254.6607208251953, "epoch": 0.42933333333333334, "grad_norm": 1.0006767726840313, "kl": 0.01226806640625, "learning_rate": 2e-06, "loss": 0.0167, "reward": 0.6183627843856812, "reward_std": 0.12064019590616226, "rewards/length_reward": 0.057589273899793625, "rewards/similarity_reward": 0.560773491859436, "step": 161 }, { "completion_length": 278.9821472167969, "epoch": 0.432, "grad_norm": 0.7754115998151179, "kl": 0.0108642578125, "learning_rate": 2e-06, "loss": 0.0624, "reward": 0.6279152035713196, "reward_std": 0.1739441603422165, "rewards/length_reward": 0.06651783734560013, "rewards/similarity_reward": 0.5613973140716553, "step": 162 }, { "completion_length": 261.51788330078125, "epoch": 0.43466666666666665, "grad_norm": 0.9381033539462706, "kl": 0.011962890625, "learning_rate": 2e-06, "loss": 0.1291, "reward": 0.6449581384658813, "reward_std": 0.13920167088508606, "rewards/length_reward": 0.07589282840490341, "rewards/similarity_reward": 0.569065272808075, "step": 163 }, { "completion_length": 311.8169860839844, "epoch": 0.43733333333333335, "grad_norm": 0.959860639301872, "kl": 0.0084228515625, "learning_rate": 2e-06, "loss": 0.0207, "reward": 0.6583375930786133, "reward_std": 0.1428201049566269, "rewards/length_reward": 0.07008926570415497, "rewards/similarity_reward": 0.5882483720779419, "step": 164 }, { "completion_length": 261.5758972167969, "epoch": 0.44, "grad_norm": 0.8574273981386299, "kl": 0.009033203125, "learning_rate": 2e-06, "loss": 0.1177, "reward": 0.6945616006851196, "reward_std": 0.12529133260250092, "rewards/length_reward": 0.07812497019767761, "rewards/similarity_reward": 0.6164366006851196, "step": 165 }, { "completion_length": 281.45538330078125, "epoch": 0.44266666666666665, "grad_norm": 0.8588941157426009, "kl": 0.01220703125, "learning_rate": 2e-06, "loss": 0.0615, "reward": 0.7305233478546143, "reward_std": 0.12011624127626419, "rewards/length_reward": 0.0808035358786583, "rewards/similarity_reward": 0.6497198343276978, "step": 166 }, { "completion_length": 259.64288330078125, "epoch": 0.44533333333333336, "grad_norm": 1.137909715907424, "kl": 0.01031494140625, "learning_rate": 2e-06, "loss": 0.2205, "reward": 0.5699202418327332, "reward_std": 0.1761079728603363, "rewards/length_reward": 0.06294640898704529, "rewards/similarity_reward": 0.5069737434387207, "step": 167 }, { "completion_length": 294.875, "epoch": 0.448, "grad_norm": 1.0016080727138688, "kl": 0.01171875, "learning_rate": 2e-06, "loss": 0.1368, "reward": 0.6165792942047119, "reward_std": 0.12841306626796722, "rewards/length_reward": 0.064732126891613, "rewards/similarity_reward": 0.5518471002578735, "step": 168 }, { "completion_length": 308.37054443359375, "epoch": 0.45066666666666666, "grad_norm": 1.2557098703938632, "kl": 0.0162353515625, "learning_rate": 2e-06, "loss": 0.0777, "reward": 0.6842705607414246, "reward_std": 0.09632124751806259, "rewards/length_reward": 0.08124997466802597, "rewards/similarity_reward": 0.6030204892158508, "step": 169 }, { "completion_length": 281.4821472167969, "epoch": 0.4533333333333333, "grad_norm": 0.8773655821391068, "kl": 0.009033203125, "learning_rate": 2e-06, "loss": 0.0602, "reward": 0.6346014738082886, "reward_std": 0.14319205284118652, "rewards/length_reward": 0.06116069480776787, "rewards/similarity_reward": 0.5734407901763916, "step": 170 }, { "completion_length": 281.8125, "epoch": 0.456, "grad_norm": 0.9145529663215465, "kl": 0.01275634765625, "learning_rate": 2e-06, "loss": 0.1688, "reward": 0.5963006019592285, "reward_std": 0.15331213176250458, "rewards/length_reward": 0.06696426123380661, "rewards/similarity_reward": 0.5293362736701965, "step": 171 }, { "completion_length": 280.8571472167969, "epoch": 0.45866666666666667, "grad_norm": 3.878703302716922, "kl": 0.0269775390625, "learning_rate": 2e-06, "loss": 0.0219, "reward": 0.6730906963348389, "reward_std": 0.11424030363559723, "rewards/length_reward": 0.06874997913837433, "rewards/similarity_reward": 0.604340672492981, "step": 172 }, { "completion_length": 277.9910888671875, "epoch": 0.4613333333333333, "grad_norm": 1.0003205251640386, "kl": 0.01141357421875, "learning_rate": 2e-06, "loss": 0.1181, "reward": 0.6080780029296875, "reward_std": 0.14715011417865753, "rewards/length_reward": 0.06651782989501953, "rewards/similarity_reward": 0.541560173034668, "step": 173 }, { "completion_length": 257.0133972167969, "epoch": 0.464, "grad_norm": 0.8626427313272481, "kl": 0.00921630859375, "learning_rate": 2e-06, "loss": 0.0332, "reward": 0.6258962154388428, "reward_std": 0.13628825545310974, "rewards/length_reward": 0.07098211348056793, "rewards/similarity_reward": 0.554914116859436, "step": 174 }, { "completion_length": 267.1294860839844, "epoch": 0.4666666666666667, "grad_norm": 0.9448046232693003, "kl": 0.0128173828125, "learning_rate": 2e-06, "loss": 0.0686, "reward": 0.582805871963501, "reward_std": 0.13543623685836792, "rewards/length_reward": 0.07455354183912277, "rewards/similarity_reward": 0.5082523226737976, "step": 175 }, { "completion_length": 230.3169708251953, "epoch": 0.4693333333333333, "grad_norm": 1.0336066582105279, "kl": 0.01611328125, "learning_rate": 2e-06, "loss": 0.0188, "reward": 0.5923266410827637, "reward_std": 0.15992802381515503, "rewards/length_reward": 0.06607140600681305, "rewards/similarity_reward": 0.5262552499771118, "step": 176 }, { "completion_length": 259.2857360839844, "epoch": 0.472, "grad_norm": 0.8515404437990851, "kl": 0.01422119140625, "learning_rate": 2e-06, "loss": -0.0031, "reward": 0.6366464495658875, "reward_std": 0.14244325459003448, "rewards/length_reward": 0.06785711646080017, "rewards/similarity_reward": 0.5687893033027649, "step": 177 }, { "completion_length": 269.0535888671875, "epoch": 0.4746666666666667, "grad_norm": 1.0508009846238586, "kl": 0.01202392578125, "learning_rate": 2e-06, "loss": 0.1414, "reward": 0.6338518857955933, "reward_std": 0.13359463214874268, "rewards/length_reward": 0.06651783734560013, "rewards/similarity_reward": 0.5673341155052185, "step": 178 }, { "completion_length": 254.2053680419922, "epoch": 0.47733333333333333, "grad_norm": 1.0742338846656552, "kl": 0.00799560546875, "learning_rate": 2e-06, "loss": 0.0226, "reward": 0.6352322697639465, "reward_std": 0.16355818510055542, "rewards/length_reward": 0.07187497615814209, "rewards/similarity_reward": 0.5633572936058044, "step": 179 }, { "completion_length": 258.8482360839844, "epoch": 0.48, "grad_norm": 1.1021168870169997, "kl": 0.0213623046875, "learning_rate": 2e-06, "loss": 0.0656, "reward": 0.594104528427124, "reward_std": 0.1770821362733841, "rewards/length_reward": 0.060714274644851685, "rewards/similarity_reward": 0.5333902835845947, "step": 180 }, { "completion_length": 256.8125, "epoch": 0.4826666666666667, "grad_norm": 0.8501219854036921, "kl": 0.009033203125, "learning_rate": 2e-06, "loss": -0.0019, "reward": 0.6487245559692383, "reward_std": 0.13405689597129822, "rewards/length_reward": 0.07455354183912277, "rewards/similarity_reward": 0.5741709470748901, "step": 181 }, { "completion_length": 256.02679443359375, "epoch": 0.48533333333333334, "grad_norm": 1.0385629776489995, "kl": 0.012939453125, "learning_rate": 2e-06, "loss": 0.0721, "reward": 0.6005666255950928, "reward_std": 0.18559977412223816, "rewards/length_reward": 0.06741069257259369, "rewards/similarity_reward": 0.5331559181213379, "step": 182 }, { "completion_length": 245.2723388671875, "epoch": 0.488, "grad_norm": 0.9856929099072189, "kl": 0.0140380859375, "learning_rate": 2e-06, "loss": 0.0559, "reward": 0.6080025434494019, "reward_std": 0.12059400230646133, "rewards/length_reward": 0.06517855077981949, "rewards/similarity_reward": 0.5428239703178406, "step": 183 }, { "completion_length": 275.40179443359375, "epoch": 0.49066666666666664, "grad_norm": 1.0764117312018395, "kl": 0.0147705078125, "learning_rate": 2e-06, "loss": 0.1746, "reward": 0.5906988382339478, "reward_std": 0.13717274367809296, "rewards/length_reward": 0.06339284032583237, "rewards/similarity_reward": 0.5273059606552124, "step": 184 }, { "completion_length": 235.33929443359375, "epoch": 0.49333333333333335, "grad_norm": 1.216465109274426, "kl": 0.015869140625, "learning_rate": 2e-06, "loss": 0.0441, "reward": 0.6832688450813293, "reward_std": 0.12071473151445389, "rewards/length_reward": 0.07767853885889053, "rewards/similarity_reward": 0.6055901646614075, "step": 185 }, { "completion_length": 271.0401916503906, "epoch": 0.496, "grad_norm": 0.9417708264398014, "kl": 0.0113525390625, "learning_rate": 2e-06, "loss": 0.0103, "reward": 0.7055503726005554, "reward_std": 0.09871623665094376, "rewards/length_reward": 0.07812497019767761, "rewards/similarity_reward": 0.6274253129959106, "step": 186 }, { "completion_length": 288.40179443359375, "epoch": 0.49866666666666665, "grad_norm": 0.7904664413572577, "kl": 0.0113525390625, "learning_rate": 2e-06, "loss": 0.0943, "reward": 0.6588479280471802, "reward_std": 0.1498415172100067, "rewards/length_reward": 0.07053568959236145, "rewards/similarity_reward": 0.5883120894432068, "step": 187 }, { "completion_length": 262.2857360839844, "epoch": 0.5013333333333333, "grad_norm": 1.0107893927701763, "kl": 0.0113525390625, "learning_rate": 2e-06, "loss": 0.0639, "reward": 0.5783969759941101, "reward_std": 0.1660866141319275, "rewards/length_reward": 0.06741069257259369, "rewards/similarity_reward": 0.5109862685203552, "step": 188 }, { "completion_length": 243.23214721679688, "epoch": 0.504, "grad_norm": 0.9914068826603122, "kl": 0.0263671875, "learning_rate": 2e-06, "loss": 0.0183, "reward": 0.5762468576431274, "reward_std": 0.1855197250843048, "rewards/length_reward": 0.06741069257259369, "rewards/similarity_reward": 0.5088360905647278, "step": 189 }, { "completion_length": 279.8973388671875, "epoch": 0.5066666666666667, "grad_norm": 0.8186202175206256, "kl": 0.01031494140625, "learning_rate": 2e-06, "loss": 0.0181, "reward": 0.6954742074012756, "reward_std": 0.08623984456062317, "rewards/length_reward": 0.07053568959236145, "rewards/similarity_reward": 0.624938428401947, "step": 190 }, { "completion_length": 257.21429443359375, "epoch": 0.5093333333333333, "grad_norm": 0.877876828642467, "kl": 0.014892578125, "learning_rate": 2e-06, "loss": 0.0328, "reward": 0.6462003588676453, "reward_std": 0.11538383364677429, "rewards/length_reward": 0.06830354779958725, "rewards/similarity_reward": 0.577896773815155, "step": 191 }, { "completion_length": 257.7946472167969, "epoch": 0.512, "grad_norm": 0.8857490639900779, "kl": 0.01214599609375, "learning_rate": 2e-06, "loss": 0.0477, "reward": 0.6250221133232117, "reward_std": 0.15633754432201385, "rewards/length_reward": 0.06830354779958725, "rewards/similarity_reward": 0.5567185282707214, "step": 192 }, { "completion_length": 303.4464416503906, "epoch": 0.5146666666666667, "grad_norm": 0.8375649728004798, "kl": 0.00897216796875, "learning_rate": 2e-06, "loss": 0.0444, "reward": 0.6938925981521606, "reward_std": 0.13664484024047852, "rewards/length_reward": 0.07544640451669693, "rewards/similarity_reward": 0.6184461116790771, "step": 193 }, { "completion_length": 239.02679443359375, "epoch": 0.5173333333333333, "grad_norm": 0.9796244769392795, "kl": 0.0169677734375, "learning_rate": 2e-06, "loss": 0.0277, "reward": 0.6308580636978149, "reward_std": 0.09844722598791122, "rewards/length_reward": 0.07544640451669693, "rewards/similarity_reward": 0.5554116368293762, "step": 194 }, { "completion_length": 220.04019165039062, "epoch": 0.52, "grad_norm": 1.1138163852092198, "kl": 0.0211181640625, "learning_rate": 2e-06, "loss": 0.101, "reward": 0.5808507204055786, "reward_std": 0.14026090502738953, "rewards/length_reward": 0.056696418672800064, "rewards/similarity_reward": 0.5241542458534241, "step": 195 }, { "completion_length": 310.6294860839844, "epoch": 0.5226666666666666, "grad_norm": 0.7596161810526226, "kl": 0.012451171875, "learning_rate": 2e-06, "loss": 0.0671, "reward": 0.6398200988769531, "reward_std": 0.16089944541454315, "rewards/length_reward": 0.07633925974369049, "rewards/similarity_reward": 0.5634807348251343, "step": 196 }, { "completion_length": 293.65179443359375, "epoch": 0.5253333333333333, "grad_norm": 1.3057397068251875, "kl": 0.0137939453125, "learning_rate": 2e-06, "loss": 0.0229, "reward": 0.69722580909729, "reward_std": 0.10665407031774521, "rewards/length_reward": 0.07901783287525177, "rewards/similarity_reward": 0.6182078719139099, "step": 197 }, { "completion_length": 271.8794860839844, "epoch": 0.528, "grad_norm": 0.9454287770215252, "kl": 0.01190185546875, "learning_rate": 2e-06, "loss": 0.0697, "reward": 0.6203178763389587, "reward_std": 0.1754215508699417, "rewards/length_reward": 0.06339284032583237, "rewards/similarity_reward": 0.5569249987602234, "step": 198 }, { "completion_length": 259.3571472167969, "epoch": 0.5306666666666666, "grad_norm": 0.8381899247069013, "kl": 0.011474609375, "learning_rate": 2e-06, "loss": 0.0283, "reward": 0.6383811831474304, "reward_std": 0.11189709603786469, "rewards/length_reward": 0.07544640451669693, "rewards/similarity_reward": 0.5629347562789917, "step": 199 }, { "completion_length": 258.0401916503906, "epoch": 0.5333333333333333, "grad_norm": 1.0128439289515407, "kl": 0.012451171875, "learning_rate": 2e-06, "loss": 0.1465, "reward": 0.5912656784057617, "reward_std": 0.14550404250621796, "rewards/length_reward": 0.058482129126787186, "rewards/similarity_reward": 0.5327835083007812, "step": 200 }, { "completion_length": 311.27679443359375, "epoch": 0.536, "grad_norm": 0.86779018830801, "kl": 0.00909423828125, "learning_rate": 2e-06, "loss": 0.0985, "reward": 0.588790237903595, "reward_std": 0.140910342335701, "rewards/length_reward": 0.07187497615814209, "rewards/similarity_reward": 0.5169152021408081, "step": 201 }, { "completion_length": 223.9598388671875, "epoch": 0.5386666666666666, "grad_norm": 0.9896401672605407, "kl": 0.01263427734375, "learning_rate": 2e-06, "loss": 0.0605, "reward": 0.5413497686386108, "reward_std": 0.13121715188026428, "rewards/length_reward": 0.06517855077981949, "rewards/similarity_reward": 0.4761711657047272, "step": 202 }, { "completion_length": 245.66964721679688, "epoch": 0.5413333333333333, "grad_norm": 0.8825950131925253, "kl": 0.01300048828125, "learning_rate": 2e-06, "loss": 0.0168, "reward": 0.632973849773407, "reward_std": 0.15790660679340363, "rewards/length_reward": 0.07678568363189697, "rewards/similarity_reward": 0.5561880469322205, "step": 203 }, { "completion_length": 214.50894165039062, "epoch": 0.544, "grad_norm": 0.9691668766051184, "kl": 0.0133056640625, "learning_rate": 2e-06, "loss": 0.0135, "reward": 0.6077868938446045, "reward_std": 0.12028573453426361, "rewards/length_reward": 0.06964283436536789, "rewards/similarity_reward": 0.5381439328193665, "step": 204 }, { "completion_length": 292.73663330078125, "epoch": 0.5466666666666666, "grad_norm": 0.7967760450327859, "kl": 0.00982666015625, "learning_rate": 2e-06, "loss": 0.0266, "reward": 0.6232799887657166, "reward_std": 0.10140591114759445, "rewards/length_reward": 0.07410712540149689, "rewards/similarity_reward": 0.5491728186607361, "step": 205 }, { "completion_length": 221.25001525878906, "epoch": 0.5493333333333333, "grad_norm": 1.1638296703356164, "kl": 0.01385498046875, "learning_rate": 2e-06, "loss": 0.1653, "reward": 0.5745998620986938, "reward_std": 0.1350637972354889, "rewards/length_reward": 0.06741069257259369, "rewards/similarity_reward": 0.5071890950202942, "step": 206 }, { "completion_length": 265.58929443359375, "epoch": 0.552, "grad_norm": 0.845270302637572, "kl": 0.01031494140625, "learning_rate": 2e-06, "loss": 0.0111, "reward": 0.6836676001548767, "reward_std": 0.10602893680334091, "rewards/length_reward": 0.07053568959236145, "rewards/similarity_reward": 0.6131318211555481, "step": 207 }, { "completion_length": 295.58929443359375, "epoch": 0.5546666666666666, "grad_norm": 0.9323286808595849, "kl": 0.00921630859375, "learning_rate": 2e-06, "loss": 0.0749, "reward": 0.5889706611633301, "reward_std": 0.1236046850681305, "rewards/length_reward": 0.06517855823040009, "rewards/similarity_reward": 0.5237920880317688, "step": 208 }, { "completion_length": 293.58038330078125, "epoch": 0.5573333333333333, "grad_norm": 0.9233381319586115, "kl": 0.01055908203125, "learning_rate": 2e-06, "loss": 0.2, "reward": 0.6363462805747986, "reward_std": 0.12041884660720825, "rewards/length_reward": 0.06874997913837433, "rewards/similarity_reward": 0.5675963759422302, "step": 209 }, { "completion_length": 266.3660888671875, "epoch": 0.56, "grad_norm": 0.9921663239986533, "kl": 0.01214599609375, "learning_rate": 2e-06, "loss": 0.0664, "reward": 0.5890473127365112, "reward_std": 0.14096976816654205, "rewards/length_reward": 0.06339284032583237, "rewards/similarity_reward": 0.5256544351577759, "step": 210 }, { "completion_length": 257.46875, "epoch": 0.5626666666666666, "grad_norm": 0.9774355514560761, "kl": 0.01080322265625, "learning_rate": 2e-06, "loss": 0.0824, "reward": 0.5761434435844421, "reward_std": 0.18852439522743225, "rewards/length_reward": 0.061160698533058167, "rewards/similarity_reward": 0.5149827003479004, "step": 211 }, { "completion_length": 214.66964721679688, "epoch": 0.5653333333333334, "grad_norm": 1.0333338701683394, "kl": 0.00994873046875, "learning_rate": 2e-06, "loss": 0.1128, "reward": 0.6288223266601562, "reward_std": 0.1170286163687706, "rewards/length_reward": 0.06964283436536789, "rewards/similarity_reward": 0.5591794848442078, "step": 212 }, { "completion_length": 284.0089416503906, "epoch": 0.568, "grad_norm": 0.9552749101564338, "kl": 0.01422119140625, "learning_rate": 2e-06, "loss": 0.1971, "reward": 0.5316947102546692, "reward_std": 0.14774499833583832, "rewards/length_reward": 0.06607140600681305, "rewards/similarity_reward": 0.46562325954437256, "step": 213 }, { "completion_length": 264.84375, "epoch": 0.5706666666666667, "grad_norm": 0.9328609206359839, "kl": 0.0128173828125, "learning_rate": 2e-06, "loss": 0.0611, "reward": 0.61468505859375, "reward_std": 0.0943475142121315, "rewards/length_reward": 0.08214282244443893, "rewards/similarity_reward": 0.5325421690940857, "step": 214 }, { "completion_length": 288.65625, "epoch": 0.5733333333333334, "grad_norm": 0.9888552258575887, "kl": 0.0147705078125, "learning_rate": 2e-06, "loss": 0.1398, "reward": 0.5841876864433289, "reward_std": 0.10097295790910721, "rewards/length_reward": 0.06294640898704529, "rewards/similarity_reward": 0.5212411880493164, "step": 215 }, { "completion_length": 270.28125, "epoch": 0.576, "grad_norm": 0.8803342156226522, "kl": 0.014892578125, "learning_rate": 2e-06, "loss": 0.0359, "reward": 0.6344039440155029, "reward_std": 0.17091530561447144, "rewards/length_reward": 0.06607141345739365, "rewards/similarity_reward": 0.5683325529098511, "step": 216 }, { "completion_length": 255.12501525878906, "epoch": 0.5786666666666667, "grad_norm": 0.8979196392383272, "kl": 0.027099609375, "learning_rate": 2e-06, "loss": -0.0078, "reward": 0.661945641040802, "reward_std": 0.1631477326154709, "rewards/length_reward": 0.07767854630947113, "rewards/similarity_reward": 0.5842669606208801, "step": 217 }, { "completion_length": 263.58038330078125, "epoch": 0.5813333333333334, "grad_norm": 1.180022786404114, "kl": 0.0191650390625, "learning_rate": 2e-06, "loss": 0.1111, "reward": 0.5408477187156677, "reward_std": 0.11287137866020203, "rewards/length_reward": 0.060267843306064606, "rewards/similarity_reward": 0.4805798828601837, "step": 218 }, { "completion_length": 293.24554443359375, "epoch": 0.584, "grad_norm": 1.082130352994329, "kl": 0.0113525390625, "learning_rate": 2e-06, "loss": 0.166, "reward": 0.6229541301727295, "reward_std": 0.18498755991458893, "rewards/length_reward": 0.06294640898704529, "rewards/similarity_reward": 0.5600076913833618, "step": 219 }, { "completion_length": 255.9553680419922, "epoch": 0.5866666666666667, "grad_norm": 0.8518142779942337, "kl": 0.0216064453125, "learning_rate": 2e-06, "loss": -0.0418, "reward": 0.6351791620254517, "reward_std": 0.1440075933933258, "rewards/length_reward": 0.06919640302658081, "rewards/similarity_reward": 0.5659827589988708, "step": 220 }, { "completion_length": 268.2589416503906, "epoch": 0.5893333333333334, "grad_norm": 0.8005051959777295, "kl": 0.00799560546875, "learning_rate": 2e-06, "loss": 0.029, "reward": 0.6807352304458618, "reward_std": 0.11082387715578079, "rewards/length_reward": 0.07455354183912277, "rewards/similarity_reward": 0.6061817407608032, "step": 221 }, { "completion_length": 267.5089416503906, "epoch": 0.592, "grad_norm": 0.8727360410582777, "kl": 0.00927734375, "learning_rate": 2e-06, "loss": 0.0333, "reward": 0.6831346154212952, "reward_std": 0.09732881933450699, "rewards/length_reward": 0.07633925974369049, "rewards/similarity_reward": 0.6067953109741211, "step": 222 }, { "completion_length": 271.2008972167969, "epoch": 0.5946666666666667, "grad_norm": 0.7543972270797626, "kl": 0.00921630859375, "learning_rate": 2e-06, "loss": 0.0251, "reward": 0.7254729866981506, "reward_std": 0.13280263543128967, "rewards/length_reward": 0.07723211497068405, "rewards/similarity_reward": 0.6482407450675964, "step": 223 }, { "completion_length": 276.33038330078125, "epoch": 0.5973333333333334, "grad_norm": 0.8477057601765857, "kl": 0.01470947265625, "learning_rate": 2e-06, "loss": 0.093, "reward": 0.6127163171768188, "reward_std": 0.14752325415611267, "rewards/length_reward": 0.06160712614655495, "rewards/similarity_reward": 0.5511091351509094, "step": 224 }, { "completion_length": 313.02679443359375, "epoch": 0.6, "grad_norm": 0.9444843501933834, "kl": 0.01953125, "learning_rate": 2e-06, "loss": 0.1058, "reward": 0.6097243428230286, "reward_std": 0.1704142987728119, "rewards/length_reward": 0.06696426123380661, "rewards/similarity_reward": 0.5427600741386414, "step": 225 }, { "completion_length": 288.37054443359375, "epoch": 0.6026666666666667, "grad_norm": 0.8579103953599808, "kl": 0.01373291015625, "learning_rate": 2e-06, "loss": 0.1122, "reward": 0.6366080641746521, "reward_std": 0.12147609889507294, "rewards/length_reward": 0.07232140004634857, "rewards/similarity_reward": 0.5642866492271423, "step": 226 }, { "completion_length": 310.1607360839844, "epoch": 0.6053333333333333, "grad_norm": 0.759533963504491, "kl": 0.01312255859375, "learning_rate": 2e-06, "loss": 0.0699, "reward": 0.6739456057548523, "reward_std": 0.10951042920351028, "rewards/length_reward": 0.07276783138513565, "rewards/similarity_reward": 0.6011778116226196, "step": 227 }, { "completion_length": 239.0848388671875, "epoch": 0.608, "grad_norm": 0.9025930213219101, "kl": 0.01104736328125, "learning_rate": 2e-06, "loss": 0.0215, "reward": 0.6142429709434509, "reward_std": 0.08829416334629059, "rewards/length_reward": 0.07633925974369049, "rewards/similarity_reward": 0.5379037261009216, "step": 228 }, { "completion_length": 278.3125, "epoch": 0.6106666666666667, "grad_norm": 0.886301576249163, "kl": 0.01104736328125, "learning_rate": 2e-06, "loss": 0.0334, "reward": 0.7060741186141968, "reward_std": 0.11311851441860199, "rewards/length_reward": 0.07723211497068405, "rewards/similarity_reward": 0.6288419961929321, "step": 229 }, { "completion_length": 251.18751525878906, "epoch": 0.6133333333333333, "grad_norm": 2.476356671041086, "kl": 0.0244140625, "learning_rate": 2e-06, "loss": 0.0096, "reward": 0.5848217010498047, "reward_std": 0.11717528849840164, "rewards/length_reward": 0.061160698533058167, "rewards/similarity_reward": 0.5236610770225525, "step": 230 }, { "completion_length": 236.29019165039062, "epoch": 0.616, "grad_norm": 0.9163834681525471, "kl": 0.0108642578125, "learning_rate": 2e-06, "loss": 0.0201, "reward": 0.6646043658256531, "reward_std": 0.11276809126138687, "rewards/length_reward": 0.07991068810224533, "rewards/similarity_reward": 0.5846936702728271, "step": 231 }, { "completion_length": 317.0089416503906, "epoch": 0.6186666666666667, "grad_norm": 0.8636877609886525, "kl": 0.013427734375, "learning_rate": 2e-06, "loss": 0.036, "reward": 0.5721753835678101, "reward_std": 0.16120396554470062, "rewards/length_reward": 0.06517855077981949, "rewards/similarity_reward": 0.506996750831604, "step": 232 }, { "completion_length": 307.7008972167969, "epoch": 0.6213333333333333, "grad_norm": 1.3729033520790577, "kl": 0.0166015625, "learning_rate": 2e-06, "loss": 0.0383, "reward": 0.6260521411895752, "reward_std": 0.11853621900081635, "rewards/length_reward": 0.07767853885889053, "rewards/similarity_reward": 0.5483735799789429, "step": 233 }, { "completion_length": 283.5714416503906, "epoch": 0.624, "grad_norm": 0.8530397710423918, "kl": 0.016357421875, "learning_rate": 2e-06, "loss": 0.0206, "reward": 0.6967118382453918, "reward_std": 0.1562497317790985, "rewards/length_reward": 0.07857140153646469, "rewards/similarity_reward": 0.618140459060669, "step": 234 }, { "completion_length": 272.6160888671875, "epoch": 0.6266666666666667, "grad_norm": 0.8799568084373302, "kl": 0.01153564453125, "learning_rate": 2e-06, "loss": 0.0181, "reward": 0.6851814985275269, "reward_std": 0.10234292596578598, "rewards/length_reward": 0.07142854481935501, "rewards/similarity_reward": 0.6137529611587524, "step": 235 }, { "completion_length": 316.4776916503906, "epoch": 0.6293333333333333, "grad_norm": 0.8224273598135922, "kl": 0.01416015625, "learning_rate": 2e-06, "loss": 0.1293, "reward": 0.5818712115287781, "reward_std": 0.1467462033033371, "rewards/length_reward": 0.059374988079071045, "rewards/similarity_reward": 0.5224961638450623, "step": 236 }, { "completion_length": 276.1651916503906, "epoch": 0.632, "grad_norm": 0.8214244182848573, "kl": 0.01373291015625, "learning_rate": 2e-06, "loss": 0.0324, "reward": 0.6201799511909485, "reward_std": 0.14638349413871765, "rewards/length_reward": 0.07098212093114853, "rewards/similarity_reward": 0.5491978526115417, "step": 237 }, { "completion_length": 314.15179443359375, "epoch": 0.6346666666666667, "grad_norm": 0.8751337279602847, "kl": 0.011474609375, "learning_rate": 2e-06, "loss": 0.1279, "reward": 0.6131877899169922, "reward_std": 0.15269529819488525, "rewards/length_reward": 0.06383927166461945, "rewards/similarity_reward": 0.5493485331535339, "step": 238 }, { "completion_length": 241.5982208251953, "epoch": 0.6373333333333333, "grad_norm": 0.9065686563133485, "kl": 0.01300048828125, "learning_rate": 2e-06, "loss": 0.053, "reward": 0.6322412490844727, "reward_std": 0.13913773000240326, "rewards/length_reward": 0.06741069257259369, "rewards/similarity_reward": 0.5648305416107178, "step": 239 }, { "completion_length": 282.3973388671875, "epoch": 0.64, "grad_norm": 0.8759378979761268, "kl": 0.0130615234375, "learning_rate": 2e-06, "loss": 0.0351, "reward": 0.6563798785209656, "reward_std": 0.11333189904689789, "rewards/length_reward": 0.07053568959236145, "rewards/similarity_reward": 0.5858440399169922, "step": 240 }, { "completion_length": 258.2946472167969, "epoch": 0.6426666666666667, "grad_norm": 0.8083399774508907, "kl": 0.0115966796875, "learning_rate": 2e-06, "loss": -0.0089, "reward": 0.7068819403648376, "reward_std": 0.10833070427179337, "rewards/length_reward": 0.08392854034900665, "rewards/similarity_reward": 0.6229532957077026, "step": 241 }, { "completion_length": 226.43751525878906, "epoch": 0.6453333333333333, "grad_norm": 1.113129598732782, "kl": 0.01708984375, "learning_rate": 2e-06, "loss": 0.0985, "reward": 0.47489413619041443, "reward_std": 0.14593513309955597, "rewards/length_reward": 0.053124986588954926, "rewards/similarity_reward": 0.4217691719532013, "step": 242 }, { "completion_length": 254.5982208251953, "epoch": 0.648, "grad_norm": 1.0574663727866278, "kl": 0.01171875, "learning_rate": 2e-06, "loss": 0.0655, "reward": 0.585192084312439, "reward_std": 0.1674540489912033, "rewards/length_reward": 0.05937498062849045, "rewards/similarity_reward": 0.5258170962333679, "step": 243 }, { "completion_length": 245.9866180419922, "epoch": 0.6506666666666666, "grad_norm": 0.9866678905813414, "kl": 0.01385498046875, "learning_rate": 2e-06, "loss": 0.0992, "reward": 0.6301730871200562, "reward_std": 0.11110112071037292, "rewards/length_reward": 0.07142854481935501, "rewards/similarity_reward": 0.5587445497512817, "step": 244 }, { "completion_length": 276.9419860839844, "epoch": 0.6533333333333333, "grad_norm": 0.8334751875894263, "kl": 0.01226806640625, "learning_rate": 2e-06, "loss": 0.0199, "reward": 0.734417736530304, "reward_std": 0.13278159499168396, "rewards/length_reward": 0.07767854630947113, "rewards/similarity_reward": 0.6567391157150269, "step": 245 }, { "completion_length": 224.4241180419922, "epoch": 0.656, "grad_norm": 1.039042606617133, "kl": 0.0211181640625, "learning_rate": 2e-06, "loss": 0.1146, "reward": 0.6315779089927673, "reward_std": 0.1330062597990036, "rewards/length_reward": 0.07053568959236145, "rewards/similarity_reward": 0.5610421895980835, "step": 246 }, { "completion_length": 252.9732208251953, "epoch": 0.6586666666666666, "grad_norm": 0.8538008305966633, "kl": 0.01275634765625, "learning_rate": 2e-06, "loss": 0.0224, "reward": 0.6646360754966736, "reward_std": 0.12239360809326172, "rewards/length_reward": 0.07455354183912277, "rewards/similarity_reward": 0.5900824666023254, "step": 247 }, { "completion_length": 218.7366180419922, "epoch": 0.6613333333333333, "grad_norm": 0.963162350651896, "kl": 0.01507568359375, "learning_rate": 2e-06, "loss": 0.0349, "reward": 0.6808683276176453, "reward_std": 0.13527972996234894, "rewards/length_reward": 0.07187497615814209, "rewards/similarity_reward": 0.6089933514595032, "step": 248 }, { "completion_length": 254.5982208251953, "epoch": 0.664, "grad_norm": 0.9385464548992294, "kl": 0.0179443359375, "learning_rate": 2e-06, "loss": 0.0288, "reward": 0.631544828414917, "reward_std": 0.13531894981861115, "rewards/length_reward": 0.06874997913837433, "rewards/similarity_reward": 0.5627948045730591, "step": 249 }, { "completion_length": 291.37054443359375, "epoch": 0.6666666666666666, "grad_norm": 0.9140004673035673, "kl": 0.01708984375, "learning_rate": 2e-06, "loss": 0.0837, "reward": 0.5439311861991882, "reward_std": 0.13576674461364746, "rewards/length_reward": 0.05848212540149689, "rewards/similarity_reward": 0.48544901609420776, "step": 250 }, { "completion_length": 302.15179443359375, "epoch": 0.6693333333333333, "grad_norm": 0.8827607842873424, "kl": 0.0137939453125, "learning_rate": 2e-06, "loss": 0.1511, "reward": 0.521554172039032, "reward_std": 0.13812950253486633, "rewards/length_reward": 0.06249998137354851, "rewards/similarity_reward": 0.459054172039032, "step": 251 }, { "completion_length": 256.67413330078125, "epoch": 0.672, "grad_norm": 0.9030494234496588, "kl": 0.015869140625, "learning_rate": 2e-06, "loss": 0.0098, "reward": 0.6373765468597412, "reward_std": 0.16011908650398254, "rewards/length_reward": 0.06517855077981949, "rewards/similarity_reward": 0.5721979141235352, "step": 252 }, { "completion_length": 263.4464416503906, "epoch": 0.6746666666666666, "grad_norm": 0.9917515330394924, "kl": 0.0157470703125, "learning_rate": 2e-06, "loss": 0.1071, "reward": 0.5835117697715759, "reward_std": 0.15427549183368683, "rewards/length_reward": 0.06205355003476143, "rewards/similarity_reward": 0.5214581489562988, "step": 253 }, { "completion_length": 263.1875, "epoch": 0.6773333333333333, "grad_norm": 0.8527537949516601, "kl": 0.01007080078125, "learning_rate": 2e-06, "loss": 0.0704, "reward": 0.659292459487915, "reward_std": 0.11287476867437363, "rewards/length_reward": 0.07723211497068405, "rewards/similarity_reward": 0.5820602774620056, "step": 254 }, { "completion_length": 247.35714721679688, "epoch": 0.68, "grad_norm": 0.9306878660258886, "kl": 0.018798828125, "learning_rate": 2e-06, "loss": 0.1071, "reward": 0.5428202748298645, "reward_std": 0.14576061069965363, "rewards/length_reward": 0.055357132107019424, "rewards/similarity_reward": 0.4874631464481354, "step": 255 }, { "completion_length": 304.77679443359375, "epoch": 0.6826666666666666, "grad_norm": 0.8122651765327112, "kl": 0.01123046875, "learning_rate": 2e-06, "loss": 0.0722, "reward": 0.6534665822982788, "reward_std": 0.17661263048648834, "rewards/length_reward": 0.06651783734560013, "rewards/similarity_reward": 0.5869486927986145, "step": 256 }, { "completion_length": 316.0, "epoch": 0.6853333333333333, "grad_norm": 0.7637865876113388, "kl": 0.01153564453125, "learning_rate": 2e-06, "loss": 0.0805, "reward": 0.6809090375900269, "reward_std": 0.12792253494262695, "rewards/length_reward": 0.07499997317790985, "rewards/similarity_reward": 0.6059090495109558, "step": 257 }, { "completion_length": 267.7410888671875, "epoch": 0.688, "grad_norm": 0.908951786987476, "kl": 0.0123291015625, "learning_rate": 2e-06, "loss": 0.0919, "reward": 0.6498162150382996, "reward_std": 0.12118736654520035, "rewards/length_reward": 0.057589273899793625, "rewards/similarity_reward": 0.5922268629074097, "step": 258 }, { "completion_length": 255.13394165039062, "epoch": 0.6906666666666667, "grad_norm": 0.9868527698980504, "kl": 0.01116943359375, "learning_rate": 2e-06, "loss": 0.0518, "reward": 0.6177918910980225, "reward_std": 0.1076013594865799, "rewards/length_reward": 0.07232140004634857, "rewards/similarity_reward": 0.5454704165458679, "step": 259 }, { "completion_length": 211.96429443359375, "epoch": 0.6933333333333334, "grad_norm": 1.1390879691759828, "kl": 0.01251220703125, "learning_rate": 2e-06, "loss": 0.0376, "reward": 0.6133698225021362, "reward_std": 0.1394728273153305, "rewards/length_reward": 0.07187497615814209, "rewards/similarity_reward": 0.5414947867393494, "step": 260 }, { "completion_length": 226.0491180419922, "epoch": 0.696, "grad_norm": 0.9181631341556423, "kl": 0.01458740234375, "learning_rate": 2e-06, "loss": 0.0146, "reward": 0.6710724234580994, "reward_std": 0.12615807354450226, "rewards/length_reward": 0.06696426123380661, "rewards/similarity_reward": 0.6041080951690674, "step": 261 }, { "completion_length": 290.71875, "epoch": 0.6986666666666667, "grad_norm": 0.9242817884998483, "kl": 0.010986328125, "learning_rate": 2e-06, "loss": 0.0492, "reward": 0.758361279964447, "reward_std": 0.0939282700419426, "rewards/length_reward": 0.07812497019767761, "rewards/similarity_reward": 0.680236279964447, "step": 262 }, { "completion_length": 295.2276916503906, "epoch": 0.7013333333333334, "grad_norm": 0.9268240432998979, "kl": 0.01434326171875, "learning_rate": 2e-06, "loss": 0.0777, "reward": 0.5887910723686218, "reward_std": 0.15629605948925018, "rewards/length_reward": 0.06517855077981949, "rewards/similarity_reward": 0.5236124992370605, "step": 263 }, { "completion_length": 197.97769165039062, "epoch": 0.704, "grad_norm": 0.9100124092555839, "kl": 0.01190185546875, "learning_rate": 2e-06, "loss": 0.0176, "reward": 0.5667382478713989, "reward_std": 0.1061021164059639, "rewards/length_reward": 0.06741069257259369, "rewards/similarity_reward": 0.4993274211883545, "step": 264 }, { "completion_length": 242.96429443359375, "epoch": 0.7066666666666667, "grad_norm": 0.9638821889698813, "kl": 0.0205078125, "learning_rate": 2e-06, "loss": 0.0458, "reward": 0.6227900981903076, "reward_std": 0.1025083139538765, "rewards/length_reward": 0.06562498211860657, "rewards/similarity_reward": 0.5571650862693787, "step": 265 }, { "completion_length": 223.3973388671875, "epoch": 0.7093333333333334, "grad_norm": 1.0581452656511359, "kl": 0.01336669921875, "learning_rate": 2e-06, "loss": 0.0641, "reward": 0.6723743081092834, "reward_std": 0.1159893348813057, "rewards/length_reward": 0.06517855077981949, "rewards/similarity_reward": 0.6071956753730774, "step": 266 }, { "completion_length": 270.2098388671875, "epoch": 0.712, "grad_norm": 0.8609101277095984, "kl": 0.0146484375, "learning_rate": 2e-06, "loss": 0.0419, "reward": 0.7352553009986877, "reward_std": 0.13158555328845978, "rewards/length_reward": 0.07499997317790985, "rewards/similarity_reward": 0.6602552533149719, "step": 267 }, { "completion_length": 286.65179443359375, "epoch": 0.7146666666666667, "grad_norm": 0.85187609520963, "kl": 0.01470947265625, "learning_rate": 2e-06, "loss": 0.0801, "reward": 0.5860309600830078, "reward_std": 0.12361589819192886, "rewards/length_reward": 0.06562498211860657, "rewards/similarity_reward": 0.5204059481620789, "step": 268 }, { "completion_length": 258.5, "epoch": 0.7173333333333334, "grad_norm": 0.8462850933355499, "kl": 0.01080322265625, "learning_rate": 2e-06, "loss": 0.1016, "reward": 0.7648903131484985, "reward_std": 0.08282845467329025, "rewards/length_reward": 0.08348211646080017, "rewards/similarity_reward": 0.6814082264900208, "step": 269 }, { "completion_length": 255.4598388671875, "epoch": 0.72, "grad_norm": 0.8789894818999019, "kl": 0.01507568359375, "learning_rate": 2e-06, "loss": 0.1231, "reward": 0.6465427875518799, "reward_std": 0.12229768186807632, "rewards/length_reward": 0.06607140600681305, "rewards/similarity_reward": 0.5804713368415833, "step": 270 }, { "completion_length": 273.01788330078125, "epoch": 0.7226666666666667, "grad_norm": 0.8695715294946503, "kl": 0.0184326171875, "learning_rate": 2e-06, "loss": 0.087, "reward": 0.6258493661880493, "reward_std": 0.13294367492198944, "rewards/length_reward": 0.07053568959236145, "rewards/similarity_reward": 0.5553135871887207, "step": 271 }, { "completion_length": 260.8169860839844, "epoch": 0.7253333333333334, "grad_norm": 1.2217664906535957, "kl": 0.015380859375, "learning_rate": 2e-06, "loss": 0.0431, "reward": 0.6779768466949463, "reward_std": 0.11835993826389313, "rewards/length_reward": 0.07678568363189697, "rewards/similarity_reward": 0.6011910438537598, "step": 272 }, { "completion_length": 284.625, "epoch": 0.728, "grad_norm": 0.70076206752431, "kl": 0.01031494140625, "learning_rate": 2e-06, "loss": 0.0623, "reward": 0.6957324147224426, "reward_std": 0.11651583760976791, "rewards/length_reward": 0.08348210901021957, "rewards/similarity_reward": 0.6122502088546753, "step": 273 }, { "completion_length": 255.76339721679688, "epoch": 0.7306666666666667, "grad_norm": 0.9105713339266117, "kl": 0.012939453125, "learning_rate": 2e-06, "loss": 0.0998, "reward": 0.6966086626052856, "reward_std": 0.08685937523841858, "rewards/length_reward": 0.07187496870756149, "rewards/similarity_reward": 0.624733567237854, "step": 274 }, { "completion_length": 296.76788330078125, "epoch": 0.7333333333333333, "grad_norm": 0.7414942240643484, "kl": 0.00921630859375, "learning_rate": 2e-06, "loss": 0.0451, "reward": 0.7098910212516785, "reward_std": 0.09432552009820938, "rewards/length_reward": 0.08482139557600021, "rewards/similarity_reward": 0.6250695586204529, "step": 275 }, { "completion_length": 270.0669860839844, "epoch": 0.736, "grad_norm": 0.898333119316704, "kl": 0.0137939453125, "learning_rate": 2e-06, "loss": -0.0225, "reward": 0.7013087868690491, "reward_std": 0.11285625398159027, "rewards/length_reward": 0.0741071105003357, "rewards/similarity_reward": 0.6272015571594238, "step": 276 }, { "completion_length": 198.93751525878906, "epoch": 0.7386666666666667, "grad_norm": 1.1511982372559852, "kl": 0.0205078125, "learning_rate": 2e-06, "loss": 0.0183, "reward": 0.5090009570121765, "reward_std": 0.13011598587036133, "rewards/length_reward": 0.06205355003476143, "rewards/similarity_reward": 0.4469473958015442, "step": 277 }, { "completion_length": 271.58038330078125, "epoch": 0.7413333333333333, "grad_norm": 0.8195827963319392, "kl": 0.01092529296875, "learning_rate": 2e-06, "loss": 0.0099, "reward": 0.6257685422897339, "reward_std": 0.1082058921456337, "rewards/length_reward": 0.07544640451669693, "rewards/similarity_reward": 0.5503220558166504, "step": 278 }, { "completion_length": 273.2232360839844, "epoch": 0.744, "grad_norm": 0.9459393517121532, "kl": 0.01422119140625, "learning_rate": 2e-06, "loss": 0.0902, "reward": 0.6197928190231323, "reward_std": 0.15125982463359833, "rewards/length_reward": 0.06874997913837433, "rewards/similarity_reward": 0.551042914390564, "step": 279 }, { "completion_length": 279.65625, "epoch": 0.7466666666666667, "grad_norm": 0.9319904211339567, "kl": 0.0126953125, "learning_rate": 2e-06, "loss": 0.0701, "reward": 0.6224436163902283, "reward_std": 0.11631693691015244, "rewards/length_reward": 0.07008926570415497, "rewards/similarity_reward": 0.5523543357849121, "step": 280 }, { "completion_length": 289.61163330078125, "epoch": 0.7493333333333333, "grad_norm": 1.071648613346095, "kl": 0.010009765625, "learning_rate": 2e-06, "loss": 0.1723, "reward": 0.6961318850517273, "reward_std": 0.10897497087717056, "rewards/length_reward": 0.07723211497068405, "rewards/similarity_reward": 0.6188997030258179, "step": 281 }, { "completion_length": 243.8348388671875, "epoch": 0.752, "grad_norm": 0.994726741013797, "kl": 0.01495361328125, "learning_rate": 2e-06, "loss": 0.0637, "reward": 0.6346572637557983, "reward_std": 0.10820147395133972, "rewards/length_reward": 0.07008925825357437, "rewards/similarity_reward": 0.5645679831504822, "step": 282 }, { "completion_length": 251.38394165039062, "epoch": 0.7546666666666667, "grad_norm": 0.976491323713793, "kl": 0.015380859375, "learning_rate": 2e-06, "loss": 0.0921, "reward": 0.5713584423065186, "reward_std": 0.1460532546043396, "rewards/length_reward": 0.06517855077981949, "rewards/similarity_reward": 0.5061798095703125, "step": 283 }, { "completion_length": 295.8348388671875, "epoch": 0.7573333333333333, "grad_norm": 0.8830201574560093, "kl": 0.0140380859375, "learning_rate": 2e-06, "loss": -0.0096, "reward": 0.5986955761909485, "reward_std": 0.15517514944076538, "rewards/length_reward": 0.07008926570415497, "rewards/similarity_reward": 0.5286062955856323, "step": 284 }, { "completion_length": 247.72769165039062, "epoch": 0.76, "grad_norm": 0.8557479345922782, "kl": 0.01385498046875, "learning_rate": 2e-06, "loss": -0.0017, "reward": 0.6696428656578064, "reward_std": 0.10514307767152786, "rewards/length_reward": 0.07857140153646469, "rewards/similarity_reward": 0.5910714268684387, "step": 285 }, { "completion_length": 264.9196472167969, "epoch": 0.7626666666666667, "grad_norm": 0.900029253879394, "kl": 0.01031494140625, "learning_rate": 2e-06, "loss": 0.011, "reward": 0.6437191367149353, "reward_std": 0.12881356477737427, "rewards/length_reward": 0.07991068065166473, "rewards/similarity_reward": 0.5638083815574646, "step": 286 }, { "completion_length": 275.76788330078125, "epoch": 0.7653333333333333, "grad_norm": 0.8434547706339766, "kl": 0.01519775390625, "learning_rate": 2e-06, "loss": 0.0667, "reward": 0.6621875762939453, "reward_std": 0.15463142096996307, "rewards/length_reward": 0.06964283436536789, "rewards/similarity_reward": 0.5925447344779968, "step": 287 }, { "completion_length": 251.4866180419922, "epoch": 0.768, "grad_norm": 0.9853184231195431, "kl": 0.01507568359375, "learning_rate": 2e-06, "loss": 0.0183, "reward": 0.5950483679771423, "reward_std": 0.1278952956199646, "rewards/length_reward": 0.07544640451669693, "rewards/similarity_reward": 0.5196019411087036, "step": 288 }, { "completion_length": 279.9419860839844, "epoch": 0.7706666666666667, "grad_norm": 0.8374958728984951, "kl": 0.0126953125, "learning_rate": 2e-06, "loss": 0.1275, "reward": 0.6563042402267456, "reward_std": 0.14691661298274994, "rewards/length_reward": 0.07098212093114853, "rewards/similarity_reward": 0.5853220820426941, "step": 289 }, { "completion_length": 283.12054443359375, "epoch": 0.7733333333333333, "grad_norm": 0.9945411525052974, "kl": 0.01019287109375, "learning_rate": 2e-06, "loss": 0.0685, "reward": 0.7337676882743835, "reward_std": 0.09462190419435501, "rewards/length_reward": 0.07633925974369049, "rewards/similarity_reward": 0.6574283838272095, "step": 290 }, { "completion_length": 286.1026916503906, "epoch": 0.776, "grad_norm": 0.7519820653879999, "kl": 0.01275634765625, "learning_rate": 2e-06, "loss": 0.069, "reward": 0.6660787463188171, "reward_std": 0.13770896196365356, "rewards/length_reward": 0.07455354183912277, "rewards/similarity_reward": 0.5915251970291138, "step": 291 }, { "completion_length": 300.4419860839844, "epoch": 0.7786666666666666, "grad_norm": 0.9216655286630218, "kl": 0.01190185546875, "learning_rate": 2e-06, "loss": 0.1068, "reward": 0.7210602164268494, "reward_std": 0.15427368879318237, "rewards/length_reward": 0.08035711199045181, "rewards/similarity_reward": 0.6407030820846558, "step": 292 }, { "completion_length": 252.0178680419922, "epoch": 0.7813333333333333, "grad_norm": 0.8654140719017164, "kl": 0.022705078125, "learning_rate": 2e-06, "loss": 0.1224, "reward": 0.6323553919792175, "reward_std": 0.1670098751783371, "rewards/length_reward": 0.06651784479618073, "rewards/similarity_reward": 0.5658375024795532, "step": 293 }, { "completion_length": 274.8839416503906, "epoch": 0.784, "grad_norm": 0.7786721291031314, "kl": 0.0142822265625, "learning_rate": 2e-06, "loss": 0.0166, "reward": 0.7442488074302673, "reward_std": 0.10799020528793335, "rewards/length_reward": 0.07455354183912277, "rewards/similarity_reward": 0.6696951389312744, "step": 294 }, { "completion_length": 279.1294860839844, "epoch": 0.7866666666666666, "grad_norm": 0.8056776017830058, "kl": 0.011962890625, "learning_rate": 2e-06, "loss": 0.0048, "reward": 0.6635159850120544, "reward_std": 0.11003357172012329, "rewards/length_reward": 0.07008926570415497, "rewards/similarity_reward": 0.5934267044067383, "step": 295 }, { "completion_length": 254.96429443359375, "epoch": 0.7893333333333333, "grad_norm": 1.0354511241505295, "kl": 0.0137939453125, "learning_rate": 2e-06, "loss": 0.0919, "reward": 0.6252850294113159, "reward_std": 0.10695895552635193, "rewards/length_reward": 0.07142854481935501, "rewards/similarity_reward": 0.553856372833252, "step": 296 }, { "completion_length": 233.25894165039062, "epoch": 0.792, "grad_norm": 0.9468597047570717, "kl": 0.01275634765625, "learning_rate": 2e-06, "loss": -0.0101, "reward": 0.6699472665786743, "reward_std": 0.16828653216362, "rewards/length_reward": 0.06964283436536789, "rewards/similarity_reward": 0.6003044247627258, "step": 297 }, { "completion_length": 249.1428680419922, "epoch": 0.7946666666666666, "grad_norm": 0.9140690517111535, "kl": 0.01141357421875, "learning_rate": 2e-06, "loss": 0.072, "reward": 0.6824041604995728, "reward_std": 0.12977474927902222, "rewards/length_reward": 0.08035711199045181, "rewards/similarity_reward": 0.6020469069480896, "step": 298 }, { "completion_length": 276.3571472167969, "epoch": 0.7973333333333333, "grad_norm": 0.8259435738042828, "kl": 0.01141357421875, "learning_rate": 2e-06, "loss": 0.0105, "reward": 0.7071071863174438, "reward_std": 0.08347765356302261, "rewards/length_reward": 0.08169639110565186, "rewards/similarity_reward": 0.625410795211792, "step": 299 }, { "completion_length": 270.0758972167969, "epoch": 0.8, "grad_norm": 0.8981450111371676, "kl": 0.0126953125, "learning_rate": 2e-06, "loss": 0.0594, "reward": 0.6137918829917908, "reward_std": 0.12631313502788544, "rewards/length_reward": 0.064732126891613, "rewards/similarity_reward": 0.5490598082542419, "step": 300 }, { "completion_length": 278.83038330078125, "epoch": 0.8026666666666666, "grad_norm": 0.8303352041330266, "kl": 0.01397705078125, "learning_rate": 2e-06, "loss": 0.0166, "reward": 0.6976829767227173, "reward_std": 0.11335788667201996, "rewards/length_reward": 0.08169639110565186, "rewards/similarity_reward": 0.6159866452217102, "step": 301 }, { "completion_length": 257.5357360839844, "epoch": 0.8053333333333333, "grad_norm": 0.8867998574709848, "kl": 0.0135498046875, "learning_rate": 2e-06, "loss": 0.0754, "reward": 0.6026350855827332, "reward_std": 0.0930032953619957, "rewards/length_reward": 0.07589282840490341, "rewards/similarity_reward": 0.526742160320282, "step": 302 }, { "completion_length": 283.5669860839844, "epoch": 0.808, "grad_norm": 0.8168218668358965, "kl": 0.0120849609375, "learning_rate": 2e-06, "loss": 0.0773, "reward": 0.5895494222640991, "reward_std": 0.1272886097431183, "rewards/length_reward": 0.06874997913837433, "rewards/similarity_reward": 0.5207993984222412, "step": 303 }, { "completion_length": 303.6026916503906, "epoch": 0.8106666666666666, "grad_norm": 0.8899545222480755, "kl": 0.012939453125, "learning_rate": 2e-06, "loss": 0.0372, "reward": 0.6067291498184204, "reward_std": 0.1214829757809639, "rewards/length_reward": 0.07008926570415497, "rewards/similarity_reward": 0.5366398692131042, "step": 304 }, { "completion_length": 294.3482360839844, "epoch": 0.8133333333333334, "grad_norm": 0.8194602682013028, "kl": 0.01153564453125, "learning_rate": 2e-06, "loss": 0.0566, "reward": 0.7332960963249207, "reward_std": 0.08479318022727966, "rewards/length_reward": 0.07767854630947113, "rewards/similarity_reward": 0.6556174755096436, "step": 305 }, { "completion_length": 266.4375, "epoch": 0.816, "grad_norm": 0.987377079764631, "kl": 0.0133056640625, "learning_rate": 2e-06, "loss": 0.0409, "reward": 0.6382983326911926, "reward_std": 0.1240207627415657, "rewards/length_reward": 0.07366068661212921, "rewards/similarity_reward": 0.5646375417709351, "step": 306 }, { "completion_length": 267.95538330078125, "epoch": 0.8186666666666667, "grad_norm": 0.9217292474743543, "kl": 0.01263427734375, "learning_rate": 2e-06, "loss": 0.0267, "reward": 0.6616266965866089, "reward_std": 0.1070173904299736, "rewards/length_reward": 0.06964283436536789, "rewards/similarity_reward": 0.5919837355613708, "step": 307 }, { "completion_length": 279.0535888671875, "epoch": 0.8213333333333334, "grad_norm": 0.8072891907153936, "kl": 0.01385498046875, "learning_rate": 2e-06, "loss": 0.0943, "reward": 0.6182869672775269, "reward_std": 0.1361446976661682, "rewards/length_reward": 0.07187497615814209, "rewards/similarity_reward": 0.5464120507240295, "step": 308 }, { "completion_length": 287.90625, "epoch": 0.824, "grad_norm": 0.9054980085837359, "kl": 0.01153564453125, "learning_rate": 2e-06, "loss": 0.068, "reward": 0.7372510433197021, "reward_std": 0.0957195907831192, "rewards/length_reward": 0.07499997317790985, "rewards/similarity_reward": 0.6622509956359863, "step": 309 }, { "completion_length": 258.3973388671875, "epoch": 0.8266666666666667, "grad_norm": 0.9082585914171709, "kl": 0.01202392578125, "learning_rate": 2e-06, "loss": 0.0061, "reward": 0.647227942943573, "reward_std": 0.1310262531042099, "rewards/length_reward": 0.07053568959236145, "rewards/similarity_reward": 0.5766921043395996, "step": 310 }, { "completion_length": 221.2991180419922, "epoch": 0.8293333333333334, "grad_norm": 0.9448119727952166, "kl": 0.021728515625, "learning_rate": 2e-06, "loss": -0.0416, "reward": 0.6708490252494812, "reward_std": 0.1254904717206955, "rewards/length_reward": 0.07812497019767761, "rewards/similarity_reward": 0.5927240252494812, "step": 311 }, { "completion_length": 249.3616180419922, "epoch": 0.832, "grad_norm": 0.8967865636957736, "kl": 0.010009765625, "learning_rate": 2e-06, "loss": 0.0332, "reward": 0.7173448801040649, "reward_std": 0.10435692220926285, "rewards/length_reward": 0.08214282244443893, "rewards/similarity_reward": 0.6352020502090454, "step": 312 }, { "completion_length": 246.80804443359375, "epoch": 0.8346666666666667, "grad_norm": 0.9778150152016866, "kl": 0.01373291015625, "learning_rate": 2e-06, "loss": 0.0158, "reward": 0.6462720036506653, "reward_std": 0.13069510459899902, "rewards/length_reward": 0.07499997317790985, "rewards/similarity_reward": 0.5712720155715942, "step": 313 }, { "completion_length": 244.13839721679688, "epoch": 0.8373333333333334, "grad_norm": 0.9354621189513169, "kl": 0.01214599609375, "learning_rate": 2e-06, "loss": 0.264, "reward": 0.6220008730888367, "reward_std": 0.12123651802539825, "rewards/length_reward": 0.06205355003476143, "rewards/similarity_reward": 0.5599472522735596, "step": 314 }, { "completion_length": 269.5446472167969, "epoch": 0.84, "grad_norm": 0.9611408485021674, "kl": 0.01513671875, "learning_rate": 2e-06, "loss": 0.0553, "reward": 0.725965678691864, "reward_std": 0.09657153487205505, "rewards/length_reward": 0.07723211497068405, "rewards/similarity_reward": 0.6487335562705994, "step": 315 }, { "completion_length": 274.84375, "epoch": 0.8426666666666667, "grad_norm": 0.8797127493353065, "kl": 0.01129150390625, "learning_rate": 2e-06, "loss": 0.014, "reward": 0.6985806226730347, "reward_std": 0.11010481417179108, "rewards/length_reward": 0.0741071105003357, "rewards/similarity_reward": 0.6244734525680542, "step": 316 }, { "completion_length": 272.8348388671875, "epoch": 0.8453333333333334, "grad_norm": 0.779139564448991, "kl": 0.01055908203125, "learning_rate": 2e-06, "loss": -0.0161, "reward": 0.7422655820846558, "reward_std": 0.06264423578977585, "rewards/length_reward": 0.08124996721744537, "rewards/similarity_reward": 0.6610156297683716, "step": 317 }, { "completion_length": 229.7678680419922, "epoch": 0.848, "grad_norm": 4.278780421719415, "kl": 0.0390625, "learning_rate": 2e-06, "loss": 0.0681, "reward": 0.6605138778686523, "reward_std": 0.14701789617538452, "rewards/length_reward": 0.07187496870756149, "rewards/similarity_reward": 0.5886389017105103, "step": 318 }, { "completion_length": 286.2723388671875, "epoch": 0.8506666666666667, "grad_norm": 1.0371097952967725, "kl": 0.01611328125, "learning_rate": 2e-06, "loss": 0.1372, "reward": 0.6110987663269043, "reward_std": 0.18697677552700043, "rewards/length_reward": 0.06205355003476143, "rewards/similarity_reward": 0.549045205116272, "step": 319 }, { "completion_length": 260.0625, "epoch": 0.8533333333333334, "grad_norm": 0.9056186985134533, "kl": 0.009521484375, "learning_rate": 2e-06, "loss": 0.0351, "reward": 0.6572511196136475, "reward_std": 0.08565808087587357, "rewards/length_reward": 0.07812497019767761, "rewards/similarity_reward": 0.5791261196136475, "step": 320 }, { "completion_length": 281.71875, "epoch": 0.856, "grad_norm": 0.71560536286136, "kl": 0.00860595703125, "learning_rate": 2e-06, "loss": 0.0613, "reward": 0.7018586993217468, "reward_std": 0.14483648538589478, "rewards/length_reward": 0.08169639110565186, "rewards/similarity_reward": 0.620162308216095, "step": 321 }, { "completion_length": 258.2276916503906, "epoch": 0.8586666666666667, "grad_norm": 0.9896625298898443, "kl": 0.0181884765625, "learning_rate": 2e-06, "loss": 0.2194, "reward": 0.5979973077774048, "reward_std": 0.13931064307689667, "rewards/length_reward": 0.06249998137354851, "rewards/similarity_reward": 0.5354973077774048, "step": 322 }, { "completion_length": 237.1741180419922, "epoch": 0.8613333333333333, "grad_norm": 0.9609452774135127, "kl": 0.01513671875, "learning_rate": 2e-06, "loss": 0.0016, "reward": 0.6683059334754944, "reward_std": 0.10867080092430115, "rewards/length_reward": 0.07991068810224533, "rewards/similarity_reward": 0.5883952975273132, "step": 323 }, { "completion_length": 254.01339721679688, "epoch": 0.864, "grad_norm": 1.0568546085260409, "kl": 0.01531982421875, "learning_rate": 2e-06, "loss": 0.0933, "reward": 0.6546286940574646, "reward_std": 0.11366698145866394, "rewards/length_reward": 0.07008926570415497, "rewards/similarity_reward": 0.5845393538475037, "step": 324 }, { "completion_length": 242.03126525878906, "epoch": 0.8666666666666667, "grad_norm": 0.926913110151395, "kl": 0.014892578125, "learning_rate": 2e-06, "loss": 0.0865, "reward": 0.5440469980239868, "reward_std": 0.11594089865684509, "rewards/length_reward": 0.06383926421403885, "rewards/similarity_reward": 0.48020774126052856, "step": 325 }, { "completion_length": 228.21429443359375, "epoch": 0.8693333333333333, "grad_norm": 1.0043052790420461, "kl": 0.01385498046875, "learning_rate": 2e-06, "loss": -0.0272, "reward": 0.6953443884849548, "reward_std": 0.1263352483510971, "rewards/length_reward": 0.06785711646080017, "rewards/similarity_reward": 0.6274873614311218, "step": 326 }, { "completion_length": 307.3125, "epoch": 0.872, "grad_norm": 0.7734278067274474, "kl": 0.0089111328125, "learning_rate": 2e-06, "loss": 0.0474, "reward": 0.689052402973175, "reward_std": 0.10326018929481506, "rewards/length_reward": 0.07455354183912277, "rewards/similarity_reward": 0.6144987940788269, "step": 327 }, { "completion_length": 256.7589416503906, "epoch": 0.8746666666666667, "grad_norm": 0.8349456447351374, "kl": 0.01275634765625, "learning_rate": 2e-06, "loss": -0.0053, "reward": 0.6755567193031311, "reward_std": 0.12627391517162323, "rewards/length_reward": 0.07321426272392273, "rewards/similarity_reward": 0.6023423671722412, "step": 328 }, { "completion_length": 272.67413330078125, "epoch": 0.8773333333333333, "grad_norm": 0.9337813221087722, "kl": 0.013671875, "learning_rate": 2e-06, "loss": 0.0863, "reward": 0.6436842083930969, "reward_std": 0.1272781938314438, "rewards/length_reward": 0.06339284032583237, "rewards/similarity_reward": 0.5802912712097168, "step": 329 }, { "completion_length": 268.5401916503906, "epoch": 0.88, "grad_norm": 0.7822000698940798, "kl": 0.014892578125, "learning_rate": 2e-06, "loss": 0.0076, "reward": 0.6488507986068726, "reward_std": 0.1486339569091797, "rewards/length_reward": 0.07812497019767761, "rewards/similarity_reward": 0.5707257986068726, "step": 330 }, { "completion_length": 255.2991180419922, "epoch": 0.8826666666666667, "grad_norm": 0.9970416796611882, "kl": 0.01348876953125, "learning_rate": 2e-06, "loss": -0.0091, "reward": 0.623367190361023, "reward_std": 0.11435237526893616, "rewards/length_reward": 0.07544640451669693, "rewards/similarity_reward": 0.547920823097229, "step": 331 }, { "completion_length": 281.2544860839844, "epoch": 0.8853333333333333, "grad_norm": 0.9393864746480084, "kl": 0.0177001953125, "learning_rate": 2e-06, "loss": -0.0269, "reward": 0.6835038661956787, "reward_std": 0.12119947373867035, "rewards/length_reward": 0.07366069406270981, "rewards/similarity_reward": 0.6098431348800659, "step": 332 }, { "completion_length": 277.3883972167969, "epoch": 0.888, "grad_norm": 0.8857739277618905, "kl": 0.0133056640625, "learning_rate": 2e-06, "loss": 0.1155, "reward": 0.6458525657653809, "reward_std": 0.10718663036823273, "rewards/length_reward": 0.07187497615814209, "rewards/similarity_reward": 0.5739776492118835, "step": 333 }, { "completion_length": 222.0491180419922, "epoch": 0.8906666666666667, "grad_norm": 0.9211484451974372, "kl": 0.0145263671875, "learning_rate": 2e-06, "loss": 0.0326, "reward": 0.6779581308364868, "reward_std": 0.09158685058355331, "rewards/length_reward": 0.07098212093114853, "rewards/similarity_reward": 0.6069758534431458, "step": 334 }, { "completion_length": 253.54464721679688, "epoch": 0.8933333333333333, "grad_norm": 1.0439727694024494, "kl": 0.01513671875, "learning_rate": 2e-06, "loss": 0.0938, "reward": 0.6808232069015503, "reward_std": 0.11848772317171097, "rewards/length_reward": 0.07544640451669693, "rewards/similarity_reward": 0.6053767204284668, "step": 335 }, { "completion_length": 233.00001525878906, "epoch": 0.896, "grad_norm": 0.9838382076638493, "kl": 0.015869140625, "learning_rate": 2e-06, "loss": 0.0451, "reward": 0.58315509557724, "reward_std": 0.11559745669364929, "rewards/length_reward": 0.06741069257259369, "rewards/similarity_reward": 0.5157443881034851, "step": 336 }, { "completion_length": 269.0133972167969, "epoch": 0.8986666666666666, "grad_norm": 0.836096159627277, "kl": 0.01361083984375, "learning_rate": 2e-06, "loss": 0.0062, "reward": 0.6409623622894287, "reward_std": 0.12886659801006317, "rewards/length_reward": 0.07767854630947113, "rewards/similarity_reward": 0.5632836818695068, "step": 337 }, { "completion_length": 251.40179443359375, "epoch": 0.9013333333333333, "grad_norm": 0.8535550782434568, "kl": 0.01422119140625, "learning_rate": 2e-06, "loss": 0.0089, "reward": 0.6296460032463074, "reward_std": 0.16196994483470917, "rewards/length_reward": 0.07187497615814209, "rewards/similarity_reward": 0.5577709674835205, "step": 338 }, { "completion_length": 244.80804443359375, "epoch": 0.904, "grad_norm": 1.0854210987310389, "kl": 0.0181884765625, "learning_rate": 2e-06, "loss": 0.0595, "reward": 0.597773551940918, "reward_std": 0.10373269766569138, "rewards/length_reward": 0.06339284032583237, "rewards/similarity_reward": 0.5343807935714722, "step": 339 }, { "completion_length": 262.6875, "epoch": 0.9066666666666666, "grad_norm": 0.8356624214145669, "kl": 0.013916015625, "learning_rate": 2e-06, "loss": 0.0101, "reward": 0.6685509085655212, "reward_std": 0.0960090234875679, "rewards/length_reward": 0.07946424931287766, "rewards/similarity_reward": 0.5890867114067078, "step": 340 }, { "completion_length": 262.5535888671875, "epoch": 0.9093333333333333, "grad_norm": 1.0702777784285404, "kl": 0.02783203125, "learning_rate": 2e-06, "loss": 0.1024, "reward": 0.5271078944206238, "reward_std": 0.14886566996574402, "rewards/length_reward": 0.064732126891613, "rewards/similarity_reward": 0.46237578988075256, "step": 341 }, { "completion_length": 251.03126525878906, "epoch": 0.912, "grad_norm": 0.9235089036400403, "kl": 0.0172119140625, "learning_rate": 2e-06, "loss": 0.0324, "reward": 0.6528847217559814, "reward_std": 0.11811169981956482, "rewards/length_reward": 0.06383927166461945, "rewards/similarity_reward": 0.5890454053878784, "step": 342 }, { "completion_length": 236.2857208251953, "epoch": 0.9146666666666666, "grad_norm": 0.9202873381431551, "kl": 0.01556396484375, "learning_rate": 2e-06, "loss": 0.0497, "reward": 0.6048458814620972, "reward_std": 0.13773028552532196, "rewards/length_reward": 0.07544640451669693, "rewards/similarity_reward": 0.5293995141983032, "step": 343 }, { "completion_length": 309.99554443359375, "epoch": 0.9173333333333333, "grad_norm": 0.9489271796425148, "kl": 0.01226806640625, "learning_rate": 2e-06, "loss": 0.0812, "reward": 0.6291395425796509, "reward_std": 0.15959399938583374, "rewards/length_reward": 0.06696426123380661, "rewards/similarity_reward": 0.5621752738952637, "step": 344 }, { "completion_length": 245.21429443359375, "epoch": 0.92, "grad_norm": 1.1363976997302783, "kl": 0.0213623046875, "learning_rate": 2e-06, "loss": 0.1781, "reward": 0.5569170713424683, "reward_std": 0.14195482432842255, "rewards/length_reward": 0.06160712614655495, "rewards/similarity_reward": 0.49530985951423645, "step": 345 }, { "completion_length": 272.2232360839844, "epoch": 0.9226666666666666, "grad_norm": 0.7565012868381632, "kl": 0.0108642578125, "learning_rate": 2e-06, "loss": 0.0695, "reward": 0.6887885928153992, "reward_std": 0.11395367234945297, "rewards/length_reward": 0.07857140153646469, "rewards/similarity_reward": 0.6102170348167419, "step": 346 }, { "completion_length": 310.28125, "epoch": 0.9253333333333333, "grad_norm": 0.782594647397142, "kl": 0.01202392578125, "learning_rate": 2e-06, "loss": 0.0752, "reward": 0.6732801795005798, "reward_std": 0.12288369983434677, "rewards/length_reward": 0.07455354183912277, "rewards/similarity_reward": 0.5987265706062317, "step": 347 }, { "completion_length": 307.86163330078125, "epoch": 0.928, "grad_norm": 0.7087010173295871, "kl": 0.013671875, "learning_rate": 2e-06, "loss": 0.008, "reward": 0.6750614047050476, "reward_std": 0.09951343387365341, "rewards/length_reward": 0.07410712540149689, "rewards/similarity_reward": 0.6009542942047119, "step": 348 }, { "completion_length": 273.3883972167969, "epoch": 0.9306666666666666, "grad_norm": 1.0456544147832767, "kl": 0.01446533203125, "learning_rate": 2e-06, "loss": 0.0404, "reward": 0.6893116235733032, "reward_std": 0.09679926186800003, "rewards/length_reward": 0.06517855077981949, "rewards/similarity_reward": 0.6241331100463867, "step": 349 }, { "completion_length": 264.1919860839844, "epoch": 0.9333333333333333, "grad_norm": 0.8991803002318822, "kl": 0.01397705078125, "learning_rate": 2e-06, "loss": 0.0129, "reward": 0.6719235181808472, "reward_std": 0.13838014006614685, "rewards/length_reward": 0.07321426272392273, "rewards/similarity_reward": 0.598709225654602, "step": 350 }, { "completion_length": 274.9508972167969, "epoch": 0.936, "grad_norm": 0.8591029679110356, "kl": 0.013916015625, "learning_rate": 2e-06, "loss": 0.0312, "reward": 0.5572786331176758, "reward_std": 0.14849816262722015, "rewards/length_reward": 0.06785711646080017, "rewards/similarity_reward": 0.489421546459198, "step": 351 }, { "completion_length": 277.9508972167969, "epoch": 0.9386666666666666, "grad_norm": 0.8388349903111052, "kl": 0.01141357421875, "learning_rate": 2e-06, "loss": 0.0278, "reward": 0.681722104549408, "reward_std": 0.0899442657828331, "rewards/length_reward": 0.07991068065166473, "rewards/similarity_reward": 0.6018112897872925, "step": 352 }, { "completion_length": 240.3348388671875, "epoch": 0.9413333333333334, "grad_norm": 0.867198144640214, "kl": 0.01336669921875, "learning_rate": 2e-06, "loss": -0.0137, "reward": 0.6065589785575867, "reward_std": 0.11837570369243622, "rewards/length_reward": 0.07946424931287766, "rewards/similarity_reward": 0.5270946025848389, "step": 353 }, { "completion_length": 287.4151916503906, "epoch": 0.944, "grad_norm": 1.2363858371428533, "kl": 0.0157470703125, "learning_rate": 2e-06, "loss": 0.0649, "reward": 0.6748880743980408, "reward_std": 0.13465073704719543, "rewards/length_reward": 0.07187496870756149, "rewards/similarity_reward": 0.6030132174491882, "step": 354 }, { "completion_length": 309.6919860839844, "epoch": 0.9466666666666667, "grad_norm": 0.6824545679415536, "kl": 0.00946044921875, "learning_rate": 2e-06, "loss": 0.0881, "reward": 0.6755697727203369, "reward_std": 0.13920390605926514, "rewards/length_reward": 0.07901783287525177, "rewards/similarity_reward": 0.5965518355369568, "step": 355 }, { "completion_length": 304.0089416503906, "epoch": 0.9493333333333334, "grad_norm": 0.7612207527990814, "kl": 0.00860595703125, "learning_rate": 2e-06, "loss": 0.0573, "reward": 0.6815410852432251, "reward_std": 0.11536341905593872, "rewards/length_reward": 0.07455354928970337, "rewards/similarity_reward": 0.6069875955581665, "step": 356 }, { "completion_length": 280.4151916503906, "epoch": 0.952, "grad_norm": 0.9197442279455559, "kl": 0.014892578125, "learning_rate": 2e-06, "loss": 0.0976, "reward": 0.6213651895523071, "reward_std": 0.1452549546957016, "rewards/length_reward": 0.059374988079071045, "rewards/similarity_reward": 0.5619902014732361, "step": 357 }, { "completion_length": 311.9375, "epoch": 0.9546666666666667, "grad_norm": 0.7667231406265689, "kl": 0.0123291015625, "learning_rate": 2e-06, "loss": 0.0949, "reward": 0.671852707862854, "reward_std": 0.12415429949760437, "rewards/length_reward": 0.07455354183912277, "rewards/similarity_reward": 0.5972990989685059, "step": 358 }, { "completion_length": 296.4598388671875, "epoch": 0.9573333333333334, "grad_norm": 0.784755991417782, "kl": 0.0093994140625, "learning_rate": 2e-06, "loss": 0.0284, "reward": 0.7474254965782166, "reward_std": 0.10959716141223907, "rewards/length_reward": 0.07901783287525177, "rewards/similarity_reward": 0.6684076189994812, "step": 359 }, { "completion_length": 253.15626525878906, "epoch": 0.96, "grad_norm": 0.8190677358549971, "kl": 0.0125732421875, "learning_rate": 2e-06, "loss": -0.0109, "reward": 0.6444076299667358, "reward_std": 0.1199827641248703, "rewards/length_reward": 0.07142855226993561, "rewards/similarity_reward": 0.5729790925979614, "step": 360 }, { "completion_length": 306.5848388671875, "epoch": 0.9626666666666667, "grad_norm": 0.8327047575723723, "kl": 0.0101318359375, "learning_rate": 2e-06, "loss": 0.044, "reward": 0.6085981726646423, "reward_std": 0.15067243576049805, "rewards/length_reward": 0.07366069406270981, "rewards/similarity_reward": 0.5349374413490295, "step": 361 }, { "completion_length": 254.7723388671875, "epoch": 0.9653333333333334, "grad_norm": 0.8753840404117226, "kl": 0.0155029296875, "learning_rate": 2e-06, "loss": 0.0219, "reward": 0.6461009383201599, "reward_std": 0.11166159808635712, "rewards/length_reward": 0.07053568959236145, "rewards/similarity_reward": 0.5755651593208313, "step": 362 }, { "completion_length": 260.33929443359375, "epoch": 0.968, "grad_norm": 0.8487935107258318, "kl": 0.0135498046875, "learning_rate": 2e-06, "loss": 0.0758, "reward": 0.6731547713279724, "reward_std": 0.0943944975733757, "rewards/length_reward": 0.07946424931287766, "rewards/similarity_reward": 0.5936905145645142, "step": 363 }, { "completion_length": 284.9107360839844, "epoch": 0.9706666666666667, "grad_norm": 0.9104348928736092, "kl": 0.0146484375, "learning_rate": 2e-06, "loss": 0.0438, "reward": 0.6029422879219055, "reward_std": 0.13879723846912384, "rewards/length_reward": 0.07053568959236145, "rewards/similarity_reward": 0.5324065685272217, "step": 364 }, { "completion_length": 291.1339416503906, "epoch": 0.9733333333333334, "grad_norm": 0.8351250207880698, "kl": 0.01312255859375, "learning_rate": 2e-06, "loss": 0.0447, "reward": 0.6295793056488037, "reward_std": 0.11455141007900238, "rewards/length_reward": 0.07321426272392273, "rewards/similarity_reward": 0.556364893913269, "step": 365 }, { "completion_length": 313.6160888671875, "epoch": 0.976, "grad_norm": 0.8331869500678173, "kl": 0.0137939453125, "learning_rate": 2e-06, "loss": 0.1377, "reward": 0.6497610807418823, "reward_std": 0.13638634979724884, "rewards/length_reward": 0.064732126891613, "rewards/similarity_reward": 0.5850289463996887, "step": 366 }, { "completion_length": 260.46875, "epoch": 0.9786666666666667, "grad_norm": 0.9519334592833407, "kl": 0.0184326171875, "learning_rate": 2e-06, "loss": 0.108, "reward": 0.6215986609458923, "reward_std": 0.13745638728141785, "rewards/length_reward": 0.06696426123380661, "rewards/similarity_reward": 0.5546343326568604, "step": 367 }, { "completion_length": 261.9196472167969, "epoch": 0.9813333333333333, "grad_norm": 0.9206057200583376, "kl": 0.01361083984375, "learning_rate": 2e-06, "loss": 0.1112, "reward": 0.5996190905570984, "reward_std": 0.13816344738006592, "rewards/length_reward": 0.0741071105003357, "rewards/similarity_reward": 0.5255119204521179, "step": 368 }, { "completion_length": 187.60269165039062, "epoch": 0.984, "grad_norm": 34.2971123341626, "kl": 0.0152587890625, "learning_rate": 2e-06, "loss": -0.0025, "reward": 0.6047165989875793, "reward_std": 0.11468542367219925, "rewards/length_reward": 0.07276783138513565, "rewards/similarity_reward": 0.5319487452507019, "step": 369 }, { "completion_length": 273.24554443359375, "epoch": 0.9866666666666667, "grad_norm": 1.0225592676611843, "kl": 0.0157470703125, "learning_rate": 2e-06, "loss": 0.0974, "reward": 0.5848848819732666, "reward_std": 0.13747373223304749, "rewards/length_reward": 0.06205355003476143, "rewards/similarity_reward": 0.5228313207626343, "step": 370 }, { "completion_length": 242.43304443359375, "epoch": 0.9893333333333333, "grad_norm": 0.9519953414264258, "kl": 0.01556396484375, "learning_rate": 2e-06, "loss": 0.1648, "reward": 0.6343554854393005, "reward_std": 0.14080199599266052, "rewards/length_reward": 0.07142855226993561, "rewards/similarity_reward": 0.5629268884658813, "step": 371 }, { "completion_length": 295.7633972167969, "epoch": 0.992, "grad_norm": 0.7534581065002547, "kl": 0.0140380859375, "learning_rate": 2e-06, "loss": 0.0601, "reward": 0.6254644393920898, "reward_std": 0.14738810062408447, "rewards/length_reward": 0.07232140004634857, "rewards/similarity_reward": 0.5531430244445801, "step": 372 }, { "completion_length": 240.38394165039062, "epoch": 0.9946666666666667, "grad_norm": 0.9672618888481953, "kl": 0.01239013671875, "learning_rate": 2e-06, "loss": 0.0627, "reward": 0.6286079287528992, "reward_std": 0.1355430781841278, "rewards/length_reward": 0.07232140004634857, "rewards/similarity_reward": 0.5562865734100342, "step": 373 }, { "completion_length": 296.3482360839844, "epoch": 0.9973333333333333, "grad_norm": 0.8090409603684708, "kl": 0.0118408203125, "learning_rate": 2e-06, "loss": 0.0202, "reward": 0.7064945697784424, "reward_std": 0.08492975682020187, "rewards/length_reward": 0.07455354183912277, "rewards/similarity_reward": 0.631941020488739, "step": 374 }, { "completion_length": 207.4114227294922, "epoch": 1.0, "grad_norm": 0.9985931023470964, "kl": 0.01458740234375, "learning_rate": 2e-06, "loss": 0.0328, "reward": 0.6288642883300781, "reward_std": 0.13037118315696716, "rewards/length_reward": 0.07276783138513565, "rewards/similarity_reward": 0.5560964941978455, "step": 375 } ], "logging_steps": 1, "max_steps": 375, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 32, "trial_name": null, "trial_params": null }