|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 1.0, |
|
"eval_steps": 500, |
|
"global_step": 375, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"completion_length": 376.9687805175781, |
|
"epoch": 0.0026666666666666666, |
|
"grad_norm": 0.8290089342545105, |
|
"kl": 0.00012969970703125, |
|
"learning_rate": 2e-06, |
|
"loss": 0.1243, |
|
"reward": 0.26896461844444275, |
|
"reward_std": 0.2736624479293823, |
|
"rewards/length_reward": 0.026339290663599968, |
|
"rewards/similarity_reward": 0.24262532591819763, |
|
"step": 1 |
|
}, |
|
{ |
|
"completion_length": 363.27679443359375, |
|
"epoch": 0.005333333333333333, |
|
"grad_norm": 0.8474061525539559, |
|
"kl": 0.00015354156494140625, |
|
"learning_rate": 2e-06, |
|
"loss": 0.1524, |
|
"reward": 0.1628378927707672, |
|
"reward_std": 0.21853798627853394, |
|
"rewards/length_reward": 0.0178571455180645, |
|
"rewards/similarity_reward": 0.14498072862625122, |
|
"step": 2 |
|
}, |
|
{ |
|
"completion_length": 378.4732360839844, |
|
"epoch": 0.008, |
|
"grad_norm": 0.8192733588218007, |
|
"kl": 0.0001430511474609375, |
|
"learning_rate": 2e-06, |
|
"loss": 0.1852, |
|
"reward": 0.27797675132751465, |
|
"reward_std": 0.23766961693763733, |
|
"rewards/length_reward": 0.02633928880095482, |
|
"rewards/similarity_reward": 0.25163745880126953, |
|
"step": 3 |
|
}, |
|
{ |
|
"completion_length": 359.21429443359375, |
|
"epoch": 0.010666666666666666, |
|
"grad_norm": 0.845054880446133, |
|
"kl": 0.00015544891357421875, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0616, |
|
"reward": 0.1538199633359909, |
|
"reward_std": 0.17373259365558624, |
|
"rewards/length_reward": 0.013839286752045155, |
|
"rewards/similarity_reward": 0.13998067378997803, |
|
"step": 4 |
|
}, |
|
{ |
|
"completion_length": 336.15625, |
|
"epoch": 0.013333333333333334, |
|
"grad_norm": 0.8717553512231005, |
|
"kl": 0.00014972686767578125, |
|
"learning_rate": 2e-06, |
|
"loss": 0.1209, |
|
"reward": 0.21149994432926178, |
|
"reward_std": 0.2120143473148346, |
|
"rewards/length_reward": 0.01830357313156128, |
|
"rewards/similarity_reward": 0.1931963711977005, |
|
"step": 5 |
|
}, |
|
{ |
|
"completion_length": 373.6026916503906, |
|
"epoch": 0.016, |
|
"grad_norm": 0.7610025457969676, |
|
"kl": 0.00013446807861328125, |
|
"learning_rate": 2e-06, |
|
"loss": 0.1513, |
|
"reward": 0.1827303022146225, |
|
"reward_std": 0.24866001307964325, |
|
"rewards/length_reward": 0.02187500335276127, |
|
"rewards/similarity_reward": 0.16085529327392578, |
|
"step": 6 |
|
}, |
|
{ |
|
"completion_length": 341.7589416503906, |
|
"epoch": 0.018666666666666668, |
|
"grad_norm": 0.8626611704865026, |
|
"kl": 0.0001583099365234375, |
|
"learning_rate": 2e-06, |
|
"loss": 0.1271, |
|
"reward": 0.19529196619987488, |
|
"reward_std": 0.2814559042453766, |
|
"rewards/length_reward": 0.021428575739264488, |
|
"rewards/similarity_reward": 0.17386338114738464, |
|
"step": 7 |
|
}, |
|
{ |
|
"completion_length": 424.98663330078125, |
|
"epoch": 0.021333333333333333, |
|
"grad_norm": 0.7146043340468313, |
|
"kl": 0.00017452239990234375, |
|
"learning_rate": 2e-06, |
|
"loss": 0.1881, |
|
"reward": 0.21610300242900848, |
|
"reward_std": 0.2689198851585388, |
|
"rewards/length_reward": 0.01830357313156128, |
|
"rewards/similarity_reward": 0.197799414396286, |
|
"step": 8 |
|
}, |
|
{ |
|
"completion_length": 348.2321472167969, |
|
"epoch": 0.024, |
|
"grad_norm": 0.7606250954270842, |
|
"kl": 0.000186920166015625, |
|
"learning_rate": 2e-06, |
|
"loss": 0.1276, |
|
"reward": 0.20473892986774445, |
|
"reward_std": 0.23727914690971375, |
|
"rewards/length_reward": 0.02008928917348385, |
|
"rewards/similarity_reward": 0.18464964628219604, |
|
"step": 9 |
|
}, |
|
{ |
|
"completion_length": 355.54913330078125, |
|
"epoch": 0.02666666666666667, |
|
"grad_norm": 0.8173370703071066, |
|
"kl": 0.00018310546875, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0787, |
|
"reward": 0.227640300989151, |
|
"reward_std": 0.2539962828159332, |
|
"rewards/length_reward": 0.01741071790456772, |
|
"rewards/similarity_reward": 0.21022957563400269, |
|
"step": 10 |
|
}, |
|
{ |
|
"completion_length": 343.7232360839844, |
|
"epoch": 0.029333333333333333, |
|
"grad_norm": 0.9104553890156574, |
|
"kl": 0.00019550323486328125, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0575, |
|
"reward": 0.25083568692207336, |
|
"reward_std": 0.2815442383289337, |
|
"rewards/length_reward": 0.02767857536673546, |
|
"rewards/similarity_reward": 0.22315707802772522, |
|
"step": 11 |
|
}, |
|
{ |
|
"completion_length": 355.0535888671875, |
|
"epoch": 0.032, |
|
"grad_norm": 0.8007928014475878, |
|
"kl": 0.0003528594970703125, |
|
"learning_rate": 2e-06, |
|
"loss": 0.1448, |
|
"reward": 0.2787685990333557, |
|
"reward_std": 0.25941261649131775, |
|
"rewards/length_reward": 0.025892863050103188, |
|
"rewards/similarity_reward": 0.25287577509880066, |
|
"step": 12 |
|
}, |
|
{ |
|
"completion_length": 395.4107360839844, |
|
"epoch": 0.034666666666666665, |
|
"grad_norm": 0.7050603406845205, |
|
"kl": 0.000255584716796875, |
|
"learning_rate": 2e-06, |
|
"loss": 0.1127, |
|
"reward": 0.31717172265052795, |
|
"reward_std": 0.2762907147407532, |
|
"rewards/length_reward": 0.02946428954601288, |
|
"rewards/similarity_reward": 0.28770744800567627, |
|
"step": 13 |
|
}, |
|
{ |
|
"completion_length": 361.05804443359375, |
|
"epoch": 0.037333333333333336, |
|
"grad_norm": 0.9360978406768153, |
|
"kl": 0.0003376007080078125, |
|
"learning_rate": 2e-06, |
|
"loss": 0.142, |
|
"reward": 0.24003884196281433, |
|
"reward_std": 0.27974435687065125, |
|
"rewards/length_reward": 0.021428575739264488, |
|
"rewards/similarity_reward": 0.2186102569103241, |
|
"step": 14 |
|
}, |
|
{ |
|
"completion_length": 398.0044860839844, |
|
"epoch": 0.04, |
|
"grad_norm": 0.7389563411116621, |
|
"kl": 0.0003604888916015625, |
|
"learning_rate": 2e-06, |
|
"loss": 0.1137, |
|
"reward": 0.23077349364757538, |
|
"reward_std": 0.24957218766212463, |
|
"rewards/length_reward": 0.0178571455180645, |
|
"rewards/similarity_reward": 0.21291638910770416, |
|
"step": 15 |
|
}, |
|
{ |
|
"completion_length": 366.4196472167969, |
|
"epoch": 0.042666666666666665, |
|
"grad_norm": 0.7079515986093292, |
|
"kl": 0.0004177093505859375, |
|
"learning_rate": 2e-06, |
|
"loss": 0.1596, |
|
"reward": 0.16758890450000763, |
|
"reward_std": 0.1997506469488144, |
|
"rewards/length_reward": 0.0178571455180645, |
|
"rewards/similarity_reward": 0.14973175525665283, |
|
"step": 16 |
|
}, |
|
{ |
|
"completion_length": 385.87054443359375, |
|
"epoch": 0.04533333333333334, |
|
"grad_norm": 0.7793857856999354, |
|
"kl": 0.000385284423828125, |
|
"learning_rate": 2e-06, |
|
"loss": 0.1863, |
|
"reward": 0.2408275306224823, |
|
"reward_std": 0.28883570432662964, |
|
"rewards/length_reward": 0.022321434691548347, |
|
"rewards/similarity_reward": 0.2185060679912567, |
|
"step": 17 |
|
}, |
|
{ |
|
"completion_length": 373.0446472167969, |
|
"epoch": 0.048, |
|
"grad_norm": 0.8181367418694138, |
|
"kl": 0.0005035400390625, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0998, |
|
"reward": 0.2590915262699127, |
|
"reward_std": 0.26667794585227966, |
|
"rewards/length_reward": 0.02678571827709675, |
|
"rewards/similarity_reward": 0.23230580985546112, |
|
"step": 18 |
|
}, |
|
{ |
|
"completion_length": 314.1026916503906, |
|
"epoch": 0.050666666666666665, |
|
"grad_norm": 0.8473993736940718, |
|
"kl": 0.000438690185546875, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0861, |
|
"reward": 0.34537845849990845, |
|
"reward_std": 0.26645439863204956, |
|
"rewards/length_reward": 0.03705357387661934, |
|
"rewards/similarity_reward": 0.3083249032497406, |
|
"step": 19 |
|
}, |
|
{ |
|
"completion_length": 354.93304443359375, |
|
"epoch": 0.05333333333333334, |
|
"grad_norm": 0.8882148635006984, |
|
"kl": 0.00057220458984375, |
|
"learning_rate": 2e-06, |
|
"loss": 0.1732, |
|
"reward": 0.3256897032260895, |
|
"reward_std": 0.25436195731163025, |
|
"rewards/length_reward": 0.025446433573961258, |
|
"rewards/similarity_reward": 0.3002432584762573, |
|
"step": 20 |
|
}, |
|
{ |
|
"completion_length": 354.1250305175781, |
|
"epoch": 0.056, |
|
"grad_norm": 0.8096248306365297, |
|
"kl": 0.000713348388671875, |
|
"learning_rate": 2e-06, |
|
"loss": 0.1809, |
|
"reward": 0.42166247963905334, |
|
"reward_std": 0.2462671995162964, |
|
"rewards/length_reward": 0.0401785746216774, |
|
"rewards/similarity_reward": 0.38148391246795654, |
|
"step": 21 |
|
}, |
|
{ |
|
"completion_length": 379.8258972167969, |
|
"epoch": 0.058666666666666666, |
|
"grad_norm": 0.8324528992208251, |
|
"kl": 0.000713348388671875, |
|
"learning_rate": 2e-06, |
|
"loss": 0.1888, |
|
"reward": 0.4674707353115082, |
|
"reward_std": 0.28602704405784607, |
|
"rewards/length_reward": 0.03883929178118706, |
|
"rewards/similarity_reward": 0.42863139510154724, |
|
"step": 22 |
|
}, |
|
{ |
|
"completion_length": 336.1964416503906, |
|
"epoch": 0.06133333333333333, |
|
"grad_norm": 0.883659092626158, |
|
"kl": 0.001129150390625, |
|
"learning_rate": 2e-06, |
|
"loss": 0.1224, |
|
"reward": 0.3572904169559479, |
|
"reward_std": 0.2817726135253906, |
|
"rewards/length_reward": 0.030803577974438667, |
|
"rewards/similarity_reward": 0.32648688554763794, |
|
"step": 23 |
|
}, |
|
{ |
|
"completion_length": 379.8750305175781, |
|
"epoch": 0.064, |
|
"grad_norm": 0.7759947978320788, |
|
"kl": 0.00135040283203125, |
|
"learning_rate": 2e-06, |
|
"loss": 0.163, |
|
"reward": 0.34559932351112366, |
|
"reward_std": 0.26974016427993774, |
|
"rewards/length_reward": 0.03705357387661934, |
|
"rewards/similarity_reward": 0.3085457384586334, |
|
"step": 24 |
|
}, |
|
{ |
|
"completion_length": 334.71875, |
|
"epoch": 0.06666666666666667, |
|
"grad_norm": 0.8358157724868338, |
|
"kl": 0.001068115234375, |
|
"learning_rate": 2e-06, |
|
"loss": 0.1326, |
|
"reward": 0.3908008337020874, |
|
"reward_std": 0.3024666905403137, |
|
"rewards/length_reward": 0.03928571566939354, |
|
"rewards/similarity_reward": 0.35151511430740356, |
|
"step": 25 |
|
}, |
|
{ |
|
"completion_length": 372.55804443359375, |
|
"epoch": 0.06933333333333333, |
|
"grad_norm": 0.7610834907565935, |
|
"kl": 0.0027008056640625, |
|
"learning_rate": 2e-06, |
|
"loss": 0.1688, |
|
"reward": 0.2864897847175598, |
|
"reward_std": 0.2402629852294922, |
|
"rewards/length_reward": 0.02500000409781933, |
|
"rewards/similarity_reward": 0.2614898085594177, |
|
"step": 26 |
|
}, |
|
{ |
|
"completion_length": 332.1651916503906, |
|
"epoch": 0.072, |
|
"grad_norm": 0.9340372327621089, |
|
"kl": 0.00152587890625, |
|
"learning_rate": 2e-06, |
|
"loss": 0.1936, |
|
"reward": 0.32003700733184814, |
|
"reward_std": 0.28589487075805664, |
|
"rewards/length_reward": 0.0334821492433548, |
|
"rewards/similarity_reward": 0.28655487298965454, |
|
"step": 27 |
|
}, |
|
{ |
|
"completion_length": 421.08929443359375, |
|
"epoch": 0.07466666666666667, |
|
"grad_norm": 0.7506727526601732, |
|
"kl": 0.00148773193359375, |
|
"learning_rate": 2e-06, |
|
"loss": 0.1452, |
|
"reward": 0.39091238379478455, |
|
"reward_std": 0.21896174550056458, |
|
"rewards/length_reward": 0.03482143208384514, |
|
"rewards/similarity_reward": 0.3560909032821655, |
|
"step": 28 |
|
}, |
|
{ |
|
"completion_length": 381.3348388671875, |
|
"epoch": 0.07733333333333334, |
|
"grad_norm": 0.6952200929063435, |
|
"kl": 0.0017242431640625, |
|
"learning_rate": 2e-06, |
|
"loss": 0.1113, |
|
"reward": 0.36936715245246887, |
|
"reward_std": 0.23492401838302612, |
|
"rewards/length_reward": 0.030357148498296738, |
|
"rewards/similarity_reward": 0.33900997042655945, |
|
"step": 29 |
|
}, |
|
{ |
|
"completion_length": 334.6607360839844, |
|
"epoch": 0.08, |
|
"grad_norm": 0.9106612624614471, |
|
"kl": 0.00170135498046875, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0756, |
|
"reward": 0.4293070435523987, |
|
"reward_std": 0.2823811173439026, |
|
"rewards/length_reward": 0.0401785746216774, |
|
"rewards/similarity_reward": 0.3891284763813019, |
|
"step": 30 |
|
}, |
|
{ |
|
"completion_length": 461.99554443359375, |
|
"epoch": 0.08266666666666667, |
|
"grad_norm": 0.661013366993289, |
|
"kl": 0.0020751953125, |
|
"learning_rate": 2e-06, |
|
"loss": 0.2489, |
|
"reward": 0.3686121702194214, |
|
"reward_std": 0.2422313094139099, |
|
"rewards/length_reward": 0.028571434319019318, |
|
"rewards/similarity_reward": 0.3400407135486603, |
|
"step": 31 |
|
}, |
|
{ |
|
"completion_length": 354.83038330078125, |
|
"epoch": 0.08533333333333333, |
|
"grad_norm": 0.9261837111640477, |
|
"kl": 0.0028533935546875, |
|
"learning_rate": 2e-06, |
|
"loss": 0.3225, |
|
"reward": 0.3788739740848541, |
|
"reward_std": 0.26962369680404663, |
|
"rewards/length_reward": 0.03482143208384514, |
|
"rewards/similarity_reward": 0.34405258297920227, |
|
"step": 32 |
|
}, |
|
{ |
|
"completion_length": 355.5357360839844, |
|
"epoch": 0.088, |
|
"grad_norm": 0.8643961614615921, |
|
"kl": 0.0026397705078125, |
|
"learning_rate": 2e-06, |
|
"loss": 0.1893, |
|
"reward": 0.3799653649330139, |
|
"reward_std": 0.2525205910205841, |
|
"rewards/length_reward": 0.03303571790456772, |
|
"rewards/similarity_reward": 0.346929669380188, |
|
"step": 33 |
|
}, |
|
{ |
|
"completion_length": 303.2901916503906, |
|
"epoch": 0.09066666666666667, |
|
"grad_norm": 0.9531070627714385, |
|
"kl": 0.002532958984375, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0718, |
|
"reward": 0.435234397649765, |
|
"reward_std": 0.2490427941083908, |
|
"rewards/length_reward": 0.03883928805589676, |
|
"rewards/similarity_reward": 0.39639511704444885, |
|
"step": 34 |
|
}, |
|
{ |
|
"completion_length": 391.89288330078125, |
|
"epoch": 0.09333333333333334, |
|
"grad_norm": 0.8111465127450542, |
|
"kl": 0.0019683837890625, |
|
"learning_rate": 2e-06, |
|
"loss": 0.1647, |
|
"reward": 0.4038808047771454, |
|
"reward_std": 0.31206637620925903, |
|
"rewards/length_reward": 0.03883928805589676, |
|
"rewards/similarity_reward": 0.3650415241718292, |
|
"step": 35 |
|
}, |
|
{ |
|
"completion_length": 359.9196472167969, |
|
"epoch": 0.096, |
|
"grad_norm": 0.8281785424636492, |
|
"kl": 0.00335693359375, |
|
"learning_rate": 2e-06, |
|
"loss": 0.1875, |
|
"reward": 0.4340634346008301, |
|
"reward_std": 0.27002009749412537, |
|
"rewards/length_reward": 0.04151785373687744, |
|
"rewards/similarity_reward": 0.39254552125930786, |
|
"step": 36 |
|
}, |
|
{ |
|
"completion_length": 287.6875, |
|
"epoch": 0.09866666666666667, |
|
"grad_norm": 1.053364246590671, |
|
"kl": 0.00518798828125, |
|
"learning_rate": 2e-06, |
|
"loss": 0.085, |
|
"reward": 0.47383809089660645, |
|
"reward_std": 0.22637499868869781, |
|
"rewards/length_reward": 0.04374999925494194, |
|
"rewards/similarity_reward": 0.430088073015213, |
|
"step": 37 |
|
}, |
|
{ |
|
"completion_length": 398.1160888671875, |
|
"epoch": 0.10133333333333333, |
|
"grad_norm": 0.7805356469167566, |
|
"kl": 0.00341796875, |
|
"learning_rate": 2e-06, |
|
"loss": 0.2197, |
|
"reward": 0.4928036332130432, |
|
"reward_std": 0.24267539381980896, |
|
"rewards/length_reward": 0.050446417182683945, |
|
"rewards/similarity_reward": 0.4423570930957794, |
|
"step": 38 |
|
}, |
|
{ |
|
"completion_length": 312.7276916503906, |
|
"epoch": 0.104, |
|
"grad_norm": 0.819085900048414, |
|
"kl": 0.00177001953125, |
|
"learning_rate": 2e-06, |
|
"loss": 0.1426, |
|
"reward": 0.5058891177177429, |
|
"reward_std": 0.24202971160411835, |
|
"rewards/length_reward": 0.04776785522699356, |
|
"rewards/similarity_reward": 0.4581212103366852, |
|
"step": 39 |
|
}, |
|
{ |
|
"completion_length": 386.8214416503906, |
|
"epoch": 0.10666666666666667, |
|
"grad_norm": 0.7650271991129984, |
|
"kl": 0.0030364990234375, |
|
"learning_rate": 2e-06, |
|
"loss": 0.1882, |
|
"reward": 0.4022373855113983, |
|
"reward_std": 0.24781934916973114, |
|
"rewards/length_reward": 0.0401785746216774, |
|
"rewards/similarity_reward": 0.3620587885379791, |
|
"step": 40 |
|
}, |
|
{ |
|
"completion_length": 250.4107208251953, |
|
"epoch": 0.10933333333333334, |
|
"grad_norm": 0.9803970599540968, |
|
"kl": 0.003875732421875, |
|
"learning_rate": 2e-06, |
|
"loss": -0.0008, |
|
"reward": 0.5008234977722168, |
|
"reward_std": 0.22121772170066833, |
|
"rewards/length_reward": 0.050446417182683945, |
|
"rewards/similarity_reward": 0.45037704706192017, |
|
"step": 41 |
|
}, |
|
{ |
|
"completion_length": 317.1160888671875, |
|
"epoch": 0.112, |
|
"grad_norm": 0.8986374178737812, |
|
"kl": 0.00341796875, |
|
"learning_rate": 2e-06, |
|
"loss": 0.2065, |
|
"reward": 0.45162686705589294, |
|
"reward_std": 0.27914097905158997, |
|
"rewards/length_reward": 0.04598214104771614, |
|
"rewards/similarity_reward": 0.4056447148323059, |
|
"step": 42 |
|
}, |
|
{ |
|
"completion_length": 313.4821472167969, |
|
"epoch": 0.11466666666666667, |
|
"grad_norm": 0.9439169733302692, |
|
"kl": 0.0040283203125, |
|
"learning_rate": 2e-06, |
|
"loss": 0.1189, |
|
"reward": 0.5015469789505005, |
|
"reward_std": 0.2071218341588974, |
|
"rewards/length_reward": 0.050892848521471024, |
|
"rewards/similarity_reward": 0.4506540596485138, |
|
"step": 43 |
|
}, |
|
{ |
|
"completion_length": 308.23663330078125, |
|
"epoch": 0.11733333333333333, |
|
"grad_norm": 0.843848181524653, |
|
"kl": 0.004150390625, |
|
"learning_rate": 2e-06, |
|
"loss": 0.1423, |
|
"reward": 0.49464964866638184, |
|
"reward_std": 0.19760312139987946, |
|
"rewards/length_reward": 0.05223213508725166, |
|
"rewards/similarity_reward": 0.4424174726009369, |
|
"step": 44 |
|
}, |
|
{ |
|
"completion_length": 322.51788330078125, |
|
"epoch": 0.12, |
|
"grad_norm": 0.808670899867239, |
|
"kl": 0.004730224609375, |
|
"learning_rate": 2e-06, |
|
"loss": 0.2162, |
|
"reward": 0.45207658410072327, |
|
"reward_std": 0.22255302965641022, |
|
"rewards/length_reward": 0.03883929178118706, |
|
"rewards/similarity_reward": 0.4132373034954071, |
|
"step": 45 |
|
}, |
|
{ |
|
"completion_length": 306.9732360839844, |
|
"epoch": 0.12266666666666666, |
|
"grad_norm": 0.9215129819151354, |
|
"kl": 0.0038604736328125, |
|
"learning_rate": 2e-06, |
|
"loss": 0.1492, |
|
"reward": 0.5217949151992798, |
|
"reward_std": 0.24197062849998474, |
|
"rewards/length_reward": 0.04598213732242584, |
|
"rewards/similarity_reward": 0.47581273317337036, |
|
"step": 46 |
|
}, |
|
{ |
|
"completion_length": 305.45538330078125, |
|
"epoch": 0.12533333333333332, |
|
"grad_norm": 0.8829840145792894, |
|
"kl": 0.00518798828125, |
|
"learning_rate": 2e-06, |
|
"loss": 0.2397, |
|
"reward": 0.471711665391922, |
|
"reward_std": 0.15981332957744598, |
|
"rewards/length_reward": 0.049553561955690384, |
|
"rewards/similarity_reward": 0.42215806245803833, |
|
"step": 47 |
|
}, |
|
{ |
|
"completion_length": 294.4419860839844, |
|
"epoch": 0.128, |
|
"grad_norm": 0.8246638858466566, |
|
"kl": 0.004241943359375, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0913, |
|
"reward": 0.4828924238681793, |
|
"reward_std": 0.19750112295150757, |
|
"rewards/length_reward": 0.053124990314245224, |
|
"rewards/similarity_reward": 0.429767370223999, |
|
"step": 48 |
|
}, |
|
{ |
|
"completion_length": 332.45538330078125, |
|
"epoch": 0.13066666666666665, |
|
"grad_norm": 0.7904388485187617, |
|
"kl": 0.004913330078125, |
|
"learning_rate": 2e-06, |
|
"loss": 0.1529, |
|
"reward": 0.45801258087158203, |
|
"reward_std": 0.2542867660522461, |
|
"rewards/length_reward": 0.04196428507566452, |
|
"rewards/similarity_reward": 0.4160482585430145, |
|
"step": 49 |
|
}, |
|
{ |
|
"completion_length": 281.77679443359375, |
|
"epoch": 0.13333333333333333, |
|
"grad_norm": 0.9845645428183626, |
|
"kl": 0.00433349609375, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0801, |
|
"reward": 0.5682670474052429, |
|
"reward_std": 0.23296673595905304, |
|
"rewards/length_reward": 0.06517855077981949, |
|
"rewards/similarity_reward": 0.5030884742736816, |
|
"step": 50 |
|
}, |
|
{ |
|
"completion_length": 316.7410888671875, |
|
"epoch": 0.136, |
|
"grad_norm": 0.9300429591791828, |
|
"kl": 0.00592041015625, |
|
"learning_rate": 2e-06, |
|
"loss": 0.103, |
|
"reward": 0.44630467891693115, |
|
"reward_std": 0.12811601161956787, |
|
"rewards/length_reward": 0.04821427911520004, |
|
"rewards/similarity_reward": 0.3980904519557953, |
|
"step": 51 |
|
}, |
|
{ |
|
"completion_length": 296.8973388671875, |
|
"epoch": 0.13866666666666666, |
|
"grad_norm": 0.8592422567531082, |
|
"kl": 0.005218505859375, |
|
"learning_rate": 2e-06, |
|
"loss": 0.1159, |
|
"reward": 0.5130535960197449, |
|
"reward_std": 0.1873682290315628, |
|
"rewards/length_reward": 0.0491071380674839, |
|
"rewards/similarity_reward": 0.4639464318752289, |
|
"step": 52 |
|
}, |
|
{ |
|
"completion_length": 284.0401916503906, |
|
"epoch": 0.14133333333333334, |
|
"grad_norm": 0.8593724590061699, |
|
"kl": 0.0048828125, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0087, |
|
"reward": 0.5250208973884583, |
|
"reward_std": 0.21563619375228882, |
|
"rewards/length_reward": 0.057142842561006546, |
|
"rewards/similarity_reward": 0.46787798404693604, |
|
"step": 53 |
|
}, |
|
{ |
|
"completion_length": 268.21429443359375, |
|
"epoch": 0.144, |
|
"grad_norm": 0.9220995505083402, |
|
"kl": 0.005645751953125, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0234, |
|
"reward": 0.6078009605407715, |
|
"reward_std": 0.18404294550418854, |
|
"rewards/length_reward": 0.04508928582072258, |
|
"rewards/similarity_reward": 0.5627117156982422, |
|
"step": 54 |
|
}, |
|
{ |
|
"completion_length": 332.2232360839844, |
|
"epoch": 0.14666666666666667, |
|
"grad_norm": 0.8153244083746986, |
|
"kl": 0.004852294921875, |
|
"learning_rate": 2e-06, |
|
"loss": 0.1089, |
|
"reward": 0.5709711313247681, |
|
"reward_std": 0.18112631142139435, |
|
"rewards/length_reward": 0.058035701513290405, |
|
"rewards/similarity_reward": 0.5129354596138, |
|
"step": 55 |
|
}, |
|
{ |
|
"completion_length": 298.62054443359375, |
|
"epoch": 0.14933333333333335, |
|
"grad_norm": 0.861900790561817, |
|
"kl": 0.00567626953125, |
|
"learning_rate": 2e-06, |
|
"loss": 0.1037, |
|
"reward": 0.5129757523536682, |
|
"reward_std": 0.21154648065567017, |
|
"rewards/length_reward": 0.05401784926652908, |
|
"rewards/similarity_reward": 0.45895785093307495, |
|
"step": 56 |
|
}, |
|
{ |
|
"completion_length": 243.4241180419922, |
|
"epoch": 0.152, |
|
"grad_norm": 0.928994891862699, |
|
"kl": 0.004241943359375, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0328, |
|
"reward": 0.6340307593345642, |
|
"reward_std": 0.16285859048366547, |
|
"rewards/length_reward": 0.06562498211860657, |
|
"rewards/similarity_reward": 0.56840580701828, |
|
"step": 57 |
|
}, |
|
{ |
|
"completion_length": 293.0669860839844, |
|
"epoch": 0.15466666666666667, |
|
"grad_norm": 0.89777989101008, |
|
"kl": 0.00579833984375, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0313, |
|
"reward": 0.5502158999443054, |
|
"reward_std": 0.1914074867963791, |
|
"rewards/length_reward": 0.06517855077981949, |
|
"rewards/similarity_reward": 0.48503735661506653, |
|
"step": 58 |
|
}, |
|
{ |
|
"completion_length": 348.3973388671875, |
|
"epoch": 0.15733333333333333, |
|
"grad_norm": 0.830581459672137, |
|
"kl": 0.005706787109375, |
|
"learning_rate": 2e-06, |
|
"loss": 0.1262, |
|
"reward": 0.5427281260490417, |
|
"reward_std": 0.18273915350437164, |
|
"rewards/length_reward": 0.04508928582072258, |
|
"rewards/similarity_reward": 0.4976387917995453, |
|
"step": 59 |
|
}, |
|
{ |
|
"completion_length": 300.43304443359375, |
|
"epoch": 0.16, |
|
"grad_norm": 0.7841145484798535, |
|
"kl": 0.007080078125, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0356, |
|
"reward": 0.6150097846984863, |
|
"reward_std": 0.15837538242340088, |
|
"rewards/length_reward": 0.06383926421403885, |
|
"rewards/similarity_reward": 0.5511705279350281, |
|
"step": 60 |
|
}, |
|
{ |
|
"completion_length": 291.64288330078125, |
|
"epoch": 0.16266666666666665, |
|
"grad_norm": 0.9759946887460155, |
|
"kl": 0.005645751953125, |
|
"learning_rate": 2e-06, |
|
"loss": 0.1347, |
|
"reward": 0.6720048785209656, |
|
"reward_std": 0.16562286019325256, |
|
"rewards/length_reward": 0.06428569555282593, |
|
"rewards/similarity_reward": 0.6077191233634949, |
|
"step": 61 |
|
}, |
|
{ |
|
"completion_length": 300.3973388671875, |
|
"epoch": 0.16533333333333333, |
|
"grad_norm": 0.8353754395282778, |
|
"kl": 0.00592041015625, |
|
"learning_rate": 2e-06, |
|
"loss": 0.1508, |
|
"reward": 0.6174642443656921, |
|
"reward_std": 0.1775916963815689, |
|
"rewards/length_reward": 0.05848212540149689, |
|
"rewards/similarity_reward": 0.5589820742607117, |
|
"step": 62 |
|
}, |
|
{ |
|
"completion_length": 299.0, |
|
"epoch": 0.168, |
|
"grad_norm": 0.8434412806636016, |
|
"kl": 0.0068359375, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0787, |
|
"reward": 0.5795109272003174, |
|
"reward_std": 0.18212977051734924, |
|
"rewards/length_reward": 0.056249987334012985, |
|
"rewards/similarity_reward": 0.5232609510421753, |
|
"step": 63 |
|
}, |
|
{ |
|
"completion_length": 314.3883972167969, |
|
"epoch": 0.17066666666666666, |
|
"grad_norm": 1.1818234014256608, |
|
"kl": 0.005706787109375, |
|
"learning_rate": 2e-06, |
|
"loss": 0.2756, |
|
"reward": 0.5499185919761658, |
|
"reward_std": 0.22555634379386902, |
|
"rewards/length_reward": 0.04776785522699356, |
|
"rewards/similarity_reward": 0.5021507143974304, |
|
"step": 64 |
|
}, |
|
{ |
|
"completion_length": 259.1785888671875, |
|
"epoch": 0.17333333333333334, |
|
"grad_norm": 0.9529921486665629, |
|
"kl": 0.006439208984375, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0704, |
|
"reward": 0.5430376529693604, |
|
"reward_std": 0.2042228877544403, |
|
"rewards/length_reward": 0.052232131361961365, |
|
"rewards/similarity_reward": 0.4908054769039154, |
|
"step": 65 |
|
}, |
|
{ |
|
"completion_length": 258.0714416503906, |
|
"epoch": 0.176, |
|
"grad_norm": 1.1488507934693044, |
|
"kl": 0.006805419921875, |
|
"learning_rate": 2e-06, |
|
"loss": 0.1057, |
|
"reward": 0.570214033126831, |
|
"reward_std": 0.160283625125885, |
|
"rewards/length_reward": 0.06517855077981949, |
|
"rewards/similarity_reward": 0.505035400390625, |
|
"step": 66 |
|
}, |
|
{ |
|
"completion_length": 330.90179443359375, |
|
"epoch": 0.17866666666666667, |
|
"grad_norm": 0.912379625363708, |
|
"kl": 0.0062255859375, |
|
"learning_rate": 2e-06, |
|
"loss": 0.1291, |
|
"reward": 0.49484553933143616, |
|
"reward_std": 0.21234968304634094, |
|
"rewards/length_reward": 0.04553570970892906, |
|
"rewards/similarity_reward": 0.4493098556995392, |
|
"step": 67 |
|
}, |
|
{ |
|
"completion_length": 266.46429443359375, |
|
"epoch": 0.18133333333333335, |
|
"grad_norm": 0.9382639131370187, |
|
"kl": 0.00909423828125, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0685, |
|
"reward": 0.5787621736526489, |
|
"reward_std": 0.17865508794784546, |
|
"rewards/length_reward": 0.054464273154735565, |
|
"rewards/similarity_reward": 0.5242978930473328, |
|
"step": 68 |
|
}, |
|
{ |
|
"completion_length": 285.25, |
|
"epoch": 0.184, |
|
"grad_norm": 0.8385679542137942, |
|
"kl": 0.00555419921875, |
|
"learning_rate": 2e-06, |
|
"loss": 0.046, |
|
"reward": 0.6689252257347107, |
|
"reward_std": 0.16466915607452393, |
|
"rewards/length_reward": 0.06651783734560013, |
|
"rewards/similarity_reward": 0.6024073958396912, |
|
"step": 69 |
|
}, |
|
{ |
|
"completion_length": 247.68751525878906, |
|
"epoch": 0.18666666666666668, |
|
"grad_norm": 1.01200025847724, |
|
"kl": 0.00860595703125, |
|
"learning_rate": 2e-06, |
|
"loss": 0.1382, |
|
"reward": 0.4780524969100952, |
|
"reward_std": 0.19645950198173523, |
|
"rewards/length_reward": 0.04196428507566452, |
|
"rewards/similarity_reward": 0.4360882043838501, |
|
"step": 70 |
|
}, |
|
{ |
|
"completion_length": 307.24554443359375, |
|
"epoch": 0.18933333333333333, |
|
"grad_norm": 0.8185082695628789, |
|
"kl": 0.00787353515625, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0749, |
|
"reward": 0.5303549766540527, |
|
"reward_std": 0.1896388977766037, |
|
"rewards/length_reward": 0.056696414947509766, |
|
"rewards/similarity_reward": 0.4736584722995758, |
|
"step": 71 |
|
}, |
|
{ |
|
"completion_length": 329.15179443359375, |
|
"epoch": 0.192, |
|
"grad_norm": 0.8562549539520792, |
|
"kl": 0.00823974609375, |
|
"learning_rate": 2e-06, |
|
"loss": 0.1642, |
|
"reward": 0.5008990168571472, |
|
"reward_std": 0.17187656462192535, |
|
"rewards/length_reward": 0.050892848521471024, |
|
"rewards/similarity_reward": 0.4500061273574829, |
|
"step": 72 |
|
}, |
|
{ |
|
"completion_length": 253.59376525878906, |
|
"epoch": 0.19466666666666665, |
|
"grad_norm": 0.8806238339574037, |
|
"kl": 0.006591796875, |
|
"learning_rate": 2e-06, |
|
"loss": 0.1082, |
|
"reward": 0.7047773003578186, |
|
"reward_std": 0.12662379443645477, |
|
"rewards/length_reward": 0.07232140004634857, |
|
"rewards/similarity_reward": 0.6324558258056641, |
|
"step": 73 |
|
}, |
|
{ |
|
"completion_length": 302.7321472167969, |
|
"epoch": 0.19733333333333333, |
|
"grad_norm": 0.888373625390179, |
|
"kl": 0.01300048828125, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0384, |
|
"reward": 0.5046581625938416, |
|
"reward_std": 0.18071489036083221, |
|
"rewards/length_reward": 0.0491071380674839, |
|
"rewards/similarity_reward": 0.45555105805397034, |
|
"step": 74 |
|
}, |
|
{ |
|
"completion_length": 309.0982360839844, |
|
"epoch": 0.2, |
|
"grad_norm": 0.8352994571315709, |
|
"kl": 0.0081787109375, |
|
"learning_rate": 2e-06, |
|
"loss": 0.1895, |
|
"reward": 0.6111252903938293, |
|
"reward_std": 0.19863885641098022, |
|
"rewards/length_reward": 0.054464273154735565, |
|
"rewards/similarity_reward": 0.5566610097885132, |
|
"step": 75 |
|
}, |
|
{ |
|
"completion_length": 255.25001525878906, |
|
"epoch": 0.20266666666666666, |
|
"grad_norm": 1.0786021298964794, |
|
"kl": 0.0089111328125, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0384, |
|
"reward": 0.5118966102600098, |
|
"reward_std": 0.1661101132631302, |
|
"rewards/length_reward": 0.05223213508725166, |
|
"rewards/similarity_reward": 0.4596644341945648, |
|
"step": 76 |
|
}, |
|
{ |
|
"completion_length": 299.92413330078125, |
|
"epoch": 0.20533333333333334, |
|
"grad_norm": 0.887828324089484, |
|
"kl": 0.0128173828125, |
|
"learning_rate": 2e-06, |
|
"loss": 0.1341, |
|
"reward": 0.5058793425559998, |
|
"reward_std": 0.2038315385580063, |
|
"rewards/length_reward": 0.05044642463326454, |
|
"rewards/similarity_reward": 0.45543283224105835, |
|
"step": 77 |
|
}, |
|
{ |
|
"completion_length": 300.26788330078125, |
|
"epoch": 0.208, |
|
"grad_norm": 0.960422578229874, |
|
"kl": 0.01080322265625, |
|
"learning_rate": 2e-06, |
|
"loss": 0.2173, |
|
"reward": 0.5535677075386047, |
|
"reward_std": 0.16259299218654633, |
|
"rewards/length_reward": 0.06205355376005173, |
|
"rewards/similarity_reward": 0.49151411652565, |
|
"step": 78 |
|
}, |
|
{ |
|
"completion_length": 256.21875, |
|
"epoch": 0.21066666666666667, |
|
"grad_norm": 0.9394611442130687, |
|
"kl": 0.01171875, |
|
"learning_rate": 2e-06, |
|
"loss": 0.031, |
|
"reward": 0.6236703991889954, |
|
"reward_std": 0.16783180832862854, |
|
"rewards/length_reward": 0.05982141196727753, |
|
"rewards/similarity_reward": 0.5638489127159119, |
|
"step": 79 |
|
}, |
|
{ |
|
"completion_length": 279.9419860839844, |
|
"epoch": 0.21333333333333335, |
|
"grad_norm": 1.0860808591863038, |
|
"kl": 0.0096435546875, |
|
"learning_rate": 2e-06, |
|
"loss": 0.2206, |
|
"reward": 0.5311146974563599, |
|
"reward_std": 0.20672693848609924, |
|
"rewards/length_reward": 0.052678562700748444, |
|
"rewards/similarity_reward": 0.47843608260154724, |
|
"step": 80 |
|
}, |
|
{ |
|
"completion_length": 302.7008972167969, |
|
"epoch": 0.216, |
|
"grad_norm": 0.7695067843560371, |
|
"kl": 0.00653076171875, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0387, |
|
"reward": 0.5848192572593689, |
|
"reward_std": 0.20639710128307343, |
|
"rewards/length_reward": 0.060267843306064606, |
|
"rewards/similarity_reward": 0.5245514512062073, |
|
"step": 81 |
|
}, |
|
{ |
|
"completion_length": 247.18751525878906, |
|
"epoch": 0.21866666666666668, |
|
"grad_norm": 1.0172061111487714, |
|
"kl": 0.009765625, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0528, |
|
"reward": 0.6127398610115051, |
|
"reward_std": 0.13182979822158813, |
|
"rewards/length_reward": 0.06562498211860657, |
|
"rewards/similarity_reward": 0.5471147894859314, |
|
"step": 82 |
|
}, |
|
{ |
|
"completion_length": 303.40625, |
|
"epoch": 0.22133333333333333, |
|
"grad_norm": 0.7610369219853271, |
|
"kl": 0.00921630859375, |
|
"learning_rate": 2e-06, |
|
"loss": 0.1034, |
|
"reward": 0.5650977492332458, |
|
"reward_std": 0.16646917164325714, |
|
"rewards/length_reward": 0.056696418672800064, |
|
"rewards/similarity_reward": 0.5084013342857361, |
|
"step": 83 |
|
}, |
|
{ |
|
"completion_length": 296.625, |
|
"epoch": 0.224, |
|
"grad_norm": 0.8819878638905205, |
|
"kl": 0.00653076171875, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0509, |
|
"reward": 0.6824392676353455, |
|
"reward_std": 0.13198219239711761, |
|
"rewards/length_reward": 0.07678568363189697, |
|
"rewards/similarity_reward": 0.6056535243988037, |
|
"step": 84 |
|
}, |
|
{ |
|
"completion_length": 264.4375, |
|
"epoch": 0.22666666666666666, |
|
"grad_norm": 0.9860703009968039, |
|
"kl": 0.0103759765625, |
|
"learning_rate": 2e-06, |
|
"loss": 0.1082, |
|
"reward": 0.5452346205711365, |
|
"reward_std": 0.18002980947494507, |
|
"rewards/length_reward": 0.055357132107019424, |
|
"rewards/similarity_reward": 0.48987752199172974, |
|
"step": 85 |
|
}, |
|
{ |
|
"completion_length": 275.1696472167969, |
|
"epoch": 0.22933333333333333, |
|
"grad_norm": 0.876007599982239, |
|
"kl": 0.007720947265625, |
|
"learning_rate": 2e-06, |
|
"loss": 0.1075, |
|
"reward": 0.5654360055923462, |
|
"reward_std": 0.15497317910194397, |
|
"rewards/length_reward": 0.050892848521471024, |
|
"rewards/similarity_reward": 0.5145430564880371, |
|
"step": 86 |
|
}, |
|
{ |
|
"completion_length": 268.3883972167969, |
|
"epoch": 0.232, |
|
"grad_norm": 0.9877196366166759, |
|
"kl": 0.00872802734375, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0664, |
|
"reward": 0.6144102811813354, |
|
"reward_std": 0.1374298632144928, |
|
"rewards/length_reward": 0.06741069257259369, |
|
"rewards/similarity_reward": 0.5469995141029358, |
|
"step": 87 |
|
}, |
|
{ |
|
"completion_length": 339.67413330078125, |
|
"epoch": 0.23466666666666666, |
|
"grad_norm": 0.6868950012188707, |
|
"kl": 0.006072998046875, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0478, |
|
"reward": 0.6562062501907349, |
|
"reward_std": 0.1523490995168686, |
|
"rewards/length_reward": 0.07187497615814209, |
|
"rewards/similarity_reward": 0.5843312740325928, |
|
"step": 88 |
|
}, |
|
{ |
|
"completion_length": 262.4598388671875, |
|
"epoch": 0.23733333333333334, |
|
"grad_norm": 0.8224596138062096, |
|
"kl": 0.006378173828125, |
|
"learning_rate": 2e-06, |
|
"loss": 0.101, |
|
"reward": 0.6333271265029907, |
|
"reward_std": 0.170791357755661, |
|
"rewards/length_reward": 0.06785711646080017, |
|
"rewards/similarity_reward": 0.5654700398445129, |
|
"step": 89 |
|
}, |
|
{ |
|
"completion_length": 256.42413330078125, |
|
"epoch": 0.24, |
|
"grad_norm": 0.9173507116779652, |
|
"kl": 0.00946044921875, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0437, |
|
"reward": 0.5871028900146484, |
|
"reward_std": 0.16378919780254364, |
|
"rewards/length_reward": 0.04732142388820648, |
|
"rewards/similarity_reward": 0.5397815108299255, |
|
"step": 90 |
|
}, |
|
{ |
|
"completion_length": 350.6562805175781, |
|
"epoch": 0.24266666666666667, |
|
"grad_norm": 0.7989839039096065, |
|
"kl": 0.01251220703125, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0614, |
|
"reward": 0.5199065208435059, |
|
"reward_std": 0.20764127373695374, |
|
"rewards/length_reward": 0.05044642463326454, |
|
"rewards/similarity_reward": 0.4694600999355316, |
|
"step": 91 |
|
}, |
|
{ |
|
"completion_length": 249.0044708251953, |
|
"epoch": 0.24533333333333332, |
|
"grad_norm": 0.9605121844841826, |
|
"kl": 0.01123046875, |
|
"learning_rate": 2e-06, |
|
"loss": 0.036, |
|
"reward": 0.6044343113899231, |
|
"reward_std": 0.164906844496727, |
|
"rewards/length_reward": 0.057589270174503326, |
|
"rewards/similarity_reward": 0.5468449592590332, |
|
"step": 92 |
|
}, |
|
{ |
|
"completion_length": 312.2232360839844, |
|
"epoch": 0.248, |
|
"grad_norm": 0.836379585280954, |
|
"kl": 0.007781982421875, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0948, |
|
"reward": 0.5776776075363159, |
|
"reward_std": 0.15271225571632385, |
|
"rewards/length_reward": 0.055803555995225906, |
|
"rewards/similarity_reward": 0.5218740701675415, |
|
"step": 93 |
|
}, |
|
{ |
|
"completion_length": 311.3482360839844, |
|
"epoch": 0.25066666666666665, |
|
"grad_norm": 0.7945534599494852, |
|
"kl": 0.00897216796875, |
|
"learning_rate": 2e-06, |
|
"loss": 0.1441, |
|
"reward": 0.6111860275268555, |
|
"reward_std": 0.13805179297924042, |
|
"rewards/length_reward": 0.06696426123380661, |
|
"rewards/similarity_reward": 0.5442216992378235, |
|
"step": 94 |
|
}, |
|
{ |
|
"completion_length": 234.93751525878906, |
|
"epoch": 0.25333333333333335, |
|
"grad_norm": 1.2028189171950667, |
|
"kl": 0.00909423828125, |
|
"learning_rate": 2e-06, |
|
"loss": 0.1766, |
|
"reward": 0.5961614847183228, |
|
"reward_std": 0.16394107043743134, |
|
"rewards/length_reward": 0.05982141196727753, |
|
"rewards/similarity_reward": 0.5363399982452393, |
|
"step": 95 |
|
}, |
|
{ |
|
"completion_length": 305.6964416503906, |
|
"epoch": 0.256, |
|
"grad_norm": 0.8067577376172387, |
|
"kl": 0.00982666015625, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0671, |
|
"reward": 0.656367301940918, |
|
"reward_std": 0.12278923392295837, |
|
"rewards/length_reward": 0.06874997913837433, |
|
"rewards/similarity_reward": 0.5876173377037048, |
|
"step": 96 |
|
}, |
|
{ |
|
"completion_length": 327.71429443359375, |
|
"epoch": 0.25866666666666666, |
|
"grad_norm": 0.770922327161602, |
|
"kl": 0.0064697265625, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0279, |
|
"reward": 0.5429174304008484, |
|
"reward_std": 0.16670171916484833, |
|
"rewards/length_reward": 0.06562498211860657, |
|
"rewards/similarity_reward": 0.47729235887527466, |
|
"step": 97 |
|
}, |
|
{ |
|
"completion_length": 256.62054443359375, |
|
"epoch": 0.2613333333333333, |
|
"grad_norm": 1.051628532134925, |
|
"kl": 0.0072021484375, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0872, |
|
"reward": 0.5515283346176147, |
|
"reward_std": 0.17894278466701508, |
|
"rewards/length_reward": 0.06205355003476143, |
|
"rewards/similarity_reward": 0.48947471380233765, |
|
"step": 98 |
|
}, |
|
{ |
|
"completion_length": 268.59375, |
|
"epoch": 0.264, |
|
"grad_norm": 0.8240697442290599, |
|
"kl": 0.008544921875, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0672, |
|
"reward": 0.6131307482719421, |
|
"reward_std": 0.17423538863658905, |
|
"rewards/length_reward": 0.06205355003476143, |
|
"rewards/similarity_reward": 0.5510770678520203, |
|
"step": 99 |
|
}, |
|
{ |
|
"completion_length": 257.37054443359375, |
|
"epoch": 0.26666666666666666, |
|
"grad_norm": 131.12733526048441, |
|
"kl": 0.1923828125, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0817, |
|
"reward": 0.6513006687164307, |
|
"reward_std": 0.14835356175899506, |
|
"rewards/length_reward": 0.06830354779958725, |
|
"rewards/similarity_reward": 0.5829971432685852, |
|
"step": 100 |
|
}, |
|
{ |
|
"completion_length": 307.1473388671875, |
|
"epoch": 0.2693333333333333, |
|
"grad_norm": 0.837570841896231, |
|
"kl": 0.0054931640625, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0194, |
|
"reward": 0.6764991283416748, |
|
"reward_std": 0.1323472112417221, |
|
"rewards/length_reward": 0.07410712540149689, |
|
"rewards/similarity_reward": 0.6023918986320496, |
|
"step": 101 |
|
}, |
|
{ |
|
"completion_length": 265.5044860839844, |
|
"epoch": 0.272, |
|
"grad_norm": 0.7713890189466205, |
|
"kl": 0.00830078125, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0212, |
|
"reward": 0.6779460310935974, |
|
"reward_std": 0.12496771663427353, |
|
"rewards/length_reward": 0.07276783138513565, |
|
"rewards/similarity_reward": 0.6051782369613647, |
|
"step": 102 |
|
}, |
|
{ |
|
"completion_length": 280.46429443359375, |
|
"epoch": 0.27466666666666667, |
|
"grad_norm": 0.8825358886169125, |
|
"kl": 0.008056640625, |
|
"learning_rate": 2e-06, |
|
"loss": 0.1086, |
|
"reward": 0.5848525166511536, |
|
"reward_std": 0.11382713168859482, |
|
"rewards/length_reward": 0.06071426719427109, |
|
"rewards/similarity_reward": 0.5241381525993347, |
|
"step": 103 |
|
}, |
|
{ |
|
"completion_length": 320.3571472167969, |
|
"epoch": 0.2773333333333333, |
|
"grad_norm": 0.8471202564701443, |
|
"kl": 0.0069580078125, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0388, |
|
"reward": 0.6058804988861084, |
|
"reward_std": 0.15757833421230316, |
|
"rewards/length_reward": 0.06517855077981949, |
|
"rewards/similarity_reward": 0.5407018661499023, |
|
"step": 104 |
|
}, |
|
{ |
|
"completion_length": 287.0848388671875, |
|
"epoch": 0.28, |
|
"grad_norm": 0.885756988877436, |
|
"kl": 0.00860595703125, |
|
"learning_rate": 2e-06, |
|
"loss": 0.089, |
|
"reward": 0.6150888204574585, |
|
"reward_std": 0.1344638168811798, |
|
"rewards/length_reward": 0.057589273899793625, |
|
"rewards/similarity_reward": 0.5574995875358582, |
|
"step": 105 |
|
}, |
|
{ |
|
"completion_length": 293.1339416503906, |
|
"epoch": 0.2826666666666667, |
|
"grad_norm": 0.9299759085944364, |
|
"kl": 0.01336669921875, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0428, |
|
"reward": 0.560218334197998, |
|
"reward_std": 0.2031860500574112, |
|
"rewards/length_reward": 0.058035701513290405, |
|
"rewards/similarity_reward": 0.50218266248703, |
|
"step": 106 |
|
}, |
|
{ |
|
"completion_length": 292.75, |
|
"epoch": 0.2853333333333333, |
|
"grad_norm": 0.8374882655316597, |
|
"kl": 0.00848388671875, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0683, |
|
"reward": 0.6374148726463318, |
|
"reward_std": 0.15000107884407043, |
|
"rewards/length_reward": 0.06919640302658081, |
|
"rewards/similarity_reward": 0.5682184100151062, |
|
"step": 107 |
|
}, |
|
{ |
|
"completion_length": 263.3571472167969, |
|
"epoch": 0.288, |
|
"grad_norm": 1.0433586800088648, |
|
"kl": 0.0078125, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0913, |
|
"reward": 0.5456939935684204, |
|
"reward_std": 0.1411367952823639, |
|
"rewards/length_reward": 0.056696414947509766, |
|
"rewards/similarity_reward": 0.48899757862091064, |
|
"step": 108 |
|
}, |
|
{ |
|
"completion_length": 263.9107360839844, |
|
"epoch": 0.2906666666666667, |
|
"grad_norm": 0.9650468316923807, |
|
"kl": 0.01129150390625, |
|
"learning_rate": 2e-06, |
|
"loss": 0.119, |
|
"reward": 0.6117041110992432, |
|
"reward_std": 0.13907021284103394, |
|
"rewards/length_reward": 0.06205355376005173, |
|
"rewards/similarity_reward": 0.5496505498886108, |
|
"step": 109 |
|
}, |
|
{ |
|
"completion_length": 235.0848388671875, |
|
"epoch": 0.29333333333333333, |
|
"grad_norm": 0.9205848620805003, |
|
"kl": 0.009521484375, |
|
"learning_rate": 2e-06, |
|
"loss": 0.006, |
|
"reward": 0.5724084377288818, |
|
"reward_std": 0.12264589220285416, |
|
"rewards/length_reward": 0.06964283436536789, |
|
"rewards/similarity_reward": 0.5027655959129333, |
|
"step": 110 |
|
}, |
|
{ |
|
"completion_length": 280.9419860839844, |
|
"epoch": 0.296, |
|
"grad_norm": 0.8242814043162366, |
|
"kl": 0.00836181640625, |
|
"learning_rate": 2e-06, |
|
"loss": 0.1358, |
|
"reward": 0.6025325059890747, |
|
"reward_std": 0.13276302814483643, |
|
"rewards/length_reward": 0.06741069257259369, |
|
"rewards/similarity_reward": 0.5351218581199646, |
|
"step": 111 |
|
}, |
|
{ |
|
"completion_length": 261.55804443359375, |
|
"epoch": 0.2986666666666667, |
|
"grad_norm": 0.8979430693793525, |
|
"kl": 0.0145263671875, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0898, |
|
"reward": 0.5723416805267334, |
|
"reward_std": 0.11434419453144073, |
|
"rewards/length_reward": 0.06428569555282593, |
|
"rewards/similarity_reward": 0.5080559253692627, |
|
"step": 112 |
|
}, |
|
{ |
|
"completion_length": 273.8482360839844, |
|
"epoch": 0.30133333333333334, |
|
"grad_norm": 0.8994640436743108, |
|
"kl": 0.0084228515625, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0884, |
|
"reward": 0.6239952445030212, |
|
"reward_std": 0.15253794193267822, |
|
"rewards/length_reward": 0.06517855077981949, |
|
"rewards/similarity_reward": 0.5588168501853943, |
|
"step": 113 |
|
}, |
|
{ |
|
"completion_length": 265.29913330078125, |
|
"epoch": 0.304, |
|
"grad_norm": 0.8511084352415984, |
|
"kl": 0.015625, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0217, |
|
"reward": 0.5796217322349548, |
|
"reward_std": 0.16319997608661652, |
|
"rewards/length_reward": 0.06785712391138077, |
|
"rewards/similarity_reward": 0.511764645576477, |
|
"step": 114 |
|
}, |
|
{ |
|
"completion_length": 227.94644165039062, |
|
"epoch": 0.30666666666666664, |
|
"grad_norm": 1.0504493112285522, |
|
"kl": 0.013427734375, |
|
"learning_rate": 2e-06, |
|
"loss": 0.092, |
|
"reward": 0.611219584941864, |
|
"reward_std": 0.1473054587841034, |
|
"rewards/length_reward": 0.06562498211860657, |
|
"rewards/similarity_reward": 0.5455944538116455, |
|
"step": 115 |
|
}, |
|
{ |
|
"completion_length": 312.2008972167969, |
|
"epoch": 0.30933333333333335, |
|
"grad_norm": 0.8491847097599164, |
|
"kl": 0.009033203125, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0818, |
|
"reward": 0.5205245018005371, |
|
"reward_std": 0.18279042840003967, |
|
"rewards/length_reward": 0.055803555995225906, |
|
"rewards/similarity_reward": 0.46472102403640747, |
|
"step": 116 |
|
}, |
|
{ |
|
"completion_length": 285.4151916503906, |
|
"epoch": 0.312, |
|
"grad_norm": 0.948136046714223, |
|
"kl": 0.01251220703125, |
|
"learning_rate": 2e-06, |
|
"loss": 0.1402, |
|
"reward": 0.5244685411453247, |
|
"reward_std": 0.1221655011177063, |
|
"rewards/length_reward": 0.058482129126787186, |
|
"rewards/similarity_reward": 0.4659864008426666, |
|
"step": 117 |
|
}, |
|
{ |
|
"completion_length": 291.8973388671875, |
|
"epoch": 0.31466666666666665, |
|
"grad_norm": 0.8327937599541795, |
|
"kl": 0.00823974609375, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0416, |
|
"reward": 0.6440633535385132, |
|
"reward_std": 0.14113157987594604, |
|
"rewards/length_reward": 0.06696426123380661, |
|
"rewards/similarity_reward": 0.5770990252494812, |
|
"step": 118 |
|
}, |
|
{ |
|
"completion_length": 276.3035888671875, |
|
"epoch": 0.31733333333333336, |
|
"grad_norm": 1.0522569506493296, |
|
"kl": 0.01007080078125, |
|
"learning_rate": 2e-06, |
|
"loss": 0.2508, |
|
"reward": 0.501847505569458, |
|
"reward_std": 0.16830717027187347, |
|
"rewards/length_reward": 0.051785703748464584, |
|
"rewards/similarity_reward": 0.4500618278980255, |
|
"step": 119 |
|
}, |
|
{ |
|
"completion_length": 231.46876525878906, |
|
"epoch": 0.32, |
|
"grad_norm": 1.0564887037389263, |
|
"kl": 0.01544189453125, |
|
"learning_rate": 2e-06, |
|
"loss": 0.1258, |
|
"reward": 0.5212039351463318, |
|
"reward_std": 0.1660899519920349, |
|
"rewards/length_reward": 0.053124986588954926, |
|
"rewards/similarity_reward": 0.46807900071144104, |
|
"step": 120 |
|
}, |
|
{ |
|
"completion_length": 274.55804443359375, |
|
"epoch": 0.32266666666666666, |
|
"grad_norm": 0.892927807825851, |
|
"kl": 0.006988525390625, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0887, |
|
"reward": 0.5594373941421509, |
|
"reward_std": 0.13949331641197205, |
|
"rewards/length_reward": 0.06651782989501953, |
|
"rewards/similarity_reward": 0.49291953444480896, |
|
"step": 121 |
|
}, |
|
{ |
|
"completion_length": 309.36163330078125, |
|
"epoch": 0.3253333333333333, |
|
"grad_norm": 0.8555023561165935, |
|
"kl": 0.01019287109375, |
|
"learning_rate": 2e-06, |
|
"loss": 0.1512, |
|
"reward": 0.5939301252365112, |
|
"reward_std": 0.16705819964408875, |
|
"rewards/length_reward": 0.061160698533058167, |
|
"rewards/similarity_reward": 0.5327693819999695, |
|
"step": 122 |
|
}, |
|
{ |
|
"completion_length": 264.3973388671875, |
|
"epoch": 0.328, |
|
"grad_norm": 0.9083757893001095, |
|
"kl": 0.00775146484375, |
|
"learning_rate": 2e-06, |
|
"loss": 0.1449, |
|
"reward": 0.6276513934135437, |
|
"reward_std": 0.14763577282428741, |
|
"rewards/length_reward": 0.06160712614655495, |
|
"rewards/similarity_reward": 0.5660442113876343, |
|
"step": 123 |
|
}, |
|
{ |
|
"completion_length": 267.96875, |
|
"epoch": 0.33066666666666666, |
|
"grad_norm": 0.9115344595637944, |
|
"kl": 0.01336669921875, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0998, |
|
"reward": 0.6213053464889526, |
|
"reward_std": 0.16126255691051483, |
|
"rewards/length_reward": 0.05848212540149689, |
|
"rewards/similarity_reward": 0.5628232359886169, |
|
"step": 124 |
|
}, |
|
{ |
|
"completion_length": 275.2276916503906, |
|
"epoch": 0.3333333333333333, |
|
"grad_norm": 0.8933631069209625, |
|
"kl": 0.01019287109375, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0232, |
|
"reward": 0.6394702792167664, |
|
"reward_std": 0.17729975283145905, |
|
"rewards/length_reward": 0.061160698533058167, |
|
"rewards/similarity_reward": 0.5783094763755798, |
|
"step": 125 |
|
}, |
|
{ |
|
"completion_length": 280.0848388671875, |
|
"epoch": 0.336, |
|
"grad_norm": 0.9959640208447441, |
|
"kl": 0.0203857421875, |
|
"learning_rate": 2e-06, |
|
"loss": 0.066, |
|
"reward": 0.5415524840354919, |
|
"reward_std": 0.18598264455795288, |
|
"rewards/length_reward": 0.06741069257259369, |
|
"rewards/similarity_reward": 0.47414183616638184, |
|
"step": 126 |
|
}, |
|
{ |
|
"completion_length": 294.37054443359375, |
|
"epoch": 0.33866666666666667, |
|
"grad_norm": 0.944172883238238, |
|
"kl": 0.0078125, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0825, |
|
"reward": 0.6250823736190796, |
|
"reward_std": 0.1783696711063385, |
|
"rewards/length_reward": 0.06696426123380661, |
|
"rewards/similarity_reward": 0.5581181049346924, |
|
"step": 127 |
|
}, |
|
{ |
|
"completion_length": 252.44644165039062, |
|
"epoch": 0.3413333333333333, |
|
"grad_norm": 0.8622018142523461, |
|
"kl": 0.01190185546875, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0136, |
|
"reward": 0.5941780209541321, |
|
"reward_std": 0.1297590583562851, |
|
"rewards/length_reward": 0.06517855077981949, |
|
"rewards/similarity_reward": 0.5289995074272156, |
|
"step": 128 |
|
}, |
|
{ |
|
"completion_length": 275.1696472167969, |
|
"epoch": 0.344, |
|
"grad_norm": 0.997627840820869, |
|
"kl": 0.00933837890625, |
|
"learning_rate": 2e-06, |
|
"loss": 0.1634, |
|
"reward": 0.5641010999679565, |
|
"reward_std": 0.13691328465938568, |
|
"rewards/length_reward": 0.057142842561006546, |
|
"rewards/similarity_reward": 0.5069582462310791, |
|
"step": 129 |
|
}, |
|
{ |
|
"completion_length": 291.78125, |
|
"epoch": 0.3466666666666667, |
|
"grad_norm": 0.9141566771741596, |
|
"kl": 0.00885009765625, |
|
"learning_rate": 2e-06, |
|
"loss": 0.233, |
|
"reward": 0.5903910398483276, |
|
"reward_std": 0.14815300703048706, |
|
"rewards/length_reward": 0.06249998137354851, |
|
"rewards/similarity_reward": 0.5278909802436829, |
|
"step": 130 |
|
}, |
|
{ |
|
"completion_length": 294.58929443359375, |
|
"epoch": 0.34933333333333333, |
|
"grad_norm": 0.9307314460988763, |
|
"kl": 0.01153564453125, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0979, |
|
"reward": 0.5972681045532227, |
|
"reward_std": 0.16272346675395966, |
|
"rewards/length_reward": 0.061160698533058167, |
|
"rewards/similarity_reward": 0.5361074805259705, |
|
"step": 131 |
|
}, |
|
{ |
|
"completion_length": 240.2232208251953, |
|
"epoch": 0.352, |
|
"grad_norm": 0.9959808951952684, |
|
"kl": 0.01092529296875, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0428, |
|
"reward": 0.6472880244255066, |
|
"reward_std": 0.15316687524318695, |
|
"rewards/length_reward": 0.06562498211860657, |
|
"rewards/similarity_reward": 0.5816629528999329, |
|
"step": 132 |
|
}, |
|
{ |
|
"completion_length": 237.1607208251953, |
|
"epoch": 0.3546666666666667, |
|
"grad_norm": 0.8515521500324365, |
|
"kl": 0.01251220703125, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0683, |
|
"reward": 0.631507396697998, |
|
"reward_std": 0.15118519961833954, |
|
"rewards/length_reward": 0.06428569555282593, |
|
"rewards/similarity_reward": 0.5672216415405273, |
|
"step": 133 |
|
}, |
|
{ |
|
"completion_length": 307.5, |
|
"epoch": 0.35733333333333334, |
|
"grad_norm": 0.719487956498844, |
|
"kl": 0.0059814453125, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0352, |
|
"reward": 0.6587818264961243, |
|
"reward_std": 0.14100806415081024, |
|
"rewards/length_reward": 0.07187497615814209, |
|
"rewards/similarity_reward": 0.5869067907333374, |
|
"step": 134 |
|
}, |
|
{ |
|
"completion_length": 305.2589416503906, |
|
"epoch": 0.36, |
|
"grad_norm": 1.0877799003245066, |
|
"kl": 0.0169677734375, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0955, |
|
"reward": 0.5947835445404053, |
|
"reward_std": 0.12429028749465942, |
|
"rewards/length_reward": 0.06071426719427109, |
|
"rewards/similarity_reward": 0.5340692400932312, |
|
"step": 135 |
|
}, |
|
{ |
|
"completion_length": 223.81251525878906, |
|
"epoch": 0.3626666666666667, |
|
"grad_norm": 1.056046449389469, |
|
"kl": 0.00665283203125, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0631, |
|
"reward": 0.6106573343276978, |
|
"reward_std": 0.13982126116752625, |
|
"rewards/length_reward": 0.06517855077981949, |
|
"rewards/similarity_reward": 0.5454786419868469, |
|
"step": 136 |
|
}, |
|
{ |
|
"completion_length": 302.52679443359375, |
|
"epoch": 0.36533333333333334, |
|
"grad_norm": 0.8473080240759754, |
|
"kl": 0.01116943359375, |
|
"learning_rate": 2e-06, |
|
"loss": 0.121, |
|
"reward": 0.5767890810966492, |
|
"reward_std": 0.1566361039876938, |
|
"rewards/length_reward": 0.06071426719427109, |
|
"rewards/similarity_reward": 0.5160747766494751, |
|
"step": 137 |
|
}, |
|
{ |
|
"completion_length": 294.9285888671875, |
|
"epoch": 0.368, |
|
"grad_norm": 0.8165111113975745, |
|
"kl": 0.00677490234375, |
|
"learning_rate": 2e-06, |
|
"loss": -0.0013, |
|
"reward": 0.6466237902641296, |
|
"reward_std": 0.11831733584403992, |
|
"rewards/length_reward": 0.07142855226993561, |
|
"rewards/similarity_reward": 0.5751951336860657, |
|
"step": 138 |
|
}, |
|
{ |
|
"completion_length": 297.4107360839844, |
|
"epoch": 0.37066666666666664, |
|
"grad_norm": 0.8905760527062927, |
|
"kl": 0.00653076171875, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0894, |
|
"reward": 0.6628533601760864, |
|
"reward_std": 0.10040118545293808, |
|
"rewards/length_reward": 0.06339284032583237, |
|
"rewards/similarity_reward": 0.5994604229927063, |
|
"step": 139 |
|
}, |
|
{ |
|
"completion_length": 282.9508972167969, |
|
"epoch": 0.37333333333333335, |
|
"grad_norm": 0.9489224311946435, |
|
"kl": 0.01007080078125, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0747, |
|
"reward": 0.5422684550285339, |
|
"reward_std": 0.18701300024986267, |
|
"rewards/length_reward": 0.055357132107019424, |
|
"rewards/similarity_reward": 0.4869112968444824, |
|
"step": 140 |
|
}, |
|
{ |
|
"completion_length": 286.9419860839844, |
|
"epoch": 0.376, |
|
"grad_norm": 1.5325112007084152, |
|
"kl": 0.0205078125, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0749, |
|
"reward": 0.6462909579277039, |
|
"reward_std": 0.1564369648694992, |
|
"rewards/length_reward": 0.06919640302658081, |
|
"rewards/similarity_reward": 0.5770944356918335, |
|
"step": 141 |
|
}, |
|
{ |
|
"completion_length": 233.50001525878906, |
|
"epoch": 0.37866666666666665, |
|
"grad_norm": 1.1124358172264561, |
|
"kl": 0.01336669921875, |
|
"learning_rate": 2e-06, |
|
"loss": 0.1038, |
|
"reward": 0.6390895247459412, |
|
"reward_std": 0.11253345012664795, |
|
"rewards/length_reward": 0.05982141196727753, |
|
"rewards/similarity_reward": 0.5792680978775024, |
|
"step": 142 |
|
}, |
|
{ |
|
"completion_length": 308.0401916503906, |
|
"epoch": 0.38133333333333336, |
|
"grad_norm": 0.8437782349764958, |
|
"kl": 0.01019287109375, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0439, |
|
"reward": 0.6860373616218567, |
|
"reward_std": 0.08081385493278503, |
|
"rewards/length_reward": 0.07901783287525177, |
|
"rewards/similarity_reward": 0.6070196032524109, |
|
"step": 143 |
|
}, |
|
{ |
|
"completion_length": 274.54913330078125, |
|
"epoch": 0.384, |
|
"grad_norm": 0.9174096594145076, |
|
"kl": 0.01019287109375, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0938, |
|
"reward": 0.6485283970832825, |
|
"reward_std": 0.1347315013408661, |
|
"rewards/length_reward": 0.07455354183912277, |
|
"rewards/similarity_reward": 0.5739747881889343, |
|
"step": 144 |
|
}, |
|
{ |
|
"completion_length": 257.5535888671875, |
|
"epoch": 0.38666666666666666, |
|
"grad_norm": 0.9244068415305253, |
|
"kl": 0.01019287109375, |
|
"learning_rate": 2e-06, |
|
"loss": 0.056, |
|
"reward": 0.6137626767158508, |
|
"reward_std": 0.13303914666175842, |
|
"rewards/length_reward": 0.06562498211860657, |
|
"rewards/similarity_reward": 0.5481376647949219, |
|
"step": 145 |
|
}, |
|
{ |
|
"completion_length": 255.62501525878906, |
|
"epoch": 0.3893333333333333, |
|
"grad_norm": 0.8596867360926773, |
|
"kl": 0.011474609375, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0491, |
|
"reward": 0.6294366717338562, |
|
"reward_std": 0.13696229457855225, |
|
"rewards/length_reward": 0.0741071105003357, |
|
"rewards/similarity_reward": 0.5553295016288757, |
|
"step": 146 |
|
}, |
|
{ |
|
"completion_length": 321.3258972167969, |
|
"epoch": 0.392, |
|
"grad_norm": 0.7611409673177786, |
|
"kl": 0.01171875, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0851, |
|
"reward": 0.6082260012626648, |
|
"reward_std": 0.11479248106479645, |
|
"rewards/length_reward": 0.06964283436536789, |
|
"rewards/similarity_reward": 0.5385831594467163, |
|
"step": 147 |
|
}, |
|
{ |
|
"completion_length": 256.5223388671875, |
|
"epoch": 0.39466666666666667, |
|
"grad_norm": 1.017158083005092, |
|
"kl": 0.01165771484375, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0583, |
|
"reward": 0.5373588800430298, |
|
"reward_std": 0.1524331271648407, |
|
"rewards/length_reward": 0.060267843306064606, |
|
"rewards/similarity_reward": 0.477090984582901, |
|
"step": 148 |
|
}, |
|
{ |
|
"completion_length": 247.34376525878906, |
|
"epoch": 0.3973333333333333, |
|
"grad_norm": 1.100826516879252, |
|
"kl": 0.011474609375, |
|
"learning_rate": 2e-06, |
|
"loss": 0.1543, |
|
"reward": 0.6250883340835571, |
|
"reward_std": 0.1562027931213379, |
|
"rewards/length_reward": 0.06785711646080017, |
|
"rewards/similarity_reward": 0.5572311878204346, |
|
"step": 149 |
|
}, |
|
{ |
|
"completion_length": 271.61163330078125, |
|
"epoch": 0.4, |
|
"grad_norm": 0.85368826964619, |
|
"kl": 0.00897216796875, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0525, |
|
"reward": 0.5830017924308777, |
|
"reward_std": 0.1454438865184784, |
|
"rewards/length_reward": 0.06160712614655495, |
|
"rewards/similarity_reward": 0.5213946104049683, |
|
"step": 150 |
|
}, |
|
{ |
|
"completion_length": 274.40179443359375, |
|
"epoch": 0.4026666666666667, |
|
"grad_norm": 0.9117887687890662, |
|
"kl": 0.014892578125, |
|
"learning_rate": 2e-06, |
|
"loss": -0.0237, |
|
"reward": 0.6028919219970703, |
|
"reward_std": 0.15602950751781464, |
|
"rewards/length_reward": 0.06651782989501953, |
|
"rewards/similarity_reward": 0.536374032497406, |
|
"step": 151 |
|
}, |
|
{ |
|
"completion_length": 269.2008972167969, |
|
"epoch": 0.4053333333333333, |
|
"grad_norm": 0.8208276830838094, |
|
"kl": 0.010009765625, |
|
"learning_rate": 2e-06, |
|
"loss": 0.014, |
|
"reward": 0.665276825428009, |
|
"reward_std": 0.1257169246673584, |
|
"rewards/length_reward": 0.08035711199045181, |
|
"rewards/similarity_reward": 0.5849196910858154, |
|
"step": 152 |
|
}, |
|
{ |
|
"completion_length": 264.4732360839844, |
|
"epoch": 0.408, |
|
"grad_norm": 0.9062154210012625, |
|
"kl": 0.013427734375, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0701, |
|
"reward": 0.6374659538269043, |
|
"reward_std": 0.1712835431098938, |
|
"rewards/length_reward": 0.06830354779958725, |
|
"rewards/similarity_reward": 0.5691623091697693, |
|
"step": 153 |
|
}, |
|
{ |
|
"completion_length": 325.1071472167969, |
|
"epoch": 0.4106666666666667, |
|
"grad_norm": 0.8808738957904011, |
|
"kl": 0.0089111328125, |
|
"learning_rate": 2e-06, |
|
"loss": 0.1136, |
|
"reward": 0.6423187255859375, |
|
"reward_std": 0.1033661887049675, |
|
"rewards/length_reward": 0.06383926421403885, |
|
"rewards/similarity_reward": 0.578479528427124, |
|
"step": 154 |
|
}, |
|
{ |
|
"completion_length": 281.3035888671875, |
|
"epoch": 0.41333333333333333, |
|
"grad_norm": 0.8449149570191646, |
|
"kl": 0.012451171875, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0893, |
|
"reward": 0.6530374884605408, |
|
"reward_std": 0.12996266782283783, |
|
"rewards/length_reward": 0.06116069480776787, |
|
"rewards/similarity_reward": 0.5918766856193542, |
|
"step": 155 |
|
}, |
|
{ |
|
"completion_length": 297.02679443359375, |
|
"epoch": 0.416, |
|
"grad_norm": 0.8274002453741087, |
|
"kl": 0.008056640625, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0593, |
|
"reward": 0.7200801372528076, |
|
"reward_std": 0.12102329730987549, |
|
"rewards/length_reward": 0.07901783287525177, |
|
"rewards/similarity_reward": 0.6410622596740723, |
|
"step": 156 |
|
}, |
|
{ |
|
"completion_length": 245.7991180419922, |
|
"epoch": 0.4186666666666667, |
|
"grad_norm": 1.0463728826517769, |
|
"kl": 0.0145263671875, |
|
"learning_rate": 2e-06, |
|
"loss": 0.1192, |
|
"reward": 0.6804168820381165, |
|
"reward_std": 0.1330643892288208, |
|
"rewards/length_reward": 0.0741071105003357, |
|
"rewards/similarity_reward": 0.6063097715377808, |
|
"step": 157 |
|
}, |
|
{ |
|
"completion_length": 272.2410888671875, |
|
"epoch": 0.42133333333333334, |
|
"grad_norm": 0.8424445256337731, |
|
"kl": 0.0152587890625, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0411, |
|
"reward": 0.6152999401092529, |
|
"reward_std": 0.18344512581825256, |
|
"rewards/length_reward": 0.06428569555282593, |
|
"rewards/similarity_reward": 0.5510141849517822, |
|
"step": 158 |
|
}, |
|
{ |
|
"completion_length": 283.90625, |
|
"epoch": 0.424, |
|
"grad_norm": 0.944378171141832, |
|
"kl": 0.0128173828125, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0973, |
|
"reward": 0.6098131537437439, |
|
"reward_std": 0.14866778254508972, |
|
"rewards/length_reward": 0.05982141196727753, |
|
"rewards/similarity_reward": 0.5499916672706604, |
|
"step": 159 |
|
}, |
|
{ |
|
"completion_length": 251.37054443359375, |
|
"epoch": 0.4266666666666667, |
|
"grad_norm": 0.867614538281579, |
|
"kl": 0.01190185546875, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0069, |
|
"reward": 0.6304399371147156, |
|
"reward_std": 0.12713229656219482, |
|
"rewards/length_reward": 0.07276783138513565, |
|
"rewards/similarity_reward": 0.5576720237731934, |
|
"step": 160 |
|
}, |
|
{ |
|
"completion_length": 254.6607208251953, |
|
"epoch": 0.42933333333333334, |
|
"grad_norm": 1.0006767726840313, |
|
"kl": 0.01226806640625, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0167, |
|
"reward": 0.6183627843856812, |
|
"reward_std": 0.12064019590616226, |
|
"rewards/length_reward": 0.057589273899793625, |
|
"rewards/similarity_reward": 0.560773491859436, |
|
"step": 161 |
|
}, |
|
{ |
|
"completion_length": 278.9821472167969, |
|
"epoch": 0.432, |
|
"grad_norm": 0.7754115998151179, |
|
"kl": 0.0108642578125, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0624, |
|
"reward": 0.6279152035713196, |
|
"reward_std": 0.1739441603422165, |
|
"rewards/length_reward": 0.06651783734560013, |
|
"rewards/similarity_reward": 0.5613973140716553, |
|
"step": 162 |
|
}, |
|
{ |
|
"completion_length": 261.51788330078125, |
|
"epoch": 0.43466666666666665, |
|
"grad_norm": 0.9381033539462706, |
|
"kl": 0.011962890625, |
|
"learning_rate": 2e-06, |
|
"loss": 0.1291, |
|
"reward": 0.6449581384658813, |
|
"reward_std": 0.13920167088508606, |
|
"rewards/length_reward": 0.07589282840490341, |
|
"rewards/similarity_reward": 0.569065272808075, |
|
"step": 163 |
|
}, |
|
{ |
|
"completion_length": 311.8169860839844, |
|
"epoch": 0.43733333333333335, |
|
"grad_norm": 0.959860639301872, |
|
"kl": 0.0084228515625, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0207, |
|
"reward": 0.6583375930786133, |
|
"reward_std": 0.1428201049566269, |
|
"rewards/length_reward": 0.07008926570415497, |
|
"rewards/similarity_reward": 0.5882483720779419, |
|
"step": 164 |
|
}, |
|
{ |
|
"completion_length": 261.5758972167969, |
|
"epoch": 0.44, |
|
"grad_norm": 0.8574273981386299, |
|
"kl": 0.009033203125, |
|
"learning_rate": 2e-06, |
|
"loss": 0.1177, |
|
"reward": 0.6945616006851196, |
|
"reward_std": 0.12529133260250092, |
|
"rewards/length_reward": 0.07812497019767761, |
|
"rewards/similarity_reward": 0.6164366006851196, |
|
"step": 165 |
|
}, |
|
{ |
|
"completion_length": 281.45538330078125, |
|
"epoch": 0.44266666666666665, |
|
"grad_norm": 0.8588941157426009, |
|
"kl": 0.01220703125, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0615, |
|
"reward": 0.7305233478546143, |
|
"reward_std": 0.12011624127626419, |
|
"rewards/length_reward": 0.0808035358786583, |
|
"rewards/similarity_reward": 0.6497198343276978, |
|
"step": 166 |
|
}, |
|
{ |
|
"completion_length": 259.64288330078125, |
|
"epoch": 0.44533333333333336, |
|
"grad_norm": 1.137909715907424, |
|
"kl": 0.01031494140625, |
|
"learning_rate": 2e-06, |
|
"loss": 0.2205, |
|
"reward": 0.5699202418327332, |
|
"reward_std": 0.1761079728603363, |
|
"rewards/length_reward": 0.06294640898704529, |
|
"rewards/similarity_reward": 0.5069737434387207, |
|
"step": 167 |
|
}, |
|
{ |
|
"completion_length": 294.875, |
|
"epoch": 0.448, |
|
"grad_norm": 1.0016080727138688, |
|
"kl": 0.01171875, |
|
"learning_rate": 2e-06, |
|
"loss": 0.1368, |
|
"reward": 0.6165792942047119, |
|
"reward_std": 0.12841306626796722, |
|
"rewards/length_reward": 0.064732126891613, |
|
"rewards/similarity_reward": 0.5518471002578735, |
|
"step": 168 |
|
}, |
|
{ |
|
"completion_length": 308.37054443359375, |
|
"epoch": 0.45066666666666666, |
|
"grad_norm": 1.2557098703938632, |
|
"kl": 0.0162353515625, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0777, |
|
"reward": 0.6842705607414246, |
|
"reward_std": 0.09632124751806259, |
|
"rewards/length_reward": 0.08124997466802597, |
|
"rewards/similarity_reward": 0.6030204892158508, |
|
"step": 169 |
|
}, |
|
{ |
|
"completion_length": 281.4821472167969, |
|
"epoch": 0.4533333333333333, |
|
"grad_norm": 0.8773655821391068, |
|
"kl": 0.009033203125, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0602, |
|
"reward": 0.6346014738082886, |
|
"reward_std": 0.14319205284118652, |
|
"rewards/length_reward": 0.06116069480776787, |
|
"rewards/similarity_reward": 0.5734407901763916, |
|
"step": 170 |
|
}, |
|
{ |
|
"completion_length": 281.8125, |
|
"epoch": 0.456, |
|
"grad_norm": 0.9145529663215465, |
|
"kl": 0.01275634765625, |
|
"learning_rate": 2e-06, |
|
"loss": 0.1688, |
|
"reward": 0.5963006019592285, |
|
"reward_std": 0.15331213176250458, |
|
"rewards/length_reward": 0.06696426123380661, |
|
"rewards/similarity_reward": 0.5293362736701965, |
|
"step": 171 |
|
}, |
|
{ |
|
"completion_length": 280.8571472167969, |
|
"epoch": 0.45866666666666667, |
|
"grad_norm": 3.878703302716922, |
|
"kl": 0.0269775390625, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0219, |
|
"reward": 0.6730906963348389, |
|
"reward_std": 0.11424030363559723, |
|
"rewards/length_reward": 0.06874997913837433, |
|
"rewards/similarity_reward": 0.604340672492981, |
|
"step": 172 |
|
}, |
|
{ |
|
"completion_length": 277.9910888671875, |
|
"epoch": 0.4613333333333333, |
|
"grad_norm": 1.0003205251640386, |
|
"kl": 0.01141357421875, |
|
"learning_rate": 2e-06, |
|
"loss": 0.1181, |
|
"reward": 0.6080780029296875, |
|
"reward_std": 0.14715011417865753, |
|
"rewards/length_reward": 0.06651782989501953, |
|
"rewards/similarity_reward": 0.541560173034668, |
|
"step": 173 |
|
}, |
|
{ |
|
"completion_length": 257.0133972167969, |
|
"epoch": 0.464, |
|
"grad_norm": 0.8626427313272481, |
|
"kl": 0.00921630859375, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0332, |
|
"reward": 0.6258962154388428, |
|
"reward_std": 0.13628825545310974, |
|
"rewards/length_reward": 0.07098211348056793, |
|
"rewards/similarity_reward": 0.554914116859436, |
|
"step": 174 |
|
}, |
|
{ |
|
"completion_length": 267.1294860839844, |
|
"epoch": 0.4666666666666667, |
|
"grad_norm": 0.9448046232693003, |
|
"kl": 0.0128173828125, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0686, |
|
"reward": 0.582805871963501, |
|
"reward_std": 0.13543623685836792, |
|
"rewards/length_reward": 0.07455354183912277, |
|
"rewards/similarity_reward": 0.5082523226737976, |
|
"step": 175 |
|
}, |
|
{ |
|
"completion_length": 230.3169708251953, |
|
"epoch": 0.4693333333333333, |
|
"grad_norm": 1.0336066582105279, |
|
"kl": 0.01611328125, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0188, |
|
"reward": 0.5923266410827637, |
|
"reward_std": 0.15992802381515503, |
|
"rewards/length_reward": 0.06607140600681305, |
|
"rewards/similarity_reward": 0.5262552499771118, |
|
"step": 176 |
|
}, |
|
{ |
|
"completion_length": 259.2857360839844, |
|
"epoch": 0.472, |
|
"grad_norm": 0.8515404437990851, |
|
"kl": 0.01422119140625, |
|
"learning_rate": 2e-06, |
|
"loss": -0.0031, |
|
"reward": 0.6366464495658875, |
|
"reward_std": 0.14244325459003448, |
|
"rewards/length_reward": 0.06785711646080017, |
|
"rewards/similarity_reward": 0.5687893033027649, |
|
"step": 177 |
|
}, |
|
{ |
|
"completion_length": 269.0535888671875, |
|
"epoch": 0.4746666666666667, |
|
"grad_norm": 1.0508009846238586, |
|
"kl": 0.01202392578125, |
|
"learning_rate": 2e-06, |
|
"loss": 0.1414, |
|
"reward": 0.6338518857955933, |
|
"reward_std": 0.13359463214874268, |
|
"rewards/length_reward": 0.06651783734560013, |
|
"rewards/similarity_reward": 0.5673341155052185, |
|
"step": 178 |
|
}, |
|
{ |
|
"completion_length": 254.2053680419922, |
|
"epoch": 0.47733333333333333, |
|
"grad_norm": 1.0742338846656552, |
|
"kl": 0.00799560546875, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0226, |
|
"reward": 0.6352322697639465, |
|
"reward_std": 0.16355818510055542, |
|
"rewards/length_reward": 0.07187497615814209, |
|
"rewards/similarity_reward": 0.5633572936058044, |
|
"step": 179 |
|
}, |
|
{ |
|
"completion_length": 258.8482360839844, |
|
"epoch": 0.48, |
|
"grad_norm": 1.1021168870169997, |
|
"kl": 0.0213623046875, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0656, |
|
"reward": 0.594104528427124, |
|
"reward_std": 0.1770821362733841, |
|
"rewards/length_reward": 0.060714274644851685, |
|
"rewards/similarity_reward": 0.5333902835845947, |
|
"step": 180 |
|
}, |
|
{ |
|
"completion_length": 256.8125, |
|
"epoch": 0.4826666666666667, |
|
"grad_norm": 0.8501219854036921, |
|
"kl": 0.009033203125, |
|
"learning_rate": 2e-06, |
|
"loss": -0.0019, |
|
"reward": 0.6487245559692383, |
|
"reward_std": 0.13405689597129822, |
|
"rewards/length_reward": 0.07455354183912277, |
|
"rewards/similarity_reward": 0.5741709470748901, |
|
"step": 181 |
|
}, |
|
{ |
|
"completion_length": 256.02679443359375, |
|
"epoch": 0.48533333333333334, |
|
"grad_norm": 1.0385629776489995, |
|
"kl": 0.012939453125, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0721, |
|
"reward": 0.6005666255950928, |
|
"reward_std": 0.18559977412223816, |
|
"rewards/length_reward": 0.06741069257259369, |
|
"rewards/similarity_reward": 0.5331559181213379, |
|
"step": 182 |
|
}, |
|
{ |
|
"completion_length": 245.2723388671875, |
|
"epoch": 0.488, |
|
"grad_norm": 0.9856929099072189, |
|
"kl": 0.0140380859375, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0559, |
|
"reward": 0.6080025434494019, |
|
"reward_std": 0.12059400230646133, |
|
"rewards/length_reward": 0.06517855077981949, |
|
"rewards/similarity_reward": 0.5428239703178406, |
|
"step": 183 |
|
}, |
|
{ |
|
"completion_length": 275.40179443359375, |
|
"epoch": 0.49066666666666664, |
|
"grad_norm": 1.0764117312018395, |
|
"kl": 0.0147705078125, |
|
"learning_rate": 2e-06, |
|
"loss": 0.1746, |
|
"reward": 0.5906988382339478, |
|
"reward_std": 0.13717274367809296, |
|
"rewards/length_reward": 0.06339284032583237, |
|
"rewards/similarity_reward": 0.5273059606552124, |
|
"step": 184 |
|
}, |
|
{ |
|
"completion_length": 235.33929443359375, |
|
"epoch": 0.49333333333333335, |
|
"grad_norm": 1.216465109274426, |
|
"kl": 0.015869140625, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0441, |
|
"reward": 0.6832688450813293, |
|
"reward_std": 0.12071473151445389, |
|
"rewards/length_reward": 0.07767853885889053, |
|
"rewards/similarity_reward": 0.6055901646614075, |
|
"step": 185 |
|
}, |
|
{ |
|
"completion_length": 271.0401916503906, |
|
"epoch": 0.496, |
|
"grad_norm": 0.9417708264398014, |
|
"kl": 0.0113525390625, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0103, |
|
"reward": 0.7055503726005554, |
|
"reward_std": 0.09871623665094376, |
|
"rewards/length_reward": 0.07812497019767761, |
|
"rewards/similarity_reward": 0.6274253129959106, |
|
"step": 186 |
|
}, |
|
{ |
|
"completion_length": 288.40179443359375, |
|
"epoch": 0.49866666666666665, |
|
"grad_norm": 0.7904664413572577, |
|
"kl": 0.0113525390625, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0943, |
|
"reward": 0.6588479280471802, |
|
"reward_std": 0.1498415172100067, |
|
"rewards/length_reward": 0.07053568959236145, |
|
"rewards/similarity_reward": 0.5883120894432068, |
|
"step": 187 |
|
}, |
|
{ |
|
"completion_length": 262.2857360839844, |
|
"epoch": 0.5013333333333333, |
|
"grad_norm": 1.0107893927701763, |
|
"kl": 0.0113525390625, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0639, |
|
"reward": 0.5783969759941101, |
|
"reward_std": 0.1660866141319275, |
|
"rewards/length_reward": 0.06741069257259369, |
|
"rewards/similarity_reward": 0.5109862685203552, |
|
"step": 188 |
|
}, |
|
{ |
|
"completion_length": 243.23214721679688, |
|
"epoch": 0.504, |
|
"grad_norm": 0.9914068826603122, |
|
"kl": 0.0263671875, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0183, |
|
"reward": 0.5762468576431274, |
|
"reward_std": 0.1855197250843048, |
|
"rewards/length_reward": 0.06741069257259369, |
|
"rewards/similarity_reward": 0.5088360905647278, |
|
"step": 189 |
|
}, |
|
{ |
|
"completion_length": 279.8973388671875, |
|
"epoch": 0.5066666666666667, |
|
"grad_norm": 0.8186202175206256, |
|
"kl": 0.01031494140625, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0181, |
|
"reward": 0.6954742074012756, |
|
"reward_std": 0.08623984456062317, |
|
"rewards/length_reward": 0.07053568959236145, |
|
"rewards/similarity_reward": 0.624938428401947, |
|
"step": 190 |
|
}, |
|
{ |
|
"completion_length": 257.21429443359375, |
|
"epoch": 0.5093333333333333, |
|
"grad_norm": 0.877876828642467, |
|
"kl": 0.014892578125, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0328, |
|
"reward": 0.6462003588676453, |
|
"reward_std": 0.11538383364677429, |
|
"rewards/length_reward": 0.06830354779958725, |
|
"rewards/similarity_reward": 0.577896773815155, |
|
"step": 191 |
|
}, |
|
{ |
|
"completion_length": 257.7946472167969, |
|
"epoch": 0.512, |
|
"grad_norm": 0.8857490639900779, |
|
"kl": 0.01214599609375, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0477, |
|
"reward": 0.6250221133232117, |
|
"reward_std": 0.15633754432201385, |
|
"rewards/length_reward": 0.06830354779958725, |
|
"rewards/similarity_reward": 0.5567185282707214, |
|
"step": 192 |
|
}, |
|
{ |
|
"completion_length": 303.4464416503906, |
|
"epoch": 0.5146666666666667, |
|
"grad_norm": 0.8375649728004798, |
|
"kl": 0.00897216796875, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0444, |
|
"reward": 0.6938925981521606, |
|
"reward_std": 0.13664484024047852, |
|
"rewards/length_reward": 0.07544640451669693, |
|
"rewards/similarity_reward": 0.6184461116790771, |
|
"step": 193 |
|
}, |
|
{ |
|
"completion_length": 239.02679443359375, |
|
"epoch": 0.5173333333333333, |
|
"grad_norm": 0.9796244769392795, |
|
"kl": 0.0169677734375, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0277, |
|
"reward": 0.6308580636978149, |
|
"reward_std": 0.09844722598791122, |
|
"rewards/length_reward": 0.07544640451669693, |
|
"rewards/similarity_reward": 0.5554116368293762, |
|
"step": 194 |
|
}, |
|
{ |
|
"completion_length": 220.04019165039062, |
|
"epoch": 0.52, |
|
"grad_norm": 1.1138163852092198, |
|
"kl": 0.0211181640625, |
|
"learning_rate": 2e-06, |
|
"loss": 0.101, |
|
"reward": 0.5808507204055786, |
|
"reward_std": 0.14026090502738953, |
|
"rewards/length_reward": 0.056696418672800064, |
|
"rewards/similarity_reward": 0.5241542458534241, |
|
"step": 195 |
|
}, |
|
{ |
|
"completion_length": 310.6294860839844, |
|
"epoch": 0.5226666666666666, |
|
"grad_norm": 0.7596161810526226, |
|
"kl": 0.012451171875, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0671, |
|
"reward": 0.6398200988769531, |
|
"reward_std": 0.16089944541454315, |
|
"rewards/length_reward": 0.07633925974369049, |
|
"rewards/similarity_reward": 0.5634807348251343, |
|
"step": 196 |
|
}, |
|
{ |
|
"completion_length": 293.65179443359375, |
|
"epoch": 0.5253333333333333, |
|
"grad_norm": 1.3057397068251875, |
|
"kl": 0.0137939453125, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0229, |
|
"reward": 0.69722580909729, |
|
"reward_std": 0.10665407031774521, |
|
"rewards/length_reward": 0.07901783287525177, |
|
"rewards/similarity_reward": 0.6182078719139099, |
|
"step": 197 |
|
}, |
|
{ |
|
"completion_length": 271.8794860839844, |
|
"epoch": 0.528, |
|
"grad_norm": 0.9454287770215252, |
|
"kl": 0.01190185546875, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0697, |
|
"reward": 0.6203178763389587, |
|
"reward_std": 0.1754215508699417, |
|
"rewards/length_reward": 0.06339284032583237, |
|
"rewards/similarity_reward": 0.5569249987602234, |
|
"step": 198 |
|
}, |
|
{ |
|
"completion_length": 259.3571472167969, |
|
"epoch": 0.5306666666666666, |
|
"grad_norm": 0.8381899247069013, |
|
"kl": 0.011474609375, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0283, |
|
"reward": 0.6383811831474304, |
|
"reward_std": 0.11189709603786469, |
|
"rewards/length_reward": 0.07544640451669693, |
|
"rewards/similarity_reward": 0.5629347562789917, |
|
"step": 199 |
|
}, |
|
{ |
|
"completion_length": 258.0401916503906, |
|
"epoch": 0.5333333333333333, |
|
"grad_norm": 1.0128439289515407, |
|
"kl": 0.012451171875, |
|
"learning_rate": 2e-06, |
|
"loss": 0.1465, |
|
"reward": 0.5912656784057617, |
|
"reward_std": 0.14550404250621796, |
|
"rewards/length_reward": 0.058482129126787186, |
|
"rewards/similarity_reward": 0.5327835083007812, |
|
"step": 200 |
|
}, |
|
{ |
|
"completion_length": 311.27679443359375, |
|
"epoch": 0.536, |
|
"grad_norm": 0.86779018830801, |
|
"kl": 0.00909423828125, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0985, |
|
"reward": 0.588790237903595, |
|
"reward_std": 0.140910342335701, |
|
"rewards/length_reward": 0.07187497615814209, |
|
"rewards/similarity_reward": 0.5169152021408081, |
|
"step": 201 |
|
}, |
|
{ |
|
"completion_length": 223.9598388671875, |
|
"epoch": 0.5386666666666666, |
|
"grad_norm": 0.9896401672605407, |
|
"kl": 0.01263427734375, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0605, |
|
"reward": 0.5413497686386108, |
|
"reward_std": 0.13121715188026428, |
|
"rewards/length_reward": 0.06517855077981949, |
|
"rewards/similarity_reward": 0.4761711657047272, |
|
"step": 202 |
|
}, |
|
{ |
|
"completion_length": 245.66964721679688, |
|
"epoch": 0.5413333333333333, |
|
"grad_norm": 0.8825950131925253, |
|
"kl": 0.01300048828125, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0168, |
|
"reward": 0.632973849773407, |
|
"reward_std": 0.15790660679340363, |
|
"rewards/length_reward": 0.07678568363189697, |
|
"rewards/similarity_reward": 0.5561880469322205, |
|
"step": 203 |
|
}, |
|
{ |
|
"completion_length": 214.50894165039062, |
|
"epoch": 0.544, |
|
"grad_norm": 0.9691668766051184, |
|
"kl": 0.0133056640625, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0135, |
|
"reward": 0.6077868938446045, |
|
"reward_std": 0.12028573453426361, |
|
"rewards/length_reward": 0.06964283436536789, |
|
"rewards/similarity_reward": 0.5381439328193665, |
|
"step": 204 |
|
}, |
|
{ |
|
"completion_length": 292.73663330078125, |
|
"epoch": 0.5466666666666666, |
|
"grad_norm": 0.7967760450327859, |
|
"kl": 0.00982666015625, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0266, |
|
"reward": 0.6232799887657166, |
|
"reward_std": 0.10140591114759445, |
|
"rewards/length_reward": 0.07410712540149689, |
|
"rewards/similarity_reward": 0.5491728186607361, |
|
"step": 205 |
|
}, |
|
{ |
|
"completion_length": 221.25001525878906, |
|
"epoch": 0.5493333333333333, |
|
"grad_norm": 1.1638296703356164, |
|
"kl": 0.01385498046875, |
|
"learning_rate": 2e-06, |
|
"loss": 0.1653, |
|
"reward": 0.5745998620986938, |
|
"reward_std": 0.1350637972354889, |
|
"rewards/length_reward": 0.06741069257259369, |
|
"rewards/similarity_reward": 0.5071890950202942, |
|
"step": 206 |
|
}, |
|
{ |
|
"completion_length": 265.58929443359375, |
|
"epoch": 0.552, |
|
"grad_norm": 0.845270302637572, |
|
"kl": 0.01031494140625, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0111, |
|
"reward": 0.6836676001548767, |
|
"reward_std": 0.10602893680334091, |
|
"rewards/length_reward": 0.07053568959236145, |
|
"rewards/similarity_reward": 0.6131318211555481, |
|
"step": 207 |
|
}, |
|
{ |
|
"completion_length": 295.58929443359375, |
|
"epoch": 0.5546666666666666, |
|
"grad_norm": 0.9323286808595849, |
|
"kl": 0.00921630859375, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0749, |
|
"reward": 0.5889706611633301, |
|
"reward_std": 0.1236046850681305, |
|
"rewards/length_reward": 0.06517855823040009, |
|
"rewards/similarity_reward": 0.5237920880317688, |
|
"step": 208 |
|
}, |
|
{ |
|
"completion_length": 293.58038330078125, |
|
"epoch": 0.5573333333333333, |
|
"grad_norm": 0.9233381319586115, |
|
"kl": 0.01055908203125, |
|
"learning_rate": 2e-06, |
|
"loss": 0.2, |
|
"reward": 0.6363462805747986, |
|
"reward_std": 0.12041884660720825, |
|
"rewards/length_reward": 0.06874997913837433, |
|
"rewards/similarity_reward": 0.5675963759422302, |
|
"step": 209 |
|
}, |
|
{ |
|
"completion_length": 266.3660888671875, |
|
"epoch": 0.56, |
|
"grad_norm": 0.9921663239986533, |
|
"kl": 0.01214599609375, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0664, |
|
"reward": 0.5890473127365112, |
|
"reward_std": 0.14096976816654205, |
|
"rewards/length_reward": 0.06339284032583237, |
|
"rewards/similarity_reward": 0.5256544351577759, |
|
"step": 210 |
|
}, |
|
{ |
|
"completion_length": 257.46875, |
|
"epoch": 0.5626666666666666, |
|
"grad_norm": 0.9774355514560761, |
|
"kl": 0.01080322265625, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0824, |
|
"reward": 0.5761434435844421, |
|
"reward_std": 0.18852439522743225, |
|
"rewards/length_reward": 0.061160698533058167, |
|
"rewards/similarity_reward": 0.5149827003479004, |
|
"step": 211 |
|
}, |
|
{ |
|
"completion_length": 214.66964721679688, |
|
"epoch": 0.5653333333333334, |
|
"grad_norm": 1.0333338701683394, |
|
"kl": 0.00994873046875, |
|
"learning_rate": 2e-06, |
|
"loss": 0.1128, |
|
"reward": 0.6288223266601562, |
|
"reward_std": 0.1170286163687706, |
|
"rewards/length_reward": 0.06964283436536789, |
|
"rewards/similarity_reward": 0.5591794848442078, |
|
"step": 212 |
|
}, |
|
{ |
|
"completion_length": 284.0089416503906, |
|
"epoch": 0.568, |
|
"grad_norm": 0.9552749101564338, |
|
"kl": 0.01422119140625, |
|
"learning_rate": 2e-06, |
|
"loss": 0.1971, |
|
"reward": 0.5316947102546692, |
|
"reward_std": 0.14774499833583832, |
|
"rewards/length_reward": 0.06607140600681305, |
|
"rewards/similarity_reward": 0.46562325954437256, |
|
"step": 213 |
|
}, |
|
{ |
|
"completion_length": 264.84375, |
|
"epoch": 0.5706666666666667, |
|
"grad_norm": 0.9328609206359839, |
|
"kl": 0.0128173828125, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0611, |
|
"reward": 0.61468505859375, |
|
"reward_std": 0.0943475142121315, |
|
"rewards/length_reward": 0.08214282244443893, |
|
"rewards/similarity_reward": 0.5325421690940857, |
|
"step": 214 |
|
}, |
|
{ |
|
"completion_length": 288.65625, |
|
"epoch": 0.5733333333333334, |
|
"grad_norm": 0.9888552258575887, |
|
"kl": 0.0147705078125, |
|
"learning_rate": 2e-06, |
|
"loss": 0.1398, |
|
"reward": 0.5841876864433289, |
|
"reward_std": 0.10097295790910721, |
|
"rewards/length_reward": 0.06294640898704529, |
|
"rewards/similarity_reward": 0.5212411880493164, |
|
"step": 215 |
|
}, |
|
{ |
|
"completion_length": 270.28125, |
|
"epoch": 0.576, |
|
"grad_norm": 0.8803342156226522, |
|
"kl": 0.014892578125, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0359, |
|
"reward": 0.6344039440155029, |
|
"reward_std": 0.17091530561447144, |
|
"rewards/length_reward": 0.06607141345739365, |
|
"rewards/similarity_reward": 0.5683325529098511, |
|
"step": 216 |
|
}, |
|
{ |
|
"completion_length": 255.12501525878906, |
|
"epoch": 0.5786666666666667, |
|
"grad_norm": 0.8979196392383272, |
|
"kl": 0.027099609375, |
|
"learning_rate": 2e-06, |
|
"loss": -0.0078, |
|
"reward": 0.661945641040802, |
|
"reward_std": 0.1631477326154709, |
|
"rewards/length_reward": 0.07767854630947113, |
|
"rewards/similarity_reward": 0.5842669606208801, |
|
"step": 217 |
|
}, |
|
{ |
|
"completion_length": 263.58038330078125, |
|
"epoch": 0.5813333333333334, |
|
"grad_norm": 1.180022786404114, |
|
"kl": 0.0191650390625, |
|
"learning_rate": 2e-06, |
|
"loss": 0.1111, |
|
"reward": 0.5408477187156677, |
|
"reward_std": 0.11287137866020203, |
|
"rewards/length_reward": 0.060267843306064606, |
|
"rewards/similarity_reward": 0.4805798828601837, |
|
"step": 218 |
|
}, |
|
{ |
|
"completion_length": 293.24554443359375, |
|
"epoch": 0.584, |
|
"grad_norm": 1.082130352994329, |
|
"kl": 0.0113525390625, |
|
"learning_rate": 2e-06, |
|
"loss": 0.166, |
|
"reward": 0.6229541301727295, |
|
"reward_std": 0.18498755991458893, |
|
"rewards/length_reward": 0.06294640898704529, |
|
"rewards/similarity_reward": 0.5600076913833618, |
|
"step": 219 |
|
}, |
|
{ |
|
"completion_length": 255.9553680419922, |
|
"epoch": 0.5866666666666667, |
|
"grad_norm": 0.8518142779942337, |
|
"kl": 0.0216064453125, |
|
"learning_rate": 2e-06, |
|
"loss": -0.0418, |
|
"reward": 0.6351791620254517, |
|
"reward_std": 0.1440075933933258, |
|
"rewards/length_reward": 0.06919640302658081, |
|
"rewards/similarity_reward": 0.5659827589988708, |
|
"step": 220 |
|
}, |
|
{ |
|
"completion_length": 268.2589416503906, |
|
"epoch": 0.5893333333333334, |
|
"grad_norm": 0.8005051959777295, |
|
"kl": 0.00799560546875, |
|
"learning_rate": 2e-06, |
|
"loss": 0.029, |
|
"reward": 0.6807352304458618, |
|
"reward_std": 0.11082387715578079, |
|
"rewards/length_reward": 0.07455354183912277, |
|
"rewards/similarity_reward": 0.6061817407608032, |
|
"step": 221 |
|
}, |
|
{ |
|
"completion_length": 267.5089416503906, |
|
"epoch": 0.592, |
|
"grad_norm": 0.8727360410582777, |
|
"kl": 0.00927734375, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0333, |
|
"reward": 0.6831346154212952, |
|
"reward_std": 0.09732881933450699, |
|
"rewards/length_reward": 0.07633925974369049, |
|
"rewards/similarity_reward": 0.6067953109741211, |
|
"step": 222 |
|
}, |
|
{ |
|
"completion_length": 271.2008972167969, |
|
"epoch": 0.5946666666666667, |
|
"grad_norm": 0.7543972270797626, |
|
"kl": 0.00921630859375, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0251, |
|
"reward": 0.7254729866981506, |
|
"reward_std": 0.13280263543128967, |
|
"rewards/length_reward": 0.07723211497068405, |
|
"rewards/similarity_reward": 0.6482407450675964, |
|
"step": 223 |
|
}, |
|
{ |
|
"completion_length": 276.33038330078125, |
|
"epoch": 0.5973333333333334, |
|
"grad_norm": 0.8477057601765857, |
|
"kl": 0.01470947265625, |
|
"learning_rate": 2e-06, |
|
"loss": 0.093, |
|
"reward": 0.6127163171768188, |
|
"reward_std": 0.14752325415611267, |
|
"rewards/length_reward": 0.06160712614655495, |
|
"rewards/similarity_reward": 0.5511091351509094, |
|
"step": 224 |
|
}, |
|
{ |
|
"completion_length": 313.02679443359375, |
|
"epoch": 0.6, |
|
"grad_norm": 0.9444843501933834, |
|
"kl": 0.01953125, |
|
"learning_rate": 2e-06, |
|
"loss": 0.1058, |
|
"reward": 0.6097243428230286, |
|
"reward_std": 0.1704142987728119, |
|
"rewards/length_reward": 0.06696426123380661, |
|
"rewards/similarity_reward": 0.5427600741386414, |
|
"step": 225 |
|
}, |
|
{ |
|
"completion_length": 288.37054443359375, |
|
"epoch": 0.6026666666666667, |
|
"grad_norm": 0.8579103953599808, |
|
"kl": 0.01373291015625, |
|
"learning_rate": 2e-06, |
|
"loss": 0.1122, |
|
"reward": 0.6366080641746521, |
|
"reward_std": 0.12147609889507294, |
|
"rewards/length_reward": 0.07232140004634857, |
|
"rewards/similarity_reward": 0.5642866492271423, |
|
"step": 226 |
|
}, |
|
{ |
|
"completion_length": 310.1607360839844, |
|
"epoch": 0.6053333333333333, |
|
"grad_norm": 0.759533963504491, |
|
"kl": 0.01312255859375, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0699, |
|
"reward": 0.6739456057548523, |
|
"reward_std": 0.10951042920351028, |
|
"rewards/length_reward": 0.07276783138513565, |
|
"rewards/similarity_reward": 0.6011778116226196, |
|
"step": 227 |
|
}, |
|
{ |
|
"completion_length": 239.0848388671875, |
|
"epoch": 0.608, |
|
"grad_norm": 0.9025930213219101, |
|
"kl": 0.01104736328125, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0215, |
|
"reward": 0.6142429709434509, |
|
"reward_std": 0.08829416334629059, |
|
"rewards/length_reward": 0.07633925974369049, |
|
"rewards/similarity_reward": 0.5379037261009216, |
|
"step": 228 |
|
}, |
|
{ |
|
"completion_length": 278.3125, |
|
"epoch": 0.6106666666666667, |
|
"grad_norm": 0.886301576249163, |
|
"kl": 0.01104736328125, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0334, |
|
"reward": 0.7060741186141968, |
|
"reward_std": 0.11311851441860199, |
|
"rewards/length_reward": 0.07723211497068405, |
|
"rewards/similarity_reward": 0.6288419961929321, |
|
"step": 229 |
|
}, |
|
{ |
|
"completion_length": 251.18751525878906, |
|
"epoch": 0.6133333333333333, |
|
"grad_norm": 2.476356671041086, |
|
"kl": 0.0244140625, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0096, |
|
"reward": 0.5848217010498047, |
|
"reward_std": 0.11717528849840164, |
|
"rewards/length_reward": 0.061160698533058167, |
|
"rewards/similarity_reward": 0.5236610770225525, |
|
"step": 230 |
|
}, |
|
{ |
|
"completion_length": 236.29019165039062, |
|
"epoch": 0.616, |
|
"grad_norm": 0.9163834681525471, |
|
"kl": 0.0108642578125, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0201, |
|
"reward": 0.6646043658256531, |
|
"reward_std": 0.11276809126138687, |
|
"rewards/length_reward": 0.07991068810224533, |
|
"rewards/similarity_reward": 0.5846936702728271, |
|
"step": 231 |
|
}, |
|
{ |
|
"completion_length": 317.0089416503906, |
|
"epoch": 0.6186666666666667, |
|
"grad_norm": 0.8636877609886525, |
|
"kl": 0.013427734375, |
|
"learning_rate": 2e-06, |
|
"loss": 0.036, |
|
"reward": 0.5721753835678101, |
|
"reward_std": 0.16120396554470062, |
|
"rewards/length_reward": 0.06517855077981949, |
|
"rewards/similarity_reward": 0.506996750831604, |
|
"step": 232 |
|
}, |
|
{ |
|
"completion_length": 307.7008972167969, |
|
"epoch": 0.6213333333333333, |
|
"grad_norm": 1.3729033520790577, |
|
"kl": 0.0166015625, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0383, |
|
"reward": 0.6260521411895752, |
|
"reward_std": 0.11853621900081635, |
|
"rewards/length_reward": 0.07767853885889053, |
|
"rewards/similarity_reward": 0.5483735799789429, |
|
"step": 233 |
|
}, |
|
{ |
|
"completion_length": 283.5714416503906, |
|
"epoch": 0.624, |
|
"grad_norm": 0.8530397710423918, |
|
"kl": 0.016357421875, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0206, |
|
"reward": 0.6967118382453918, |
|
"reward_std": 0.1562497317790985, |
|
"rewards/length_reward": 0.07857140153646469, |
|
"rewards/similarity_reward": 0.618140459060669, |
|
"step": 234 |
|
}, |
|
{ |
|
"completion_length": 272.6160888671875, |
|
"epoch": 0.6266666666666667, |
|
"grad_norm": 0.8799568084373302, |
|
"kl": 0.01153564453125, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0181, |
|
"reward": 0.6851814985275269, |
|
"reward_std": 0.10234292596578598, |
|
"rewards/length_reward": 0.07142854481935501, |
|
"rewards/similarity_reward": 0.6137529611587524, |
|
"step": 235 |
|
}, |
|
{ |
|
"completion_length": 316.4776916503906, |
|
"epoch": 0.6293333333333333, |
|
"grad_norm": 0.8224273598135922, |
|
"kl": 0.01416015625, |
|
"learning_rate": 2e-06, |
|
"loss": 0.1293, |
|
"reward": 0.5818712115287781, |
|
"reward_std": 0.1467462033033371, |
|
"rewards/length_reward": 0.059374988079071045, |
|
"rewards/similarity_reward": 0.5224961638450623, |
|
"step": 236 |
|
}, |
|
{ |
|
"completion_length": 276.1651916503906, |
|
"epoch": 0.632, |
|
"grad_norm": 0.8214244182848573, |
|
"kl": 0.01373291015625, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0324, |
|
"reward": 0.6201799511909485, |
|
"reward_std": 0.14638349413871765, |
|
"rewards/length_reward": 0.07098212093114853, |
|
"rewards/similarity_reward": 0.5491978526115417, |
|
"step": 237 |
|
}, |
|
{ |
|
"completion_length": 314.15179443359375, |
|
"epoch": 0.6346666666666667, |
|
"grad_norm": 0.8751337279602847, |
|
"kl": 0.011474609375, |
|
"learning_rate": 2e-06, |
|
"loss": 0.1279, |
|
"reward": 0.6131877899169922, |
|
"reward_std": 0.15269529819488525, |
|
"rewards/length_reward": 0.06383927166461945, |
|
"rewards/similarity_reward": 0.5493485331535339, |
|
"step": 238 |
|
}, |
|
{ |
|
"completion_length": 241.5982208251953, |
|
"epoch": 0.6373333333333333, |
|
"grad_norm": 0.9065686563133485, |
|
"kl": 0.01300048828125, |
|
"learning_rate": 2e-06, |
|
"loss": 0.053, |
|
"reward": 0.6322412490844727, |
|
"reward_std": 0.13913773000240326, |
|
"rewards/length_reward": 0.06741069257259369, |
|
"rewards/similarity_reward": 0.5648305416107178, |
|
"step": 239 |
|
}, |
|
{ |
|
"completion_length": 282.3973388671875, |
|
"epoch": 0.64, |
|
"grad_norm": 0.8759378979761268, |
|
"kl": 0.0130615234375, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0351, |
|
"reward": 0.6563798785209656, |
|
"reward_std": 0.11333189904689789, |
|
"rewards/length_reward": 0.07053568959236145, |
|
"rewards/similarity_reward": 0.5858440399169922, |
|
"step": 240 |
|
}, |
|
{ |
|
"completion_length": 258.2946472167969, |
|
"epoch": 0.6426666666666667, |
|
"grad_norm": 0.8083399774508907, |
|
"kl": 0.0115966796875, |
|
"learning_rate": 2e-06, |
|
"loss": -0.0089, |
|
"reward": 0.7068819403648376, |
|
"reward_std": 0.10833070427179337, |
|
"rewards/length_reward": 0.08392854034900665, |
|
"rewards/similarity_reward": 0.6229532957077026, |
|
"step": 241 |
|
}, |
|
{ |
|
"completion_length": 226.43751525878906, |
|
"epoch": 0.6453333333333333, |
|
"grad_norm": 1.113129598732782, |
|
"kl": 0.01708984375, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0985, |
|
"reward": 0.47489413619041443, |
|
"reward_std": 0.14593513309955597, |
|
"rewards/length_reward": 0.053124986588954926, |
|
"rewards/similarity_reward": 0.4217691719532013, |
|
"step": 242 |
|
}, |
|
{ |
|
"completion_length": 254.5982208251953, |
|
"epoch": 0.648, |
|
"grad_norm": 1.0574663727866278, |
|
"kl": 0.01171875, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0655, |
|
"reward": 0.585192084312439, |
|
"reward_std": 0.1674540489912033, |
|
"rewards/length_reward": 0.05937498062849045, |
|
"rewards/similarity_reward": 0.5258170962333679, |
|
"step": 243 |
|
}, |
|
{ |
|
"completion_length": 245.9866180419922, |
|
"epoch": 0.6506666666666666, |
|
"grad_norm": 0.9866678905813414, |
|
"kl": 0.01385498046875, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0992, |
|
"reward": 0.6301730871200562, |
|
"reward_std": 0.11110112071037292, |
|
"rewards/length_reward": 0.07142854481935501, |
|
"rewards/similarity_reward": 0.5587445497512817, |
|
"step": 244 |
|
}, |
|
{ |
|
"completion_length": 276.9419860839844, |
|
"epoch": 0.6533333333333333, |
|
"grad_norm": 0.8334751875894263, |
|
"kl": 0.01226806640625, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0199, |
|
"reward": 0.734417736530304, |
|
"reward_std": 0.13278159499168396, |
|
"rewards/length_reward": 0.07767854630947113, |
|
"rewards/similarity_reward": 0.6567391157150269, |
|
"step": 245 |
|
}, |
|
{ |
|
"completion_length": 224.4241180419922, |
|
"epoch": 0.656, |
|
"grad_norm": 1.039042606617133, |
|
"kl": 0.0211181640625, |
|
"learning_rate": 2e-06, |
|
"loss": 0.1146, |
|
"reward": 0.6315779089927673, |
|
"reward_std": 0.1330062597990036, |
|
"rewards/length_reward": 0.07053568959236145, |
|
"rewards/similarity_reward": 0.5610421895980835, |
|
"step": 246 |
|
}, |
|
{ |
|
"completion_length": 252.9732208251953, |
|
"epoch": 0.6586666666666666, |
|
"grad_norm": 0.8538008305966633, |
|
"kl": 0.01275634765625, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0224, |
|
"reward": 0.6646360754966736, |
|
"reward_std": 0.12239360809326172, |
|
"rewards/length_reward": 0.07455354183912277, |
|
"rewards/similarity_reward": 0.5900824666023254, |
|
"step": 247 |
|
}, |
|
{ |
|
"completion_length": 218.7366180419922, |
|
"epoch": 0.6613333333333333, |
|
"grad_norm": 0.963162350651896, |
|
"kl": 0.01507568359375, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0349, |
|
"reward": 0.6808683276176453, |
|
"reward_std": 0.13527972996234894, |
|
"rewards/length_reward": 0.07187497615814209, |
|
"rewards/similarity_reward": 0.6089933514595032, |
|
"step": 248 |
|
}, |
|
{ |
|
"completion_length": 254.5982208251953, |
|
"epoch": 0.664, |
|
"grad_norm": 0.9385464548992294, |
|
"kl": 0.0179443359375, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0288, |
|
"reward": 0.631544828414917, |
|
"reward_std": 0.13531894981861115, |
|
"rewards/length_reward": 0.06874997913837433, |
|
"rewards/similarity_reward": 0.5627948045730591, |
|
"step": 249 |
|
}, |
|
{ |
|
"completion_length": 291.37054443359375, |
|
"epoch": 0.6666666666666666, |
|
"grad_norm": 0.9140004673035673, |
|
"kl": 0.01708984375, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0837, |
|
"reward": 0.5439311861991882, |
|
"reward_std": 0.13576674461364746, |
|
"rewards/length_reward": 0.05848212540149689, |
|
"rewards/similarity_reward": 0.48544901609420776, |
|
"step": 250 |
|
}, |
|
{ |
|
"completion_length": 302.15179443359375, |
|
"epoch": 0.6693333333333333, |
|
"grad_norm": 0.8827607842873424, |
|
"kl": 0.0137939453125, |
|
"learning_rate": 2e-06, |
|
"loss": 0.1511, |
|
"reward": 0.521554172039032, |
|
"reward_std": 0.13812950253486633, |
|
"rewards/length_reward": 0.06249998137354851, |
|
"rewards/similarity_reward": 0.459054172039032, |
|
"step": 251 |
|
}, |
|
{ |
|
"completion_length": 256.67413330078125, |
|
"epoch": 0.672, |
|
"grad_norm": 0.9030494234496588, |
|
"kl": 0.015869140625, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0098, |
|
"reward": 0.6373765468597412, |
|
"reward_std": 0.16011908650398254, |
|
"rewards/length_reward": 0.06517855077981949, |
|
"rewards/similarity_reward": 0.5721979141235352, |
|
"step": 252 |
|
}, |
|
{ |
|
"completion_length": 263.4464416503906, |
|
"epoch": 0.6746666666666666, |
|
"grad_norm": 0.9917515330394924, |
|
"kl": 0.0157470703125, |
|
"learning_rate": 2e-06, |
|
"loss": 0.1071, |
|
"reward": 0.5835117697715759, |
|
"reward_std": 0.15427549183368683, |
|
"rewards/length_reward": 0.06205355003476143, |
|
"rewards/similarity_reward": 0.5214581489562988, |
|
"step": 253 |
|
}, |
|
{ |
|
"completion_length": 263.1875, |
|
"epoch": 0.6773333333333333, |
|
"grad_norm": 0.8527537949516601, |
|
"kl": 0.01007080078125, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0704, |
|
"reward": 0.659292459487915, |
|
"reward_std": 0.11287476867437363, |
|
"rewards/length_reward": 0.07723211497068405, |
|
"rewards/similarity_reward": 0.5820602774620056, |
|
"step": 254 |
|
}, |
|
{ |
|
"completion_length": 247.35714721679688, |
|
"epoch": 0.68, |
|
"grad_norm": 0.9306878660258886, |
|
"kl": 0.018798828125, |
|
"learning_rate": 2e-06, |
|
"loss": 0.1071, |
|
"reward": 0.5428202748298645, |
|
"reward_std": 0.14576061069965363, |
|
"rewards/length_reward": 0.055357132107019424, |
|
"rewards/similarity_reward": 0.4874631464481354, |
|
"step": 255 |
|
}, |
|
{ |
|
"completion_length": 304.77679443359375, |
|
"epoch": 0.6826666666666666, |
|
"grad_norm": 0.8122651765327112, |
|
"kl": 0.01123046875, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0722, |
|
"reward": 0.6534665822982788, |
|
"reward_std": 0.17661263048648834, |
|
"rewards/length_reward": 0.06651783734560013, |
|
"rewards/similarity_reward": 0.5869486927986145, |
|
"step": 256 |
|
}, |
|
{ |
|
"completion_length": 316.0, |
|
"epoch": 0.6853333333333333, |
|
"grad_norm": 0.7637865876113388, |
|
"kl": 0.01153564453125, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0805, |
|
"reward": 0.6809090375900269, |
|
"reward_std": 0.12792253494262695, |
|
"rewards/length_reward": 0.07499997317790985, |
|
"rewards/similarity_reward": 0.6059090495109558, |
|
"step": 257 |
|
}, |
|
{ |
|
"completion_length": 267.7410888671875, |
|
"epoch": 0.688, |
|
"grad_norm": 0.908951786987476, |
|
"kl": 0.0123291015625, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0919, |
|
"reward": 0.6498162150382996, |
|
"reward_std": 0.12118736654520035, |
|
"rewards/length_reward": 0.057589273899793625, |
|
"rewards/similarity_reward": 0.5922268629074097, |
|
"step": 258 |
|
}, |
|
{ |
|
"completion_length": 255.13394165039062, |
|
"epoch": 0.6906666666666667, |
|
"grad_norm": 0.9868527698980504, |
|
"kl": 0.01116943359375, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0518, |
|
"reward": 0.6177918910980225, |
|
"reward_std": 0.1076013594865799, |
|
"rewards/length_reward": 0.07232140004634857, |
|
"rewards/similarity_reward": 0.5454704165458679, |
|
"step": 259 |
|
}, |
|
{ |
|
"completion_length": 211.96429443359375, |
|
"epoch": 0.6933333333333334, |
|
"grad_norm": 1.1390879691759828, |
|
"kl": 0.01251220703125, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0376, |
|
"reward": 0.6133698225021362, |
|
"reward_std": 0.1394728273153305, |
|
"rewards/length_reward": 0.07187497615814209, |
|
"rewards/similarity_reward": 0.5414947867393494, |
|
"step": 260 |
|
}, |
|
{ |
|
"completion_length": 226.0491180419922, |
|
"epoch": 0.696, |
|
"grad_norm": 0.9181631341556423, |
|
"kl": 0.01458740234375, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0146, |
|
"reward": 0.6710724234580994, |
|
"reward_std": 0.12615807354450226, |
|
"rewards/length_reward": 0.06696426123380661, |
|
"rewards/similarity_reward": 0.6041080951690674, |
|
"step": 261 |
|
}, |
|
{ |
|
"completion_length": 290.71875, |
|
"epoch": 0.6986666666666667, |
|
"grad_norm": 0.9242817884998483, |
|
"kl": 0.010986328125, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0492, |
|
"reward": 0.758361279964447, |
|
"reward_std": 0.0939282700419426, |
|
"rewards/length_reward": 0.07812497019767761, |
|
"rewards/similarity_reward": 0.680236279964447, |
|
"step": 262 |
|
}, |
|
{ |
|
"completion_length": 295.2276916503906, |
|
"epoch": 0.7013333333333334, |
|
"grad_norm": 0.9268240432998979, |
|
"kl": 0.01434326171875, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0777, |
|
"reward": 0.5887910723686218, |
|
"reward_std": 0.15629605948925018, |
|
"rewards/length_reward": 0.06517855077981949, |
|
"rewards/similarity_reward": 0.5236124992370605, |
|
"step": 263 |
|
}, |
|
{ |
|
"completion_length": 197.97769165039062, |
|
"epoch": 0.704, |
|
"grad_norm": 0.9100124092555839, |
|
"kl": 0.01190185546875, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0176, |
|
"reward": 0.5667382478713989, |
|
"reward_std": 0.1061021164059639, |
|
"rewards/length_reward": 0.06741069257259369, |
|
"rewards/similarity_reward": 0.4993274211883545, |
|
"step": 264 |
|
}, |
|
{ |
|
"completion_length": 242.96429443359375, |
|
"epoch": 0.7066666666666667, |
|
"grad_norm": 0.9638821889698813, |
|
"kl": 0.0205078125, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0458, |
|
"reward": 0.6227900981903076, |
|
"reward_std": 0.1025083139538765, |
|
"rewards/length_reward": 0.06562498211860657, |
|
"rewards/similarity_reward": 0.5571650862693787, |
|
"step": 265 |
|
}, |
|
{ |
|
"completion_length": 223.3973388671875, |
|
"epoch": 0.7093333333333334, |
|
"grad_norm": 1.0581452656511359, |
|
"kl": 0.01336669921875, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0641, |
|
"reward": 0.6723743081092834, |
|
"reward_std": 0.1159893348813057, |
|
"rewards/length_reward": 0.06517855077981949, |
|
"rewards/similarity_reward": 0.6071956753730774, |
|
"step": 266 |
|
}, |
|
{ |
|
"completion_length": 270.2098388671875, |
|
"epoch": 0.712, |
|
"grad_norm": 0.8609101277095984, |
|
"kl": 0.0146484375, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0419, |
|
"reward": 0.7352553009986877, |
|
"reward_std": 0.13158555328845978, |
|
"rewards/length_reward": 0.07499997317790985, |
|
"rewards/similarity_reward": 0.6602552533149719, |
|
"step": 267 |
|
}, |
|
{ |
|
"completion_length": 286.65179443359375, |
|
"epoch": 0.7146666666666667, |
|
"grad_norm": 0.85187609520963, |
|
"kl": 0.01470947265625, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0801, |
|
"reward": 0.5860309600830078, |
|
"reward_std": 0.12361589819192886, |
|
"rewards/length_reward": 0.06562498211860657, |
|
"rewards/similarity_reward": 0.5204059481620789, |
|
"step": 268 |
|
}, |
|
{ |
|
"completion_length": 258.5, |
|
"epoch": 0.7173333333333334, |
|
"grad_norm": 0.8462850933355499, |
|
"kl": 0.01080322265625, |
|
"learning_rate": 2e-06, |
|
"loss": 0.1016, |
|
"reward": 0.7648903131484985, |
|
"reward_std": 0.08282845467329025, |
|
"rewards/length_reward": 0.08348211646080017, |
|
"rewards/similarity_reward": 0.6814082264900208, |
|
"step": 269 |
|
}, |
|
{ |
|
"completion_length": 255.4598388671875, |
|
"epoch": 0.72, |
|
"grad_norm": 0.8789894818999019, |
|
"kl": 0.01507568359375, |
|
"learning_rate": 2e-06, |
|
"loss": 0.1231, |
|
"reward": 0.6465427875518799, |
|
"reward_std": 0.12229768186807632, |
|
"rewards/length_reward": 0.06607140600681305, |
|
"rewards/similarity_reward": 0.5804713368415833, |
|
"step": 270 |
|
}, |
|
{ |
|
"completion_length": 273.01788330078125, |
|
"epoch": 0.7226666666666667, |
|
"grad_norm": 0.8695715294946503, |
|
"kl": 0.0184326171875, |
|
"learning_rate": 2e-06, |
|
"loss": 0.087, |
|
"reward": 0.6258493661880493, |
|
"reward_std": 0.13294367492198944, |
|
"rewards/length_reward": 0.07053568959236145, |
|
"rewards/similarity_reward": 0.5553135871887207, |
|
"step": 271 |
|
}, |
|
{ |
|
"completion_length": 260.8169860839844, |
|
"epoch": 0.7253333333333334, |
|
"grad_norm": 1.2217664906535957, |
|
"kl": 0.015380859375, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0431, |
|
"reward": 0.6779768466949463, |
|
"reward_std": 0.11835993826389313, |
|
"rewards/length_reward": 0.07678568363189697, |
|
"rewards/similarity_reward": 0.6011910438537598, |
|
"step": 272 |
|
}, |
|
{ |
|
"completion_length": 284.625, |
|
"epoch": 0.728, |
|
"grad_norm": 0.70076206752431, |
|
"kl": 0.01031494140625, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0623, |
|
"reward": 0.6957324147224426, |
|
"reward_std": 0.11651583760976791, |
|
"rewards/length_reward": 0.08348210901021957, |
|
"rewards/similarity_reward": 0.6122502088546753, |
|
"step": 273 |
|
}, |
|
{ |
|
"completion_length": 255.76339721679688, |
|
"epoch": 0.7306666666666667, |
|
"grad_norm": 0.9105713339266117, |
|
"kl": 0.012939453125, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0998, |
|
"reward": 0.6966086626052856, |
|
"reward_std": 0.08685937523841858, |
|
"rewards/length_reward": 0.07187496870756149, |
|
"rewards/similarity_reward": 0.624733567237854, |
|
"step": 274 |
|
}, |
|
{ |
|
"completion_length": 296.76788330078125, |
|
"epoch": 0.7333333333333333, |
|
"grad_norm": 0.7414942240643484, |
|
"kl": 0.00921630859375, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0451, |
|
"reward": 0.7098910212516785, |
|
"reward_std": 0.09432552009820938, |
|
"rewards/length_reward": 0.08482139557600021, |
|
"rewards/similarity_reward": 0.6250695586204529, |
|
"step": 275 |
|
}, |
|
{ |
|
"completion_length": 270.0669860839844, |
|
"epoch": 0.736, |
|
"grad_norm": 0.898333119316704, |
|
"kl": 0.0137939453125, |
|
"learning_rate": 2e-06, |
|
"loss": -0.0225, |
|
"reward": 0.7013087868690491, |
|
"reward_std": 0.11285625398159027, |
|
"rewards/length_reward": 0.0741071105003357, |
|
"rewards/similarity_reward": 0.6272015571594238, |
|
"step": 276 |
|
}, |
|
{ |
|
"completion_length": 198.93751525878906, |
|
"epoch": 0.7386666666666667, |
|
"grad_norm": 1.1511982372559852, |
|
"kl": 0.0205078125, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0183, |
|
"reward": 0.5090009570121765, |
|
"reward_std": 0.13011598587036133, |
|
"rewards/length_reward": 0.06205355003476143, |
|
"rewards/similarity_reward": 0.4469473958015442, |
|
"step": 277 |
|
}, |
|
{ |
|
"completion_length": 271.58038330078125, |
|
"epoch": 0.7413333333333333, |
|
"grad_norm": 0.8195827963319392, |
|
"kl": 0.01092529296875, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0099, |
|
"reward": 0.6257685422897339, |
|
"reward_std": 0.1082058921456337, |
|
"rewards/length_reward": 0.07544640451669693, |
|
"rewards/similarity_reward": 0.5503220558166504, |
|
"step": 278 |
|
}, |
|
{ |
|
"completion_length": 273.2232360839844, |
|
"epoch": 0.744, |
|
"grad_norm": 0.9459393517121532, |
|
"kl": 0.01422119140625, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0902, |
|
"reward": 0.6197928190231323, |
|
"reward_std": 0.15125982463359833, |
|
"rewards/length_reward": 0.06874997913837433, |
|
"rewards/similarity_reward": 0.551042914390564, |
|
"step": 279 |
|
}, |
|
{ |
|
"completion_length": 279.65625, |
|
"epoch": 0.7466666666666667, |
|
"grad_norm": 0.9319904211339567, |
|
"kl": 0.0126953125, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0701, |
|
"reward": 0.6224436163902283, |
|
"reward_std": 0.11631693691015244, |
|
"rewards/length_reward": 0.07008926570415497, |
|
"rewards/similarity_reward": 0.5523543357849121, |
|
"step": 280 |
|
}, |
|
{ |
|
"completion_length": 289.61163330078125, |
|
"epoch": 0.7493333333333333, |
|
"grad_norm": 1.071648613346095, |
|
"kl": 0.010009765625, |
|
"learning_rate": 2e-06, |
|
"loss": 0.1723, |
|
"reward": 0.6961318850517273, |
|
"reward_std": 0.10897497087717056, |
|
"rewards/length_reward": 0.07723211497068405, |
|
"rewards/similarity_reward": 0.6188997030258179, |
|
"step": 281 |
|
}, |
|
{ |
|
"completion_length": 243.8348388671875, |
|
"epoch": 0.752, |
|
"grad_norm": 0.994726741013797, |
|
"kl": 0.01495361328125, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0637, |
|
"reward": 0.6346572637557983, |
|
"reward_std": 0.10820147395133972, |
|
"rewards/length_reward": 0.07008925825357437, |
|
"rewards/similarity_reward": 0.5645679831504822, |
|
"step": 282 |
|
}, |
|
{ |
|
"completion_length": 251.38394165039062, |
|
"epoch": 0.7546666666666667, |
|
"grad_norm": 0.976491323713793, |
|
"kl": 0.015380859375, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0921, |
|
"reward": 0.5713584423065186, |
|
"reward_std": 0.1460532546043396, |
|
"rewards/length_reward": 0.06517855077981949, |
|
"rewards/similarity_reward": 0.5061798095703125, |
|
"step": 283 |
|
}, |
|
{ |
|
"completion_length": 295.8348388671875, |
|
"epoch": 0.7573333333333333, |
|
"grad_norm": 0.8830201574560093, |
|
"kl": 0.0140380859375, |
|
"learning_rate": 2e-06, |
|
"loss": -0.0096, |
|
"reward": 0.5986955761909485, |
|
"reward_std": 0.15517514944076538, |
|
"rewards/length_reward": 0.07008926570415497, |
|
"rewards/similarity_reward": 0.5286062955856323, |
|
"step": 284 |
|
}, |
|
{ |
|
"completion_length": 247.72769165039062, |
|
"epoch": 0.76, |
|
"grad_norm": 0.8557479345922782, |
|
"kl": 0.01385498046875, |
|
"learning_rate": 2e-06, |
|
"loss": -0.0017, |
|
"reward": 0.6696428656578064, |
|
"reward_std": 0.10514307767152786, |
|
"rewards/length_reward": 0.07857140153646469, |
|
"rewards/similarity_reward": 0.5910714268684387, |
|
"step": 285 |
|
}, |
|
{ |
|
"completion_length": 264.9196472167969, |
|
"epoch": 0.7626666666666667, |
|
"grad_norm": 0.900029253879394, |
|
"kl": 0.01031494140625, |
|
"learning_rate": 2e-06, |
|
"loss": 0.011, |
|
"reward": 0.6437191367149353, |
|
"reward_std": 0.12881356477737427, |
|
"rewards/length_reward": 0.07991068065166473, |
|
"rewards/similarity_reward": 0.5638083815574646, |
|
"step": 286 |
|
}, |
|
{ |
|
"completion_length": 275.76788330078125, |
|
"epoch": 0.7653333333333333, |
|
"grad_norm": 0.8434547706339766, |
|
"kl": 0.01519775390625, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0667, |
|
"reward": 0.6621875762939453, |
|
"reward_std": 0.15463142096996307, |
|
"rewards/length_reward": 0.06964283436536789, |
|
"rewards/similarity_reward": 0.5925447344779968, |
|
"step": 287 |
|
}, |
|
{ |
|
"completion_length": 251.4866180419922, |
|
"epoch": 0.768, |
|
"grad_norm": 0.9853184231195431, |
|
"kl": 0.01507568359375, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0183, |
|
"reward": 0.5950483679771423, |
|
"reward_std": 0.1278952956199646, |
|
"rewards/length_reward": 0.07544640451669693, |
|
"rewards/similarity_reward": 0.5196019411087036, |
|
"step": 288 |
|
}, |
|
{ |
|
"completion_length": 279.9419860839844, |
|
"epoch": 0.7706666666666667, |
|
"grad_norm": 0.8374958728984951, |
|
"kl": 0.0126953125, |
|
"learning_rate": 2e-06, |
|
"loss": 0.1275, |
|
"reward": 0.6563042402267456, |
|
"reward_std": 0.14691661298274994, |
|
"rewards/length_reward": 0.07098212093114853, |
|
"rewards/similarity_reward": 0.5853220820426941, |
|
"step": 289 |
|
}, |
|
{ |
|
"completion_length": 283.12054443359375, |
|
"epoch": 0.7733333333333333, |
|
"grad_norm": 0.9945411525052974, |
|
"kl": 0.01019287109375, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0685, |
|
"reward": 0.7337676882743835, |
|
"reward_std": 0.09462190419435501, |
|
"rewards/length_reward": 0.07633925974369049, |
|
"rewards/similarity_reward": 0.6574283838272095, |
|
"step": 290 |
|
}, |
|
{ |
|
"completion_length": 286.1026916503906, |
|
"epoch": 0.776, |
|
"grad_norm": 0.7519820653879999, |
|
"kl": 0.01275634765625, |
|
"learning_rate": 2e-06, |
|
"loss": 0.069, |
|
"reward": 0.6660787463188171, |
|
"reward_std": 0.13770896196365356, |
|
"rewards/length_reward": 0.07455354183912277, |
|
"rewards/similarity_reward": 0.5915251970291138, |
|
"step": 291 |
|
}, |
|
{ |
|
"completion_length": 300.4419860839844, |
|
"epoch": 0.7786666666666666, |
|
"grad_norm": 0.9216655286630218, |
|
"kl": 0.01190185546875, |
|
"learning_rate": 2e-06, |
|
"loss": 0.1068, |
|
"reward": 0.7210602164268494, |
|
"reward_std": 0.15427368879318237, |
|
"rewards/length_reward": 0.08035711199045181, |
|
"rewards/similarity_reward": 0.6407030820846558, |
|
"step": 292 |
|
}, |
|
{ |
|
"completion_length": 252.0178680419922, |
|
"epoch": 0.7813333333333333, |
|
"grad_norm": 0.8654140719017164, |
|
"kl": 0.022705078125, |
|
"learning_rate": 2e-06, |
|
"loss": 0.1224, |
|
"reward": 0.6323553919792175, |
|
"reward_std": 0.1670098751783371, |
|
"rewards/length_reward": 0.06651784479618073, |
|
"rewards/similarity_reward": 0.5658375024795532, |
|
"step": 293 |
|
}, |
|
{ |
|
"completion_length": 274.8839416503906, |
|
"epoch": 0.784, |
|
"grad_norm": 0.7786721291031314, |
|
"kl": 0.0142822265625, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0166, |
|
"reward": 0.7442488074302673, |
|
"reward_std": 0.10799020528793335, |
|
"rewards/length_reward": 0.07455354183912277, |
|
"rewards/similarity_reward": 0.6696951389312744, |
|
"step": 294 |
|
}, |
|
{ |
|
"completion_length": 279.1294860839844, |
|
"epoch": 0.7866666666666666, |
|
"grad_norm": 0.8056776017830058, |
|
"kl": 0.011962890625, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0048, |
|
"reward": 0.6635159850120544, |
|
"reward_std": 0.11003357172012329, |
|
"rewards/length_reward": 0.07008926570415497, |
|
"rewards/similarity_reward": 0.5934267044067383, |
|
"step": 295 |
|
}, |
|
{ |
|
"completion_length": 254.96429443359375, |
|
"epoch": 0.7893333333333333, |
|
"grad_norm": 1.0354511241505295, |
|
"kl": 0.0137939453125, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0919, |
|
"reward": 0.6252850294113159, |
|
"reward_std": 0.10695895552635193, |
|
"rewards/length_reward": 0.07142854481935501, |
|
"rewards/similarity_reward": 0.553856372833252, |
|
"step": 296 |
|
}, |
|
{ |
|
"completion_length": 233.25894165039062, |
|
"epoch": 0.792, |
|
"grad_norm": 0.9468597047570717, |
|
"kl": 0.01275634765625, |
|
"learning_rate": 2e-06, |
|
"loss": -0.0101, |
|
"reward": 0.6699472665786743, |
|
"reward_std": 0.16828653216362, |
|
"rewards/length_reward": 0.06964283436536789, |
|
"rewards/similarity_reward": 0.6003044247627258, |
|
"step": 297 |
|
}, |
|
{ |
|
"completion_length": 249.1428680419922, |
|
"epoch": 0.7946666666666666, |
|
"grad_norm": 0.9140690517111535, |
|
"kl": 0.01141357421875, |
|
"learning_rate": 2e-06, |
|
"loss": 0.072, |
|
"reward": 0.6824041604995728, |
|
"reward_std": 0.12977474927902222, |
|
"rewards/length_reward": 0.08035711199045181, |
|
"rewards/similarity_reward": 0.6020469069480896, |
|
"step": 298 |
|
}, |
|
{ |
|
"completion_length": 276.3571472167969, |
|
"epoch": 0.7973333333333333, |
|
"grad_norm": 0.8259435738042828, |
|
"kl": 0.01141357421875, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0105, |
|
"reward": 0.7071071863174438, |
|
"reward_std": 0.08347765356302261, |
|
"rewards/length_reward": 0.08169639110565186, |
|
"rewards/similarity_reward": 0.625410795211792, |
|
"step": 299 |
|
}, |
|
{ |
|
"completion_length": 270.0758972167969, |
|
"epoch": 0.8, |
|
"grad_norm": 0.8981450111371676, |
|
"kl": 0.0126953125, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0594, |
|
"reward": 0.6137918829917908, |
|
"reward_std": 0.12631313502788544, |
|
"rewards/length_reward": 0.064732126891613, |
|
"rewards/similarity_reward": 0.5490598082542419, |
|
"step": 300 |
|
}, |
|
{ |
|
"completion_length": 278.83038330078125, |
|
"epoch": 0.8026666666666666, |
|
"grad_norm": 0.8303352041330266, |
|
"kl": 0.01397705078125, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0166, |
|
"reward": 0.6976829767227173, |
|
"reward_std": 0.11335788667201996, |
|
"rewards/length_reward": 0.08169639110565186, |
|
"rewards/similarity_reward": 0.6159866452217102, |
|
"step": 301 |
|
}, |
|
{ |
|
"completion_length": 257.5357360839844, |
|
"epoch": 0.8053333333333333, |
|
"grad_norm": 0.8867998574709848, |
|
"kl": 0.0135498046875, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0754, |
|
"reward": 0.6026350855827332, |
|
"reward_std": 0.0930032953619957, |
|
"rewards/length_reward": 0.07589282840490341, |
|
"rewards/similarity_reward": 0.526742160320282, |
|
"step": 302 |
|
}, |
|
{ |
|
"completion_length": 283.5669860839844, |
|
"epoch": 0.808, |
|
"grad_norm": 0.8168218668358965, |
|
"kl": 0.0120849609375, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0773, |
|
"reward": 0.5895494222640991, |
|
"reward_std": 0.1272886097431183, |
|
"rewards/length_reward": 0.06874997913837433, |
|
"rewards/similarity_reward": 0.5207993984222412, |
|
"step": 303 |
|
}, |
|
{ |
|
"completion_length": 303.6026916503906, |
|
"epoch": 0.8106666666666666, |
|
"grad_norm": 0.8899545222480755, |
|
"kl": 0.012939453125, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0372, |
|
"reward": 0.6067291498184204, |
|
"reward_std": 0.1214829757809639, |
|
"rewards/length_reward": 0.07008926570415497, |
|
"rewards/similarity_reward": 0.5366398692131042, |
|
"step": 304 |
|
}, |
|
{ |
|
"completion_length": 294.3482360839844, |
|
"epoch": 0.8133333333333334, |
|
"grad_norm": 0.8194602682013028, |
|
"kl": 0.01153564453125, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0566, |
|
"reward": 0.7332960963249207, |
|
"reward_std": 0.08479318022727966, |
|
"rewards/length_reward": 0.07767854630947113, |
|
"rewards/similarity_reward": 0.6556174755096436, |
|
"step": 305 |
|
}, |
|
{ |
|
"completion_length": 266.4375, |
|
"epoch": 0.816, |
|
"grad_norm": 0.987377079764631, |
|
"kl": 0.0133056640625, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0409, |
|
"reward": 0.6382983326911926, |
|
"reward_std": 0.1240207627415657, |
|
"rewards/length_reward": 0.07366068661212921, |
|
"rewards/similarity_reward": 0.5646375417709351, |
|
"step": 306 |
|
}, |
|
{ |
|
"completion_length": 267.95538330078125, |
|
"epoch": 0.8186666666666667, |
|
"grad_norm": 0.9217292474743543, |
|
"kl": 0.01263427734375, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0267, |
|
"reward": 0.6616266965866089, |
|
"reward_std": 0.1070173904299736, |
|
"rewards/length_reward": 0.06964283436536789, |
|
"rewards/similarity_reward": 0.5919837355613708, |
|
"step": 307 |
|
}, |
|
{ |
|
"completion_length": 279.0535888671875, |
|
"epoch": 0.8213333333333334, |
|
"grad_norm": 0.8072891907153936, |
|
"kl": 0.01385498046875, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0943, |
|
"reward": 0.6182869672775269, |
|
"reward_std": 0.1361446976661682, |
|
"rewards/length_reward": 0.07187497615814209, |
|
"rewards/similarity_reward": 0.5464120507240295, |
|
"step": 308 |
|
}, |
|
{ |
|
"completion_length": 287.90625, |
|
"epoch": 0.824, |
|
"grad_norm": 0.9054980085837359, |
|
"kl": 0.01153564453125, |
|
"learning_rate": 2e-06, |
|
"loss": 0.068, |
|
"reward": 0.7372510433197021, |
|
"reward_std": 0.0957195907831192, |
|
"rewards/length_reward": 0.07499997317790985, |
|
"rewards/similarity_reward": 0.6622509956359863, |
|
"step": 309 |
|
}, |
|
{ |
|
"completion_length": 258.3973388671875, |
|
"epoch": 0.8266666666666667, |
|
"grad_norm": 0.9082585914171709, |
|
"kl": 0.01202392578125, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0061, |
|
"reward": 0.647227942943573, |
|
"reward_std": 0.1310262531042099, |
|
"rewards/length_reward": 0.07053568959236145, |
|
"rewards/similarity_reward": 0.5766921043395996, |
|
"step": 310 |
|
}, |
|
{ |
|
"completion_length": 221.2991180419922, |
|
"epoch": 0.8293333333333334, |
|
"grad_norm": 0.9448119727952166, |
|
"kl": 0.021728515625, |
|
"learning_rate": 2e-06, |
|
"loss": -0.0416, |
|
"reward": 0.6708490252494812, |
|
"reward_std": 0.1254904717206955, |
|
"rewards/length_reward": 0.07812497019767761, |
|
"rewards/similarity_reward": 0.5927240252494812, |
|
"step": 311 |
|
}, |
|
{ |
|
"completion_length": 249.3616180419922, |
|
"epoch": 0.832, |
|
"grad_norm": 0.8967865636957736, |
|
"kl": 0.010009765625, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0332, |
|
"reward": 0.7173448801040649, |
|
"reward_std": 0.10435692220926285, |
|
"rewards/length_reward": 0.08214282244443893, |
|
"rewards/similarity_reward": 0.6352020502090454, |
|
"step": 312 |
|
}, |
|
{ |
|
"completion_length": 246.80804443359375, |
|
"epoch": 0.8346666666666667, |
|
"grad_norm": 0.9778150152016866, |
|
"kl": 0.01373291015625, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0158, |
|
"reward": 0.6462720036506653, |
|
"reward_std": 0.13069510459899902, |
|
"rewards/length_reward": 0.07499997317790985, |
|
"rewards/similarity_reward": 0.5712720155715942, |
|
"step": 313 |
|
}, |
|
{ |
|
"completion_length": 244.13839721679688, |
|
"epoch": 0.8373333333333334, |
|
"grad_norm": 0.9354621189513169, |
|
"kl": 0.01214599609375, |
|
"learning_rate": 2e-06, |
|
"loss": 0.264, |
|
"reward": 0.6220008730888367, |
|
"reward_std": 0.12123651802539825, |
|
"rewards/length_reward": 0.06205355003476143, |
|
"rewards/similarity_reward": 0.5599472522735596, |
|
"step": 314 |
|
}, |
|
{ |
|
"completion_length": 269.5446472167969, |
|
"epoch": 0.84, |
|
"grad_norm": 0.9611408485021674, |
|
"kl": 0.01513671875, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0553, |
|
"reward": 0.725965678691864, |
|
"reward_std": 0.09657153487205505, |
|
"rewards/length_reward": 0.07723211497068405, |
|
"rewards/similarity_reward": 0.6487335562705994, |
|
"step": 315 |
|
}, |
|
{ |
|
"completion_length": 274.84375, |
|
"epoch": 0.8426666666666667, |
|
"grad_norm": 0.8797127493353065, |
|
"kl": 0.01129150390625, |
|
"learning_rate": 2e-06, |
|
"loss": 0.014, |
|
"reward": 0.6985806226730347, |
|
"reward_std": 0.11010481417179108, |
|
"rewards/length_reward": 0.0741071105003357, |
|
"rewards/similarity_reward": 0.6244734525680542, |
|
"step": 316 |
|
}, |
|
{ |
|
"completion_length": 272.8348388671875, |
|
"epoch": 0.8453333333333334, |
|
"grad_norm": 0.779139564448991, |
|
"kl": 0.01055908203125, |
|
"learning_rate": 2e-06, |
|
"loss": -0.0161, |
|
"reward": 0.7422655820846558, |
|
"reward_std": 0.06264423578977585, |
|
"rewards/length_reward": 0.08124996721744537, |
|
"rewards/similarity_reward": 0.6610156297683716, |
|
"step": 317 |
|
}, |
|
{ |
|
"completion_length": 229.7678680419922, |
|
"epoch": 0.848, |
|
"grad_norm": 4.278780421719415, |
|
"kl": 0.0390625, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0681, |
|
"reward": 0.6605138778686523, |
|
"reward_std": 0.14701789617538452, |
|
"rewards/length_reward": 0.07187496870756149, |
|
"rewards/similarity_reward": 0.5886389017105103, |
|
"step": 318 |
|
}, |
|
{ |
|
"completion_length": 286.2723388671875, |
|
"epoch": 0.8506666666666667, |
|
"grad_norm": 1.0371097952967725, |
|
"kl": 0.01611328125, |
|
"learning_rate": 2e-06, |
|
"loss": 0.1372, |
|
"reward": 0.6110987663269043, |
|
"reward_std": 0.18697677552700043, |
|
"rewards/length_reward": 0.06205355003476143, |
|
"rewards/similarity_reward": 0.549045205116272, |
|
"step": 319 |
|
}, |
|
{ |
|
"completion_length": 260.0625, |
|
"epoch": 0.8533333333333334, |
|
"grad_norm": 0.9056186985134533, |
|
"kl": 0.009521484375, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0351, |
|
"reward": 0.6572511196136475, |
|
"reward_std": 0.08565808087587357, |
|
"rewards/length_reward": 0.07812497019767761, |
|
"rewards/similarity_reward": 0.5791261196136475, |
|
"step": 320 |
|
}, |
|
{ |
|
"completion_length": 281.71875, |
|
"epoch": 0.856, |
|
"grad_norm": 0.71560536286136, |
|
"kl": 0.00860595703125, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0613, |
|
"reward": 0.7018586993217468, |
|
"reward_std": 0.14483648538589478, |
|
"rewards/length_reward": 0.08169639110565186, |
|
"rewards/similarity_reward": 0.620162308216095, |
|
"step": 321 |
|
}, |
|
{ |
|
"completion_length": 258.2276916503906, |
|
"epoch": 0.8586666666666667, |
|
"grad_norm": 0.9896625298898443, |
|
"kl": 0.0181884765625, |
|
"learning_rate": 2e-06, |
|
"loss": 0.2194, |
|
"reward": 0.5979973077774048, |
|
"reward_std": 0.13931064307689667, |
|
"rewards/length_reward": 0.06249998137354851, |
|
"rewards/similarity_reward": 0.5354973077774048, |
|
"step": 322 |
|
}, |
|
{ |
|
"completion_length": 237.1741180419922, |
|
"epoch": 0.8613333333333333, |
|
"grad_norm": 0.9609452774135127, |
|
"kl": 0.01513671875, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0016, |
|
"reward": 0.6683059334754944, |
|
"reward_std": 0.10867080092430115, |
|
"rewards/length_reward": 0.07991068810224533, |
|
"rewards/similarity_reward": 0.5883952975273132, |
|
"step": 323 |
|
}, |
|
{ |
|
"completion_length": 254.01339721679688, |
|
"epoch": 0.864, |
|
"grad_norm": 1.0568546085260409, |
|
"kl": 0.01531982421875, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0933, |
|
"reward": 0.6546286940574646, |
|
"reward_std": 0.11366698145866394, |
|
"rewards/length_reward": 0.07008926570415497, |
|
"rewards/similarity_reward": 0.5845393538475037, |
|
"step": 324 |
|
}, |
|
{ |
|
"completion_length": 242.03126525878906, |
|
"epoch": 0.8666666666666667, |
|
"grad_norm": 0.926913110151395, |
|
"kl": 0.014892578125, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0865, |
|
"reward": 0.5440469980239868, |
|
"reward_std": 0.11594089865684509, |
|
"rewards/length_reward": 0.06383926421403885, |
|
"rewards/similarity_reward": 0.48020774126052856, |
|
"step": 325 |
|
}, |
|
{ |
|
"completion_length": 228.21429443359375, |
|
"epoch": 0.8693333333333333, |
|
"grad_norm": 1.0043052790420461, |
|
"kl": 0.01385498046875, |
|
"learning_rate": 2e-06, |
|
"loss": -0.0272, |
|
"reward": 0.6953443884849548, |
|
"reward_std": 0.1263352483510971, |
|
"rewards/length_reward": 0.06785711646080017, |
|
"rewards/similarity_reward": 0.6274873614311218, |
|
"step": 326 |
|
}, |
|
{ |
|
"completion_length": 307.3125, |
|
"epoch": 0.872, |
|
"grad_norm": 0.7734278067274474, |
|
"kl": 0.0089111328125, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0474, |
|
"reward": 0.689052402973175, |
|
"reward_std": 0.10326018929481506, |
|
"rewards/length_reward": 0.07455354183912277, |
|
"rewards/similarity_reward": 0.6144987940788269, |
|
"step": 327 |
|
}, |
|
{ |
|
"completion_length": 256.7589416503906, |
|
"epoch": 0.8746666666666667, |
|
"grad_norm": 0.8349456447351374, |
|
"kl": 0.01275634765625, |
|
"learning_rate": 2e-06, |
|
"loss": -0.0053, |
|
"reward": 0.6755567193031311, |
|
"reward_std": 0.12627391517162323, |
|
"rewards/length_reward": 0.07321426272392273, |
|
"rewards/similarity_reward": 0.6023423671722412, |
|
"step": 328 |
|
}, |
|
{ |
|
"completion_length": 272.67413330078125, |
|
"epoch": 0.8773333333333333, |
|
"grad_norm": 0.9337813221087722, |
|
"kl": 0.013671875, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0863, |
|
"reward": 0.6436842083930969, |
|
"reward_std": 0.1272781938314438, |
|
"rewards/length_reward": 0.06339284032583237, |
|
"rewards/similarity_reward": 0.5802912712097168, |
|
"step": 329 |
|
}, |
|
{ |
|
"completion_length": 268.5401916503906, |
|
"epoch": 0.88, |
|
"grad_norm": 0.7822000698940798, |
|
"kl": 0.014892578125, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0076, |
|
"reward": 0.6488507986068726, |
|
"reward_std": 0.1486339569091797, |
|
"rewards/length_reward": 0.07812497019767761, |
|
"rewards/similarity_reward": 0.5707257986068726, |
|
"step": 330 |
|
}, |
|
{ |
|
"completion_length": 255.2991180419922, |
|
"epoch": 0.8826666666666667, |
|
"grad_norm": 0.9970416796611882, |
|
"kl": 0.01348876953125, |
|
"learning_rate": 2e-06, |
|
"loss": -0.0091, |
|
"reward": 0.623367190361023, |
|
"reward_std": 0.11435237526893616, |
|
"rewards/length_reward": 0.07544640451669693, |
|
"rewards/similarity_reward": 0.547920823097229, |
|
"step": 331 |
|
}, |
|
{ |
|
"completion_length": 281.2544860839844, |
|
"epoch": 0.8853333333333333, |
|
"grad_norm": 0.9393864746480084, |
|
"kl": 0.0177001953125, |
|
"learning_rate": 2e-06, |
|
"loss": -0.0269, |
|
"reward": 0.6835038661956787, |
|
"reward_std": 0.12119947373867035, |
|
"rewards/length_reward": 0.07366069406270981, |
|
"rewards/similarity_reward": 0.6098431348800659, |
|
"step": 332 |
|
}, |
|
{ |
|
"completion_length": 277.3883972167969, |
|
"epoch": 0.888, |
|
"grad_norm": 0.8857739277618905, |
|
"kl": 0.0133056640625, |
|
"learning_rate": 2e-06, |
|
"loss": 0.1155, |
|
"reward": 0.6458525657653809, |
|
"reward_std": 0.10718663036823273, |
|
"rewards/length_reward": 0.07187497615814209, |
|
"rewards/similarity_reward": 0.5739776492118835, |
|
"step": 333 |
|
}, |
|
{ |
|
"completion_length": 222.0491180419922, |
|
"epoch": 0.8906666666666667, |
|
"grad_norm": 0.9211484451974372, |
|
"kl": 0.0145263671875, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0326, |
|
"reward": 0.6779581308364868, |
|
"reward_std": 0.09158685058355331, |
|
"rewards/length_reward": 0.07098212093114853, |
|
"rewards/similarity_reward": 0.6069758534431458, |
|
"step": 334 |
|
}, |
|
{ |
|
"completion_length": 253.54464721679688, |
|
"epoch": 0.8933333333333333, |
|
"grad_norm": 1.0439727694024494, |
|
"kl": 0.01513671875, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0938, |
|
"reward": 0.6808232069015503, |
|
"reward_std": 0.11848772317171097, |
|
"rewards/length_reward": 0.07544640451669693, |
|
"rewards/similarity_reward": 0.6053767204284668, |
|
"step": 335 |
|
}, |
|
{ |
|
"completion_length": 233.00001525878906, |
|
"epoch": 0.896, |
|
"grad_norm": 0.9838382076638493, |
|
"kl": 0.015869140625, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0451, |
|
"reward": 0.58315509557724, |
|
"reward_std": 0.11559745669364929, |
|
"rewards/length_reward": 0.06741069257259369, |
|
"rewards/similarity_reward": 0.5157443881034851, |
|
"step": 336 |
|
}, |
|
{ |
|
"completion_length": 269.0133972167969, |
|
"epoch": 0.8986666666666666, |
|
"grad_norm": 0.836096159627277, |
|
"kl": 0.01361083984375, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0062, |
|
"reward": 0.6409623622894287, |
|
"reward_std": 0.12886659801006317, |
|
"rewards/length_reward": 0.07767854630947113, |
|
"rewards/similarity_reward": 0.5632836818695068, |
|
"step": 337 |
|
}, |
|
{ |
|
"completion_length": 251.40179443359375, |
|
"epoch": 0.9013333333333333, |
|
"grad_norm": 0.8535550782434568, |
|
"kl": 0.01422119140625, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0089, |
|
"reward": 0.6296460032463074, |
|
"reward_std": 0.16196994483470917, |
|
"rewards/length_reward": 0.07187497615814209, |
|
"rewards/similarity_reward": 0.5577709674835205, |
|
"step": 338 |
|
}, |
|
{ |
|
"completion_length": 244.80804443359375, |
|
"epoch": 0.904, |
|
"grad_norm": 1.0854210987310389, |
|
"kl": 0.0181884765625, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0595, |
|
"reward": 0.597773551940918, |
|
"reward_std": 0.10373269766569138, |
|
"rewards/length_reward": 0.06339284032583237, |
|
"rewards/similarity_reward": 0.5343807935714722, |
|
"step": 339 |
|
}, |
|
{ |
|
"completion_length": 262.6875, |
|
"epoch": 0.9066666666666666, |
|
"grad_norm": 0.8356624214145669, |
|
"kl": 0.013916015625, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0101, |
|
"reward": 0.6685509085655212, |
|
"reward_std": 0.0960090234875679, |
|
"rewards/length_reward": 0.07946424931287766, |
|
"rewards/similarity_reward": 0.5890867114067078, |
|
"step": 340 |
|
}, |
|
{ |
|
"completion_length": 262.5535888671875, |
|
"epoch": 0.9093333333333333, |
|
"grad_norm": 1.0702777784285404, |
|
"kl": 0.02783203125, |
|
"learning_rate": 2e-06, |
|
"loss": 0.1024, |
|
"reward": 0.5271078944206238, |
|
"reward_std": 0.14886566996574402, |
|
"rewards/length_reward": 0.064732126891613, |
|
"rewards/similarity_reward": 0.46237578988075256, |
|
"step": 341 |
|
}, |
|
{ |
|
"completion_length": 251.03126525878906, |
|
"epoch": 0.912, |
|
"grad_norm": 0.9235089036400403, |
|
"kl": 0.0172119140625, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0324, |
|
"reward": 0.6528847217559814, |
|
"reward_std": 0.11811169981956482, |
|
"rewards/length_reward": 0.06383927166461945, |
|
"rewards/similarity_reward": 0.5890454053878784, |
|
"step": 342 |
|
}, |
|
{ |
|
"completion_length": 236.2857208251953, |
|
"epoch": 0.9146666666666666, |
|
"grad_norm": 0.9202873381431551, |
|
"kl": 0.01556396484375, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0497, |
|
"reward": 0.6048458814620972, |
|
"reward_std": 0.13773028552532196, |
|
"rewards/length_reward": 0.07544640451669693, |
|
"rewards/similarity_reward": 0.5293995141983032, |
|
"step": 343 |
|
}, |
|
{ |
|
"completion_length": 309.99554443359375, |
|
"epoch": 0.9173333333333333, |
|
"grad_norm": 0.9489271796425148, |
|
"kl": 0.01226806640625, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0812, |
|
"reward": 0.6291395425796509, |
|
"reward_std": 0.15959399938583374, |
|
"rewards/length_reward": 0.06696426123380661, |
|
"rewards/similarity_reward": 0.5621752738952637, |
|
"step": 344 |
|
}, |
|
{ |
|
"completion_length": 245.21429443359375, |
|
"epoch": 0.92, |
|
"grad_norm": 1.1363976997302783, |
|
"kl": 0.0213623046875, |
|
"learning_rate": 2e-06, |
|
"loss": 0.1781, |
|
"reward": 0.5569170713424683, |
|
"reward_std": 0.14195482432842255, |
|
"rewards/length_reward": 0.06160712614655495, |
|
"rewards/similarity_reward": 0.49530985951423645, |
|
"step": 345 |
|
}, |
|
{ |
|
"completion_length": 272.2232360839844, |
|
"epoch": 0.9226666666666666, |
|
"grad_norm": 0.7565012868381632, |
|
"kl": 0.0108642578125, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0695, |
|
"reward": 0.6887885928153992, |
|
"reward_std": 0.11395367234945297, |
|
"rewards/length_reward": 0.07857140153646469, |
|
"rewards/similarity_reward": 0.6102170348167419, |
|
"step": 346 |
|
}, |
|
{ |
|
"completion_length": 310.28125, |
|
"epoch": 0.9253333333333333, |
|
"grad_norm": 0.782594647397142, |
|
"kl": 0.01202392578125, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0752, |
|
"reward": 0.6732801795005798, |
|
"reward_std": 0.12288369983434677, |
|
"rewards/length_reward": 0.07455354183912277, |
|
"rewards/similarity_reward": 0.5987265706062317, |
|
"step": 347 |
|
}, |
|
{ |
|
"completion_length": 307.86163330078125, |
|
"epoch": 0.928, |
|
"grad_norm": 0.7087010173295871, |
|
"kl": 0.013671875, |
|
"learning_rate": 2e-06, |
|
"loss": 0.008, |
|
"reward": 0.6750614047050476, |
|
"reward_std": 0.09951343387365341, |
|
"rewards/length_reward": 0.07410712540149689, |
|
"rewards/similarity_reward": 0.6009542942047119, |
|
"step": 348 |
|
}, |
|
{ |
|
"completion_length": 273.3883972167969, |
|
"epoch": 0.9306666666666666, |
|
"grad_norm": 1.0456544147832767, |
|
"kl": 0.01446533203125, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0404, |
|
"reward": 0.6893116235733032, |
|
"reward_std": 0.09679926186800003, |
|
"rewards/length_reward": 0.06517855077981949, |
|
"rewards/similarity_reward": 0.6241331100463867, |
|
"step": 349 |
|
}, |
|
{ |
|
"completion_length": 264.1919860839844, |
|
"epoch": 0.9333333333333333, |
|
"grad_norm": 0.8991803002318822, |
|
"kl": 0.01397705078125, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0129, |
|
"reward": 0.6719235181808472, |
|
"reward_std": 0.13838014006614685, |
|
"rewards/length_reward": 0.07321426272392273, |
|
"rewards/similarity_reward": 0.598709225654602, |
|
"step": 350 |
|
}, |
|
{ |
|
"completion_length": 274.9508972167969, |
|
"epoch": 0.936, |
|
"grad_norm": 0.8591029679110356, |
|
"kl": 0.013916015625, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0312, |
|
"reward": 0.5572786331176758, |
|
"reward_std": 0.14849816262722015, |
|
"rewards/length_reward": 0.06785711646080017, |
|
"rewards/similarity_reward": 0.489421546459198, |
|
"step": 351 |
|
}, |
|
{ |
|
"completion_length": 277.9508972167969, |
|
"epoch": 0.9386666666666666, |
|
"grad_norm": 0.8388349903111052, |
|
"kl": 0.01141357421875, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0278, |
|
"reward": 0.681722104549408, |
|
"reward_std": 0.0899442657828331, |
|
"rewards/length_reward": 0.07991068065166473, |
|
"rewards/similarity_reward": 0.6018112897872925, |
|
"step": 352 |
|
}, |
|
{ |
|
"completion_length": 240.3348388671875, |
|
"epoch": 0.9413333333333334, |
|
"grad_norm": 0.867198144640214, |
|
"kl": 0.01336669921875, |
|
"learning_rate": 2e-06, |
|
"loss": -0.0137, |
|
"reward": 0.6065589785575867, |
|
"reward_std": 0.11837570369243622, |
|
"rewards/length_reward": 0.07946424931287766, |
|
"rewards/similarity_reward": 0.5270946025848389, |
|
"step": 353 |
|
}, |
|
{ |
|
"completion_length": 287.4151916503906, |
|
"epoch": 0.944, |
|
"grad_norm": 1.2363858371428533, |
|
"kl": 0.0157470703125, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0649, |
|
"reward": 0.6748880743980408, |
|
"reward_std": 0.13465073704719543, |
|
"rewards/length_reward": 0.07187496870756149, |
|
"rewards/similarity_reward": 0.6030132174491882, |
|
"step": 354 |
|
}, |
|
{ |
|
"completion_length": 309.6919860839844, |
|
"epoch": 0.9466666666666667, |
|
"grad_norm": 0.6824545679415536, |
|
"kl": 0.00946044921875, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0881, |
|
"reward": 0.6755697727203369, |
|
"reward_std": 0.13920390605926514, |
|
"rewards/length_reward": 0.07901783287525177, |
|
"rewards/similarity_reward": 0.5965518355369568, |
|
"step": 355 |
|
}, |
|
{ |
|
"completion_length": 304.0089416503906, |
|
"epoch": 0.9493333333333334, |
|
"grad_norm": 0.7612207527990814, |
|
"kl": 0.00860595703125, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0573, |
|
"reward": 0.6815410852432251, |
|
"reward_std": 0.11536341905593872, |
|
"rewards/length_reward": 0.07455354928970337, |
|
"rewards/similarity_reward": 0.6069875955581665, |
|
"step": 356 |
|
}, |
|
{ |
|
"completion_length": 280.4151916503906, |
|
"epoch": 0.952, |
|
"grad_norm": 0.9197442279455559, |
|
"kl": 0.014892578125, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0976, |
|
"reward": 0.6213651895523071, |
|
"reward_std": 0.1452549546957016, |
|
"rewards/length_reward": 0.059374988079071045, |
|
"rewards/similarity_reward": 0.5619902014732361, |
|
"step": 357 |
|
}, |
|
{ |
|
"completion_length": 311.9375, |
|
"epoch": 0.9546666666666667, |
|
"grad_norm": 0.7667231406265689, |
|
"kl": 0.0123291015625, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0949, |
|
"reward": 0.671852707862854, |
|
"reward_std": 0.12415429949760437, |
|
"rewards/length_reward": 0.07455354183912277, |
|
"rewards/similarity_reward": 0.5972990989685059, |
|
"step": 358 |
|
}, |
|
{ |
|
"completion_length": 296.4598388671875, |
|
"epoch": 0.9573333333333334, |
|
"grad_norm": 0.784755991417782, |
|
"kl": 0.0093994140625, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0284, |
|
"reward": 0.7474254965782166, |
|
"reward_std": 0.10959716141223907, |
|
"rewards/length_reward": 0.07901783287525177, |
|
"rewards/similarity_reward": 0.6684076189994812, |
|
"step": 359 |
|
}, |
|
{ |
|
"completion_length": 253.15626525878906, |
|
"epoch": 0.96, |
|
"grad_norm": 0.8190677358549971, |
|
"kl": 0.0125732421875, |
|
"learning_rate": 2e-06, |
|
"loss": -0.0109, |
|
"reward": 0.6444076299667358, |
|
"reward_std": 0.1199827641248703, |
|
"rewards/length_reward": 0.07142855226993561, |
|
"rewards/similarity_reward": 0.5729790925979614, |
|
"step": 360 |
|
}, |
|
{ |
|
"completion_length": 306.5848388671875, |
|
"epoch": 0.9626666666666667, |
|
"grad_norm": 0.8327047575723723, |
|
"kl": 0.0101318359375, |
|
"learning_rate": 2e-06, |
|
"loss": 0.044, |
|
"reward": 0.6085981726646423, |
|
"reward_std": 0.15067243576049805, |
|
"rewards/length_reward": 0.07366069406270981, |
|
"rewards/similarity_reward": 0.5349374413490295, |
|
"step": 361 |
|
}, |
|
{ |
|
"completion_length": 254.7723388671875, |
|
"epoch": 0.9653333333333334, |
|
"grad_norm": 0.8753840404117226, |
|
"kl": 0.0155029296875, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0219, |
|
"reward": 0.6461009383201599, |
|
"reward_std": 0.11166159808635712, |
|
"rewards/length_reward": 0.07053568959236145, |
|
"rewards/similarity_reward": 0.5755651593208313, |
|
"step": 362 |
|
}, |
|
{ |
|
"completion_length": 260.33929443359375, |
|
"epoch": 0.968, |
|
"grad_norm": 0.8487935107258318, |
|
"kl": 0.0135498046875, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0758, |
|
"reward": 0.6731547713279724, |
|
"reward_std": 0.0943944975733757, |
|
"rewards/length_reward": 0.07946424931287766, |
|
"rewards/similarity_reward": 0.5936905145645142, |
|
"step": 363 |
|
}, |
|
{ |
|
"completion_length": 284.9107360839844, |
|
"epoch": 0.9706666666666667, |
|
"grad_norm": 0.9104348928736092, |
|
"kl": 0.0146484375, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0438, |
|
"reward": 0.6029422879219055, |
|
"reward_std": 0.13879723846912384, |
|
"rewards/length_reward": 0.07053568959236145, |
|
"rewards/similarity_reward": 0.5324065685272217, |
|
"step": 364 |
|
}, |
|
{ |
|
"completion_length": 291.1339416503906, |
|
"epoch": 0.9733333333333334, |
|
"grad_norm": 0.8351250207880698, |
|
"kl": 0.01312255859375, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0447, |
|
"reward": 0.6295793056488037, |
|
"reward_std": 0.11455141007900238, |
|
"rewards/length_reward": 0.07321426272392273, |
|
"rewards/similarity_reward": 0.556364893913269, |
|
"step": 365 |
|
}, |
|
{ |
|
"completion_length": 313.6160888671875, |
|
"epoch": 0.976, |
|
"grad_norm": 0.8331869500678173, |
|
"kl": 0.0137939453125, |
|
"learning_rate": 2e-06, |
|
"loss": 0.1377, |
|
"reward": 0.6497610807418823, |
|
"reward_std": 0.13638634979724884, |
|
"rewards/length_reward": 0.064732126891613, |
|
"rewards/similarity_reward": 0.5850289463996887, |
|
"step": 366 |
|
}, |
|
{ |
|
"completion_length": 260.46875, |
|
"epoch": 0.9786666666666667, |
|
"grad_norm": 0.9519334592833407, |
|
"kl": 0.0184326171875, |
|
"learning_rate": 2e-06, |
|
"loss": 0.108, |
|
"reward": 0.6215986609458923, |
|
"reward_std": 0.13745638728141785, |
|
"rewards/length_reward": 0.06696426123380661, |
|
"rewards/similarity_reward": 0.5546343326568604, |
|
"step": 367 |
|
}, |
|
{ |
|
"completion_length": 261.9196472167969, |
|
"epoch": 0.9813333333333333, |
|
"grad_norm": 0.9206057200583376, |
|
"kl": 0.01361083984375, |
|
"learning_rate": 2e-06, |
|
"loss": 0.1112, |
|
"reward": 0.5996190905570984, |
|
"reward_std": 0.13816344738006592, |
|
"rewards/length_reward": 0.0741071105003357, |
|
"rewards/similarity_reward": 0.5255119204521179, |
|
"step": 368 |
|
}, |
|
{ |
|
"completion_length": 187.60269165039062, |
|
"epoch": 0.984, |
|
"grad_norm": 34.2971123341626, |
|
"kl": 0.0152587890625, |
|
"learning_rate": 2e-06, |
|
"loss": -0.0025, |
|
"reward": 0.6047165989875793, |
|
"reward_std": 0.11468542367219925, |
|
"rewards/length_reward": 0.07276783138513565, |
|
"rewards/similarity_reward": 0.5319487452507019, |
|
"step": 369 |
|
}, |
|
{ |
|
"completion_length": 273.24554443359375, |
|
"epoch": 0.9866666666666667, |
|
"grad_norm": 1.0225592676611843, |
|
"kl": 0.0157470703125, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0974, |
|
"reward": 0.5848848819732666, |
|
"reward_std": 0.13747373223304749, |
|
"rewards/length_reward": 0.06205355003476143, |
|
"rewards/similarity_reward": 0.5228313207626343, |
|
"step": 370 |
|
}, |
|
{ |
|
"completion_length": 242.43304443359375, |
|
"epoch": 0.9893333333333333, |
|
"grad_norm": 0.9519953414264258, |
|
"kl": 0.01556396484375, |
|
"learning_rate": 2e-06, |
|
"loss": 0.1648, |
|
"reward": 0.6343554854393005, |
|
"reward_std": 0.14080199599266052, |
|
"rewards/length_reward": 0.07142855226993561, |
|
"rewards/similarity_reward": 0.5629268884658813, |
|
"step": 371 |
|
}, |
|
{ |
|
"completion_length": 295.7633972167969, |
|
"epoch": 0.992, |
|
"grad_norm": 0.7534581065002547, |
|
"kl": 0.0140380859375, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0601, |
|
"reward": 0.6254644393920898, |
|
"reward_std": 0.14738810062408447, |
|
"rewards/length_reward": 0.07232140004634857, |
|
"rewards/similarity_reward": 0.5531430244445801, |
|
"step": 372 |
|
}, |
|
{ |
|
"completion_length": 240.38394165039062, |
|
"epoch": 0.9946666666666667, |
|
"grad_norm": 0.9672618888481953, |
|
"kl": 0.01239013671875, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0627, |
|
"reward": 0.6286079287528992, |
|
"reward_std": 0.1355430781841278, |
|
"rewards/length_reward": 0.07232140004634857, |
|
"rewards/similarity_reward": 0.5562865734100342, |
|
"step": 373 |
|
}, |
|
{ |
|
"completion_length": 296.3482360839844, |
|
"epoch": 0.9973333333333333, |
|
"grad_norm": 0.8090409603684708, |
|
"kl": 0.0118408203125, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0202, |
|
"reward": 0.7064945697784424, |
|
"reward_std": 0.08492975682020187, |
|
"rewards/length_reward": 0.07455354183912277, |
|
"rewards/similarity_reward": 0.631941020488739, |
|
"step": 374 |
|
}, |
|
{ |
|
"completion_length": 207.4114227294922, |
|
"epoch": 1.0, |
|
"grad_norm": 0.9985931023470964, |
|
"kl": 0.01458740234375, |
|
"learning_rate": 2e-06, |
|
"loss": 0.0328, |
|
"reward": 0.6288642883300781, |
|
"reward_std": 0.13037118315696716, |
|
"rewards/length_reward": 0.07276783138513565, |
|
"rewards/similarity_reward": 0.5560964941978455, |
|
"step": 375 |
|
} |
|
], |
|
"logging_steps": 1, |
|
"max_steps": 375, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 0.0, |
|
"train_batch_size": 32, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|