{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.999711843242724, "eval_steps": 500, "global_step": 5204, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completion_length": 220.77500915527344, "epoch": 0.00019210450485063874, "grad_norm": 2.5577025413513184, "kl": 0.0, "learning_rate": 0.0, "loss": 0.038, "reward": 0.37062498927116394, "reward_std": 0.34713491797447205, "rewards/code_format_reward": 0.26875001192092896, "rewards/code_reward": 0.11812499910593033, "step": 1, "zero_std_ratio": 0.375 }, { "clip_ratio/high_max": 0.01640424354829722, "clip_ratio/high_mean": 0.003707133045989192, "clip_ratio/low_mean": 0.0004983297904901621, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.004205462749167863, "completion_length": 164.34375381469727, "epoch": 0.0019210450485063874, "grad_norm": 2.2875964641571045, "kl": 0.13929970601263145, "learning_rate": 9.999947520846931e-07, "loss": 0.0575, "reward": 0.655464380979538, "reward_std": 0.6216425597667694, "rewards/code_format_reward": 0.5078125074505806, "rewards/code_reward": 0.20077905245125294, "step": 10, "zero_std_ratio": 0.125 }, { "clip_ratio/high_max": 0.04116484243422747, "clip_ratio/high_mean": 0.007335515914019197, "clip_ratio/low_mean": 0.00010183055419474841, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.007437346538063138, "completion_length": 100.84750213623047, "epoch": 0.003842090097012775, "grad_norm": 2.409867286682129, "kl": 1.1695969879627228, "learning_rate": 9.999734326385416e-07, "loss": -0.0111, "reward": 0.9829235672950745, "reward_std": 0.5127422153949738, "rewards/code_format_reward": 0.84375, "rewards/code_reward": 0.2805242508649826, "step": 20, "zero_std_ratio": 0.075 }, { "clip_ratio/high_max": 0.039504543878138065, "clip_ratio/high_mean": 0.0051267803879454735, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0051267803879454735, "completion_length": 97.57750091552734, "epoch": 0.005763135145519163, "grad_norm": 4.608696460723877, "kl": 2.0701700329780577, "learning_rate": 9.99935714443203e-07, "loss": -0.019, "reward": 1.1568554759025573, "reward_std": 0.6407819569110871, "rewards/code_format_reward": 0.8674999952316285, "rewards/code_reward": 0.3615527212619781, "step": 30, "zero_std_ratio": 0.025 }, { "clip_ratio/high_max": 0.005034898268058896, "clip_ratio/high_mean": 0.0006811309984186664, "clip_ratio/low_mean": 0.00013664715661434456, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0008177781579433941, "completion_length": 83.10500030517578, "epoch": 0.00768418019402555, "grad_norm": 4.833131313323975, "kl": 2.2019619703292848, "learning_rate": 9.99881598873272e-07, "loss": -0.02, "reward": 1.1795239448547363, "reward_std": 0.7194581270217896, "rewards/code_format_reward": 0.8987499952316285, "rewards/code_reward": 0.36507447361946105, "step": 40, "zero_std_ratio": 0.05 }, { "clip_ratio/high_max": 0.00872214906848967, "clip_ratio/high_mean": 0.0010902686335612088, "clip_ratio/low_mean": 0.00040422612219117584, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0014944947557523846, "completion_length": 88.22500152587891, "epoch": 0.009605225242531937, "grad_norm": 2.778585433959961, "kl": 2.4498987793922424, "learning_rate": 9.998110879009265e-07, "loss": -0.0035, "reward": 1.2663686752319336, "reward_std": 0.6244019389152526, "rewards/code_format_reward": 0.918750011920929, "rewards/code_reward": 0.40349680185317993, "step": 50, "zero_std_ratio": 0.075 }, { "clip_ratio/high_max": 0.016367838624864815, "clip_ratio/high_mean": 0.002741052128840238, "clip_ratio/low_mean": 0.0008432979579083621, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.003584350028540939, "completion_length": 91.3375, "epoch": 0.011526270291038325, "grad_norm": 2.5201120376586914, "kl": 2.7947509050369264, "learning_rate": 9.997241840958557e-07, "loss": 0.005, "reward": 1.0697558522224426, "reward_std": 0.49940577149391174, "rewards/code_format_reward": 0.9200000047683716, "rewards/code_reward": 0.30487790107727053, "step": 60, "zero_std_ratio": 0.025 }, { "clip_ratio/high_max": 0.031629907339811324, "clip_ratio/high_mean": 0.005140213097911328, "clip_ratio/low_mean": 0.003656612744089216, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.008796826144680381, "completion_length": 84.79750213623046, "epoch": 0.013447315339544713, "grad_norm": 7.281564712524414, "kl": 1.7218781247735024, "learning_rate": 9.99620890625166e-07, "loss": -0.0261, "reward": 1.1421246886253358, "reward_std": 0.5977877795696258, "rewards/code_format_reward": 0.9275000095367432, "rewards/code_reward": 0.33918734490871427, "step": 70, "zero_std_ratio": 0.05 }, { "clip_ratio/high_max": 0.10261552361771464, "clip_ratio/high_mean": 0.014289343578275293, "clip_ratio/low_mean": 0.0031720689148642123, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.017461412807460875, "completion_length": 75.67250061035156, "epoch": 0.0153683603880511, "grad_norm": 3.359511137008667, "kl": 0.3687619216740131, "learning_rate": 9.995012112532654e-07, "loss": -0.0037, "reward": 1.2640612244606018, "reward_std": 0.5189764618873596, "rewards/code_format_reward": 0.9087499976158142, "rewards/code_reward": 0.40484309792518614, "step": 80, "zero_std_ratio": 0.075 }, { "clip_ratio/high_max": 0.053048994287382814, "clip_ratio/high_mean": 0.0092369354548282, "clip_ratio/low_mean": 0.00010360952001065016, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.009340544970473274, "completion_length": 84.56500091552735, "epoch": 0.01728940543655749, "grad_norm": 2.177191734313965, "kl": 0.5693678379058837, "learning_rate": 9.993651503417269e-07, "loss": -0.008, "reward": 1.1986377000808717, "reward_std": 0.49277395009994507, "rewards/code_format_reward": 0.9112500071525573, "rewards/code_reward": 0.3715063512325287, "step": 90, "zero_std_ratio": 0.1 }, { "clip_ratio/high_max": 0.04136249013245106, "clip_ratio/high_mean": 0.00551328391302377, "clip_ratio/low_mean": 0.001408070686738938, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.006921354681253433, "completion_length": 80.8550033569336, "epoch": 0.019210450485063875, "grad_norm": 2.0033416748046875, "kl": 0.8493028253316879, "learning_rate": 9.992127128491296e-07, "loss": 0.0027, "reward": 1.1780336141586303, "reward_std": 0.4479735493659973, "rewards/code_format_reward": 0.9275000095367432, "rewards/code_reward": 0.3571417987346649, "step": 100, "zero_std_ratio": 0.125 }, { "clip_ratio/high_max": 0.0585523322224617, "clip_ratio/high_mean": 0.008032404945697635, "clip_ratio/low_mean": 0.006125411042012275, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.014157815964426845, "completion_length": 74.19000091552735, "epoch": 0.02113149553357026, "grad_norm": 2.267624855041504, "kl": 1.1039492040872574, "learning_rate": 9.990439043308776e-07, "loss": -0.0238, "reward": 1.2784739494323731, "reward_std": 0.49057124853134154, "rewards/code_format_reward": 0.9475000023841857, "rewards/code_reward": 0.40236196517944334, "step": 110, "zero_std_ratio": 0.175 }, { "clip_ratio/high_max": 0.07124514738097787, "clip_ratio/high_mean": 0.01569047374650836, "clip_ratio/low_mean": 0.0004420768018462695, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.01613254987169057, "completion_length": 68.35750045776368, "epoch": 0.02305254058207665, "grad_norm": 4.1564249992370605, "kl": 1.4338344126939773, "learning_rate": 9.988587309389975e-07, "loss": -0.0026, "reward": 1.1606964468955994, "reward_std": 0.46601226925849915, "rewards/code_format_reward": 0.9475000023841857, "rewards/code_reward": 0.34347322285175325, "step": 120, "zero_std_ratio": 0.175 }, { "clip_ratio/high_max": 0.07592196827754379, "clip_ratio/high_mean": 0.014454811741597951, "clip_ratio/low_mean": 0.0013599038298707455, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015814715722808615, "completion_length": 72.15750122070312, "epoch": 0.024973585630583037, "grad_norm": 3.9662575721740723, "kl": 1.5312897458672523, "learning_rate": 9.98657199421914e-07, "loss": -0.0024, "reward": 1.1610160946846009, "reward_std": 0.3773229032754898, "rewards/code_format_reward": 0.9587499976158143, "rewards/code_reward": 0.3408205330371857, "step": 130, "zero_std_ratio": 0.25 }, { "clip_ratio/high_max": 0.07943324451334774, "clip_ratio/high_mean": 0.014111382194096222, "clip_ratio/low_mean": 0.0036704083904623985, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.017781790602020918, "completion_length": 83.8675033569336, "epoch": 0.026894630679089426, "grad_norm": 9.37182331085205, "kl": 0.5293755233287811, "learning_rate": 9.984393171242054e-07, "loss": -0.0045, "reward": 1.3634901762008667, "reward_std": 0.5678210258483887, "rewards/code_format_reward": 0.9512500047683716, "rewards/code_reward": 0.4439325869083405, "step": 140, "zero_std_ratio": 0.175 }, { "clip_ratio/high_max": 0.13982175141572953, "clip_ratio/high_mean": 0.018845621962100267, "clip_ratio/low_mean": 0.0009358229042845778, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.019781444873660802, "completion_length": 79.41999969482421, "epoch": 0.028815675727595812, "grad_norm": 3.3437957763671875, "kl": 1.0034890450537204, "learning_rate": 9.982050919863332e-07, "loss": -0.0003, "reward": 1.332119607925415, "reward_std": 0.4401752531528473, "rewards/code_format_reward": 0.9674999952316284, "rewards/code_reward": 0.4241847813129425, "step": 150, "zero_std_ratio": 0.2 }, { "clip_ratio/high_max": 0.08667803611606359, "clip_ratio/high_mean": 0.01204416286200285, "clip_ratio/low_mean": 0.0012475995084969328, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.013291762379230932, "completion_length": 80.43250122070313, "epoch": 0.0307367207761022, "grad_norm": 3.763737678527832, "kl": 0.9024959966540337, "learning_rate": 9.979545325443564e-07, "loss": -0.0043, "reward": 1.3518987059593202, "reward_std": 0.46767728328704833, "rewards/code_format_reward": 0.9450000047683715, "rewards/code_reward": 0.4396993488073349, "step": 160, "zero_std_ratio": 0.25 }, { "clip_ratio/high_max": 0.08449154160916805, "clip_ratio/high_mean": 0.012011481402441859, "clip_ratio/low_mean": 0.00172541297506541, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.013736894307658076, "completion_length": 78.61250152587891, "epoch": 0.03265776582460859, "grad_norm": 7.203779220581055, "kl": 0.9198675453662872, "learning_rate": 9.976876479296167e-07, "loss": -0.0013, "reward": 1.3803849458694457, "reward_std": 0.4038102596998215, "rewards/code_format_reward": 0.9587499976158143, "rewards/code_reward": 0.4505049705505371, "step": 170, "zero_std_ratio": 0.2 }, { "clip_ratio/high_max": 0.07188423536717892, "clip_ratio/high_mean": 0.013125935778953135, "clip_ratio/low_mean": 0.0036661239922977985, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.016792060085572304, "completion_length": 77.1875, "epoch": 0.03457881087311498, "grad_norm": 4.487454414367676, "kl": 1.7507148087024689, "learning_rate": 9.974044478684084e-07, "loss": 0.0129, "reward": 1.3845421075820923, "reward_std": 0.5211645245552063, "rewards/code_format_reward": 0.9325000047683716, "rewards/code_reward": 0.4591460168361664, "step": 180, "zero_std_ratio": 0.175 }, { "clip_ratio/high_max": 0.03638382372446358, "clip_ratio/high_mean": 0.005234482995001599, "clip_ratio/low_mean": 0.0020637288223952057, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.007298211794113741, "completion_length": 72.73499984741211, "epoch": 0.03649985592162136, "grad_norm": 1.9644516706466675, "kl": 1.5947209149599075, "learning_rate": 9.97104942681622e-07, "loss": -0.0015, "reward": 1.5394827842712402, "reward_std": 0.42243914008140565, "rewards/code_format_reward": 0.9625, "rewards/code_reward": 0.5291163563728333, "step": 190, "zero_std_ratio": 0.225 }, { "clip_ratio/high_max": 0.2028519107028842, "clip_ratio/high_mean": 0.031164265819825232, "clip_ratio/low_mean": 0.00270410452503711, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.03386837020516396, "completion_length": 69.81000061035157, "epoch": 0.03842090097012775, "grad_norm": 3.170088052749634, "kl": 1.0117133632302284, "learning_rate": 9.9678914328437e-07, "loss": 0.0113, "reward": 1.4108091354370118, "reward_std": 0.43394198417663576, "rewards/code_format_reward": 0.9675000071525574, "rewards/code_reward": 0.46352959871292115, "step": 200, "zero_std_ratio": 0.225 }, { "clip_ratio/high_max": 0.053923821565695106, "clip_ratio/high_mean": 0.009883182743215002, "clip_ratio/low_mean": 0.0038779765891376883, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.013761159335263073, "completion_length": 69.04750213623046, "epoch": 0.04034194601863414, "grad_norm": 2.5729293823242188, "kl": 1.1861489608883857, "learning_rate": 9.964570611855874e-07, "loss": -0.007, "reward": 1.4398113250732423, "reward_std": 0.39351261258125303, "rewards/code_format_reward": 0.9650000095367431, "rewards/code_reward": 0.47865564227104185, "step": 210, "zero_std_ratio": 0.3 }, { "clip_ratio/high_max": 0.1575187448877841, "clip_ratio/high_mean": 0.020484526228392495, "clip_ratio/low_mean": 0.011988240911159664, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.03247276756446808, "completion_length": 61.67000122070313, "epoch": 0.04226299106714052, "grad_norm": 9.919574737548828, "kl": 3.983895111083984, "learning_rate": 9.961087084876135e-07, "loss": 0.0076, "reward": 1.2202381372451783, "reward_std": 0.26475468575954436, "rewards/code_format_reward": 0.96875, "rewards/code_reward": 0.36793155074119566, "step": 220, "zero_std_ratio": 0.425 }, { "clip_ratio/high_max": 0.141914052516222, "clip_ratio/high_mean": 0.023408634401857854, "clip_ratio/low_mean": 0.004240041392040439, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.027648675863747484, "completion_length": 67.37250213623047, "epoch": 0.04418403611564691, "grad_norm": 106.46134185791016, "kl": 2.121386268734932, "learning_rate": 9.957440978857498e-07, "loss": -0.0021, "reward": 1.3681801557540894, "reward_std": 0.37111111879348757, "rewards/code_format_reward": 0.9675000071525574, "rewards/code_reward": 0.4422150731086731, "step": 230, "zero_std_ratio": 0.25 }, { "clip_ratio/high_max": 0.07315623210743069, "clip_ratio/high_mean": 0.01155225959373638, "clip_ratio/low_mean": 0.005734288269013632, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.01728654802427627, "completion_length": 72.65750198364258, "epoch": 0.0461050811641533, "grad_norm": 3.1017534732818604, "kl": 0.882834991812706, "learning_rate": 9.953632426677983e-07, "loss": -0.0093, "reward": 1.484795618057251, "reward_std": 0.4526777356863022, "rewards/code_format_reward": 0.9662500023841858, "rewards/code_reward": 0.5008352994918823, "step": 240, "zero_std_ratio": 0.2 }, { "clip_ratio/high_max": 0.06419091664720326, "clip_ratio/high_mean": 0.008793376400717534, "clip_ratio/low_mean": 0.0021902987034991385, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.010983675080933609, "completion_length": 88.21500091552734, "epoch": 0.048026126212659684, "grad_norm": 5.3243279457092285, "kl": 2.7226425796747207, "learning_rate": 9.94966156713577e-07, "loss": -0.0127, "reward": 1.455380654335022, "reward_std": 0.4675000965595245, "rewards/code_format_reward": 0.9737500071525573, "rewards/code_reward": 0.4842528164386749, "step": 250, "zero_std_ratio": 0.2 }, { "clip_ratio/high_max": 0.0741606397787109, "clip_ratio/high_mean": 0.012071207936969586, "clip_ratio/low_mean": 0.003062122967094183, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015133331064134836, "completion_length": 88.98250122070313, "epoch": 0.04994717126116607, "grad_norm": 2.7804369926452637, "kl": 0.6169008180499077, "learning_rate": 9.94552854494413e-07, "loss": 0.0033, "reward": 1.427869987487793, "reward_std": 0.4851543098688126, "rewards/code_format_reward": 0.96875, "rewards/code_reward": 0.4717474699020386, "step": 260, "zero_std_ratio": 0.1 }, { "clip_ratio/high_max": 0.03937563952058554, "clip_ratio/high_mean": 0.0065028761862777175, "clip_ratio/low_mean": 0.004409579199273139, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.01091245551360771, "completion_length": 87.42750091552735, "epoch": 0.05186821630967246, "grad_norm": 6.559643745422363, "kl": 0.4433484449982643, "learning_rate": 9.941233510726168e-07, "loss": -0.0018, "reward": 1.4182387351989747, "reward_std": 0.4612067699432373, "rewards/code_format_reward": 0.9412499904632569, "rewards/code_reward": 0.4738068819046021, "step": 270, "zero_std_ratio": 0.175 }, { "clip_ratio/high_max": 0.057588514033705, "clip_ratio/high_mean": 0.008462971181143076, "clip_ratio/low_mean": 0.007865038787713274, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.016328010114375503, "completion_length": 79.69500122070312, "epoch": 0.05378926135817885, "grad_norm": 6.077131271362305, "kl": 0.6961165189743042, "learning_rate": 9.936776621009322e-07, "loss": 0.0038, "reward": 1.5715951919555664, "reward_std": 0.4179812580347061, "rewards/code_format_reward": 0.975, "rewards/code_reward": 0.5420475661754608, "step": 280, "zero_std_ratio": 0.2 }, { "clip_ratio/high_max": 0.025062982086092235, "clip_ratio/high_mean": 0.004761367203900591, "clip_ratio/low_mean": 0.0028623046877328307, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.007623671973124147, "completion_length": 83.21750183105469, "epoch": 0.055710306406685235, "grad_norm": 6.066061019897461, "kl": 0.7484225794672966, "learning_rate": 9.932158038219662e-07, "loss": -0.0052, "reward": 1.1587857127189636, "reward_std": 0.39943512678146365, "rewards/code_format_reward": 0.9637500047683716, "rewards/code_reward": 0.3384553253650665, "step": 290, "zero_std_ratio": 0.25 }, { "clip_ratio/high_max": 0.10117955654859542, "clip_ratio/high_mean": 0.013649052195250987, "clip_ratio/low_mean": 0.0008885912131518126, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.014537643361836671, "completion_length": 84.00750122070312, "epoch": 0.057631351455191625, "grad_norm": 3.23887038230896, "kl": 0.8064253896474838, "learning_rate": 9.92737793067597e-07, "loss": -0.0034, "reward": 1.3393104553222657, "reward_std": 0.4101540923118591, "rewards/code_format_reward": 0.9549999952316284, "rewards/code_reward": 0.43090522289276123, "step": 300, "zero_std_ratio": 0.15 }, { "clip_ratio/high_max": 0.04703736044466496, "clip_ratio/high_mean": 0.007716302154585719, "clip_ratio/low_mean": 0.0006432932626921683, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.008359595513320528, "completion_length": 77.70500030517579, "epoch": 0.059552396503698014, "grad_norm": 3.357680320739746, "kl": 0.6727996915578842, "learning_rate": 9.922436472583614e-07, "loss": 0.0013, "reward": 1.6670202493667603, "reward_std": 0.4320096135139465, "rewards/code_format_reward": 0.9712500095367431, "rewards/code_reward": 0.5906976163387299, "step": 310, "zero_std_ratio": 0.3 }, { "clip_ratio/high_max": 0.16380154211074113, "clip_ratio/high_mean": 0.03262772373855114, "clip_ratio/low_mean": 0.0011754593724617735, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.03380318162962794, "completion_length": 72.78750152587891, "epoch": 0.0614734415522044, "grad_norm": 3.652451992034912, "kl": 1.8953835844993592, "learning_rate": 9.91733384402818e-07, "loss": -0.005, "reward": 1.4837595462799071, "reward_std": 0.45500350296497344, "rewards/code_format_reward": 0.9662500023841858, "rewards/code_reward": 0.5003172576427459, "step": 320, "zero_std_ratio": 0.225 }, { "clip_ratio/high_max": 0.034284231485798955, "clip_ratio/high_mean": 0.005935872689587995, "clip_ratio/low_mean": 0.000911827472737059, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0068477002554573115, "completion_length": 74.17000274658203, "epoch": 0.06339448660071079, "grad_norm": 1.5065704584121704, "kl": 0.40838020071387293, "learning_rate": 9.912070230968928e-07, "loss": -0.0054, "reward": 1.3848075151443482, "reward_std": 0.3038723856210709, "rewards/code_format_reward": 0.9612499952316285, "rewards/code_reward": 0.45209125280380247, "step": 330, "zero_std_ratio": 0.35 }, { "clip_ratio/high_max": 0.05724322898313403, "clip_ratio/high_mean": 0.009350239217747002, "clip_ratio/low_mean": 0.0077414238592609765, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0170916625414975, "completion_length": 80.06500091552735, "epoch": 0.06531553164921718, "grad_norm": 3.77842116355896, "kl": 0.8782595857977867, "learning_rate": 9.906645825232008e-07, "loss": -0.0023, "reward": 1.294193172454834, "reward_std": 0.3676457226276398, "rewards/code_format_reward": 0.9549999952316284, "rewards/code_reward": 0.4083465874195099, "step": 340, "zero_std_ratio": 0.275 }, { "clip_ratio/high_max": 0.10199148450046777, "clip_ratio/high_mean": 0.018657304299995302, "clip_ratio/low_mean": 0.004165191331412643, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.022822496155276893, "completion_length": 86.5000015258789, "epoch": 0.06723657669772357, "grad_norm": 3.2845616340637207, "kl": 0.9463568836450577, "learning_rate": 9.901060824503463e-07, "loss": -0.0115, "reward": 1.485135293006897, "reward_std": 0.48840407729148866, "rewards/code_format_reward": 0.9487499833106995, "rewards/code_reward": 0.5053801357746124, "step": 350, "zero_std_ratio": 0.225 }, { "clip_ratio/high_max": 0.07233364712446928, "clip_ratio/high_mean": 0.009769158461131156, "clip_ratio/low_mean": 0.019356250233249737, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.029125408595427872, "completion_length": 80.54000091552734, "epoch": 0.06915762174622996, "grad_norm": 19.32016944885254, "kl": 1.1565445899963378, "learning_rate": 9.89531543232204e-07, "loss": 0.0045, "reward": 1.3412477493286132, "reward_std": 0.49785757064819336, "rewards/code_format_reward": 0.9599999904632568, "rewards/code_reward": 0.43062385320663454, "step": 360, "zero_std_ratio": 0.25 }, { "clip_ratio/high_max": 0.11471173651516438, "clip_ratio/high_mean": 0.02246011425741017, "clip_ratio/low_mean": 0.00892345790634863, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.031383572798222306, "completion_length": 74.02000274658204, "epoch": 0.07107866679473633, "grad_norm": 2.2520923614501953, "kl": 1.074078917503357, "learning_rate": 9.889409858071753e-07, "loss": -0.0059, "reward": 1.5273491621017456, "reward_std": 0.414175683259964, "rewards/code_format_reward": 0.9775000095367432, "rewards/code_reward": 0.519299578666687, "step": 370, "zero_std_ratio": 0.275 }, { "clip_ratio/high_max": 0.06336253914050757, "clip_ratio/high_mean": 0.01199121386744082, "clip_ratio/low_mean": 0.009130357182584703, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.021121570840477943, "completion_length": 86.4000015258789, "epoch": 0.07299971184324272, "grad_norm": 4.1052961349487305, "kl": 1.3110491752624511, "learning_rate": 9.883344316974266e-07, "loss": -0.0079, "reward": 1.5908024072647096, "reward_std": 0.47413656711578367, "rewards/code_format_reward": 0.9600000023841858, "rewards/code_reward": 0.555401211977005, "step": 380, "zero_std_ratio": 0.2 }, { "clip_ratio/high_max": 0.04433182019274682, "clip_ratio/high_mean": 0.008989717412623577, "clip_ratio/low_mean": 0.006074265367351473, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015063982826541178, "completion_length": 86.175, "epoch": 0.07492075689174911, "grad_norm": 4.5202155113220215, "kl": 0.830048742890358, "learning_rate": 9.877119030081048e-07, "loss": -0.0051, "reward": 1.492829155921936, "reward_std": 0.3874175697565079, "rewards/code_format_reward": 0.9824999928474426, "rewards/code_reward": 0.5007895469665528, "step": 390, "zero_std_ratio": 0.325 }, { "clip_ratio/high_max": 0.1522485612425953, "clip_ratio/high_mean": 0.0220908185117878, "clip_ratio/low_mean": 0.012701757764443756, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.03479257607832551, "completion_length": 78.91000137329101, "epoch": 0.0768418019402555, "grad_norm": 2.6146676540374756, "kl": 0.8627120085060597, "learning_rate": 9.870734224265308e-07, "loss": -0.0059, "reward": 1.5748756647109985, "reward_std": 0.3048340857028961, "rewards/code_format_reward": 0.987500011920929, "rewards/code_reward": 0.5405627965927124, "step": 400, "zero_std_ratio": 0.4 }, { "clip_ratio/high_max": 0.16312104668468236, "clip_ratio/high_mean": 0.025311203207820654, "clip_ratio/low_mean": 0.008227485651150345, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.03353868862614036, "completion_length": 77.27750091552734, "epoch": 0.07876284698876189, "grad_norm": 1.7234841585159302, "kl": 0.8750749856233597, "learning_rate": 9.864190132213742e-07, "loss": -0.0062, "reward": 1.6338460445404053, "reward_std": 0.3537067860364914, "rewards/code_format_reward": 0.9849999904632568, "rewards/code_reward": 0.570673018693924, "step": 410, "zero_std_ratio": 0.325 }, { "clip_ratio/high_max": 0.0936438184697181, "clip_ratio/high_mean": 0.014423616812564433, "clip_ratio/low_mean": 0.010347768076462672, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.024771385360509157, "completion_length": 75.69749908447265, "epoch": 0.08068389203726828, "grad_norm": 2.0902154445648193, "kl": 1.264050543308258, "learning_rate": 9.857486992418036e-07, "loss": 0.0048, "reward": 1.644848608970642, "reward_std": 0.277804034948349, "rewards/code_format_reward": 0.9799999952316284, "rewards/code_reward": 0.5774242997169494, "step": 420, "zero_std_ratio": 0.55 }, { "clip_ratio/high_max": 0.05532362968660891, "clip_ratio/high_mean": 0.00992250678827986, "clip_ratio/low_mean": 0.004125738283619285, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.014048245223239064, "completion_length": 69.60749969482421, "epoch": 0.08260493708577465, "grad_norm": 3.702075481414795, "kl": 1.7400359451770782, "learning_rate": 9.850625049166189e-07, "loss": -0.0008, "reward": 1.5316168069839478, "reward_std": 0.275749945640564, "rewards/code_format_reward": 0.9737499952316284, "rewards/code_reward": 0.5223708748817444, "step": 430, "zero_std_ratio": 0.475 }, { "clip_ratio/high_max": 0.15841160174459218, "clip_ratio/high_mean": 0.02351265251636505, "clip_ratio/low_mean": 0.010281538363778963, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.033794190967455506, "completion_length": 74.51000061035157, "epoch": 0.08452598213428104, "grad_norm": 3.4808361530303955, "kl": 1.2856003642082214, "learning_rate": 9.8436045525336e-07, "loss": -0.0035, "reward": 1.5067368984222411, "reward_std": 0.28293364942073823, "rewards/code_format_reward": 0.9737499833106995, "rewards/code_reward": 0.5099309325218201, "step": 440, "zero_std_ratio": 0.375 }, { "clip_ratio/high_max": 0.06043836465105414, "clip_ratio/high_mean": 0.009103650611359626, "clip_ratio/low_mean": 0.002932069695089012, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.012035720515996218, "completion_length": 76.08250122070312, "epoch": 0.08644702718278743, "grad_norm": 3.665134906768799, "kl": 1.0338351279497147, "learning_rate": 9.836425758373958e-07, "loss": 0.0011, "reward": 1.4822889804840087, "reward_std": 0.18996141627430915, "rewards/code_format_reward": 0.9674999952316284, "rewards/code_reward": 0.49926944375038146, "step": 450, "zero_std_ratio": 0.5 }, { "clip_ratio/high_max": 0.21641009524464608, "clip_ratio/high_mean": 0.03260216782800853, "clip_ratio/low_mean": 0.007402116784942336, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.040004284400492904, "completion_length": 73.13500213623047, "epoch": 0.08836807223129382, "grad_norm": 3.1982343196868896, "kl": 0.6477661892771721, "learning_rate": 9.829088928309923e-07, "loss": -0.0043, "reward": 1.7202057361602783, "reward_std": 0.25773381292819975, "rewards/code_format_reward": 0.975, "rewards/code_reward": 0.6163528442382813, "step": 460, "zero_std_ratio": 0.425 }, { "clip_ratio/high_max": 0.09453173456713557, "clip_ratio/high_mean": 0.015337946941144764, "clip_ratio/low_mean": 0.005975433619460091, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.02131338034523651, "completion_length": 81.9000015258789, "epoch": 0.09028911727980021, "grad_norm": 1.441091775894165, "kl": 0.6155861958861351, "learning_rate": 9.82159432972358e-07, "loss": -0.0063, "reward": 1.4617766380310058, "reward_std": 0.24772228300571442, "rewards/code_format_reward": 0.9774999976158142, "rewards/code_reward": 0.48651331663131714, "step": 470, "zero_std_ratio": 0.3 }, { "clip_ratio/high_max": 0.16705528497695923, "clip_ratio/high_mean": 0.026639112271368504, "clip_ratio/low_mean": 0.0035399875399889425, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.03017909936606884, "completion_length": 77.79500274658203, "epoch": 0.0922101623283066, "grad_norm": 47.74139404296875, "kl": 1.360982394218445, "learning_rate": 9.813942235746705e-07, "loss": 0.0034, "reward": 1.5168325901031494, "reward_std": 0.3997103154659271, "rewards/code_format_reward": 0.9737500071525573, "rewards/code_reward": 0.5149787843227387, "step": 480, "zero_std_ratio": 0.275 }, { "clip_ratio/high_max": 0.26956315375864504, "clip_ratio/high_mean": 0.04211876043118536, "clip_ratio/low_mean": 0.002336682367604226, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.04445544336922467, "completion_length": 86.21500091552734, "epoch": 0.09413120737681299, "grad_norm": 3.7244272232055664, "kl": 2.59437358379364, "learning_rate": 9.80613292525081e-07, "loss": 0.0038, "reward": 1.6131777048110962, "reward_std": 0.32231712639331817, "rewards/code_format_reward": 0.9799999833106995, "rewards/code_reward": 0.5615888297557831, "step": 490, "zero_std_ratio": 0.45 }, { "clip_ratio/high_max": 0.22240130547434092, "clip_ratio/high_mean": 0.044074146053753795, "clip_ratio/low_mean": 0.012573283386882395, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0566474299877882, "completion_length": 72.23500061035156, "epoch": 0.09605225242531937, "grad_norm": 2.852999687194824, "kl": 1.615745335817337, "learning_rate": 9.79816668283697e-07, "loss": 0.0017, "reward": 1.5203128576278686, "reward_std": 0.3012717217206955, "rewards/code_format_reward": 0.9725000023841858, "rewards/code_reward": 0.517031443119049, "step": 500, "zero_std_ratio": 0.35 }, { "clip_ratio/high_max": 0.15330625362694264, "clip_ratio/high_mean": 0.02403738833963871, "clip_ratio/low_mean": 0.004583830677438528, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.028621218353509902, "completion_length": 74.30000076293945, "epoch": 0.09797329747382576, "grad_norm": 2.484840154647827, "kl": 2.1540999174118043, "learning_rate": 9.790043798825458e-07, "loss": 0.0073, "reward": 1.5013367414474488, "reward_std": 0.24206546545028687, "rewards/code_format_reward": 0.9699999928474426, "rewards/code_reward": 0.508168363571167, "step": 510, "zero_std_ratio": 0.4 }, { "clip_ratio/high_max": 0.15383050357922912, "clip_ratio/high_mean": 0.027125787048134953, "clip_ratio/low_mean": 0.002593657124089077, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.029719442850910126, "completion_length": 65.0400016784668, "epoch": 0.09989434252233215, "grad_norm": 7.2150959968566895, "kl": 1.1968895211815833, "learning_rate": 9.781764569245178e-07, "loss": -0.006, "reward": 1.510750651359558, "reward_std": 0.41533524394035337, "rewards/code_format_reward": 0.9712500095367431, "rewards/code_reward": 0.5125628054141999, "step": 520, "zero_std_ratio": 0.325 }, { "clip_ratio/high_max": 0.109321213606745, "clip_ratio/high_mean": 0.018354640086181463, "clip_ratio/low_mean": 0.011131488461978733, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.029486127989366652, "completion_length": 74.52250213623047, "epoch": 0.10181538757083854, "grad_norm": 1.8456060886383057, "kl": 0.7155197218060494, "learning_rate": 9.773329295822844e-07, "loss": 0.0073, "reward": 1.5899319171905517, "reward_std": 0.3179755389690399, "rewards/code_format_reward": 0.975, "rewards/code_reward": 0.5512159705162049, "step": 530, "zero_std_ratio": 0.45 }, { "clip_ratio/high_max": 0.04905872759409249, "clip_ratio/high_mean": 0.008021075790748, "clip_ratio/low_mean": 0.004390958754811436, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.012412034533917904, "completion_length": 67.07500076293945, "epoch": 0.10373643261934493, "grad_norm": 4.641266345977783, "kl": 0.7290919035673141, "learning_rate": 9.764738285972015e-07, "loss": 0.0008, "reward": 1.300760817527771, "reward_std": 0.3361863404512405, "rewards/code_format_reward": 0.9537500143051147, "rewards/code_reward": 0.4119428813457489, "step": 540, "zero_std_ratio": 0.325 }, { "clip_ratio/high_max": 0.1825170351192355, "clip_ratio/high_mean": 0.027253909036517143, "clip_ratio/low_mean": 0.0015975978298229166, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.028851506439968942, "completion_length": 73.99250030517578, "epoch": 0.10565747766785132, "grad_norm": 1.1770566701889038, "kl": 1.328820213675499, "learning_rate": 9.755991852781876e-07, "loss": -0.0023, "reward": 1.5671115159988402, "reward_std": 0.34309983551502227, "rewards/code_format_reward": 0.9737500071525573, "rewards/code_reward": 0.5401182293891906, "step": 550, "zero_std_ratio": 0.3 }, { "clip_ratio/high_max": 0.12550847120583059, "clip_ratio/high_mean": 0.025771993771195413, "clip_ratio/low_mean": 0.0035689805867150427, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.029340974800288678, "completion_length": 71.76750030517579, "epoch": 0.1075785227163577, "grad_norm": 0.3435879647731781, "kl": 2.12383970618248, "learning_rate": 9.747090315005836e-07, "loss": 0.0024, "reward": 1.5273173809051515, "reward_std": 0.2889336168766022, "rewards/code_format_reward": 0.9649999976158142, "rewards/code_reward": 0.5224087119102478, "step": 560, "zero_std_ratio": 0.425 }, { "clip_ratio/high_max": 0.0834595168940723, "clip_ratio/high_mean": 0.015262311231344939, "clip_ratio/low_mean": 0.021645180485211312, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.03690749178640544, "completion_length": 79.53250122070312, "epoch": 0.10949956776486408, "grad_norm": 1.7026695013046265, "kl": 1.6705755025148392, "learning_rate": 9.738033997049902e-07, "loss": 0.1708, "reward": 1.5908133745193482, "reward_std": 0.3691225051879883, "rewards/code_format_reward": 0.9912500023841858, "rewards/code_reward": 0.5475941836833954, "step": 570, "zero_std_ratio": 0.35 }, { "clip_ratio/high_max": 0.18124623028561473, "clip_ratio/high_mean": 0.02496154889231548, "clip_ratio/low_mean": 0.020611650816863402, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.04557319916784763, "completion_length": 85.51750183105469, "epoch": 0.11142061281337047, "grad_norm": 18.138025283813477, "kl": 4.237766814231873, "learning_rate": 9.728823228960862e-07, "loss": -0.0051, "reward": 1.5469601631164551, "reward_std": 0.37420718297362326, "rewards/code_format_reward": 0.975000011920929, "rewards/code_reward": 0.5297300696372986, "step": 580, "zero_std_ratio": 0.3 }, { "clip_ratio/high_max": 0.014268473512493074, "clip_ratio/high_mean": 0.0028658110386459157, "clip_ratio/low_mean": 0.0056301898322999476, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.008496000757440924, "completion_length": 80.10750274658203, "epoch": 0.11334165786187686, "grad_norm": 5.16138219833374, "kl": 0.6609396353363991, "learning_rate": 9.71945834641426e-07, "loss": -0.004, "reward": 1.4476024627685546, "reward_std": 0.3472218900918961, "rewards/code_format_reward": 0.9699999928474426, "rewards/code_reward": 0.4813012361526489, "step": 590, "zero_std_ratio": 0.325 }, { "clip_ratio/high_max": 0.17693078136071563, "clip_ratio/high_mean": 0.02441923434380442, "clip_ratio/low_mean": 0.012987980741309002, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.037407214660197495, "completion_length": 83.96500091552734, "epoch": 0.11526270291038325, "grad_norm": 1.7465465068817139, "kl": 1.0383819937705994, "learning_rate": 9.709939690702158e-07, "loss": -0.0078, "reward": 1.4550770282745362, "reward_std": 0.3056318134069443, "rewards/code_format_reward": 0.9587500095367432, "rewards/code_reward": 0.48785099387168884, "step": 600, "zero_std_ratio": 0.375 }, { "clip_ratio/high_max": 0.1888352295383811, "clip_ratio/high_mean": 0.026437551854178308, "clip_ratio/low_mean": 0.0054486555512994524, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.031886206939816475, "completion_length": 79.63500213623047, "epoch": 0.11718374795888964, "grad_norm": 5.674210548400879, "kl": 1.2073093384504319, "learning_rate": 9.700267608720692e-07, "loss": -0.0021, "reward": 1.4424492359161376, "reward_std": 0.3397494524717331, "rewards/code_format_reward": 0.9725000143051148, "rewards/code_reward": 0.4780996203422546, "step": 610, "zero_std_ratio": 0.275 }, { "clip_ratio/high_max": 0.09671425293199717, "clip_ratio/high_mean": 0.020163473271531986, "clip_ratio/low_mean": 0.006395513273309917, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.02655898590455763, "completion_length": 75.22750091552734, "epoch": 0.11910479300739603, "grad_norm": 5.531320571899414, "kl": 2.2407817423343657, "learning_rate": 9.690442452957448e-07, "loss": -0.0021, "reward": 1.5595922470092773, "reward_std": 0.28165863305330274, "rewards/code_format_reward": 0.9787499904632568, "rewards/code_reward": 0.5351086378097534, "step": 620, "zero_std_ratio": 0.425 }, { "clip_ratio/high_max": 0.11810005996376276, "clip_ratio/high_mean": 0.02500568316318095, "clip_ratio/low_mean": 0.00357620443101041, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.028581888042390348, "completion_length": 80.09000091552734, "epoch": 0.1210258380559024, "grad_norm": 2.165558338165283, "kl": 1.546025463938713, "learning_rate": 9.680464581478594e-07, "loss": -0.0037, "reward": 1.51439368724823, "reward_std": 0.3320598304271698, "rewards/code_format_reward": 0.9725000023841858, "rewards/code_reward": 0.5140718221664429, "step": 630, "zero_std_ratio": 0.35 }, { "clip_ratio/high_max": 0.10313799739815295, "clip_ratio/high_mean": 0.017414161982014776, "clip_ratio/low_mean": 0.009596780824358575, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.027010941854678096, "completion_length": 76.15749969482422, "epoch": 0.1229468831044088, "grad_norm": 5.05511999130249, "kl": 1.6615911841392517, "learning_rate": 9.670334357915852e-07, "loss": 0.0033, "reward": 1.5930729150772094, "reward_std": 0.3864523351192474, "rewards/code_format_reward": 0.9662500023841858, "rewards/code_reward": 0.554973942041397, "step": 640, "zero_std_ratio": 0.275 }, { "clip_ratio/high_max": 0.1653188370168209, "clip_ratio/high_mean": 0.027094300370663404, "clip_ratio/low_mean": 0.0033949258620850744, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.03048922661691904, "completion_length": 74.23250274658203, "epoch": 0.12486792815291518, "grad_norm": 1.1590094566345215, "kl": 0.39487394616007804, "learning_rate": 9.660052151453228e-07, "loss": -0.006, "reward": 1.7198987245559691, "reward_std": 0.3215783953666687, "rewards/code_format_reward": 0.9849999904632568, "rewards/code_reward": 0.613699346780777, "step": 650, "zero_std_ratio": 0.475 }, { "clip_ratio/high_max": 0.2655821519903839, "clip_ratio/high_mean": 0.03813204998150468, "clip_ratio/low_mean": 0.017123100493336096, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.05525515023618936, "completion_length": 79.31999969482422, "epoch": 0.12678897320142157, "grad_norm": 2.8189809322357178, "kl": 0.9924295842647552, "learning_rate": 9.649618336813565e-07, "loss": -0.0022, "reward": 1.710445189476013, "reward_std": 0.2906018912792206, "rewards/code_format_reward": 0.9762500047683715, "rewards/code_reward": 0.6111600875854493, "step": 660, "zero_std_ratio": 0.35 }, { "clip_ratio/high_max": 0.10447313897311687, "clip_ratio/high_mean": 0.017084641277324408, "clip_ratio/low_mean": 0.018559307692339645, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.03564394909190014, "completion_length": 73.31750183105468, "epoch": 0.12871001824992795, "grad_norm": 7.561813831329346, "kl": 1.0190230280160903, "learning_rate": 9.639033294244894e-07, "loss": -0.0059, "reward": 1.4508479833602905, "reward_std": 0.2639226779341698, "rewards/code_format_reward": 0.9724999904632569, "rewards/code_reward": 0.4822989523410797, "step": 670, "zero_std_ratio": 0.5 }, { "clip_ratio/high_max": 0.17416613902896644, "clip_ratio/high_mean": 0.02931727101095021, "clip_ratio/low_mean": 0.013709834642941131, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.04302710462361574, "completion_length": 75.30500183105468, "epoch": 0.13063106329843435, "grad_norm": 4.0138373374938965, "kl": 1.8731355726718903, "learning_rate": 9.628297409506558e-07, "loss": 0.0038, "reward": 1.5990655183792115, "reward_std": 0.38845544308423996, "rewards/code_format_reward": 0.9762500047683715, "rewards/code_reward": 0.5554702281951904, "step": 680, "zero_std_ratio": 0.325 }, { "clip_ratio/high_max": 0.14133978222962468, "clip_ratio/high_mean": 0.025468734742025843, "clip_ratio/low_mean": 0.0034107466402929277, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.028879481457988732, "completion_length": 71.69250183105468, "epoch": 0.13255210834694073, "grad_norm": 2.7108314037323, "kl": 1.0770379617810248, "learning_rate": 9.61741107385517e-07, "loss": 0.0015, "reward": 1.357295000553131, "reward_std": 0.16353759765625, "rewards/code_format_reward": 0.981249988079071, "rewards/code_reward": 0.43333501517772677, "step": 690, "zero_std_ratio": 0.575 }, { "clip_ratio/high_max": 0.2215075224637985, "clip_ratio/high_mean": 0.03973329542204738, "clip_ratio/low_mean": 0.021483630378497764, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.06121692657470703, "completion_length": 77.00250244140625, "epoch": 0.13447315339544713, "grad_norm": 3.874828338623047, "kl": 1.798163938522339, "learning_rate": 9.606374684030354e-07, "loss": -0.0002, "reward": 1.4897700071334838, "reward_std": 0.3036611869931221, "rewards/code_format_reward": 0.9699999928474426, "rewards/code_reward": 0.5023849844932556, "step": 700, "zero_std_ratio": 0.375 }, { "clip_ratio/high_max": 0.26057110670953987, "clip_ratio/high_mean": 0.04422192363999784, "clip_ratio/low_mean": 0.012507367390207946, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.05672929054126143, "completion_length": 68.01749954223632, "epoch": 0.1363941984439535, "grad_norm": 1.9008493423461914, "kl": 1.1601522982120513, "learning_rate": 9.595188642240268e-07, "loss": -0.006, "reward": 1.5408167839050293, "reward_std": 0.23992418646812438, "rewards/code_format_reward": 0.9837499976158142, "rewards/code_reward": 0.5244708836078644, "step": 710, "zero_std_ratio": 0.475 }, { "clip_ratio/high_max": 0.11190514008048921, "clip_ratio/high_mean": 0.022988432584679686, "clip_ratio/low_mean": 0.003842631517909467, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.02683106428885367, "completion_length": 70.91749954223633, "epoch": 0.1383152434924599, "grad_norm": 2.230220317840576, "kl": 0.6176944851875306, "learning_rate": 9.58385335614697e-07, "loss": -0.0038, "reward": 1.474353313446045, "reward_std": 0.22789922058582307, "rewards/code_format_reward": 0.9850000023841858, "rewards/code_reward": 0.49092662930488584, "step": 720, "zero_std_ratio": 0.4 }, { "clip_ratio/high_max": 0.22790296860039233, "clip_ratio/high_mean": 0.043722260277718306, "clip_ratio/low_mean": 0.005503303511068225, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0492255637422204, "completion_length": 70.33000183105469, "epoch": 0.1402362885409663, "grad_norm": 3.880234956741333, "kl": 1.7978762328624724, "learning_rate": 9.572369238851546e-07, "loss": -0.01, "reward": 1.7555195808410644, "reward_std": 0.30654080510139464, "rewards/code_format_reward": 0.9862499952316284, "rewards/code_reward": 0.6311972856521606, "step": 730, "zero_std_ratio": 0.45 }, { "clip_ratio/high_max": 0.13005290240980685, "clip_ratio/high_mean": 0.02253831790876575, "clip_ratio/low_mean": 0.0076317260100040585, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.030170044326223434, "completion_length": 67.4625015258789, "epoch": 0.14215733358947266, "grad_norm": 31014.41015625, "kl": 2.5802926242351534, "learning_rate": 9.560736708879055e-07, "loss": 4.1316, "reward": 1.391554856300354, "reward_std": 0.3107602626085281, "rewards/code_format_reward": 0.9824999928474426, "rewards/code_reward": 0.4501524269580841, "step": 740, "zero_std_ratio": 0.25 }, { "clip_ratio/high_max": 0.21672796942293643, "clip_ratio/high_mean": 0.03920850001741201, "clip_ratio/low_mean": 0.0084746521897614, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.047683153115212915, "completion_length": 71.03750076293946, "epoch": 0.14407837863797907, "grad_norm": 1.3094109296798706, "kl": 4.56303431391716, "learning_rate": 9.54895619016329e-07, "loss": 0.0111, "reward": 1.5939582109451294, "reward_std": 0.2379148319363594, "rewards/code_format_reward": 0.96875, "rewards/code_reward": 0.5547916054725647, "step": 750, "zero_std_ratio": 0.45 }, { "clip_ratio/high_max": 0.08126737037673593, "clip_ratio/high_mean": 0.01269659586250782, "clip_ratio/low_mean": 0.006480468995869159, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.019177064718678593, "completion_length": 74.09750213623047, "epoch": 0.14599942368648544, "grad_norm": 3.0267083644866943, "kl": 1.5844107165932655, "learning_rate": 9.53702811203131e-07, "loss": 0.0048, "reward": 1.4744285106658936, "reward_std": 0.2754403457045555, "rewards/code_format_reward": 0.9900000095367432, "rewards/code_reward": 0.489714241027832, "step": 760, "zero_std_ratio": 0.5 }, { "clip_ratio/high_max": 0.22151243952102959, "clip_ratio/high_mean": 0.038386100489879026, "clip_ratio/low_mean": 0.001766498590586707, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.04015259912703186, "completion_length": 73.72750244140624, "epoch": 0.14792046873499184, "grad_norm": 3596482.75, "kl": 0.6901701986789703, "learning_rate": 9.524952909187801e-07, "loss": 83.9443, "reward": 1.4019340753555298, "reward_std": 0.24908357337117196, "rewards/code_format_reward": 0.9749999880790711, "rewards/code_reward": 0.45721703171730044, "step": 770, "zero_std_ratio": 0.475 }, { "clip_ratio/high_max": 0.07684345319867134, "clip_ratio/high_mean": 0.014277776470407844, "clip_ratio/low_mean": 0.016169815976172685, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0304475924000144, "completion_length": 79.24250183105468, "epoch": 0.14984151378349822, "grad_norm": 3.468223810195923, "kl": 0.45489892959594724, "learning_rate": 9.512731021699245e-07, "loss": -0.0056, "reward": 1.580666732788086, "reward_std": 0.41472728848457335, "rewards/code_format_reward": 0.9774999976158142, "rewards/code_reward": 0.5459583520889282, "step": 780, "zero_std_ratio": 0.275 }, { "clip_ratio/high_max": 0.10067678079940379, "clip_ratio/high_mean": 0.013439147116150707, "clip_ratio/low_mean": 0.023053765966324136, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.03649291144683957, "completion_length": 72.04750137329101, "epoch": 0.15176255883200462, "grad_norm": 13.193933486938477, "kl": 1.6161374658346177, "learning_rate": 9.500362894977864e-07, "loss": 0.0007, "reward": 1.6252036333084106, "reward_std": 0.3433967262506485, "rewards/code_format_reward": 0.9837499976158142, "rewards/code_reward": 0.5666643261909485, "step": 790, "zero_std_ratio": 0.325 }, { "clip_ratio/high_max": 0.11107501722872257, "clip_ratio/high_mean": 0.01587685807608068, "clip_ratio/low_mean": 0.001843169682251755, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.017720027733594178, "completion_length": 77.34250183105469, "epoch": 0.153683603880511, "grad_norm": 3.4086289405822754, "kl": 0.735039034485817, "learning_rate": 9.487848979765399e-07, "loss": -0.0033, "reward": 1.7214166164398192, "reward_std": 0.3059865742921829, "rewards/code_format_reward": 0.9924999952316285, "rewards/code_reward": 0.6125832796096802, "step": 800, "zero_std_ratio": 0.325 }, { "clip_ratio/high_max": 0.06381021924316883, "clip_ratio/high_mean": 0.012221441417932511, "clip_ratio/low_mean": 0.002595777277019806, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.014817218482494354, "completion_length": 78.94500045776367, "epoch": 0.15560464892901738, "grad_norm": 2.894174098968506, "kl": 0.9337424471974373, "learning_rate": 9.475189732116677e-07, "loss": -0.0074, "reward": 1.5309076070785523, "reward_std": 0.36832110285758973, "rewards/code_format_reward": 0.981249988079071, "rewards/code_reward": 0.5201413094997406, "step": 810, "zero_std_ratio": 0.35 }, { "clip_ratio/high_max": 0.0614451477304101, "clip_ratio/high_mean": 0.011137601570226252, "clip_ratio/low_mean": 0.015545779425883666, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.02668338119983673, "completion_length": 80.46750030517578, "epoch": 0.15752569397752378, "grad_norm": 1.5945316553115845, "kl": 1.666656306385994, "learning_rate": 9.462385613382997e-07, "loss": -0.0138, "reward": 1.4196115970611571, "reward_std": 0.3273743912577629, "rewards/code_format_reward": 0.9625, "rewards/code_reward": 0.4691807866096497, "step": 820, "zero_std_ratio": 0.4 }, { "clip_ratio/high_max": 0.0729204102884978, "clip_ratio/high_mean": 0.011435226618777961, "clip_ratio/low_mean": 0.0035716916667297483, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015006918273866177, "completion_length": 83.92250061035156, "epoch": 0.15944673902603015, "grad_norm": 3.7898244857788086, "kl": 3.157607713341713, "learning_rate": 9.449437090195312e-07, "loss": 0.6488, "reward": 1.5506922006607056, "reward_std": 0.3165741294622421, "rewards/code_format_reward": 0.9712499976158142, "rewards/code_reward": 0.532533586025238, "step": 830, "zero_std_ratio": 0.325 }, { "clip_ratio/high_max": 0.22014709915965797, "clip_ratio/high_mean": 0.030960237560793757, "clip_ratio/low_mean": 0.008386016800068318, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.03934625396504998, "completion_length": 79.59750213623047, "epoch": 0.16136778407453656, "grad_norm": 3.164461851119995, "kl": 0.48004563301801684, "learning_rate": 9.436344634447226e-07, "loss": 0.0002, "reward": 1.4315959692001343, "reward_std": 0.2676436066627502, "rewards/code_format_reward": 0.9774999976158142, "rewards/code_reward": 0.4714229583740234, "step": 840, "zero_std_ratio": 0.45 }, { "clip_ratio/high_max": 0.20247683776542544, "clip_ratio/high_mean": 0.040387283614836636, "clip_ratio/low_mean": 0.0031327656004577877, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.04352005030959845, "completion_length": 81.20750274658204, "epoch": 0.16328882912304293, "grad_norm": 3.2722160816192627, "kl": 0.8405016213655472, "learning_rate": 9.42310872327779e-07, "loss": -0.0002, "reward": 1.550826621055603, "reward_std": 0.4091781198978424, "rewards/code_format_reward": 0.9725000023841858, "rewards/code_reward": 0.5322882652282714, "step": 850, "zero_std_ratio": 0.35 }, { "clip_ratio/high_max": 0.061589781753718854, "clip_ratio/high_mean": 0.011824411456473172, "clip_ratio/low_mean": 0.011703617853345349, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.023528029827866705, "completion_length": 62.61500244140625, "epoch": 0.1652098741715493, "grad_norm": 0.2732953727245331, "kl": 1.4307941138744353, "learning_rate": 9.409729839054123e-07, "loss": 0.0075, "reward": 1.5864750623703003, "reward_std": 0.2073097825050354, "rewards/code_format_reward": 0.9837499976158142, "rewards/code_reward": 0.5473000288009644, "step": 860, "zero_std_ratio": 0.55 }, { "clip_ratio/high_max": 0.1379000276327133, "clip_ratio/high_mean": 0.02470994950272143, "clip_ratio/low_mean": 0.004926441749557853, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.029636391997337343, "completion_length": 77.2400032043457, "epoch": 0.1671309192200557, "grad_norm": 3.488050699234009, "kl": 0.9351878672838211, "learning_rate": 9.396208469353826e-07, "loss": -0.0059, "reward": 1.5735363721847535, "reward_std": 0.3392061233520508, "rewards/code_format_reward": 0.9725000023841858, "rewards/code_reward": 0.5436432063579559, "step": 870, "zero_std_ratio": 0.325 }, { "clip_ratio/high_max": 0.07835716316476464, "clip_ratio/high_mean": 0.014919109572656453, "clip_ratio/low_mean": 0.006504692946327851, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.021423802757635713, "completion_length": 74.78000183105469, "epoch": 0.1690519642685621, "grad_norm": 5.493437767028809, "kl": 1.060418888926506, "learning_rate": 9.382545106947214e-07, "loss": -0.0036, "reward": 1.745260238647461, "reward_std": 0.297343048453331, "rewards/code_format_reward": 0.9887499928474426, "rewards/code_reward": 0.6254426181316376, "step": 880, "zero_std_ratio": 0.4 }, { "clip_ratio/high_max": 0.12418304020538926, "clip_ratio/high_mean": 0.022332211420871318, "clip_ratio/low_mean": 0.022319327194418294, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.04465153906494379, "completion_length": 84.53250122070312, "epoch": 0.1709730093170685, "grad_norm": 5.462327480316162, "kl": 1.5445073664188385, "learning_rate": 9.368740249779358e-07, "loss": 0.0049, "reward": 1.473905611038208, "reward_std": 0.33463606536388396, "rewards/code_format_reward": 0.9737499952316284, "rewards/code_reward": 0.49351527690887453, "step": 890, "zero_std_ratio": 0.25 }, { "clip_ratio/high_max": 0.08285986992996186, "clip_ratio/high_mean": 0.015584854045300744, "clip_ratio/low_mean": 0.002020698119304143, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.017605551629094406, "completion_length": 85.78250122070312, "epoch": 0.17289405436557487, "grad_norm": 3.7394657135009766, "kl": 1.2308152213692665, "learning_rate": 9.354794400951942e-07, "loss": 0.0006, "reward": 1.3064285874366761, "reward_std": 0.3360040634870529, "rewards/code_format_reward": 0.9787500023841857, "rewards/code_reward": 0.40852679312229156, "step": 900, "zero_std_ratio": 0.275 }, { "clip_ratio/high_max": 0.06636467641219497, "clip_ratio/high_mean": 0.01088127460097894, "clip_ratio/low_mean": 0.005357642179296818, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.01623891657218337, "completion_length": 86.17000122070313, "epoch": 0.17481509941408127, "grad_norm": 3.883023977279663, "kl": 0.5634948700666428, "learning_rate": 9.340708068704917e-07, "loss": -0.0132, "reward": 1.6946633338928223, "reward_std": 0.2633577108383179, "rewards/code_format_reward": 0.987499988079071, "rewards/code_reward": 0.6004566550254822, "step": 910, "zero_std_ratio": 0.425 }, { "clip_ratio/high_max": 0.12004003385081888, "clip_ratio/high_mean": 0.01987670698435977, "clip_ratio/low_mean": 0.00857236894662492, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.028449075785465537, "completion_length": 83.18000030517578, "epoch": 0.17673614446258765, "grad_norm": 5.860812187194824, "kl": 1.0160879641771317, "learning_rate": 9.326481766397991e-07, "loss": -0.0011, "reward": 1.5558514595031738, "reward_std": 0.28839708790183066, "rewards/code_format_reward": 0.9737500071525573, "rewards/code_reward": 0.5344882309436798, "step": 920, "zero_std_ratio": 0.375 }, { "clip_ratio/high_max": 0.06109805963933468, "clip_ratio/high_mean": 0.00847023066598922, "clip_ratio/low_mean": 0.004858631710521877, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.01332886223681271, "completion_length": 85.00750122070312, "epoch": 0.17865718951109402, "grad_norm": 2.287473440170288, "kl": 0.629003182053566, "learning_rate": 9.312116012491916e-07, "loss": -0.0155, "reward": 1.3984088182449341, "reward_std": 0.38690108954906466, "rewards/code_format_reward": 0.9787500023841857, "rewards/code_reward": 0.45451690554618834, "step": 930, "zero_std_ratio": 0.275 }, { "clip_ratio/high_max": 0.11440350348129869, "clip_ratio/high_mean": 0.021249773760791867, "clip_ratio/low_mean": 0.010212704542209395, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.03146247826516628, "completion_length": 85.56500244140625, "epoch": 0.18057823455960043, "grad_norm": 2.5915870666503906, "kl": 0.6908730089664459, "learning_rate": 9.297611330529588e-07, "loss": -0.0019, "reward": 1.5472615003585815, "reward_std": 0.34995803236961365, "rewards/code_format_reward": 0.9762500047683715, "rewards/code_reward": 0.529568213224411, "step": 940, "zero_std_ratio": 0.35 }, { "clip_ratio/high_max": 0.11480946252122522, "clip_ratio/high_mean": 0.021491143060848115, "clip_ratio/low_mean": 0.007519157652859576, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.029010300803929568, "completion_length": 72.10000152587891, "epoch": 0.1824992796081068, "grad_norm": 1.5689059495925903, "kl": 0.7929495573043823, "learning_rate": 9.282968249116975e-07, "loss": -0.0054, "reward": 1.8428637742996217, "reward_std": 0.2614489495754242, "rewards/code_format_reward": 0.9875, "rewards/code_reward": 0.6745568513870239, "step": 950, "zero_std_ratio": 0.525 }, { "clip_ratio/high_max": 0.3971266824752092, "clip_ratio/high_mean": 0.05282264268025756, "clip_ratio/low_mean": 0.004530514683574438, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.05735315615311265, "completion_length": 70.86750030517578, "epoch": 0.1844203246566132, "grad_norm": 3.4463512897491455, "kl": 0.8312035664916039, "learning_rate": 9.268187301903852e-07, "loss": 0.0003, "reward": 1.6929683208465576, "reward_std": 0.2562918782234192, "rewards/code_format_reward": 0.9837499976158142, "rewards/code_reward": 0.600546681880951, "step": 960, "zero_std_ratio": 0.475 }, { "clip_ratio/high_max": 0.1673737466800958, "clip_ratio/high_mean": 0.03157579629332759, "clip_ratio/low_mean": 0.012752554472535848, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.044328349828720096, "completion_length": 76.92749938964843, "epoch": 0.18634136970511958, "grad_norm": 3.0548853874206543, "kl": 0.6291002959012986, "learning_rate": 9.253269027564339e-07, "loss": -0.005, "reward": 1.4119353413581848, "reward_std": 0.33177118599414823, "rewards/code_format_reward": 0.981249988079071, "rewards/code_reward": 0.46065517961978913, "step": 970, "zero_std_ratio": 0.425 }, { "clip_ratio/high_max": 0.15495819319039583, "clip_ratio/high_mean": 0.022329012653790413, "clip_ratio/low_mean": 0.006486268152366392, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.02881528080906719, "completion_length": 68.58250122070312, "epoch": 0.18826241475362598, "grad_norm": 7.065835952758789, "kl": 1.0375685960054397, "learning_rate": 9.238213969777292e-07, "loss": -0.0046, "reward": 1.6331373691558837, "reward_std": 0.2626490265130997, "rewards/code_format_reward": 0.9850000023841858, "rewards/code_reward": 0.5703186750411987, "step": 980, "zero_std_ratio": 0.5 }, { "clip_ratio/high_max": 0.10045178183354438, "clip_ratio/high_mean": 0.020599483215482904, "clip_ratio/low_mean": 0.007835417747264728, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.02843490142840892, "completion_length": 71.47500076293946, "epoch": 0.19018345980213236, "grad_norm": 4.533353328704834, "kl": 2.011890631914139, "learning_rate": 9.223022677206474e-07, "loss": -0.0001, "reward": 1.7676753044128417, "reward_std": 0.25886805951595304, "rewards/code_format_reward": 0.9824999928474426, "rewards/code_reward": 0.6382126212120056, "step": 990, "zero_std_ratio": 0.475 }, { "clip_ratio/high_max": 0.12670890614390373, "clip_ratio/high_mean": 0.022856980562210083, "clip_ratio/low_mean": 0.016935013599868397, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.039791994355618955, "completion_length": 70.40500106811524, "epoch": 0.19210450485063874, "grad_norm": 9.587749481201172, "kl": 1.1125446915626527, "learning_rate": 9.207695703480562e-07, "loss": -0.0049, "reward": 1.5464402914047242, "reward_std": 0.30552313327789304, "rewards/code_format_reward": 0.9899999976158143, "rewards/code_reward": 0.5257201135158539, "step": 1000, "zero_std_ratio": 0.45 }, { "clip_ratio/high_max": 0.13173274043947458, "clip_ratio/high_mean": 0.021644592471420764, "clip_ratio/low_mean": 0.01016495683870744, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.03180954959243536, "completion_length": 81.64500122070312, "epoch": 0.19402554989914514, "grad_norm": 61.59896469116211, "kl": 1.3899411320686341, "learning_rate": 9.192233607172973e-07, "loss": 0.0117, "reward": 1.5586263418197632, "reward_std": 0.32884465754032133, "rewards/code_format_reward": 0.9862499952316284, "rewards/code_reward": 0.5327506422996521, "step": 1010, "zero_std_ratio": 0.35 }, { "clip_ratio/high_max": 0.38223748579621314, "clip_ratio/high_mean": 0.05293128285557032, "clip_ratio/low_mean": 0.008536407171050087, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.06146768992766738, "completion_length": 75.7425033569336, "epoch": 0.19594659494765151, "grad_norm": 0.8699261546134949, "kl": 2.267198386788368, "learning_rate": 9.17663695178151e-07, "loss": 0.0007, "reward": 1.4393709778785706, "reward_std": 0.19248414039611816, "rewards/code_format_reward": 0.9887499928474426, "rewards/code_reward": 0.4724979490041733, "step": 1020, "zero_std_ratio": 0.55 }, { "clip_ratio/high_max": 0.05449964143335819, "clip_ratio/high_mean": 0.008484689320903271, "clip_ratio/low_mean": 0.0017167545520351268, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.010201443906407804, "completion_length": 74.80750045776367, "epoch": 0.19786763999615792, "grad_norm": 3.8721530437469482, "kl": 1.034875027090311, "learning_rate": 9.160906305707814e-07, "loss": -0.0065, "reward": 1.6229804277420044, "reward_std": 0.21886643767356873, "rewards/code_format_reward": 0.9962499976158142, "rewards/code_reward": 0.5624276876449585, "step": 1030, "zero_std_ratio": 0.575 }, { "clip_ratio/high_max": 0.057783479290083054, "clip_ratio/high_mean": 0.008794186974409968, "clip_ratio/low_mean": 0.01261859169753734, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.02141277852933854, "completion_length": 80.94500122070312, "epoch": 0.1997886850446643, "grad_norm": 2.0369646549224854, "kl": 0.47016064152121545, "learning_rate": 9.145042242236667e-07, "loss": -0.0016, "reward": 1.5200274467468262, "reward_std": 0.2379522889852524, "rewards/code_format_reward": 0.98125, "rewards/code_reward": 0.5147012054920197, "step": 1040, "zero_std_ratio": 0.5 }, { "clip_ratio/high_max": 0.05080003601033241, "clip_ratio/high_mean": 0.0081847107532667, "clip_ratio/low_mean": 0.003685746184783056, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.011870456766337157, "completion_length": 86.39750213623047, "epoch": 0.2017097300931707, "grad_norm": 1.86152184009552, "kl": 0.9119557231664658, "learning_rate": 9.129045339515085e-07, "loss": -0.0025, "reward": 1.338998556137085, "reward_std": 0.29172809422016144, "rewards/code_format_reward": 0.9787500023841857, "rewards/code_reward": 0.42481178045272827, "step": 1050, "zero_std_ratio": 0.525 }, { "clip_ratio/high_max": 0.11181259918957949, "clip_ratio/high_mean": 0.01702371232677251, "clip_ratio/low_mean": 0.003983464353950694, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.021007176581770183, "completion_length": 89.0250015258789, "epoch": 0.20363077514167707, "grad_norm": 1.664932370185852, "kl": 1.7415984645485878, "learning_rate": 9.112916180531254e-07, "loss": -0.0009, "reward": 1.6867451906204223, "reward_std": 0.26216842532157897, "rewards/code_format_reward": 0.9849999904632568, "rewards/code_reward": 0.5971225798130035, "step": 1060, "zero_std_ratio": 0.475 }, { "clip_ratio/high_max": 0.1619036693125963, "clip_ratio/high_mean": 0.02605230761691928, "clip_ratio/low_mean": 0.011786457896232606, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.03783876644447446, "completion_length": 80.52750091552734, "epoch": 0.20555182019018345, "grad_norm": 3.1480722427368164, "kl": 2.3309426337480543, "learning_rate": 9.096655353093286e-07, "loss": -0.0108, "reward": 1.7797099113464356, "reward_std": 0.3243818938732147, "rewards/code_format_reward": 0.9875, "rewards/code_reward": 0.6429799437522888, "step": 1070, "zero_std_ratio": 0.375 }, { "clip_ratio/high_max": 0.145719656907022, "clip_ratio/high_mean": 0.02472380215767771, "clip_ratio/low_mean": 0.01881317695369944, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.043536979146301745, "completion_length": 75.46750183105469, "epoch": 0.20747286523868985, "grad_norm": 4.7426347732543945, "kl": 0.7767296731472015, "learning_rate": 9.080263449807788e-07, "loss": 0.0042, "reward": 1.5128322124481202, "reward_std": 0.26058112680912016, "rewards/code_format_reward": 0.9662500023841858, "rewards/code_reward": 0.514853572845459, "step": 1080, "zero_std_ratio": 0.375 }, { "clip_ratio/high_max": 0.04860758520662785, "clip_ratio/high_mean": 0.00921072952914983, "clip_ratio/low_mean": 0.013458288778201677, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.022669017571024595, "completion_length": 77.22750244140624, "epoch": 0.20939391028719623, "grad_norm": 2.2836835384368896, "kl": 0.6794285923242569, "learning_rate": 9.063741068058278e-07, "loss": -0.0028, "reward": 1.5665315628051757, "reward_std": 0.23656646013259888, "rewards/code_format_reward": 0.9737500071525573, "rewards/code_reward": 0.5398283064365387, "step": 1090, "zero_std_ratio": 0.475 }, { "clip_ratio/high_max": 0.2074673067778349, "clip_ratio/high_mean": 0.036228268034756185, "clip_ratio/low_mean": 0.003734398238157155, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.039962667226791385, "completion_length": 91.18000030517578, "epoch": 0.21131495533570263, "grad_norm": 7.916996002197266, "kl": 1.0919141083955766, "learning_rate": 9.0470888099834e-07, "loss": 0.1666, "reward": 1.68690767288208, "reward_std": 0.32907233834266664, "rewards/code_format_reward": 0.9799999952316284, "rewards/code_reward": 0.5984537959098816, "step": 1100, "zero_std_ratio": 0.375 }, { "clip_ratio/high_max": 0.16652454435825348, "clip_ratio/high_mean": 0.027045656740665436, "clip_ratio/low_mean": 0.006342244842380751, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.03338790265843272, "completion_length": 80.02000122070312, "epoch": 0.213236000384209, "grad_norm": 24.34583282470703, "kl": 1.00138920545578, "learning_rate": 9.030307282454995e-07, "loss": -0.0023, "reward": 1.6111816883087158, "reward_std": 0.24880893230438234, "rewards/code_format_reward": 0.9724999904632569, "rewards/code_reward": 0.5624658226966858, "step": 1110, "zero_std_ratio": 0.45 }, { "clip_ratio/high_max": 0.18198216175660492, "clip_ratio/high_mean": 0.02493738690391183, "clip_ratio/low_mean": 0.004894328210502863, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.029831714881584048, "completion_length": 71.95, "epoch": 0.2151570454327154, "grad_norm": 2.7608304023742676, "kl": 0.971074515581131, "learning_rate": 9.013397097055971e-07, "loss": -0.0022, "reward": 1.6884326457977294, "reward_std": 0.3369467526674271, "rewards/code_format_reward": 0.9712499856948853, "rewards/code_reward": 0.6014038324356079, "step": 1120, "zero_std_ratio": 0.375 }, { "clip_ratio/high_max": 0.16543546952307225, "clip_ratio/high_mean": 0.02493141880258918, "clip_ratio/low_mean": 0.007064808573340997, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.03199622761458158, "completion_length": 72.50500030517578, "epoch": 0.21707809048122179, "grad_norm": 7.147952556610107, "kl": 6.163409499824047, "learning_rate": 8.996358870058017e-07, "loss": 0.0081, "reward": 1.5753276348114014, "reward_std": 0.2175431028008461, "rewards/code_format_reward": 0.9924999952316285, "rewards/code_reward": 0.5395387947559357, "step": 1130, "zero_std_ratio": 0.55 }, { "clip_ratio/high_max": 0.05989155264105648, "clip_ratio/high_mean": 0.009021314003621227, "clip_ratio/low_mean": 0.014251881884410978, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.023273196443915366, "completion_length": 74.57750091552734, "epoch": 0.21899913552972816, "grad_norm": 17.58907699584961, "kl": 0.9839092344045639, "learning_rate": 8.979193222399154e-07, "loss": -0.0006, "reward": 1.570918822288513, "reward_std": 0.27486068904399874, "rewards/code_format_reward": 0.9737500071525573, "rewards/code_reward": 0.5420219123363494, "step": 1140, "zero_std_ratio": 0.475 }, { "clip_ratio/high_max": 0.23600016683340072, "clip_ratio/high_mean": 0.04525289600715041, "clip_ratio/low_mean": 0.00799154011765495, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.05324443739373237, "completion_length": 71.89750061035156, "epoch": 0.22092018057823457, "grad_norm": 8.010896682739258, "kl": 1.0768774889409543, "learning_rate": 8.961900779661095e-07, "loss": 0.0139, "reward": 1.5848765134811402, "reward_std": 0.21965934410691262, "rewards/code_format_reward": 0.9862499952316284, "rewards/code_reward": 0.5458757638931274, "step": 1150, "zero_std_ratio": 0.55 }, { "clip_ratio/high_max": 0.10782922431826591, "clip_ratio/high_mean": 0.014393238560296595, "clip_ratio/low_mean": 0.0046036563231609765, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.018996895058080554, "completion_length": 78.2925018310547, "epoch": 0.22284122562674094, "grad_norm": 3.7750465869903564, "kl": 0.5210637584328651, "learning_rate": 8.944482172046448e-07, "loss": -0.0065, "reward": 1.6227028608322143, "reward_std": 0.2484603613615036, "rewards/code_format_reward": 0.98125, "rewards/code_reward": 0.5660388946533204, "step": 1160, "zero_std_ratio": 0.525 }, { "clip_ratio/high_max": 0.13120641289278864, "clip_ratio/high_mean": 0.019719564472325146, "clip_ratio/low_mean": 0.00696407729992643, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.026683641644194723, "completion_length": 81.64000091552734, "epoch": 0.22476227067524734, "grad_norm": 1.1691230535507202, "kl": 0.5908193171024323, "learning_rate": 8.926938034355751e-07, "loss": -0.0008, "reward": 1.6598936080932618, "reward_std": 0.3073273479938507, "rewards/code_format_reward": 0.9875, "rewards/code_reward": 0.5830717980861664, "step": 1170, "zero_std_ratio": 0.425 }, { "clip_ratio/high_max": 0.26425148695707323, "clip_ratio/high_mean": 0.03642228813841939, "clip_ratio/low_mean": 0.0025068818649742752, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.03892916943877935, "completion_length": 83.06500244140625, "epoch": 0.22668331572375372, "grad_norm": 5.047176361083984, "kl": 0.8601905956864357, "learning_rate": 8.90926900596434e-07, "loss": 0.019, "reward": 1.6030859470367431, "reward_std": 0.18358819633722306, "rewards/code_format_reward": 0.9862500071525574, "rewards/code_reward": 0.5549804508686066, "step": 1180, "zero_std_ratio": 0.5 }, { "clip_ratio/high_max": 0.20188620835542678, "clip_ratio/high_mean": 0.03365288833156228, "clip_ratio/low_mean": 0.012162915989756584, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.04581580460071564, "completion_length": 80.93500061035157, "epoch": 0.2286043607722601, "grad_norm": 3.431043863296509, "kl": 3.284740853309631, "learning_rate": 8.891475730799039e-07, "loss": -0.0024, "reward": 1.719798493385315, "reward_std": 0.2678588882088661, "rewards/code_format_reward": 0.9887500047683716, "rewards/code_reward": 0.6127117216587067, "step": 1190, "zero_std_ratio": 0.475 }, { "clip_ratio/high_max": 0.13805483505129815, "clip_ratio/high_mean": 0.02111883880570531, "clip_ratio/low_mean": 0.002508872369071469, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.023627711273729802, "completion_length": 87.33250274658204, "epoch": 0.2305254058207665, "grad_norm": 4.731442928314209, "kl": 1.1696231275796891, "learning_rate": 8.873558857314706e-07, "loss": -0.0053, "reward": 1.7580220222473144, "reward_std": 0.28411929309368134, "rewards/code_format_reward": 0.9900000095367432, "rewards/code_reward": 0.6315110087394714, "step": 1200, "zero_std_ratio": 0.45 }, { "clip_ratio/high_max": 0.07043634681031108, "clip_ratio/high_mean": 0.009235845855437219, "clip_ratio/low_mean": 0.017453379271319135, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.02668922524899244, "completion_length": 86.74250030517578, "epoch": 0.23244645086927287, "grad_norm": 23.686250686645508, "kl": 1.7613270074129104, "learning_rate": 8.855519038470587e-07, "loss": 0.91, "reward": 1.8096629619598388, "reward_std": 0.2700611263513565, "rewards/code_format_reward": 0.9824999928474426, "rewards/code_reward": 0.6592064738273621, "step": 1210, "zero_std_ratio": 0.5 }, { "clip_ratio/high_max": 0.1193816315382719, "clip_ratio/high_mean": 0.01799508691765368, "clip_ratio/low_mean": 0.0052341839407745285, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.023229270869342143, "completion_length": 91.73750152587891, "epoch": 0.23436749591777928, "grad_norm": 5.015241622924805, "kl": 87723751.16166303, "learning_rate": 8.83735693170653e-07, "loss": 178666.875, "reward": 1.5409840583801269, "reward_std": 0.3586106300354004, "rewards/code_format_reward": 0.9687500119209289, "rewards/code_reward": 0.5283045113086701, "step": 1220, "zero_std_ratio": 0.4 }, { "clip_ratio/high_max": 0.15788686936721205, "clip_ratio/high_mean": 0.02180835944600403, "clip_ratio/low_mean": 0.004957044991897419, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.026765404315665365, "completion_length": 83.70250091552734, "epoch": 0.23628854096628565, "grad_norm": 2.7140953540802, "kl": 0.755669391900301, "learning_rate": 8.81907319891902e-07, "loss": -0.0099, "reward": 1.8449480056762695, "reward_std": 0.28006095588207247, "rewards/code_format_reward": 0.987499988079071, "rewards/code_reward": 0.6755990028381348, "step": 1230, "zero_std_ratio": 0.425 }, { "clip_ratio/high_max": 0.12416752465069295, "clip_ratio/high_mean": 0.01972346901893616, "clip_ratio/low_mean": 0.01847981174942106, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0382032809779048, "completion_length": 91.47000274658203, "epoch": 0.23820958601479206, "grad_norm": 10.781957626342773, "kl": 1.0129390999674797, "learning_rate": 8.800668506437059e-07, "loss": 0.0011, "reward": 1.6923505306243896, "reward_std": 0.3265227422118187, "rewards/code_format_reward": 0.9787500023841857, "rewards/code_reward": 0.6014877319335937, "step": 1240, "zero_std_ratio": 0.425 }, { "clip_ratio/high_max": 0.12042212830856443, "clip_ratio/high_mean": 0.017916655144654216, "clip_ratio/low_mean": 0.007017276567057707, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.02493393179029226, "completion_length": 76.6675018310547, "epoch": 0.24013063106329843, "grad_norm": 47.773136138916016, "kl": 1.4071896970272064, "learning_rate": 8.782143524997882e-07, "loss": 0.0018, "reward": 1.6722928285598755, "reward_std": 0.25374017357826234, "rewards/code_format_reward": 0.9824999809265137, "rewards/code_reward": 0.5905213832855225, "step": 1250, "zero_std_ratio": 0.525 }, { "clip_ratio/high_max": 0.08169625541195273, "clip_ratio/high_mean": 0.013112110400106758, "clip_ratio/low_mean": 0.003914138658728916, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.01702624891186133, "completion_length": 78.23750152587891, "epoch": 0.2420516761118048, "grad_norm": 2688.99462890625, "kl": 9.395949372649193, "learning_rate": 8.76349892972251e-07, "loss": 0.1943, "reward": 1.5601455688476562, "reward_std": 0.3348282665014267, "rewards/code_format_reward": 0.9712499976158142, "rewards/code_reward": 0.5372602701187134, "step": 1260, "zero_std_ratio": 0.475 }, { "clip_ratio/high_max": 0.13095853393897414, "clip_ratio/high_mean": 0.018921413994394242, "clip_ratio/low_mean": 0.018763081403449178, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.03768449563067407, "completion_length": 76.1500015258789, "epoch": 0.2439727211603112, "grad_norm": 3.0777931213378906, "kl": 1.7352074533700943, "learning_rate": 8.744735400091154e-07, "loss": 0.0055, "reward": 1.633968448638916, "reward_std": 0.23277063071727752, "rewards/code_format_reward": 0.9824999928474426, "rewards/code_reward": 0.5713592231273651, "step": 1270, "zero_std_ratio": 0.45 }, { "clip_ratio/high_max": 0.15694143967702984, "clip_ratio/high_mean": 0.026766782545018943, "clip_ratio/low_mean": 0.010570818380801938, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.03733760174363852, "completion_length": 76.13500213623047, "epoch": 0.2458937662088176, "grad_norm": 2.8748385906219482, "kl": 3.007472372055054, "learning_rate": 8.725853619918444e-07, "loss": 0.0249, "reward": 1.4643328666687012, "reward_std": 0.2899716466665268, "rewards/code_format_reward": 0.9799999952316284, "rewards/code_reward": 0.48716638684272767, "step": 1280, "zero_std_ratio": 0.35 }, { "clip_ratio/high_max": 0.07734788609668612, "clip_ratio/high_mean": 0.013521577988285571, "clip_ratio/low_mean": 0.002974278874171432, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.01649585694540292, "completion_length": 77.6050033569336, "epoch": 0.247814811257324, "grad_norm": 4.51137638092041, "kl": 0.6521440967917442, "learning_rate": 8.706854277328507e-07, "loss": -0.0065, "reward": 1.663088607788086, "reward_std": 0.29463320076465604, "rewards/code_format_reward": 0.9887499928474426, "rewards/code_reward": 0.5843567848205566, "step": 1290, "zero_std_ratio": 0.375 }, { "clip_ratio/high_max": 0.12926983460783958, "clip_ratio/high_mean": 0.016393666993826626, "clip_ratio/low_mean": 0.024948839796707034, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.04134250609204173, "completion_length": 74.63750305175782, "epoch": 0.24973585630583037, "grad_norm": 7.019649982452393, "kl": 0.6837658904492855, "learning_rate": 8.687738064729902e-07, "loss": -0.0022, "reward": 1.6927862167358398, "reward_std": 0.14656674191355706, "rewards/code_format_reward": 0.9962499976158142, "rewards/code_reward": 0.5973306179046631, "step": 1300, "zero_std_ratio": 0.675 }, { "clip_ratio/high_max": 0.15073961750604212, "clip_ratio/high_mean": 0.024888798157917336, "clip_ratio/low_mean": 0.004707413475262001, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.02959621128393337, "completion_length": 79.17500152587891, "epoch": 0.25165690135433677, "grad_norm": 3.9428677558898926, "kl": 1.0088127315044404, "learning_rate": 8.668505678790368e-07, "loss": 0.7445, "reward": 1.5962260961532593, "reward_std": 0.22741070687770842, "rewards/code_format_reward": 0.98125, "rewards/code_reward": 0.5528005361557007, "step": 1310, "zero_std_ratio": 0.525 }, { "clip_ratio/high_max": 0.0862931152805686, "clip_ratio/high_mean": 0.016994312894530593, "clip_ratio/low_mean": 0.0031913593309582213, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.020185671979561448, "completion_length": 79.30500183105468, "epoch": 0.25357794640284315, "grad_norm": 2.810743808746338, "kl": 2.0237294919788837, "learning_rate": 8.649157820411451e-07, "loss": -0.0028, "reward": 1.6300202369689942, "reward_std": 0.2859074264764786, "rewards/code_format_reward": 0.975, "rewards/code_reward": 0.5712601006031036, "step": 1320, "zero_std_ratio": 0.45 }, { "clip_ratio/high_max": 0.14902311654295772, "clip_ratio/high_mean": 0.02855427504691761, "clip_ratio/low_mean": 0.012185945303644984, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.040740220062434676, "completion_length": 70.88000030517578, "epoch": 0.2554989914513495, "grad_norm": 4.68557071685791, "kl": 1.2288852274417876, "learning_rate": 8.629695194702949e-07, "loss": -0.0057, "reward": 1.4114359855651855, "reward_std": 0.2626632884144783, "rewards/code_format_reward": 0.9625, "rewards/code_reward": 0.46509301066398623, "step": 1330, "zero_std_ratio": 0.475 }, { "clip_ratio/high_max": 0.11323303133249282, "clip_ratio/high_mean": 0.016216285666450857, "clip_ratio/low_mean": 0.0045135776337701826, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.02072986289858818, "completion_length": 71.99250030517578, "epoch": 0.2574200364998559, "grad_norm": 43.944698333740234, "kl": 1.446278090775013, "learning_rate": 8.610118510957221e-07, "loss": 0.0112, "reward": 1.5807109117507934, "reward_std": 0.23466840982437134, "rewards/code_format_reward": 0.9737500071525573, "rewards/code_reward": 0.5469179153442383, "step": 1340, "zero_std_ratio": 0.5 }, { "clip_ratio/high_max": 0.20504833161830902, "clip_ratio/high_mean": 0.029384778672829272, "clip_ratio/low_mean": 0.006734570109983906, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.03611934892833233, "completion_length": 69.60750198364258, "epoch": 0.25934108154836233, "grad_norm": 3.4515652656555176, "kl": 1.288391387462616, "learning_rate": 8.59042848262334e-07, "loss": 0.0022, "reward": 1.7648874998092652, "reward_std": 0.29008678793907167, "rewards/code_format_reward": 0.99375, "rewards/code_reward": 0.6340062260627747, "step": 1350, "zero_std_ratio": 0.475 }, { "clip_ratio/high_max": 0.18540791552513838, "clip_ratio/high_mean": 0.030647353292442857, "clip_ratio/low_mean": 0.0048290589373209515, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.03547641267068684, "completion_length": 73.8150016784668, "epoch": 0.2612621265968687, "grad_norm": 24.974191665649414, "kl": 1.361786951869726, "learning_rate": 8.570625827281077e-07, "loss": -0.0015, "reward": 1.6352276086807251, "reward_std": 0.20483867302536965, "rewards/code_format_reward": 0.9712500095367431, "rewards/code_reward": 0.5748012781143188, "step": 1360, "zero_std_ratio": 0.625 }, { "clip_ratio/high_max": 0.25138766765594484, "clip_ratio/high_mean": 0.043486443860456345, "clip_ratio/low_mean": 0.006613140180706978, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.05009958455339074, "completion_length": 85.41999969482421, "epoch": 0.2631831716453751, "grad_norm": 0.2826422452926636, "kl": 1.1484392315149308, "learning_rate": 8.550711266614774e-07, "loss": -0.0015, "reward": 1.5049166679382324, "reward_std": 0.17118329852819442, "rewards/code_format_reward": 0.9737499952316284, "rewards/code_reward": 0.5090208292007447, "step": 1370, "zero_std_ratio": 0.625 }, { "clip_ratio/high_max": 0.10418513733893633, "clip_ratio/high_mean": 0.017387184244580568, "clip_ratio/low_mean": 0.006483422458404675, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.023870606115087865, "completion_length": 78.00750274658203, "epoch": 0.26510421669388146, "grad_norm": 0.43826720118522644, "kl": 0.5077251173555851, "learning_rate": 8.530685526387023e-07, "loss": 0.0071, "reward": 1.5417476654052735, "reward_std": 0.2806018695235252, "rewards/code_format_reward": 0.975, "rewards/code_reward": 0.5271238267421723, "step": 1380, "zero_std_ratio": 0.375 }, { "clip_ratio/high_max": 0.12633447310654447, "clip_ratio/high_mean": 0.01944113611098146, "clip_ratio/low_mean": 0.02114583211950958, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.04058696813735878, "completion_length": 69.89499969482422, "epoch": 0.26702526174238783, "grad_norm": 3.222648859024048, "kl": 0.8532382689416409, "learning_rate": 8.510549336412227e-07, "loss": 0.2832, "reward": 1.4325429320335388, "reward_std": 0.23379142954945564, "rewards/code_format_reward": 0.95625, "rewards/code_reward": 0.47720896899700166, "step": 1390, "zero_std_ratio": 0.525 }, { "clip_ratio/high_max": 0.10978957340121269, "clip_ratio/high_mean": 0.015348212420940399, "clip_ratio/low_mean": 0.00886362442979589, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.024211836606264116, "completion_length": 74.32000198364258, "epoch": 0.26894630679089426, "grad_norm": 511.98333740234375, "kl": 6.762348529696465, "learning_rate": 8.490303430529996e-07, "loss": 0.0097, "reward": 1.5433219909667968, "reward_std": 0.3002948135137558, "rewards/code_format_reward": 0.9787500023841857, "rewards/code_reward": 0.5269734919071197, "step": 1400, "zero_std_ratio": 0.475 }, { "clip_ratio/high_max": 0.021737607452087103, "clip_ratio/high_mean": 0.004128801400656812, "clip_ratio/low_mean": 0.008135353482794016, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.012264154804870486, "completion_length": 70.88250122070312, "epoch": 0.27086735183940064, "grad_norm": 4.558300018310547, "kl": 1.0645984336733818, "learning_rate": 8.469948546578406e-07, "loss": -0.002, "reward": 1.711915636062622, "reward_std": 0.23479849100112915, "rewards/code_format_reward": 0.9875, "rewards/code_reward": 0.6090827941894531, "step": 1410, "zero_std_ratio": 0.525 }, { "clip_ratio/high_max": 0.31115832179784775, "clip_ratio/high_mean": 0.04542893636971712, "clip_ratio/low_mean": 0.004167796808178537, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.04959673266857863, "completion_length": 82.51750335693359, "epoch": 0.272788396887907, "grad_norm": 26.85635757446289, "kl": 0.6633755072951317, "learning_rate": 8.449485426367113e-07, "loss": -0.0044, "reward": 1.8086278200149537, "reward_std": 0.25109012275934217, "rewards/code_format_reward": 0.9862499952316284, "rewards/code_reward": 0.6577514052391052, "step": 1420, "zero_std_ratio": 0.575 }, { "clip_ratio/high_max": 0.21433292645961047, "clip_ratio/high_mean": 0.027504962938837706, "clip_ratio/low_mean": 0.007746222103014589, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.03525118476245552, "completion_length": 69.12750015258788, "epoch": 0.2747094419364134, "grad_norm": 39.272727966308594, "kl": 2.1152508199214934, "learning_rate": 8.428914815650318e-07, "loss": 56.6465, "reward": 1.5950207233428955, "reward_std": 0.25626782774925233, "rewards/code_format_reward": 0.9824999928474426, "rewards/code_reward": 0.5518853664398193, "step": 1430, "zero_std_ratio": 0.525 }, { "clip_ratio/high_max": 0.14325670124962925, "clip_ratio/high_mean": 0.02268084152601659, "clip_ratio/low_mean": 0.006528474338119849, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.029209316370543092, "completion_length": 67.30000076293945, "epoch": 0.2766304869849198, "grad_norm": 4.287910461425781, "kl": 1.2686308354139328, "learning_rate": 8.408237464099576e-07, "loss": 9.8201, "reward": 1.6364605188369752, "reward_std": 0.22813104093074799, "rewards/code_format_reward": 0.9749999880790711, "rewards/code_reward": 0.5744802415370941, "step": 1440, "zero_std_ratio": 0.525 }, { "clip_ratio/high_max": 0.18851536950096487, "clip_ratio/high_mean": 0.024719347018981354, "clip_ratio/low_mean": 0.013444452191470191, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.038163799053290856, "completion_length": 82.13250274658203, "epoch": 0.2785515320334262, "grad_norm": 0.4786536991596222, "kl": 8.468844538927078, "learning_rate": 8.387454125276494e-07, "loss": 0.0456, "reward": 1.7758944988250733, "reward_std": 0.1511917643249035, "rewards/code_format_reward": 0.9875, "rewards/code_reward": 0.6410722196102142, "step": 1450, "zero_std_ratio": 0.6 }, { "clip_ratio/high_max": 0.15984937213361264, "clip_ratio/high_mean": 0.025054804515093565, "clip_ratio/low_mean": 0.01257994698244147, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.03763475017622113, "completion_length": 79.66000213623047, "epoch": 0.2804725770819326, "grad_norm": 3.223284959793091, "kl": 1.7015444114804268, "learning_rate": 8.366565556605258e-07, "loss": 0.0276, "reward": 1.5976650953292846, "reward_std": 0.341750779747963, "rewards/code_format_reward": 0.96875, "rewards/code_reward": 0.5566450238227845, "step": 1460, "zero_std_ratio": 0.4 }, { "clip_ratio/high_max": 0.27182711616624144, "clip_ratio/high_mean": 0.040798351392732, "clip_ratio/low_mean": 0.002200227712455671, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.04299857785226777, "completion_length": 79.22250213623047, "epoch": 0.28239362213043895, "grad_norm": 1.4845157861709595, "kl": 1.693036738038063, "learning_rate": 8.345572519345031e-07, "loss": -0.0017, "reward": 1.7161717653274535, "reward_std": 0.2422049015760422, "rewards/code_format_reward": 0.9824999928474426, "rewards/code_reward": 0.612460857629776, "step": 1470, "zero_std_ratio": 0.525 }, { "clip_ratio/high_max": 0.17126517184078693, "clip_ratio/high_mean": 0.025960111571475864, "clip_ratio/low_mean": 0.00444280517695006, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.030402917158789934, "completion_length": 83.31750183105468, "epoch": 0.2843146671789453, "grad_norm": 5.96829080581665, "kl": 0.574289733916521, "learning_rate": 8.324475778562209e-07, "loss": -0.0061, "reward": 1.7776363611221313, "reward_std": 0.2358689785003662, "rewards/code_format_reward": 0.9837499976158142, "rewards/code_reward": 0.6428806602954864, "step": 1480, "zero_std_ratio": 0.475 }, { "clip_ratio/high_max": 0.2015096817165613, "clip_ratio/high_mean": 0.03324723746627569, "clip_ratio/low_mean": 0.00480144299363019, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.038048680778592824, "completion_length": 73.28000106811524, "epoch": 0.28623571222745176, "grad_norm": 6.496949672698975, "kl": 0.6653359919786453, "learning_rate": 8.30327610310254e-07, "loss": 0.0021, "reward": 1.6191941976547242, "reward_std": 0.31718442738056185, "rewards/code_format_reward": 0.9825000047683716, "rewards/code_reward": 0.5639720797538758, "step": 1490, "zero_std_ratio": 0.45 }, { "clip_ratio/high_max": 0.15795932533219456, "clip_ratio/high_mean": 0.02212390162749216, "clip_ratio/low_mean": 0.00480329486890696, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.026927196700125933, "completion_length": 73.78000106811524, "epoch": 0.28815675727595813, "grad_norm": 5.75892972946167, "kl": 0.46196936070919037, "learning_rate": 8.281974265563108e-07, "loss": -0.0045, "reward": 1.7829506158828736, "reward_std": 0.17953601479530334, "rewards/code_format_reward": 0.9949999928474427, "rewards/code_reward": 0.642725282907486, "step": 1500, "zero_std_ratio": 0.625 }, { "clip_ratio/high_max": 0.24582542856223882, "clip_ratio/high_mean": 0.030850262753665446, "clip_ratio/low_mean": 0.005616182333324104, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.03646644388791174, "completion_length": 77.69500198364258, "epoch": 0.2900778023244645, "grad_norm": 326340576.0, "kl": 0.605505321919918, "learning_rate": 8.260571042264166e-07, "loss": 8518.9961, "reward": 1.7113344192504882, "reward_std": 0.18693218380212784, "rewards/code_format_reward": 0.9875, "rewards/code_reward": 0.6087921977043151, "step": 1510, "zero_std_ratio": 0.575 }, { "clip_ratio/high_max": 0.22759323129430414, "clip_ratio/high_mean": 0.03405714362161234, "clip_ratio/low_mean": 0.0032101303557283247, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.03726727362954989, "completion_length": 75.53250122070312, "epoch": 0.2919988473729709, "grad_norm": 2.2893807888031006, "kl": 0.5214515089988708, "learning_rate": 8.23906721322086e-07, "loss": 0.0027, "reward": 1.6311777591705323, "reward_std": 0.17696685791015626, "rewards/code_format_reward": 0.9862499952316284, "rewards/code_reward": 0.5690263509750366, "step": 1520, "zero_std_ratio": 0.55 }, { "clip_ratio/high_max": 0.06725322343409061, "clip_ratio/high_mean": 0.010706762981135398, "clip_ratio/low_mean": 0.0018884234530560206, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.012595186498947442, "completion_length": 78.90999908447266, "epoch": 0.29391989242147726, "grad_norm": 2.6211440563201904, "kl": 0.5930808052420616, "learning_rate": 8.217463562114786e-07, "loss": -0.0035, "reward": 1.7637510299682617, "reward_std": 0.209340962767601, "rewards/code_format_reward": 0.981249988079071, "rewards/code_reward": 0.6365630030632019, "step": 1530, "zero_std_ratio": 0.525 }, { "clip_ratio/high_max": 0.06629493543878198, "clip_ratio/high_mean": 0.012000571249518543, "clip_ratio/low_mean": 0.010053297760896385, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.022053868882358073, "completion_length": 77.66250152587891, "epoch": 0.2958409374699837, "grad_norm": 0.5937472581863403, "kl": 0.6556157968938351, "learning_rate": 8.195760876265438e-07, "loss": 0.0023, "reward": 1.4144308805465697, "reward_std": 0.12647379338741302, "rewards/code_format_reward": 0.9825000047683716, "rewards/code_reward": 0.461590439081192, "step": 1540, "zero_std_ratio": 0.65 }, { "clip_ratio/high_max": 0.2611659773625433, "clip_ratio/high_mean": 0.05069012229796499, "clip_ratio/low_mean": 0.009917778367525897, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.06060790033079684, "completion_length": 80.60250091552734, "epoch": 0.29776198251849006, "grad_norm": 7.297484874725342, "kl": 2.139972834289074, "learning_rate": 8.173959946601519e-07, "loss": 0.0662, "reward": 1.6416264057159424, "reward_std": 0.3118141442537308, "rewards/code_format_reward": 0.9749999880790711, "rewards/code_reward": 0.5770631790161133, "step": 1550, "zero_std_ratio": 0.4 }, { "clip_ratio/high_max": 0.14441705606877803, "clip_ratio/high_mean": 0.023728324193507434, "clip_ratio/low_mean": 0.005098688977886923, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.028827012795954943, "completion_length": 77.17750244140625, "epoch": 0.29968302756699644, "grad_norm": 5.614815711975098, "kl": 0.5137595549225807, "learning_rate": 8.152061567632108e-07, "loss": -0.0057, "reward": 1.5097593545913697, "reward_std": 0.29559260606765747, "rewards/code_format_reward": 0.9575000047683716, "rewards/code_reward": 0.5155046641826629, "step": 1560, "zero_std_ratio": 0.375 }, { "clip_ratio/high_max": 0.03715956890955567, "clip_ratio/high_mean": 0.006024846772197634, "clip_ratio/low_mean": 0.009319488028995692, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015344334836117923, "completion_length": 76.34000091552734, "epoch": 0.3016040726155028, "grad_norm": 5.059381008148193, "kl": 0.8711868159472942, "learning_rate": 8.130066537417707e-07, "loss": -0.0003, "reward": 1.4149085521697997, "reward_std": 0.19155050422996284, "rewards/code_format_reward": 0.9749999880790711, "rewards/code_reward": 0.463704252243042, "step": 1570, "zero_std_ratio": 0.625 }, { "clip_ratio/high_max": 0.0935845285654068, "clip_ratio/high_mean": 0.013573423656634987, "clip_ratio/low_mean": 0.00990565216197865, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.023479075590148567, "completion_length": 83.97500152587891, "epoch": 0.30352511766400925, "grad_norm": 2.025956869125366, "kl": 0.9980318561196327, "learning_rate": 8.10797565754116e-07, "loss": -0.0041, "reward": 1.5444376945495606, "reward_std": 0.19510383605957032, "rewards/code_format_reward": 0.9887499928474426, "rewards/code_reward": 0.525031316280365, "step": 1580, "zero_std_ratio": 0.475 }, { "clip_ratio/high_max": 0.11659459788352251, "clip_ratio/high_mean": 0.016526972700376064, "clip_ratio/low_mean": 0.0030368489184184, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.019563821679912507, "completion_length": 90.33000335693359, "epoch": 0.3054461627125156, "grad_norm": 4.901747703552246, "kl": 0.6650052145123482, "learning_rate": 8.085789733078439e-07, "loss": 0.9063, "reward": 1.6000897407531738, "reward_std": 0.20618843138217927, "rewards/code_format_reward": 0.9774999976158142, "rewards/code_reward": 0.5556698679924011, "step": 1590, "zero_std_ratio": 0.55 }, { "clip_ratio/high_max": 0.1246914654970169, "clip_ratio/high_mean": 0.018419789243489505, "clip_ratio/low_mean": 0.0033823222620412707, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.021802110970020293, "completion_length": 82.78250122070312, "epoch": 0.307367207761022, "grad_norm": 16365.4453125, "kl": 83.84930176734925, "learning_rate": 8.063509572569303e-07, "loss": 0.4123, "reward": 1.8164207458496093, "reward_std": 0.25260339230298995, "rewards/code_format_reward": 0.987499988079071, "rewards/code_reward": 0.6613353252410888, "step": 1600, "zero_std_ratio": 0.475 }, { "clip_ratio/high_max": 0.18187000900506972, "clip_ratio/high_mean": 0.026620355295017363, "clip_ratio/low_mean": 0.011157544914749452, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.03777789976447821, "completion_length": 72.65250244140626, "epoch": 0.3092882528095284, "grad_norm": 2.8136842250823975, "kl": 0.9565572030842304, "learning_rate": 8.041135987987831e-07, "loss": 0.0037, "reward": 1.7599462985992431, "reward_std": 0.26825075447559354, "rewards/code_format_reward": 0.9899999976158143, "rewards/code_reward": 0.6324730753898621, "step": 1610, "zero_std_ratio": 0.475 }, { "clip_ratio/high_max": 0.03404317735694349, "clip_ratio/high_mean": 0.006068735342705622, "clip_ratio/low_mean": 0.010824382931605214, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.016893118200823665, "completion_length": 78.07500305175782, "epoch": 0.31120929785803475, "grad_norm": 31.179058074951172, "kl": 0.560398967564106, "learning_rate": 8.018669794712835e-07, "loss": -0.0011, "reward": 1.5130140781402588, "reward_std": 0.2716240629553795, "rewards/code_format_reward": 0.9799999952316284, "rewards/code_reward": 0.5115070700645447, "step": 1620, "zero_std_ratio": 0.5 }, { "clip_ratio/high_max": 0.06218870538286865, "clip_ratio/high_mean": 0.008549430634593591, "clip_ratio/low_mean": 0.007052442076383158, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.01560187318827957, "completion_length": 83.56500091552735, "epoch": 0.3131303429065412, "grad_norm": 0.6899747252464294, "kl": 0.7204694971442223, "learning_rate": 7.996111811498138e-07, "loss": 0.0031, "reward": 1.687961721420288, "reward_std": 0.19512347355484963, "rewards/code_format_reward": 0.9924999952316285, "rewards/code_reward": 0.5958558440208435, "step": 1630, "zero_std_ratio": 0.55 }, { "clip_ratio/high_max": 0.17274489336414262, "clip_ratio/high_mean": 0.021967002666497138, "clip_ratio/low_mean": 0.009596503502689303, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.03156350784411188, "completion_length": 80.9175033569336, "epoch": 0.31505138795504756, "grad_norm": 2.105334758758545, "kl": 0.8054538488388061, "learning_rate": 7.97346286044274e-07, "loss": -0.0058, "reward": 1.3176400899887084, "reward_std": 0.20478213280439378, "rewards/code_format_reward": 0.98125, "rewards/code_reward": 0.41350752413272857, "step": 1640, "zero_std_ratio": 0.575 }, { "clip_ratio/high_max": 0.16534482885617763, "clip_ratio/high_mean": 0.02735080250131432, "clip_ratio/low_mean": 0.0035748321075516286, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.030925634037703275, "completion_length": 74.01250228881835, "epoch": 0.31697243300355393, "grad_norm": 184916.921875, "kl": 28.671802641451357, "learning_rate": 7.950723766960857e-07, "loss": 5.579, "reward": 1.6360910892486573, "reward_std": 0.2874180316925049, "rewards/code_format_reward": 0.9687500119209289, "rewards/code_reward": 0.5758580267429352, "step": 1650, "zero_std_ratio": 0.375 }, { "clip_ratio/high_max": 0.10983106552157551, "clip_ratio/high_mean": 0.016536441215430388, "clip_ratio/low_mean": 0.011150279239518567, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.027686719762277788, "completion_length": 84.17750244140625, "epoch": 0.3188934780520603, "grad_norm": 219305424.0, "kl": 106.82060827612877, "learning_rate": 7.927895359751835e-07, "loss": 5248.6121, "reward": 1.5329812049865723, "reward_std": 0.22349740117788314, "rewards/code_format_reward": 0.9774999976158142, "rewards/code_reward": 0.5221156060695649, "step": 1660, "zero_std_ratio": 0.5 }, { "clip_ratio/high_max": 0.13622083119116724, "clip_ratio/high_mean": 0.01933064509066753, "clip_ratio/low_mean": 0.005038347843219526, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.024368993006646633, "completion_length": 80.39500274658204, "epoch": 0.3208145231005667, "grad_norm": 9.519110679626465, "kl": 0.7214748501777649, "learning_rate": 7.904978470769959e-07, "loss": -0.0025, "reward": 1.6617871284484864, "reward_std": 0.27498180270195005, "rewards/code_format_reward": 0.95625, "rewards/code_reward": 0.5918310403823852, "step": 1670, "zero_std_ratio": 0.475 }, { "clip_ratio/high_max": 0.09761472065001726, "clip_ratio/high_mean": 0.01911984165199101, "clip_ratio/low_mean": 0.010301339952275158, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.02942118220962584, "completion_length": 74.54750213623046, "epoch": 0.3227355681490731, "grad_norm": 6.143461227416992, "kl": 0.7205829441547393, "learning_rate": 7.881973935194124e-07, "loss": 0.0015, "reward": 1.4262179613113404, "reward_std": 0.26740061640739443, "rewards/code_format_reward": 0.9737499952316284, "rewards/code_reward": 0.4696714758872986, "step": 1680, "zero_std_ratio": 0.425 }, { "clip_ratio/high_max": 0.07396706650033594, "clip_ratio/high_mean": 0.011737752065528184, "clip_ratio/low_mean": 0.005250315659213811, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.016988067945931107, "completion_length": 75.27500228881836, "epoch": 0.3246566131975795, "grad_norm": 2.337491989135742, "kl": 68.4789316162467, "learning_rate": 7.858882591397403e-07, "loss": 0.3045, "reward": 1.527750849723816, "reward_std": 0.26877219378948214, "rewards/code_format_reward": 0.9899999976158143, "rewards/code_reward": 0.5163754165172577, "step": 1690, "zero_std_ratio": 0.5 }, { "clip_ratio/high_max": 0.28693441725336016, "clip_ratio/high_mean": 0.04205623795860447, "clip_ratio/low_mean": 0.009473194915335626, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.051529432012466715, "completion_length": 84.14500274658204, "epoch": 0.32657765824608587, "grad_norm": 20.964569091796875, "kl": 0.5620399042963982, "learning_rate": 7.835705280916488e-07, "loss": -0.0051, "reward": 1.615627408027649, "reward_std": 0.2002291887998581, "rewards/code_format_reward": 0.9949999928474427, "rewards/code_reward": 0.5590636849403381, "step": 1700, "zero_std_ratio": 0.5 }, { "clip_ratio/high_max": 0.2242162274196744, "clip_ratio/high_mean": 0.036464582500047985, "clip_ratio/low_mean": 0.010222097241785378, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.046686679660342636, "completion_length": 78.56000061035157, "epoch": 0.32849870329459224, "grad_norm": 3.2044875621795654, "kl": 0.7747909784317016, "learning_rate": 7.812442848421032e-07, "loss": -0.0006, "reward": 1.6169416427612304, "reward_std": 0.24999960064888, "rewards/code_format_reward": 0.9887500047683716, "rewards/code_reward": 0.5612833142280579, "step": 1710, "zero_std_ratio": 0.5 }, { "clip_ratio/high_max": 0.10125078996643425, "clip_ratio/high_mean": 0.019883562461473048, "clip_ratio/low_mean": 0.014126901775307487, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.034010464209131896, "completion_length": 73.05000152587891, "epoch": 0.3304197483430986, "grad_norm": 735.9865112304688, "kl": 2.3181345582008364, "learning_rate": 7.789096141682851e-07, "loss": 0.1213, "reward": 1.371981406211853, "reward_std": 0.17790164202451705, "rewards/code_format_reward": 0.9712499976158142, "rewards/code_reward": 0.44317818284034727, "step": 1720, "zero_std_ratio": 0.575 }, { "clip_ratio/high_max": 0.15698121464811265, "clip_ratio/high_mean": 0.026607585436431692, "clip_ratio/low_mean": 0.004372719774255529, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.030980303999967873, "completion_length": 78.5425033569336, "epoch": 0.33234079339160505, "grad_norm": 2.3281009197235107, "kl": 1.7815167903900146, "learning_rate": 7.765666011545045e-07, "loss": 0.4359, "reward": 1.669968068599701, "reward_std": 0.18121034651994705, "rewards/code_format_reward": 0.9737499833106995, "rewards/code_reward": 0.5915465235710144, "step": 1730, "zero_std_ratio": 0.625 }, { "clip_ratio/high_max": 0.1189429596066475, "clip_ratio/high_mean": 0.021151045989245176, "clip_ratio/low_mean": 0.002452358941081911, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.023603405337780714, "completion_length": 69.71000289916992, "epoch": 0.3342618384401114, "grad_norm": 1720.8326416015625, "kl": 0.7967777937650681, "learning_rate": 7.742153311890971e-07, "loss": 0.0982, "reward": 1.5440645456314086, "reward_std": 0.18595425188541412, "rewards/code_format_reward": 0.9712499976158142, "rewards/code_reward": 0.5292197823524475, "step": 1740, "zero_std_ratio": 0.55 }, { "clip_ratio/high_max": 0.08902034647762776, "clip_ratio/high_mean": 0.012681722827255725, "clip_ratio/low_mean": 0.00311334275174886, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015795065369457007, "completion_length": 74.49249954223633, "epoch": 0.3361828834886178, "grad_norm": 0.09847641736268997, "kl": 0.8014414094388485, "learning_rate": 7.718558899613143e-07, "loss": 0.0099, "reward": 1.5567015647888183, "reward_std": 0.14754890371114016, "rewards/code_format_reward": 0.9649999976158142, "rewards/code_reward": 0.5371007978916168, "step": 1750, "zero_std_ratio": 0.675 }, { "clip_ratio/high_max": 0.15779653917998077, "clip_ratio/high_mean": 0.030520046106539668, "clip_ratio/low_mean": 0.009007267560809851, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.03952731299214065, "completion_length": 77.64000091552734, "epoch": 0.3381039285371242, "grad_norm": 16.5263729095459, "kl": 0.7359155111014843, "learning_rate": 7.69488363458199e-07, "loss": -0.0085, "reward": 1.477712869644165, "reward_std": 0.26145162880420686, "rewards/code_format_reward": 0.993749988079071, "rewards/code_reward": 0.49041891694068906, "step": 1760, "zero_std_ratio": 0.4 }, { "clip_ratio/high_max": 0.17542534926906228, "clip_ratio/high_mean": 0.025472976046148687, "clip_ratio/low_mean": 0.005083448148798198, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.030556425044778734, "completion_length": 78.76000061035157, "epoch": 0.3400249735856306, "grad_norm": 2.440377950668335, "kl": 1.2570879265666008, "learning_rate": 7.671128379614524e-07, "loss": -0.0029, "reward": 1.697490382194519, "reward_std": 0.21552397906780243, "rewards/code_format_reward": 0.9887499809265137, "rewards/code_reward": 0.6015576839447021, "step": 1770, "zero_std_ratio": 0.575 }, { "clip_ratio/high_max": 0.03777246242389083, "clip_ratio/high_mean": 0.005805602658074349, "clip_ratio/low_mean": 0.006219673785381019, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.012025276734493672, "completion_length": 78.01500091552734, "epoch": 0.341946018634137, "grad_norm": 3.58803129196167, "kl": 1.3505164757370949, "learning_rate": 7.647294000442899e-07, "loss": -0.0008, "reward": 1.3937680006027222, "reward_std": 0.1832626909017563, "rewards/code_format_reward": 0.9912500023841858, "rewards/code_reward": 0.44907149076461794, "step": 1780, "zero_std_ratio": 0.525 }, { "clip_ratio/high_max": 0.08561003021895885, "clip_ratio/high_mean": 0.011109948102966883, "clip_ratio/low_mean": 0.0035756964149186387, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.014685644480050542, "completion_length": 76.20749969482422, "epoch": 0.34386706368264336, "grad_norm": 10.503286361694336, "kl": 0.552098847925663, "learning_rate": 7.623381365682855e-07, "loss": -0.0015, "reward": 1.6644479036331177, "reward_std": 0.22849067896604539, "rewards/code_format_reward": 0.9899999976158143, "rewards/code_reward": 0.5847239375114441, "step": 1790, "zero_std_ratio": 0.55 }, { "clip_ratio/high_max": 0.06243175007402897, "clip_ratio/high_mean": 0.009089326043613255, "clip_ratio/low_mean": 0.005161185140605084, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.014250511419959366, "completion_length": 69.70000076293945, "epoch": 0.34578810873114973, "grad_norm": 4.685351371765137, "kl": 0.3103115826845169, "learning_rate": 7.599391346802063e-07, "loss": -0.0003, "reward": 1.8390909910202027, "reward_std": 0.20207120031118392, "rewards/code_format_reward": 0.9875, "rewards/code_reward": 0.6726704835891724, "step": 1800, "zero_std_ratio": 0.6 }, { "clip_ratio/high_max": 0.046840774989686904, "clip_ratio/high_mean": 0.007519985581166111, "clip_ratio/low_mean": 0.004676173024927266, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.01219615869631525, "completion_length": 80.15500183105469, "epoch": 0.3477091537796561, "grad_norm": 21886460.0, "kl": 0.48781016543507577, "learning_rate": 7.575324818088367e-07, "loss": 517.7405, "reward": 1.6558839797973632, "reward_std": 0.2796541228890419, "rewards/code_format_reward": 0.9737500071525573, "rewards/code_reward": 0.5845044732093811, "step": 1810, "zero_std_ratio": 0.45 }, { "clip_ratio/high_max": 0.18512438922189176, "clip_ratio/high_mean": 0.0357341198658105, "clip_ratio/low_mean": 0.0033004880184307694, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.03903460723813623, "completion_length": 78.84000091552734, "epoch": 0.34963019882816254, "grad_norm": 9.198795318603516, "kl": 4.244446061551571, "learning_rate": 7.551182656617924e-07, "loss": 0.0031, "reward": 1.5848650455474853, "reward_std": 0.17606763169169426, "rewards/code_format_reward": 0.9862500071525574, "rewards/code_reward": 0.5458700299263001, "step": 1820, "zero_std_ratio": 0.525 }, { "clip_ratio/high_max": 0.07551750033162534, "clip_ratio/high_mean": 0.013169253122759983, "clip_ratio/low_mean": 0.001537335959437769, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.014706589409615844, "completion_length": 82.8800033569336, "epoch": 0.3515512438766689, "grad_norm": 0.724766731262207, "kl": 0.9274087265133858, "learning_rate": 7.526965742223234e-07, "loss": 0.0013, "reward": 1.5606717586517334, "reward_std": 0.2877893716096878, "rewards/code_format_reward": 0.9837499976158142, "rewards/code_reward": 0.5343983888626098, "step": 1830, "zero_std_ratio": 0.45 }, { "clip_ratio/high_max": 0.1388332260772586, "clip_ratio/high_mean": 0.021653852658346295, "clip_ratio/low_mean": 0.008576209528837354, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.03023006208240986, "completion_length": 74.26250076293945, "epoch": 0.3534722889251753, "grad_norm": 5.426670074462891, "kl": 0.7045004338026046, "learning_rate": 7.502674957461079e-07, "loss": -0.007, "reward": 1.5688656568527222, "reward_std": 0.30554552264511586, "rewards/code_format_reward": 0.9837499976158142, "rewards/code_reward": 0.5384953856468201, "step": 1840, "zero_std_ratio": 0.5 }, { "clip_ratio/high_max": 0.07899991576559842, "clip_ratio/high_mean": 0.013301478006178513, "clip_ratio/low_mean": 0.01124582380289212, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.024547301628626884, "completion_length": 74.45500106811524, "epoch": 0.35539333397368167, "grad_norm": 2.5104761123657227, "kl": 0.6198086604475975, "learning_rate": 7.478311187580363e-07, "loss": -0.0071, "reward": 1.5550098896026612, "reward_std": 0.21109988391399384, "rewards/code_format_reward": 0.9924999952316285, "rewards/code_reward": 0.52937992811203, "step": 1850, "zero_std_ratio": 0.575 }, { "clip_ratio/high_max": 0.063601403683424, "clip_ratio/high_mean": 0.010639100335538387, "clip_ratio/low_mean": 0.00778028266504407, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.018419382628053427, "completion_length": 71.92500152587891, "epoch": 0.35731437902218804, "grad_norm": 3.805928945541382, "kl": 1.6179959252476692, "learning_rate": 7.453875320489842e-07, "loss": 0.3, "reward": 1.4410953760147094, "reward_std": 0.19501519501209258, "rewards/code_format_reward": 0.981249988079071, "rewards/code_reward": 0.47523519992828367, "step": 1860, "zero_std_ratio": 0.5 }, { "clip_ratio/high_max": 0.10860318611375988, "clip_ratio/high_mean": 0.018746975070098416, "clip_ratio/low_mean": 0.008747255423804745, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.02749423016794026, "completion_length": 69.9375015258789, "epoch": 0.3592354240706945, "grad_norm": 2.388782501220703, "kl": 0.5952992506325245, "learning_rate": 7.429368246725772e-07, "loss": 0.0443, "reward": 1.6972971916198731, "reward_std": 0.17401356399059295, "rewards/code_format_reward": 0.9912499904632568, "rewards/code_reward": 0.6008361041545868, "step": 1870, "zero_std_ratio": 0.625 }, { "clip_ratio/high_max": 0.08630747124552726, "clip_ratio/high_mean": 0.012746809562668205, "clip_ratio/low_mean": 0.010304910433478653, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.02305172006599605, "completion_length": 70.83000183105469, "epoch": 0.36115646911920085, "grad_norm": 16.255178451538086, "kl": 0.8730347856879235, "learning_rate": 7.40479085941945e-07, "loss": 0.0036, "reward": 1.467816424369812, "reward_std": 0.17535984218120576, "rewards/code_format_reward": 0.9925000071525574, "rewards/code_reward": 0.48578319549560545, "step": 1880, "zero_std_ratio": 0.575 }, { "clip_ratio/high_max": 0.26251301234588026, "clip_ratio/high_mean": 0.03827818045392632, "clip_ratio/low_mean": 0.005526873719645664, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.04380505495937541, "completion_length": 64.74750213623047, "epoch": 0.3630775141677072, "grad_norm": 4.061140060424805, "kl": 0.8530658036470413, "learning_rate": 7.380144054264669e-07, "loss": 0.0197, "reward": 1.498781108856201, "reward_std": 0.17463037073612214, "rewards/code_format_reward": 0.9600000023841858, "rewards/code_reward": 0.509390527009964, "step": 1890, "zero_std_ratio": 0.525 }, { "clip_ratio/high_max": 0.24850870491936802, "clip_ratio/high_mean": 0.04144583061570302, "clip_ratio/low_mean": 0.00702623330289498, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.04847206423291937, "completion_length": 75.8375015258789, "epoch": 0.3649985592162136, "grad_norm": 3.4472062587738037, "kl": 1.6324397973716258, "learning_rate": 7.355428729485071e-07, "loss": -0.001, "reward": 1.6619214057922362, "reward_std": 0.18103656098246573, "rewards/code_format_reward": 0.9875, "rewards/code_reward": 0.5840856909751893, "step": 1900, "zero_std_ratio": 0.625 }, { "clip_ratio/high_max": 0.09173821061849594, "clip_ratio/high_mean": 0.014921509474515916, "clip_ratio/low_mean": 0.002157307107700035, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.017078816797584294, "completion_length": 62.185000610351565, "epoch": 0.36691960426472003, "grad_norm": 2.0225422382354736, "kl": 184.02759787738324, "learning_rate": 7.330645785801417e-07, "loss": 2.9496, "reward": 1.7410502433776855, "reward_std": 0.10668236091732979, "rewards/code_format_reward": 0.9949999928474427, "rewards/code_reward": 0.6217751204967499, "step": 1910, "zero_std_ratio": 0.75 }, { "clip_ratio/high_max": 0.16933906488120556, "clip_ratio/high_mean": 0.02619449864141643, "clip_ratio/low_mean": 0.014137339405715465, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.04033183753490448, "completion_length": 79.30000152587891, "epoch": 0.3688406493132264, "grad_norm": 2.6208443641662598, "kl": 1.235317513346672, "learning_rate": 7.305796126398758e-07, "loss": -0.0012, "reward": 1.5036948204040528, "reward_std": 0.20645264089107512, "rewards/code_format_reward": 0.9762499928474426, "rewards/code_reward": 0.5077848553657531, "step": 1920, "zero_std_ratio": 0.6 }, { "clip_ratio/high_max": 0.2661599090555683, "clip_ratio/high_mean": 0.03600101897318382, "clip_ratio/low_mean": 0.009155643907433841, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.045156662538647654, "completion_length": 78.10000152587891, "epoch": 0.3707616943617328, "grad_norm": 8.953734397888184, "kl": 0.6204134523868561, "learning_rate": 7.280880656893518e-07, "loss": 0.0025, "reward": 1.4915935516357421, "reward_std": 0.2376121073961258, "rewards/code_format_reward": 0.9787499904632568, "rewards/code_reward": 0.501109266281128, "step": 1930, "zero_std_ratio": 0.575 }, { "clip_ratio/high_max": 0.15203024838119744, "clip_ratio/high_mean": 0.023713350854814054, "clip_ratio/low_mean": 0.004282052081543952, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.02799540273845196, "completion_length": 74.42500076293945, "epoch": 0.37268273941023916, "grad_norm": 11.845942497253418, "kl": 0.5031724810600281, "learning_rate": 7.255900285300496e-07, "loss": 0.5255, "reward": 1.6400779724121093, "reward_std": 0.22267285138368606, "rewards/code_format_reward": 0.9649999856948852, "rewards/code_reward": 0.5787889719009399, "step": 1940, "zero_std_ratio": 0.625 }, { "clip_ratio/high_max": 0.09135808227583767, "clip_ratio/high_mean": 0.012801296508405358, "clip_ratio/low_mean": 0.01690869364247192, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.02970999013632536, "completion_length": 69.52000198364257, "epoch": 0.37460378445874554, "grad_norm": 6.7441229820251465, "kl": 1.2024895504117012, "learning_rate": 7.230855921999769e-07, "loss": 44.3651, "reward": 1.6912511348724366, "reward_std": 0.17418113350868225, "rewards/code_format_reward": 0.9899999976158143, "rewards/code_reward": 0.5981255412101746, "step": 1950, "zero_std_ratio": 0.575 }, { "clip_ratio/high_max": 0.07453169417567551, "clip_ratio/high_mean": 0.009913802641676739, "clip_ratio/low_mean": 0.003736039294744842, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.013649841863662004, "completion_length": 74.01250228881835, "epoch": 0.37652482950725197, "grad_norm": 4.616723537445068, "kl": 0.6156632959842682, "learning_rate": 7.205748479703515e-07, "loss": -0.0005, "reward": 1.846400761604309, "reward_std": 0.17167636156082153, "rewards/code_format_reward": 0.9899999976158143, "rewards/code_reward": 0.6757004141807557, "step": 1960, "zero_std_ratio": 0.675 }, { "clip_ratio/high_max": 0.09189570704475045, "clip_ratio/high_mean": 0.013587052945513278, "clip_ratio/low_mean": 0.004667519498616457, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0182545724324882, "completion_length": 64.46750030517578, "epoch": 0.37844587455575834, "grad_norm": 0.17748567461967468, "kl": 0.4286219261586666, "learning_rate": 7.180578873422757e-07, "loss": -0.0046, "reward": 1.612094521522522, "reward_std": 0.10822201184928418, "rewards/code_format_reward": 0.99375, "rewards/code_reward": 0.5576097548007966, "step": 1970, "zero_std_ratio": 0.725 }, { "clip_ratio/high_max": 0.2088342323899269, "clip_ratio/high_mean": 0.028434151923283933, "clip_ratio/low_mean": 0.005974846053868532, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.034408997558057305, "completion_length": 69.26750106811524, "epoch": 0.3803669196042647, "grad_norm": 6.238914966583252, "kl": 0.7256933867931366, "learning_rate": 7.155348020434001e-07, "loss": -0.0046, "reward": 1.469704508781433, "reward_std": 0.24035734832286834, "rewards/code_format_reward": 0.9799999833106995, "rewards/code_reward": 0.4898522675037384, "step": 1980, "zero_std_ratio": 0.4 }, { "clip_ratio/high_max": 0.05789460870437324, "clip_ratio/high_mean": 0.007717460609273985, "clip_ratio/low_mean": 0.003460834617726505, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.011178295104764402, "completion_length": 70.19000244140625, "epoch": 0.3822879646527711, "grad_norm": 8.066108703613281, "kl": 1.1788517452776432, "learning_rate": 7.130056840245824e-07, "loss": -0.0005, "reward": 1.5026792764663697, "reward_std": 0.2312860034406185, "rewards/code_format_reward": 0.9962499976158142, "rewards/code_reward": 0.5022771418094635, "step": 1990, "zero_std_ratio": 0.525 }, { "clip_ratio/high_max": 0.07602796133141965, "clip_ratio/high_mean": 0.012856367122731171, "clip_ratio/low_mean": 0.0035519548939191735, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.016408322553616017, "completion_length": 66.4625015258789, "epoch": 0.38420900970127747, "grad_norm": 3.559206962585449, "kl": 1.225260878354311, "learning_rate": 7.104706254565358e-07, "loss": -0.003, "reward": 1.742388916015625, "reward_std": 0.12480423972010612, "rewards/code_format_reward": 0.9924999952316285, "rewards/code_reward": 0.623069453239441, "step": 2000, "zero_std_ratio": 0.675 }, { "clip_ratio/high_max": 0.11271043051965535, "clip_ratio/high_mean": 0.017727556044701488, "clip_ratio/low_mean": 0.005613272835034877, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.02334082857705653, "completion_length": 77.1650001525879, "epoch": 0.3861300547497839, "grad_norm": 3.4077274799346924, "kl": 0.8489379599690438, "learning_rate": 7.07929718726469e-07, "loss": 0.0403, "reward": 1.5602745056152343, "reward_std": 0.2609230324625969, "rewards/code_format_reward": 0.9850000023841858, "rewards/code_reward": 0.5338872492313385, "step": 2010, "zero_std_ratio": 0.575 }, { "clip_ratio/high_max": 0.2993799396790564, "clip_ratio/high_mean": 0.041865267558023334, "clip_ratio/low_mean": 0.006948894041124731, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.04881416228599846, "completion_length": 74.0150016784668, "epoch": 0.3880510997982903, "grad_norm": 3.2043685913085938, "kl": 6.086115422844887, "learning_rate": 7.053830564347206e-07, "loss": 2.2989, "reward": 1.5310536623001099, "reward_std": 0.19302123934030532, "rewards/code_format_reward": 0.9837500095367432, "rewards/code_reward": 0.5195893287658692, "step": 2020, "zero_std_ratio": 0.5 }, { "clip_ratio/high_max": 0.06591402762569487, "clip_ratio/high_mean": 0.009311116795288399, "clip_ratio/low_mean": 0.0017412514251191169, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.01105236830189824, "completion_length": 73.44250106811523, "epoch": 0.38997214484679665, "grad_norm": 2.137256622314453, "kl": 3.9139866441488267, "learning_rate": 7.028307313913838e-07, "loss": 0.0061, "reward": 1.8796703815460205, "reward_std": 0.12868851274251938, "rewards/code_format_reward": 0.9974999904632569, "rewards/code_reward": 0.6904601573944091, "step": 2030, "zero_std_ratio": 0.775 }, { "clip_ratio/high_max": 0.24738994101062417, "clip_ratio/high_mean": 0.03705689987400547, "clip_ratio/low_mean": 0.007423648721305654, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.04448054819367826, "completion_length": 67.09500198364258, "epoch": 0.39189318989530303, "grad_norm": 5.504507541656494, "kl": 1.4878595262765884, "learning_rate": 7.002728366129242e-07, "loss": 0.0166, "reward": 1.8640715599060058, "reward_std": 0.22610510736703873, "rewards/code_format_reward": 0.9799999952316284, "rewards/code_reward": 0.6870357990264893, "step": 2040, "zero_std_ratio": 0.65 }, { "clip_ratio/high_max": 0.09283696161583066, "clip_ratio/high_mean": 0.014592013147193938, "clip_ratio/low_mean": 0.0040809189551509915, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.01867293259128928, "completion_length": 72.88500137329102, "epoch": 0.3938142349438094, "grad_norm": 1.877032995223999, "kl": 2.3534633785486223, "learning_rate": 6.977094653187891e-07, "loss": 0.3364, "reward": 1.5182712078094482, "reward_std": 0.19934598058462144, "rewards/code_format_reward": 0.9712499976158142, "rewards/code_reward": 0.5163230776786805, "step": 2050, "zero_std_ratio": 0.525 }, { "clip_ratio/high_max": 0.047755516692996026, "clip_ratio/high_mean": 0.007312651420943439, "clip_ratio/low_mean": 0.0007527987050707452, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.008065450168214739, "completion_length": 67.76500091552734, "epoch": 0.39573527999231584, "grad_norm": 1.7954281568527222, "kl": 2.4329017847776413, "learning_rate": 6.95140710928012e-07, "loss": 206.5648, "reward": 1.3761554956436157, "reward_std": 0.21033956706523896, "rewards/code_format_reward": 0.9762499928474426, "rewards/code_reward": 0.44401525855064394, "step": 2060, "zero_std_ratio": 0.6 }, { "clip_ratio/high_max": 0.07075442476198077, "clip_ratio/high_mean": 0.009443573304452002, "clip_ratio/low_mean": 0.003901358728762716, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.013344932324253022, "completion_length": 68.6150016784668, "epoch": 0.3976563250408222, "grad_norm": 1.3921815156936646, "kl": 0.6283935949206352, "learning_rate": 6.925666670558062e-07, "loss": 1.5274, "reward": 1.4756604433059692, "reward_std": 0.2542987480759621, "rewards/code_format_reward": 0.9837499976158142, "rewards/code_reward": 0.49189271330833434, "step": 2070, "zero_std_ratio": 0.475 }, { "clip_ratio/high_max": 0.09334485791623592, "clip_ratio/high_mean": 0.015712386509403587, "clip_ratio/low_mean": 0.005205962993204594, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.02091834945604205, "completion_length": 75.06750183105468, "epoch": 0.3995773700893286, "grad_norm": 1.3997697830200195, "kl": 0.5330163806676864, "learning_rate": 6.899874275101538e-07, "loss": -0.0031, "reward": 1.7522424459457397, "reward_std": 0.1803124487400055, "rewards/code_format_reward": 0.9899999856948852, "rewards/code_reward": 0.6286212205886841, "step": 2080, "zero_std_ratio": 0.575 }, { "clip_ratio/high_max": 0.08916364926844836, "clip_ratio/high_mean": 0.014017748599871992, "clip_ratio/low_mean": 0.003948131998186, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.01796588017605245, "completion_length": 78.19750213623047, "epoch": 0.40149841513783496, "grad_norm": 2296.336669921875, "kl": 1.0256180852651595, "learning_rate": 6.874030862883879e-07, "loss": 0.0318, "reward": 1.2450440883636475, "reward_std": 0.22890471369028093, "rewards/code_format_reward": 0.9775000095367432, "rewards/code_reward": 0.3781470343470573, "step": 2090, "zero_std_ratio": 0.45 }, { "clip_ratio/high_max": 0.22327043637633323, "clip_ratio/high_mean": 0.04789549903944135, "clip_ratio/low_mean": 0.00559167112223804, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.053487171232700345, "completion_length": 70.61250152587891, "epoch": 0.4034194601863414, "grad_norm": 3.2615253925323486, "kl": 8.218332803249359, "learning_rate": 6.848137375737652e-07, "loss": 0.0058, "reward": 1.6430699110031128, "reward_std": 0.21420457661151887, "rewards/code_format_reward": 0.96875, "rewards/code_reward": 0.5793474376201629, "step": 2100, "zero_std_ratio": 0.475 }, { "clip_ratio/high_max": 0.1533806946128607, "clip_ratio/high_mean": 0.02256658235564828, "clip_ratio/low_mean": 0.002787484592408873, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.025354066491127016, "completion_length": 74.33999938964844, "epoch": 0.40534050523484777, "grad_norm": 4.315516471862793, "kl": 1.0426696628332137, "learning_rate": 6.822194757320354e-07, "loss": 0.0019, "reward": 1.6090970516204834, "reward_std": 0.1758709292858839, "rewards/code_format_reward": 0.993749988079071, "rewards/code_reward": 0.5561110019683838, "step": 2110, "zero_std_ratio": 0.525 }, { "clip_ratio/high_max": 0.1336930485442281, "clip_ratio/high_mean": 0.021989132883027195, "clip_ratio/low_mean": 0.0070218192064203325, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0290109527297318, "completion_length": 73.0250015258789, "epoch": 0.40726155028335415, "grad_norm": 18.143117904663086, "kl": 0.4288759011775255, "learning_rate": 6.796203953080007e-07, "loss": 0.0005, "reward": 1.72017080783844, "reward_std": 0.22243313789367675, "rewards/code_format_reward": 0.9824999928474426, "rewards/code_reward": 0.6144603788852692, "step": 2120, "zero_std_ratio": 0.525 }, { "clip_ratio/high_max": 0.08061643750406802, "clip_ratio/high_mean": 0.011467291257577016, "clip_ratio/low_mean": 0.011395246715983376, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.022862538695335388, "completion_length": 68.66250152587891, "epoch": 0.4091825953318605, "grad_norm": 1.0005404949188232, "kl": 0.47304695919156076, "learning_rate": 6.770165910220709e-07, "loss": 0.0006, "reward": 1.4831626653671264, "reward_std": 0.1916220799088478, "rewards/code_format_reward": 0.9837499856948853, "rewards/code_reward": 0.4956438183784485, "step": 2130, "zero_std_ratio": 0.575 }, { "clip_ratio/high_max": 0.06393485199660062, "clip_ratio/high_mean": 0.011905963439494372, "clip_ratio/low_mean": 0.0023792986408807336, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.01428526220843196, "completion_length": 74.32250137329102, "epoch": 0.4111036403803669, "grad_norm": 2.491830825805664, "kl": 2.213325946778059, "learning_rate": 6.744081577668115e-07, "loss": 0.1532, "reward": 1.7680244207382203, "reward_std": 0.18317916095256806, "rewards/code_format_reward": 0.9687499880790711, "rewards/code_reward": 0.6418246865272522, "step": 2140, "zero_std_ratio": 0.675 }, { "clip_ratio/high_max": 0.03965856842696667, "clip_ratio/high_mean": 0.00730013819411397, "clip_ratio/low_mean": 0.0031650666729547083, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.010465204739011824, "completion_length": 73.1050018310547, "epoch": 0.41302468542887333, "grad_norm": 0.353427916765213, "kl": 0.2898652456700802, "learning_rate": 6.717951906034856e-07, "loss": -0.0015, "reward": 1.6113624095916748, "reward_std": 0.09930019937455654, "rewards/code_format_reward": 0.9862499952316284, "rewards/code_reward": 0.5591186702251434, "step": 2150, "zero_std_ratio": 0.725 }, { "clip_ratio/high_max": 0.03382167350500822, "clip_ratio/high_mean": 0.005409902473911643, "clip_ratio/low_mean": 0.0024156818573828785, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.007825584337115287, "completion_length": 68.12750091552735, "epoch": 0.4149457304773797, "grad_norm": 3.9950575828552246, "kl": 0.789361334592104, "learning_rate": 6.691777847585883e-07, "loss": 0.048, "reward": 1.5698497295379639, "reward_std": 0.1552659712731838, "rewards/code_format_reward": 0.9725000023841858, "rewards/code_reward": 0.5417998552322387, "step": 2160, "zero_std_ratio": 0.575 }, { "clip_ratio/high_max": 0.026565033989027143, "clip_ratio/high_mean": 0.004212364956038073, "clip_ratio/low_mean": 0.0013839059392921627, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.005596270889509469, "completion_length": 70.80999984741212, "epoch": 0.4168667755258861, "grad_norm": 1.3910998106002808, "kl": 1.4257395297288895, "learning_rate": 6.665560356203784e-07, "loss": 0.8731, "reward": 1.4512264728546143, "reward_std": 0.14117379933595658, "rewards/code_format_reward": 0.9924999952316285, "rewards/code_reward": 0.47748821377754214, "step": 2170, "zero_std_ratio": 0.65 }, { "clip_ratio/high_max": 0.09546168451197445, "clip_ratio/high_mean": 0.01459201174438931, "clip_ratio/low_mean": 0.006060798710677773, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.020652810629690065, "completion_length": 67.89000091552734, "epoch": 0.41878782057439246, "grad_norm": 0.6732813715934753, "kl": 1.1321026906371117, "learning_rate": 6.639300387353999e-07, "loss": -0.0002, "reward": 1.3501636981964111, "reward_std": 0.21670444533228875, "rewards/code_format_reward": 0.9924999833106994, "rewards/code_reward": 0.42695685029029845, "step": 2180, "zero_std_ratio": 0.55 }, { "clip_ratio/high_max": 0.11407874876167626, "clip_ratio/high_mean": 0.01725804756570142, "clip_ratio/low_mean": 0.0015681478500482627, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.018826195126166567, "completion_length": 68.7525016784668, "epoch": 0.42070886562289883, "grad_norm": 1.5759879350662231, "kl": 0.4211964398622513, "learning_rate": 6.612998898050014e-07, "loss": -0.0021, "reward": 1.7485667228698731, "reward_std": 0.16526954025030136, "rewards/code_format_reward": 0.9612500071525574, "rewards/code_reward": 0.6339708626270294, "step": 2190, "zero_std_ratio": 0.6 }, { "clip_ratio/high_max": 0.10751554854214192, "clip_ratio/high_mean": 0.013745604571886361, "clip_ratio/low_mean": 0.010064921525190585, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.023810526612214743, "completion_length": 62.88500137329102, "epoch": 0.42262991067140526, "grad_norm": 2.4066975116729736, "kl": 0.7549011036753654, "learning_rate": 6.586656846818477e-07, "loss": 0.2999, "reward": 1.6932018756866456, "reward_std": 0.1608109436929226, "rewards/code_format_reward": 0.9899999976158143, "rewards/code_reward": 0.5991009473800659, "step": 2200, "zero_std_ratio": 0.675 }, { "clip_ratio/high_max": 0.019488381547853352, "clip_ratio/high_mean": 0.003436583065195009, "clip_ratio/low_mean": 0.002801175639615394, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0062377589056268334, "completion_length": 72.55250244140625, "epoch": 0.42455095571991164, "grad_norm": 2.0696611404418945, "kl": 5.306586292386055, "learning_rate": 6.56027519366427e-07, "loss": 0.011, "reward": 1.611876368522644, "reward_std": 0.1603232156485319, "rewards/code_format_reward": 0.9850000023841858, "rewards/code_reward": 0.5596881568431854, "step": 2210, "zero_std_ratio": 0.6 }, { "clip_ratio/high_max": 0.05246827639639377, "clip_ratio/high_mean": 0.00732308179140091, "clip_ratio/low_mean": 0.0034836977836675944, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.010806779406266287, "completion_length": 64.31750183105468, "epoch": 0.426472000768418, "grad_norm": 0.12577353417873383, "kl": 0.5850224502384662, "learning_rate": 6.533854900035516e-07, "loss": -0.0015, "reward": 1.7735862731933594, "reward_std": 0.13040905613452197, "rewards/code_format_reward": 0.9875, "rewards/code_reward": 0.6399181246757507, "step": 2220, "zero_std_ratio": 0.7 }, { "clip_ratio/high_max": 0.24315445288084447, "clip_ratio/high_mean": 0.031706276966724546, "clip_ratio/low_mean": 0.011593326600268484, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.04329960328759626, "completion_length": 72.80000152587891, "epoch": 0.4283930458169244, "grad_norm": 4.765016078948975, "kl": 1.5887107208371163, "learning_rate": 6.507396928788548e-07, "loss": 0.0023, "reward": 1.6477301597595215, "reward_std": 0.12887158915400504, "rewards/code_format_reward": 0.975, "rewards/code_reward": 0.5801151037216187, "step": 2230, "zero_std_ratio": 0.675 }, { "clip_ratio/high_max": 0.044854282308369874, "clip_ratio/high_mean": 0.007485381804872304, "clip_ratio/low_mean": 0.0028356918206554837, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.01032107327482663, "completion_length": 66.63000183105468, "epoch": 0.4303140908654308, "grad_norm": 1.5923104286193848, "kl": 0.9431760296225548, "learning_rate": 6.480902244152813e-07, "loss": -0.0021, "reward": 1.4723083972930908, "reward_std": 0.13776133116334677, "rewards/code_format_reward": 0.9899999856948852, "rewards/code_reward": 0.48865418434143065, "step": 2240, "zero_std_ratio": 0.7 }, { "clip_ratio/high_max": 0.08558401605114341, "clip_ratio/high_mean": 0.01418596402509138, "clip_ratio/low_mean": 0.005716345021210145, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.019902308681048454, "completion_length": 67.75250015258788, "epoch": 0.4322351359139372, "grad_norm": 4.213563442230225, "kl": 0.7182839468121529, "learning_rate": 6.454371811695732e-07, "loss": -0.0032, "reward": 1.5263491868972778, "reward_std": 0.215225650370121, "rewards/code_format_reward": 0.975000011920929, "rewards/code_reward": 0.51942458152771, "step": 2250, "zero_std_ratio": 0.525 }, { "clip_ratio/high_max": 0.17924602022394537, "clip_ratio/high_mean": 0.02314122476382181, "clip_ratio/low_mean": 0.006780697987414897, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.029921922995708884, "completion_length": 67.31500091552735, "epoch": 0.43415618096244357, "grad_norm": 2.018653392791748, "kl": 0.644180704653263, "learning_rate": 6.427806598287522e-07, "loss": -0.0031, "reward": 1.8284268617630004, "reward_std": 0.1590463936328888, "rewards/code_format_reward": 0.993749988079071, "rewards/code_reward": 0.6657759308815002, "step": 2260, "zero_std_ratio": 0.625 }, { "clip_ratio/high_max": 0.26562999933958054, "clip_ratio/high_mean": 0.04080731603316963, "clip_ratio/low_mean": 0.002605196795775555, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.043412512401118875, "completion_length": 64.91250076293946, "epoch": 0.43607722601094995, "grad_norm": 2.8014633655548096, "kl": 1.4193657219409943, "learning_rate": 6.401207572065942e-07, "loss": 0.0075, "reward": 1.6795406818389893, "reward_std": 0.1340640414506197, "rewards/code_format_reward": 0.99375, "rewards/code_reward": 0.5913328170776367, "step": 2270, "zero_std_ratio": 0.625 }, { "clip_ratio/high_max": 0.15035496577620505, "clip_ratio/high_mean": 0.021555275144055485, "clip_ratio/low_mean": 0.007308500797080342, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.028863775311037898, "completion_length": 83.20750122070312, "epoch": 0.4379982710594563, "grad_norm": 5.3116655349731445, "kl": 1.7165004715323449, "learning_rate": 6.374575702401019e-07, "loss": -0.0031, "reward": 1.694450354576111, "reward_std": 0.2935485541820526, "rewards/code_format_reward": 0.9650000095367431, "rewards/code_reward": 0.6059751749038697, "step": 2280, "zero_std_ratio": 0.475 }, { "clip_ratio/high_max": 0.05178634703624994, "clip_ratio/high_mean": 0.007199086344917305, "clip_ratio/low_mean": 0.004959188599605114, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.012158275028923526, "completion_length": 68.35500106811523, "epoch": 0.43991931610796275, "grad_norm": 11.67419719696045, "kl": 0.8460408747196198, "learning_rate": 6.347911959859725e-07, "loss": -0.0013, "reward": 1.6080287456512452, "reward_std": 0.2270718976855278, "rewards/code_format_reward": 0.9699999928474426, "rewards/code_reward": 0.5615143775939941, "step": 2290, "zero_std_ratio": 0.575 }, { "clip_ratio/high_max": 0.07622572851832957, "clip_ratio/high_mean": 0.011604995708330535, "clip_ratio/low_mean": 0.0013341609621420503, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.012939156647189521, "completion_length": 68.30750274658203, "epoch": 0.44184036115646913, "grad_norm": 332.7762451171875, "kl": 0.7540152728557586, "learning_rate": 6.321217316170599e-07, "loss": 0.1015, "reward": 1.4850183725357056, "reward_std": 0.1393202841281891, "rewards/code_format_reward": 0.9912499904632568, "rewards/code_reward": 0.49469670057296755, "step": 2300, "zero_std_ratio": 0.6 }, { "clip_ratio/high_max": 0.16746083926409483, "clip_ratio/high_mean": 0.02103413282893598, "clip_ratio/low_mean": 0.0068577720652683635, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.027891904639545828, "completion_length": 64.55000152587891, "epoch": 0.4437614062049755, "grad_norm": 0.36056017875671387, "kl": 0.4329931303858757, "learning_rate": 6.294492744188335e-07, "loss": 0.0002, "reward": 1.4963040232658387, "reward_std": 0.07247132882475853, "rewards/code_format_reward": 0.9837499976158142, "rewards/code_reward": 0.502214539051056, "step": 2310, "zero_std_ratio": 0.725 }, { "clip_ratio/high_max": 0.05429213300812989, "clip_ratio/high_mean": 0.007803994990536012, "clip_ratio/low_mean": 0.008226435555843636, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.01603043078503106, "completion_length": 69.78750228881836, "epoch": 0.4456824512534819, "grad_norm": 0.1676941215991974, "kl": 0.276796979829669, "learning_rate": 6.267739217858329e-07, "loss": -0.0028, "reward": 1.7269956827163697, "reward_std": 0.1742506742477417, "rewards/code_format_reward": 0.9912499904632568, "rewards/code_reward": 0.6156853199005127, "step": 2320, "zero_std_ratio": 0.575 }, { "clip_ratio/high_max": 0.03961263382807374, "clip_ratio/high_mean": 0.00831791803939268, "clip_ratio/low_mean": 0.008615480939624831, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.016933399019762874, "completion_length": 70.70000228881835, "epoch": 0.44760349630198826, "grad_norm": 6.724217891693115, "kl": 0.544577070325613, "learning_rate": 6.240957712181186e-07, "loss": -0.0041, "reward": 1.3949034690856934, "reward_std": 0.21950918734073638, "rewards/code_format_reward": 0.9774999976158142, "rewards/code_reward": 0.45307670831680297, "step": 2330, "zero_std_ratio": 0.475 }, { "clip_ratio/high_max": 0.2168802363506984, "clip_ratio/high_mean": 0.03705684195374488, "clip_ratio/low_mean": 0.0028890643618069587, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.03994590537622571, "completion_length": 74.0625, "epoch": 0.4495245413504947, "grad_norm": 3.073554277420044, "kl": 0.6307030320167542, "learning_rate": 6.214149203177182e-07, "loss": -0.0002, "reward": 1.679004979133606, "reward_std": 0.1860196329653263, "rewards/code_format_reward": 0.9912499904632568, "rewards/code_reward": 0.5916899800300598, "step": 2340, "zero_std_ratio": 0.6 }, { "clip_ratio/high_max": 0.0932310588657856, "clip_ratio/high_mean": 0.014429462677799165, "clip_ratio/low_mean": 0.0065534046734683216, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.020982867432758213, "completion_length": 67.18000183105468, "epoch": 0.45144558639900106, "grad_norm": 3595.65576171875, "kl": 1.140541896224022, "learning_rate": 6.187314667850697e-07, "loss": 0.1447, "reward": 1.4676954984664916, "reward_std": 0.20568167939782142, "rewards/code_format_reward": 0.9849999904632568, "rewards/code_reward": 0.4875977456569672, "step": 2350, "zero_std_ratio": 0.525 }, { "clip_ratio/high_max": 0.036702672578394414, "clip_ratio/high_mean": 0.006752843782305717, "clip_ratio/low_mean": 0.008269340678816661, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015022184286499396, "completion_length": 80.57000122070312, "epoch": 0.45336663144750744, "grad_norm": 2.759171724319458, "kl": 10.568821829557418, "learning_rate": 6.160455084154613e-07, "loss": 1.8532, "reward": 1.4545687198638917, "reward_std": 0.23069845288991928, "rewards/code_format_reward": 0.9837499976158142, "rewards/code_reward": 0.4813468337059021, "step": 2360, "zero_std_ratio": 0.475 }, { "clip_ratio/high_max": 0.08144733654335141, "clip_ratio/high_mean": 0.014263840962667019, "clip_ratio/low_mean": 0.0019872021744959056, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.01625104369595647, "completion_length": 71.55750045776367, "epoch": 0.4552876764960138, "grad_norm": 1.9088038206100464, "kl": 1.3571255028247833, "learning_rate": 6.133571430954667e-07, "loss": 0.0026, "reward": 1.5344175338745116, "reward_std": 0.16607576459646226, "rewards/code_format_reward": 0.9737500071525573, "rewards/code_reward": 0.5237712502479553, "step": 2370, "zero_std_ratio": 0.65 }, { "clip_ratio/high_max": 0.10026950519531966, "clip_ratio/high_mean": 0.01315019663888961, "clip_ratio/low_mean": 0.00221524270309601, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015365439187735318, "completion_length": 72.40750122070312, "epoch": 0.4572087215445202, "grad_norm": 4.301158428192139, "kl": 0.6290791854262352, "learning_rate": 6.106664687993782e-07, "loss": -0.0032, "reward": 1.5749263525009156, "reward_std": 0.16429235637187958, "rewards/code_format_reward": 0.9724999785423278, "rewards/code_reward": 0.5443381488323211, "step": 2380, "zero_std_ratio": 0.65 }, { "clip_ratio/high_max": 0.10308512919582427, "clip_ratio/high_mean": 0.016338009486207738, "clip_ratio/low_mean": 0.0017032683303114028, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.018041277857264504, "completion_length": 76.43750228881837, "epoch": 0.4591297665930266, "grad_norm": 6.198258876800537, "kl": 408884378.2116049, "learning_rate": 6.079735835856362e-07, "loss": 1157747.0, "reward": 1.5280384778976441, "reward_std": 0.19424125757068395, "rewards/code_format_reward": 0.9862499952316284, "rewards/code_reward": 0.517456728219986, "step": 2390, "zero_std_ratio": 0.55 }, { "clip_ratio/high_max": 0.24319633510895072, "clip_ratio/high_mean": 0.037530579004669565, "clip_ratio/low_mean": 0.004886501970031531, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.04241708111949265, "completion_length": 74.5425018310547, "epoch": 0.461050811641533, "grad_norm": 5.885474681854248, "kl": 1.4351533338427545, "learning_rate": 6.052785855932548e-07, "loss": 0.123, "reward": 1.4949720859527589, "reward_std": 0.20392217636108398, "rewards/code_format_reward": 0.9887500047683716, "rewards/code_reward": 0.5002985119819641, "step": 2400, "zero_std_ratio": 0.45 }, { "clip_ratio/high_max": 0.23103972100652753, "clip_ratio/high_mean": 0.0305588347138837, "clip_ratio/low_mean": 0.002339675696566701, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.03289851036388427, "completion_length": 70.01750106811524, "epoch": 0.4629718566900394, "grad_norm": 0.8806352615356445, "kl": 1.6503019407391548, "learning_rate": 6.025815730382463e-07, "loss": 0.8832, "reward": 1.6588483333587647, "reward_std": 0.19124363958835602, "rewards/code_format_reward": 0.9725000143051148, "rewards/code_reward": 0.5862991452217102, "step": 2410, "zero_std_ratio": 0.55 }, { "clip_ratio/high_max": 0.02757756725186482, "clip_ratio/high_mean": 0.005332520017691422, "clip_ratio/low_mean": 0.019763218611478804, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.025095738274103496, "completion_length": 71.59250183105469, "epoch": 0.46489290173854575, "grad_norm": 1.2440141439437866, "kl": 2.751401698589325, "learning_rate": 5.998826442100412e-07, "loss": 362174.725, "reward": 1.5159764885902405, "reward_std": 0.1902527991682291, "rewards/code_format_reward": 0.9799999952316284, "rewards/code_reward": 0.5129882216453552, "step": 2420, "zero_std_ratio": 0.525 }, { "clip_ratio/high_max": 0.2530499072512612, "clip_ratio/high_mean": 0.03376921496528666, "clip_ratio/low_mean": 0.0062858725665137175, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.04005508716509212, "completion_length": 76.0425018310547, "epoch": 0.4668139467870522, "grad_norm": 66.4449234008789, "kl": 2164149.3861157326, "learning_rate": 5.971818974679065e-07, "loss": 2449736.0, "reward": 1.6650853157043457, "reward_std": 0.24712301939725875, "rewards/code_format_reward": 0.9887500047683716, "rewards/code_reward": 0.585355132818222, "step": 2430, "zero_std_ratio": 0.55 }, { "clip_ratio/high_max": 0.2950001623481512, "clip_ratio/high_mean": 0.042542998865246776, "clip_ratio/low_mean": 0.0068845050991512835, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.049427504313644025, "completion_length": 75.27000198364257, "epoch": 0.46873499183555856, "grad_norm": 2.206911563873291, "kl": 11.237105096876622, "learning_rate": 5.944794312373607e-07, "loss": 0.0298, "reward": 1.7914002895355225, "reward_std": 0.22826257348060608, "rewards/code_format_reward": 0.9850000023841858, "rewards/code_reward": 0.649450159072876, "step": 2440, "zero_std_ratio": 0.525 }, { "clip_ratio/high_max": 0.07424517879262567, "clip_ratio/high_mean": 0.010772422759328038, "clip_ratio/low_mean": 0.010833968574297614, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.02160639046342112, "completion_length": 71.40500183105469, "epoch": 0.47065603688406493, "grad_norm": 76503500980224.0, "kl": 393.06428125053645, "learning_rate": 5.917753440065869e-07, "loss": 909725593.6, "reward": 1.4975883960723877, "reward_std": 0.28928079828619957, "rewards/code_format_reward": 0.9612499833106994, "rewards/code_reward": 0.5084816813468933, "step": 2450, "zero_std_ratio": 0.425 }, { "clip_ratio/high_max": 0.14294563261792065, "clip_ratio/high_mean": 0.019953654275741427, "clip_ratio/low_mean": 0.004493102640844881, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.024446757195983083, "completion_length": 79.72250061035156, "epoch": 0.4725770819325713, "grad_norm": 0.778223991394043, "kl": 2.2069298341870307, "learning_rate": 5.89069734322844e-07, "loss": -0.0085, "reward": 1.5203648328781127, "reward_std": 0.1896197520196438, "rewards/code_format_reward": 0.9712499976158142, "rewards/code_reward": 0.5173698782920837, "step": 2460, "zero_std_ratio": 0.575 }, { "clip_ratio/high_max": 0.0473501511849463, "clip_ratio/high_mean": 0.006591684772865846, "clip_ratio/low_mean": 0.0004718510695965961, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.007063536025816575, "completion_length": 76.70500183105469, "epoch": 0.4744981269810777, "grad_norm": 0.5978448390960693, "kl": 0.6427325546741486, "learning_rate": 5.863627007888745e-07, "loss": 0.0007, "reward": 1.7259918212890626, "reward_std": 0.1515914086252451, "rewards/code_format_reward": 0.9774999976158142, "rewards/code_reward": 0.618620878458023, "step": 2470, "zero_std_ratio": 0.625 }, { "clip_ratio/high_max": 0.048674007039517166, "clip_ratio/high_mean": 0.010258768184576184, "clip_ratio/low_mean": 0.012727768435433972, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.022986536961980164, "completion_length": 78.06500244140625, "epoch": 0.4764191720295841, "grad_norm": 4.168500900268555, "kl": 0.5699560895562172, "learning_rate": 5.836543420593119e-07, "loss": -0.0011, "reward": 1.6060274362564086, "reward_std": 0.2864475339651108, "rewards/code_format_reward": 0.9824999928474426, "rewards/code_reward": 0.557388699054718, "step": 2480, "zero_std_ratio": 0.425 }, { "clip_ratio/high_max": 0.014699839102104307, "clip_ratio/high_mean": 0.0019118846452329309, "clip_ratio/low_mean": 0.0005179177765967325, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0024298023723531514, "completion_length": 85.17000122070313, "epoch": 0.4783402170780905, "grad_norm": 4.149423599243164, "kl": 1.3347756370902062, "learning_rate": 5.809447568370843e-07, "loss": 0.0102, "reward": 1.621114158630371, "reward_std": 0.21484595835208892, "rewards/code_format_reward": 0.9774999856948853, "rewards/code_reward": 0.5661820948123932, "step": 2490, "zero_std_ratio": 0.5 }, { "clip_ratio/high_max": 0.029943293944234027, "clip_ratio/high_mean": 0.006927184848609613, "clip_ratio/low_mean": 0.0035072380007477475, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.010434422721300508, "completion_length": 83.86250228881836, "epoch": 0.48026126212659687, "grad_norm": 5.97049617767334, "kl": 4.178053397685289, "learning_rate": 5.782340438698185e-07, "loss": -0.0063, "reward": 1.6789068222045898, "reward_std": 0.25779220163822175, "rewards/code_format_reward": 0.9962499976158142, "rewards/code_reward": 0.5903908908367157, "step": 2500, "zero_std_ratio": 0.425 }, { "clip_ratio/high_max": 0.08102847400587052, "clip_ratio/high_mean": 0.01392527524731122, "clip_ratio/low_mean": 0.0045509199095249645, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.018476195022230968, "completion_length": 83.11250152587891, "epoch": 0.48218230717510324, "grad_norm": 5.283038139343262, "kl": 1.111867392808199, "learning_rate": 5.755223019462401e-07, "loss": 17.941, "reward": 1.577300524711609, "reward_std": 0.22725088596343995, "rewards/code_format_reward": 0.9774999976158142, "rewards/code_reward": 0.5442752420902253, "step": 2510, "zero_std_ratio": 0.475 }, { "clip_ratio/high_max": 0.06463829884305597, "clip_ratio/high_mean": 0.008858744835015387, "clip_ratio/low_mean": 0.0054666692391037944, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.01432541401591152, "completion_length": 85.7625015258789, "epoch": 0.4841033522236096, "grad_norm": 8.200135231018066, "kl": 0.4475974731147289, "learning_rate": 5.728096298925745e-07, "loss": -0.0057, "reward": 1.5549763917922974, "reward_std": 0.23400793820619584, "rewards/code_format_reward": 0.9774999976158142, "rewards/code_reward": 0.5331131994724274, "step": 2520, "zero_std_ratio": 0.425 }, { "clip_ratio/high_max": 0.031378739466890695, "clip_ratio/high_mean": 0.00507326218066737, "clip_ratio/low_mean": 0.010504274675622583, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015577536821365357, "completion_length": 79.96750030517578, "epoch": 0.48602439727211605, "grad_norm": 2.6766583919525146, "kl": 0.4622874528169632, "learning_rate": 5.700961265689434e-07, "loss": -0.0011, "reward": 1.8167934179306031, "reward_std": 0.30146218538284303, "rewards/code_format_reward": 0.9850000023841858, "rewards/code_reward": 0.6621467113494873, "step": 2530, "zero_std_ratio": 0.45 }, { "clip_ratio/high_max": 0.07196793179027736, "clip_ratio/high_mean": 0.013576928357360884, "clip_ratio/low_mean": 0.0018521397636504845, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015429067742661572, "completion_length": 87.03000030517578, "epoch": 0.4879454423206224, "grad_norm": 1.347899317741394, "kl": 0.7047492057085037, "learning_rate": 5.673818908657644e-07, "loss": -0.0079, "reward": 1.6893932342529296, "reward_std": 0.24144218415021895, "rewards/code_format_reward": 0.9862500071525574, "rewards/code_reward": 0.5981341004371643, "step": 2540, "zero_std_ratio": 0.375 }, { "clip_ratio/high_max": 0.03861275149974972, "clip_ratio/high_mean": 0.005072360605117865, "clip_ratio/low_mean": 0.0013027720240643248, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0063751326058991255, "completion_length": 78.80750122070313, "epoch": 0.4898664873691288, "grad_norm": 1.7946380376815796, "kl": 0.7765734851360321, "learning_rate": 5.646670217001451e-07, "loss": 0.004, "reward": 1.8638887882232666, "reward_std": 0.1732952728867531, "rewards/code_format_reward": 0.9949999928474427, "rewards/code_reward": 0.6831943988800049, "step": 2550, "zero_std_ratio": 0.625 }, { "clip_ratio/high_max": 0.044772564456798135, "clip_ratio/high_mean": 0.008199371959199198, "clip_ratio/low_mean": 0.007188984929234721, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015388356836047024, "completion_length": 92.03750305175781, "epoch": 0.4917875324176352, "grad_norm": 8241.9619140625, "kl": 3.7090125039219854, "learning_rate": 5.619516180122789e-07, "loss": 0.2194, "reward": 1.346347188949585, "reward_std": 0.3114967554807663, "rewards/code_format_reward": 0.9712499976158142, "rewards/code_reward": 0.4303610801696777, "step": 2560, "zero_std_ratio": 0.425 }, { "clip_ratio/high_max": 0.136108908476308, "clip_ratio/high_mean": 0.01777363264700398, "clip_ratio/low_mean": 0.0005986301795928739, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.018372263037599625, "completion_length": 77.66500091552734, "epoch": 0.4937085774661416, "grad_norm": 2.8724048137664795, "kl": 0.30402788892388344, "learning_rate": 5.592357787618398e-07, "loss": -0.0095, "reward": 1.235116672515869, "reward_std": 0.16121466904878617, "rewards/code_format_reward": 0.9875, "rewards/code_reward": 0.3706833332777023, "step": 2570, "zero_std_ratio": 0.6 }, { "clip_ratio/high_max": 0.21411730786785482, "clip_ratio/high_mean": 0.02751181152416393, "clip_ratio/low_mean": 0.005141469169757329, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.03265328073175624, "completion_length": 77.39250030517579, "epoch": 0.495629622514648, "grad_norm": 3.1119463443756104, "kl": 0.516096468269825, "learning_rate": 5.565196029243746e-07, "loss": -0.0097, "reward": 1.7056148529052735, "reward_std": 0.26717675626277926, "rewards/code_format_reward": 0.9849999904632568, "rewards/code_reward": 0.6065573751926422, "step": 2580, "zero_std_ratio": 0.425 }, { "clip_ratio/high_max": 0.06655758274719119, "clip_ratio/high_mean": 0.00869752592407167, "clip_ratio/low_mean": 0.0007154849590733647, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.009413010929711163, "completion_length": 78.71000061035156, "epoch": 0.49755066756315436, "grad_norm": 9.566883087158203, "kl": 6.985853771865368, "learning_rate": 5.538031894876971e-07, "loss": 0.0154, "reward": 1.8047074317932128, "reward_std": 0.2406391829252243, "rewards/code_format_reward": 0.9862499952316284, "rewards/code_reward": 0.655791187286377, "step": 2590, "zero_std_ratio": 0.425 }, { "clip_ratio/high_max": 0.04153572088107467, "clip_ratio/high_mean": 0.0076960999285802245, "clip_ratio/low_mean": 0.00053562533139484, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.008231725194491446, "completion_length": 87.35249938964844, "epoch": 0.49947171261166073, "grad_norm": 4.163487911224365, "kl": 3.02228729724884, "learning_rate": 5.510866374482799e-07, "loss": 0.0014, "reward": 1.7271404266357422, "reward_std": 0.20059744864702225, "rewards/code_format_reward": 0.9862499952316284, "rewards/code_reward": 0.6170076906681061, "step": 2600, "zero_std_ratio": 0.525 }, { "clip_ratio/high_max": 0.09242721796035766, "clip_ratio/high_mean": 0.01333312913775444, "clip_ratio/low_mean": 0.0022988114287727512, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.01563194077461958, "completion_length": 86.66250152587891, "epoch": 0.5013927576601671, "grad_norm": 1.7816847562789917, "kl": 2.135231140255928, "learning_rate": 5.48370045807647e-07, "loss": -0.0043, "reward": 1.5687429666519166, "reward_std": 0.22490316033363342, "rewards/code_format_reward": 0.9524999976158142, "rewards/code_reward": 0.5462464988231659, "step": 2610, "zero_std_ratio": 0.475 }, { "clip_ratio/high_max": 0.10432031177915632, "clip_ratio/high_mean": 0.01704162026871927, "clip_ratio/low_mean": 0.0019667694039526397, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.01900838967994787, "completion_length": 98.58000183105469, "epoch": 0.5033138027086735, "grad_norm": 2.1069369316101074, "kl": 2.131927290558815, "learning_rate": 5.456535135687656e-07, "loss": -0.0069, "reward": 1.6628828048706055, "reward_std": 0.23133169412612914, "rewards/code_format_reward": 0.9824999928474426, "rewards/code_reward": 0.5858163475990296, "step": 2620, "zero_std_ratio": 0.55 }, { "clip_ratio/high_max": 0.04196612173691392, "clip_ratio/high_mean": 0.0064837948535569016, "clip_ratio/low_mean": 0.0025595034239813685, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.009043298207689076, "completion_length": 86.6, "epoch": 0.5052348477571799, "grad_norm": 15.670801162719727, "kl": 2.1330361180007458, "learning_rate": 5.429371397324378e-07, "loss": -0.0054, "reward": 1.4884859561920165, "reward_std": 0.3388957381248474, "rewards/code_format_reward": 0.9887500047683716, "rewards/code_reward": 0.497055447101593, "step": 2630, "zero_std_ratio": 0.35 }, { "clip_ratio/high_max": 0.036724881688132885, "clip_ratio/high_mean": 0.005309284973191097, "clip_ratio/low_mean": 0.003665669827023521, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00897495478275232, "completion_length": 84.73250122070313, "epoch": 0.5071558928056863, "grad_norm": 6.480928421020508, "kl": 0.9241176024079323, "learning_rate": 5.402210232936934e-07, "loss": -0.0009, "reward": 1.792254877090454, "reward_std": 0.29597480297088624, "rewards/code_format_reward": 0.9974999904632569, "rewards/code_reward": 0.646752405166626, "step": 2640, "zero_std_ratio": 0.45 }, { "clip_ratio/high_max": 0.109601711621508, "clip_ratio/high_mean": 0.015215938963228837, "clip_ratio/low_mean": 0.0034228902019094675, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.018638829072006046, "completion_length": 88.82750244140625, "epoch": 0.5090769378541927, "grad_norm": 5.080334186553955, "kl": 0.6404796183109284, "learning_rate": 5.37505263238181e-07, "loss": -0.0032, "reward": 1.7266260623931884, "reward_std": 0.27733459770679475, "rewards/code_format_reward": 0.993749988079071, "rewards/code_reward": 0.6148755311965942, "step": 2650, "zero_std_ratio": 0.5 }, { "clip_ratio/high_max": 0.0589832967845723, "clip_ratio/high_mean": 0.009531341239926406, "clip_ratio/low_mean": 0.00046608211705461143, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00999742336862255, "completion_length": 88.69250183105468, "epoch": 0.510997982902699, "grad_norm": 7.949027061462402, "kl": 0.6428510576486588, "learning_rate": 5.347899585385619e-07, "loss": -0.0028, "reward": 1.8208046436309815, "reward_std": 0.32592435777187345, "rewards/code_format_reward": 0.9824999928474426, "rewards/code_reward": 0.6647772669792176, "step": 2660, "zero_std_ratio": 0.45 }, { "clip_ratio/high_max": 0.15465617645531893, "clip_ratio/high_mean": 0.022943795099854468, "clip_ratio/low_mean": 0.0016213681577937678, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.024565163621446118, "completion_length": 87.13250274658203, "epoch": 0.5129190279512055, "grad_norm": 34.059959411621094, "kl": 0.5654895901679993, "learning_rate": 5.320752081509019e-07, "loss": -0.0048, "reward": 1.7013320207595826, "reward_std": 0.27322621941566466, "rewards/code_format_reward": 0.9875, "rewards/code_reward": 0.6037909984588623, "step": 2670, "zero_std_ratio": 0.475 }, { "clip_ratio/high_max": 0.04982744911685586, "clip_ratio/high_mean": 0.007483145385049283, "clip_ratio/low_mean": 0.0010201202865573577, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.008503265725448728, "completion_length": 91.09750213623047, "epoch": 0.5148400729997118, "grad_norm": 3.5055086612701416, "kl": 0.5736653476953506, "learning_rate": 5.293611110110661e-07, "loss": -0.0032, "reward": 1.672940969467163, "reward_std": 0.24722242057323457, "rewards/code_format_reward": 0.9850000023841858, "rewards/code_reward": 0.5902204990386963, "step": 2680, "zero_std_ratio": 0.425 }, { "clip_ratio/high_max": 0.06388061246834695, "clip_ratio/high_mean": 0.008427212300011888, "clip_ratio/low_mean": 0.000504276818537619, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.008931488974485546, "completion_length": 84.06750183105468, "epoch": 0.5167611180482182, "grad_norm": 1.1749032735824585, "kl": 0.6257337100803853, "learning_rate": 5.266477660311123e-07, "loss": -0.0049, "reward": 1.883350706100464, "reward_std": 0.1923319399356842, "rewards/code_format_reward": 0.9949999928474427, "rewards/code_reward": 0.6929253697395324, "step": 2690, "zero_std_ratio": 0.6 }, { "clip_ratio/high_max": 0.032716090651229025, "clip_ratio/high_mean": 0.004722256149398163, "clip_ratio/low_mean": 0.00025295682498835956, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0049752129940316085, "completion_length": 101.05250244140625, "epoch": 0.5186821630967247, "grad_norm": 2.250870704650879, "kl": 0.3336128618568182, "learning_rate": 5.239352720956869e-07, "loss": -0.0014, "reward": 1.803996729850769, "reward_std": 0.3182943195104599, "rewards/code_format_reward": 0.9887499928474426, "rewards/code_reward": 0.6548108696937561, "step": 2700, "zero_std_ratio": 0.5 }, { "clip_ratio/high_max": 0.057230279734358194, "clip_ratio/high_mean": 0.01016470161266625, "clip_ratio/low_mean": 0.001601585964090191, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.01176628761459142, "completion_length": 92.2125015258789, "epoch": 0.520603208145231, "grad_norm": 1.7528822422027588, "kl": 0.30482072457671167, "learning_rate": 5.212237280584214e-07, "loss": -0.0012, "reward": 1.6862072706222535, "reward_std": 0.2419889122247696, "rewards/code_format_reward": 0.9837499976158142, "rewards/code_reward": 0.5971661269664764, "step": 2710, "zero_std_ratio": 0.375 }, { "clip_ratio/high_max": 0.060608417179901154, "clip_ratio/high_mean": 0.00894762706157053, "clip_ratio/low_mean": 0.0007068538383464329, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.009654480942117516, "completion_length": 92.53000183105469, "epoch": 0.5225242531937374, "grad_norm": 274.7859802246094, "kl": 1.1552282243967056, "learning_rate": 5.185132327383284e-07, "loss": 0.1157, "reward": 1.7673757076263428, "reward_std": 0.3102965742349625, "rewards/code_format_reward": 0.9887499809265137, "rewards/code_reward": 0.6365003228187561, "step": 2720, "zero_std_ratio": 0.45 }, { "clip_ratio/high_max": 0.15797494337894022, "clip_ratio/high_mean": 0.02083307456341572, "clip_ratio/low_mean": 0.009061275536078028, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.02989435092313215, "completion_length": 88.4500015258789, "epoch": 0.5244452982422437, "grad_norm": 4.456059455871582, "kl": 1.3563814774155616, "learning_rate": 5.158038849162024e-07, "loss": 0.0014, "reward": 1.5090751886367797, "reward_std": 0.23531495928764343, "rewards/code_format_reward": 0.9787499904632568, "rewards/code_reward": 0.5098500728607178, "step": 2730, "zero_std_ratio": 0.4 }, { "clip_ratio/high_max": 0.054899439518339935, "clip_ratio/high_mean": 0.008722224002121947, "clip_ratio/low_mean": 0.0002903619286371395, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.009012586006429046, "completion_length": 85.53750305175781, "epoch": 0.5263663432907502, "grad_norm": 1.951745867729187, "kl": 0.5144835211336612, "learning_rate": 5.130957833310177e-07, "loss": -0.0017, "reward": 1.7648489713668822, "reward_std": 0.1646851196885109, "rewards/code_format_reward": 0.987499988079071, "rewards/code_reward": 0.6355494737625123, "step": 2740, "zero_std_ratio": 0.525 }, { "clip_ratio/high_max": 0.10675688227638602, "clip_ratio/high_mean": 0.016114802553784103, "clip_ratio/low_mean": 0.0011672286826069466, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0172820313135162, "completion_length": 97.54000244140624, "epoch": 0.5282873883392566, "grad_norm": 2.7782888412475586, "kl": 0.484642443805933, "learning_rate": 5.103890266763317e-07, "loss": -0.0017, "reward": 1.7005881071090698, "reward_std": 0.17179570347070694, "rewards/code_format_reward": 0.9824999928474426, "rewards/code_reward": 0.6046690165996551, "step": 2750, "zero_std_ratio": 0.55 }, { "clip_ratio/high_max": 0.04718098001321778, "clip_ratio/high_mean": 0.0069286267200368455, "clip_ratio/low_mean": 0.0022616338639636522, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.009190260457398836, "completion_length": 91.71500091552734, "epoch": 0.5302084333877629, "grad_norm": 1.6982417106628418, "kl": 0.40430613309144975, "learning_rate": 5.076837135966868e-07, "loss": -0.0001, "reward": 1.7166170120239257, "reward_std": 0.12425057031214237, "rewards/code_format_reward": 0.9887499928474426, "rewards/code_reward": 0.6111209750175476, "step": 2760, "zero_std_ratio": 0.575 }, { "clip_ratio/high_max": 0.01563614197075367, "clip_ratio/high_mean": 0.0026440696616191416, "clip_ratio/low_mean": 0.0005518664722330869, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0031959360814653335, "completion_length": 93.39000091552734, "epoch": 0.5321294784362693, "grad_norm": 0.12160471081733704, "kl": 0.3728202864527702, "learning_rate": 5.049799426840166e-07, "loss": -0.0008, "reward": 1.8690509557724, "reward_std": 0.20764816105365752, "rewards/code_format_reward": 0.9824999928474426, "rewards/code_reward": 0.6889004349708557, "step": 2770, "zero_std_ratio": 0.575 }, { "clip_ratio/high_max": 0.08729656506329775, "clip_ratio/high_mean": 0.013787648268043995, "clip_ratio/low_mean": 0.0016946192088653333, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015482267551124095, "completion_length": 83.49000091552735, "epoch": 0.5340505234847757, "grad_norm": 2.061514377593994, "kl": 0.2805942878127098, "learning_rate": 5.02277812474052e-07, "loss": -0.0005, "reward": 1.5558062076568604, "reward_std": 0.18851915150880813, "rewards/code_format_reward": 0.9924999833106994, "rewards/code_reward": 0.529778128862381, "step": 2780, "zero_std_ratio": 0.5 }, { "clip_ratio/high_max": 0.047503374773077665, "clip_ratio/high_mean": 0.007121381178149022, "clip_ratio/low_mean": 0.003943280148087069, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.011064661786076613, "completion_length": 90.33500213623047, "epoch": 0.5359715685332821, "grad_norm": 2.8578014373779297, "kl": 0.9348004341125489, "learning_rate": 4.995774214427299e-07, "loss": -0.0083, "reward": 1.5787676095962524, "reward_std": 0.24208036959171295, "rewards/code_format_reward": 0.9924999952316285, "rewards/code_reward": 0.5412587821483612, "step": 2790, "zero_std_ratio": 0.425 }, { "clip_ratio/high_max": 0.0682177669601515, "clip_ratio/high_mean": 0.010770523789688013, "clip_ratio/low_mean": 0.0030192029429599644, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.01378972665697802, "completion_length": 97.35500030517578, "epoch": 0.5378926135817885, "grad_norm": 3.7523272037506104, "kl": 0.49133365601301193, "learning_rate": 4.968788680026062e-07, "loss": 0.0019, "reward": 1.8675085306167603, "reward_std": 0.3084888607263565, "rewards/code_format_reward": 0.9799999952316284, "rewards/code_reward": 0.6887542605400085, "step": 2800, "zero_std_ratio": 0.4 }, { "clip_ratio/high_max": 0.051895615691319105, "clip_ratio/high_mean": 0.007092137623112648, "clip_ratio/low_mean": 0.0009049189888173714, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.007997056643944234, "completion_length": 84.84000244140626, "epoch": 0.5398136586302948, "grad_norm": 6879.29296875, "kl": 41.48030465692282, "learning_rate": 4.941822504992665e-07, "loss": 0.3058, "reward": 1.8456867456436157, "reward_std": 0.17148398756980895, "rewards/code_format_reward": 0.9912500023841858, "rewards/code_reward": 0.6750308394432067, "step": 2810, "zero_std_ratio": 0.625 }, { "clip_ratio/high_max": 0.055018832255154845, "clip_ratio/high_mean": 0.009382021031342447, "clip_ratio/low_mean": 0.0012206103128846735, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0106026312103495, "completion_length": 92.16250152587891, "epoch": 0.5417347036788013, "grad_norm": 1.7369046211242676, "kl": 39.203208688646555, "learning_rate": 4.914876672077444e-07, "loss": 0.0739, "reward": 1.7605399131774901, "reward_std": 0.22667703181505203, "rewards/code_format_reward": 0.9862499952316284, "rewards/code_reward": 0.6337074398994446, "step": 2820, "zero_std_ratio": 0.575 }, { "clip_ratio/high_max": 0.07783090919256211, "clip_ratio/high_mean": 0.013414820143952965, "clip_ratio/low_mean": 0.004346576618263498, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.017761396686546506, "completion_length": 86.75750122070312, "epoch": 0.5436557487273077, "grad_norm": 1.3669426441192627, "kl": 0.6254852950572968, "learning_rate": 4.887952163289387e-07, "loss": -0.0037, "reward": 1.7524815320968627, "reward_std": 0.18003067299723624, "rewards/code_format_reward": 0.9962499976158142, "rewards/code_reward": 0.6271782517433167, "step": 2830, "zero_std_ratio": 0.55 }, { "clip_ratio/high_max": 0.032230034773238006, "clip_ratio/high_mean": 0.005299290179391391, "clip_ratio/low_mean": 0.002357826306251809, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.007657116468180902, "completion_length": 92.63250122070312, "epoch": 0.545576793775814, "grad_norm": 6.607495307922363, "kl": 0.6308505192399025, "learning_rate": 4.861049959860352e-07, "loss": -0.0026, "reward": 1.879476284980774, "reward_std": 0.21936110258102418, "rewards/code_format_reward": 0.9787499785423279, "rewards/code_reward": 0.6950506567955017, "step": 2840, "zero_std_ratio": 0.575 }, { "clip_ratio/high_max": 0.07762792855501174, "clip_ratio/high_mean": 0.012513539101928473, "clip_ratio/low_mean": 0.0019065381304244511, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.014420077262911946, "completion_length": 79.06750183105468, "epoch": 0.5474978388243205, "grad_norm": 2.1427297592163086, "kl": 0.7649303644895553, "learning_rate": 4.834171042209299e-07, "loss": -0.0016, "reward": 1.7679643869400024, "reward_std": 0.2242477983236313, "rewards/code_format_reward": 0.9725000023841858, "rewards/code_reward": 0.640857207775116, "step": 2850, "zero_std_ratio": 0.5 }, { "clip_ratio/high_max": 0.10583647433668375, "clip_ratio/high_mean": 0.015110303135588764, "clip_ratio/low_mean": 0.0026529163093073293, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.017763219110202046, "completion_length": 89.09499969482422, "epoch": 0.5494188838728268, "grad_norm": 5.39391565322876, "kl": 1.1404950305819512, "learning_rate": 4.807316389906573e-07, "loss": 0.0011, "reward": 1.6588359355926514, "reward_std": 0.23765334486961365, "rewards/code_format_reward": 0.9887500047683716, "rewards/code_reward": 0.5822304427623749, "step": 2860, "zero_std_ratio": 0.5 }, { "clip_ratio/high_max": 0.13843556838110088, "clip_ratio/high_mean": 0.022050847904756664, "clip_ratio/low_mean": 0.006549120438285172, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.028599968180060387, "completion_length": 85.02750091552734, "epoch": 0.5513399289213332, "grad_norm": 6.328859329223633, "kl": 1.3457766875624657, "learning_rate": 4.780486981638194e-07, "loss": 0.004, "reward": 1.4554174661636352, "reward_std": 0.291735103726387, "rewards/code_format_reward": 0.975, "rewards/code_reward": 0.4839587390422821, "step": 2870, "zero_std_ratio": 0.325 }, { "clip_ratio/high_max": 0.03891797037795186, "clip_ratio/high_mean": 0.005045192840043455, "clip_ratio/low_mean": 0.0029750549525488167, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.008020247751846909, "completion_length": 85.00500183105468, "epoch": 0.5532609739698396, "grad_norm": 3.746497392654419, "kl": 1.5130328834056854, "learning_rate": 4.75368379517019e-07, "loss": -0.0033, "reward": 1.8564167737960815, "reward_std": 0.14603331089019775, "rewards/code_format_reward": 0.9987499952316284, "rewards/code_reward": 0.6785208344459533, "step": 2880, "zero_std_ratio": 0.7 }, { "clip_ratio/high_max": 0.23034826815128326, "clip_ratio/high_mean": 0.037423617928288876, "clip_ratio/low_mean": 0.0014674156729597599, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.03889103210531175, "completion_length": 81.40750122070312, "epoch": 0.555182019018346, "grad_norm": 7.282290458679199, "kl": 0.5326755799353122, "learning_rate": 4.7269078073129696e-07, "loss": 0.0032, "reward": 1.700506567955017, "reward_std": 0.3424434006214142, "rewards/code_format_reward": 0.9700000047683716, "rewards/code_reward": 0.6077533006668091, "step": 2890, "zero_std_ratio": 0.425 }, { "clip_ratio/high_max": 0.06711816978640854, "clip_ratio/high_mean": 0.008929741784231737, "clip_ratio/low_mean": 0.0021304662863258273, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.011060208058916032, "completion_length": 75.07750244140625, "epoch": 0.5571030640668524, "grad_norm": 3.718710422515869, "kl": 0.3807241953909397, "learning_rate": 4.7001599938857204e-07, "loss": -0.0016, "reward": 1.6593467235565185, "reward_std": 0.2742844566702843, "rewards/code_format_reward": 0.9899999856948852, "rewards/code_reward": 0.5821733415126801, "step": 2900, "zero_std_ratio": 0.5 }, { "clip_ratio/high_max": 0.10134089784696698, "clip_ratio/high_mean": 0.014033923938404769, "clip_ratio/low_mean": 0.0036910680413711817, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.01772499195067212, "completion_length": 74.57250061035157, "epoch": 0.5590241091153587, "grad_norm": 18.590866088867188, "kl": 0.9125221639871597, "learning_rate": 4.673441329680844e-07, "loss": 0.0044, "reward": 1.6198436498641968, "reward_std": 0.1470041409134865, "rewards/code_format_reward": 0.9912500023841858, "rewards/code_reward": 0.5621092915534973, "step": 2910, "zero_std_ratio": 0.7 }, { "clip_ratio/high_max": 0.042256729071959855, "clip_ratio/high_mean": 0.007948629459133372, "clip_ratio/low_mean": 0.001496748169302009, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.009445377223892137, "completion_length": 77.4625015258789, "epoch": 0.5609451541638651, "grad_norm": 0.18645010888576508, "kl": 0.4780749522149563, "learning_rate": 4.6467527884284365e-07, "loss": 0.0006, "reward": 1.8204985857009888, "reward_std": 0.19856311585754155, "rewards/code_format_reward": 0.981249988079071, "rewards/code_reward": 0.6649368166923523, "step": 2920, "zero_std_ratio": 0.6 }, { "clip_ratio/high_max": 0.1271044396329671, "clip_ratio/high_mean": 0.016186495171859862, "clip_ratio/low_mean": 0.0011034044640837238, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.01728989938274026, "completion_length": 82.72500305175781, "epoch": 0.5628661992123716, "grad_norm": 6.4396162033081055, "kl": 0.30610462203621863, "learning_rate": 4.6200953427607927e-07, "loss": -0.0021, "reward": 1.7915108680725098, "reward_std": 0.22729050666093825, "rewards/code_format_reward": 0.9700000047683716, "rewards/code_reward": 0.6532554149627685, "step": 2930, "zero_std_ratio": 0.6 }, { "clip_ratio/high_max": 0.23429102210793645, "clip_ratio/high_mean": 0.03006269016477745, "clip_ratio/low_mean": 0.001874277341994457, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0319369669421576, "completion_length": 88.97500152587891, "epoch": 0.5647872442608779, "grad_norm": 43.83531951904297, "kl": 0.5952823750674725, "learning_rate": 4.5934699641769747e-07, "loss": -0.0032, "reward": 1.837431001663208, "reward_std": 0.3392215400934219, "rewards/code_format_reward": 0.9825000047683716, "rewards/code_reward": 0.6730904817581177, "step": 2940, "zero_std_ratio": 0.325 }, { "clip_ratio/high_max": 0.06733583421446383, "clip_ratio/high_mean": 0.00863017894444056, "clip_ratio/low_mean": 0.005618994176620618, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.014249173022108153, "completion_length": 77.96000061035156, "epoch": 0.5667082893093843, "grad_norm": 2.642043352127075, "kl": 0.56968834400177, "learning_rate": 4.566877623007389e-07, "loss": 0.0049, "reward": 1.7328413248062133, "reward_std": 0.21620932817459107, "rewards/code_format_reward": 0.9737500071525573, "rewards/code_reward": 0.6229831516742707, "step": 2950, "zero_std_ratio": 0.55 }, { "clip_ratio/high_max": 0.04462944087572396, "clip_ratio/high_mean": 0.007475414098007604, "clip_ratio/low_mean": 0.002004683316772571, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.009480097430059686, "completion_length": 85.0875015258789, "epoch": 0.5686293343578906, "grad_norm": 3.8512065410614014, "kl": 0.33709155321121215, "learning_rate": 4.540319288378439e-07, "loss": -0.0057, "reward": 1.6900140762329101, "reward_std": 0.21961634010076522, "rewards/code_format_reward": 0.9887499928474426, "rewards/code_reward": 0.5978195071220398, "step": 2960, "zero_std_ratio": 0.575 }, { "clip_ratio/high_max": 0.06556220971979201, "clip_ratio/high_mean": 0.01001431758631952, "clip_ratio/low_mean": 0.003507485325098969, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.01352180291141849, "completion_length": 92.67500152587891, "epoch": 0.5705503794063971, "grad_norm": 2.966658592224121, "kl": 0.5968067184090614, "learning_rate": 4.513795928177193e-07, "loss": 0.0007, "reward": 1.4343469619750977, "reward_std": 0.16000542044639587, "rewards/code_format_reward": 0.9962499976158142, "rewards/code_reward": 0.4681109845638275, "step": 2970, "zero_std_ratio": 0.55 }, { "clip_ratio/high_max": 0.006220952537842095, "clip_ratio/high_mean": 0.0009992636245442555, "clip_ratio/low_mean": 0.0029718225210672244, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0039710860582999885, "completion_length": 92.65750274658203, "epoch": 0.5724714244549035, "grad_norm": 9.493338584899902, "kl": 0.5875692501664161, "learning_rate": 4.4873085090161266e-07, "loss": -0.0009, "reward": 1.4061829090118407, "reward_std": 0.20027331858873368, "rewards/code_format_reward": 0.9762499928474426, "rewards/code_reward": 0.45902894139289857, "step": 2980, "zero_std_ratio": 0.475 }, { "clip_ratio/high_max": 0.0330589919583872, "clip_ratio/high_mean": 0.004416179939289578, "clip_ratio/low_mean": 0.002115111546299886, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.006531291425926611, "completion_length": 79.80750274658203, "epoch": 0.5743924695034098, "grad_norm": 1.592423915863037, "kl": 0.6846940219402313, "learning_rate": 4.460857996197879e-07, "loss": -0.0088, "reward": 1.8656628370285033, "reward_std": 0.24907293021678925, "rewards/code_format_reward": 0.9912499904632568, "rewards/code_reward": 0.6850189208984375, "step": 2990, "zero_std_ratio": 0.475 }, { "clip_ratio/high_max": 0.10685318629257382, "clip_ratio/high_mean": 0.014238242123974487, "clip_ratio/low_mean": 0.0005060926268924959, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.014744334877468646, "completion_length": 75.28500213623047, "epoch": 0.5763135145519163, "grad_norm": 11.023285865783691, "kl": 1.773244822025299, "learning_rate": 4.434445353680084e-07, "loss": -0.0004, "reward": 1.6719849348068236, "reward_std": 0.23447352051734924, "rewards/code_format_reward": 0.9887500047683716, "rewards/code_reward": 0.5888049364089966, "step": 3000, "zero_std_ratio": 0.475 }, { "clip_ratio/high_max": 0.09909297195263207, "clip_ratio/high_mean": 0.014624686987372116, "clip_ratio/low_mean": 0.0008105992455966771, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015435286390129477, "completion_length": 80.88750228881835, "epoch": 0.5782345596004226, "grad_norm": 3.5932817459106445, "kl": 1.2866470351815225, "learning_rate": 4.4080715440402417e-07, "loss": 0.0028, "reward": 1.7477641582489014, "reward_std": 0.27256832718849183, "rewards/code_format_reward": 0.9800000071525574, "rewards/code_reward": 0.628882086277008, "step": 3010, "zero_std_ratio": 0.4 }, { "clip_ratio/high_max": 0.061281491769477725, "clip_ratio/high_mean": 0.008347922342363746, "clip_ratio/low_mean": 0.00354889674927108, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.011896818911191076, "completion_length": 75.85250091552734, "epoch": 0.580155604648929, "grad_norm": 4.849332809448242, "kl": 0.476963010430336, "learning_rate": 4.381737528440624e-07, "loss": -0.0002, "reward": 1.5080678462982178, "reward_std": 0.1984383262693882, "rewards/code_format_reward": 0.9762500047683715, "rewards/code_reward": 0.5099714159965515, "step": 3020, "zero_std_ratio": 0.525 }, { "clip_ratio/high_max": 0.02014698493294418, "clip_ratio/high_mean": 0.0029024946445133535, "clip_ratio/low_mean": 0.001273224765463965, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0041757194034289565, "completion_length": 86.35750122070313, "epoch": 0.5820766496974354, "grad_norm": 5.408311367034912, "kl": 1.1033611692488194, "learning_rate": 4.3554442665932664e-07, "loss": -0.0044, "reward": 1.7480007410049438, "reward_std": 0.20548871904611588, "rewards/code_format_reward": 0.9674999952316284, "rewards/code_reward": 0.6321253478527069, "step": 3030, "zero_std_ratio": 0.575 }, { "clip_ratio/high_max": 0.07207567039877176, "clip_ratio/high_mean": 0.010315603285562247, "clip_ratio/low_mean": 0.0024311804067110644, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.012746783741749822, "completion_length": 87.45250091552734, "epoch": 0.5839976947459418, "grad_norm": 5.45907735824585, "kl": 0.7388446770608426, "learning_rate": 4.329192716724974e-07, "loss": -0.0134, "reward": 1.617799663543701, "reward_std": 0.28184359073638915, "rewards/code_format_reward": 0.9900000095367432, "rewards/code_reward": 0.5613998055458069, "step": 3040, "zero_std_ratio": 0.425 }, { "clip_ratio/high_max": 0.03178482772782445, "clip_ratio/high_mean": 0.00484326797304675, "clip_ratio/low_mean": 0.0010359384352341295, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.005879206501413136, "completion_length": 83.70250091552734, "epoch": 0.5859187397944482, "grad_norm": 6.244964122772217, "kl": 0.8223805136978626, "learning_rate": 4.3029838355424165e-07, "loss": -0.0028, "reward": 1.5551699638366698, "reward_std": 0.23868169337511064, "rewards/code_format_reward": 0.9899999976158143, "rewards/code_reward": 0.5300849676132202, "step": 3050, "zero_std_ratio": 0.575 }, { "clip_ratio/high_max": 0.031123768421821296, "clip_ratio/high_mean": 0.0042093763331649825, "clip_ratio/low_mean": 0.00023920949752209708, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0044485858496045695, "completion_length": 90.6500015258789, "epoch": 0.5878397848429545, "grad_norm": 1.844166874885559, "kl": 0.9453303083777428, "learning_rate": 4.2768185781972433e-07, "loss": 0.0038, "reward": 1.7277095794677735, "reward_std": 0.22161270976066588, "rewards/code_format_reward": 0.9849999904632568, "rewards/code_reward": 0.6176047682762146, "step": 3060, "zero_std_ratio": 0.475 }, { "clip_ratio/high_max": 0.06112067271023989, "clip_ratio/high_mean": 0.008206171146593989, "clip_ratio/low_mean": 0.0006491162814199925, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.008855287660844624, "completion_length": 81.27750091552734, "epoch": 0.589760829891461, "grad_norm": 3.0321500301361084, "kl": 0.4705409877002239, "learning_rate": 4.2506978982512964e-07, "loss": -0.0002, "reward": 1.9011548519134522, "reward_std": 0.2363950289785862, "rewards/code_format_reward": 0.9875, "rewards/code_reward": 0.7037024021148681, "step": 3070, "zero_std_ratio": 0.575 }, { "clip_ratio/high_max": 0.028480308945290744, "clip_ratio/high_mean": 0.00514335140469484, "clip_ratio/low_mean": 0.0035089968900138047, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.008652348211035133, "completion_length": 88.05750274658203, "epoch": 0.5916818749399674, "grad_norm": 4.498425483703613, "kl": 0.9383749194443226, "learning_rate": 4.224622747641835e-07, "loss": -0.0068, "reward": 1.2419449806213378, "reward_std": 0.1959183931350708, "rewards/code_format_reward": 0.9799999952316284, "rewards/code_reward": 0.37597247362136843, "step": 3080, "zero_std_ratio": 0.6 }, { "clip_ratio/high_max": 0.02165755571331829, "clip_ratio/high_mean": 0.003493850605445914, "clip_ratio/low_mean": 0.0001163623295724392, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.003610212981584482, "completion_length": 83.10500030517578, "epoch": 0.5936029199884737, "grad_norm": 1.0221151113510132, "kl": 1.614695566892624, "learning_rate": 4.1985940766468663e-07, "loss": 0.1048, "reward": 1.8437815666198731, "reward_std": 0.12033854126930237, "rewards/code_format_reward": 0.9949999928474427, "rewards/code_reward": 0.6731407642364502, "step": 3090, "zero_std_ratio": 0.675 }, { "clip_ratio/high_max": 0.05757335813250393, "clip_ratio/high_mean": 0.0107182093168376, "clip_ratio/low_mean": 0.004042259410198312, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.014760468708118424, "completion_length": 86.6625, "epoch": 0.5955239650369801, "grad_norm": 3.0221967697143555, "kl": 0.4662696644663811, "learning_rate": 4.1726128338504997e-07, "loss": 0.0059, "reward": 1.6797678232192994, "reward_std": 0.23598156571388246, "rewards/code_format_reward": 0.9875, "rewards/code_reward": 0.5930089056491852, "step": 3100, "zero_std_ratio": 0.5 }, { "clip_ratio/high_max": 0.1632944119395688, "clip_ratio/high_mean": 0.02386658971372526, "clip_ratio/low_mean": 0.000367270597780589, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.02423386004229542, "completion_length": 87.04000244140624, "epoch": 0.5974450100854864, "grad_norm": 3124.911865234375, "kl": 1.4825018651783466, "learning_rate": 4.146679966108374e-07, "loss": 0.109, "reward": 1.7368038177490235, "reward_std": 0.2290027320384979, "rewards/code_format_reward": 0.9912499785423279, "rewards/code_reward": 0.620589405298233, "step": 3110, "zero_std_ratio": 0.475 }, { "clip_ratio/high_max": 0.011806602030992508, "clip_ratio/high_mean": 0.00222149578621611, "clip_ratio/low_mean": 0.001782867594738491, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.004004363450803794, "completion_length": 76.50500183105468, "epoch": 0.5993660551339929, "grad_norm": 5.609122276306152, "kl": 1.2381610602140427, "learning_rate": 4.120796418513165e-07, "loss": 0.0687, "reward": 1.6538613319396973, "reward_std": 0.2478315144777298, "rewards/code_format_reward": 0.9825000047683716, "rewards/code_reward": 0.5813056170940399, "step": 3120, "zero_std_ratio": 0.475 }, { "clip_ratio/high_max": 0.04111457797698677, "clip_ratio/high_mean": 0.006102612579707056, "clip_ratio/low_mean": 0.0006678692123387008, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.006770481838611886, "completion_length": 90.63500213623047, "epoch": 0.6012871001824993, "grad_norm": 1.7537983655929565, "kl": 0.8379382207989693, "learning_rate": 4.094963134360129e-07, "loss": 3.0713, "reward": 1.8111864566802978, "reward_std": 0.23444892466068268, "rewards/code_format_reward": 0.9824999928474426, "rewards/code_reward": 0.6599682211875916, "step": 3130, "zero_std_ratio": 0.5 }, { "clip_ratio/high_max": 0.07487462717108428, "clip_ratio/high_mean": 0.009757341054501012, "clip_ratio/low_mean": 0.002470593445468694, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.012227934325346723, "completion_length": 84.99250183105468, "epoch": 0.6032081452310056, "grad_norm": 7.498387336730957, "kl": 0.5894037500023842, "learning_rate": 4.0691810551127327e-07, "loss": 0.0462, "reward": 1.6221882104873657, "reward_std": 0.25462802946567537, "rewards/code_format_reward": 0.9975000023841858, "rewards/code_reward": 0.5617190957069397, "step": 3140, "zero_std_ratio": 0.475 }, { "clip_ratio/high_max": 0.015196678857319058, "clip_ratio/high_mean": 0.0022096226894063875, "clip_ratio/low_mean": 0.002686911600176245, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.004896534324507229, "completion_length": 88.44750213623047, "epoch": 0.6051291902795121, "grad_norm": 0.7371006011962891, "kl": 1.5165767412632705, "learning_rate": 4.0434511203683386e-07, "loss": 0.0113, "reward": 1.958918571472168, "reward_std": 0.17050198167562486, "rewards/code_format_reward": 0.9962499856948852, "rewards/code_reward": 0.7303967714309693, "step": 3150, "zero_std_ratio": 0.625 }, { "clip_ratio/high_max": 0.047509356867522, "clip_ratio/high_mean": 0.0060635729460045695, "clip_ratio/low_mean": 0.0037405278504593297, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.009804100578185171, "completion_length": 93.71500091552734, "epoch": 0.6070502353280185, "grad_norm": 4.062532424926758, "kl": 164.7577206812799, "learning_rate": 4.017774267823967e-07, "loss": 0.3479, "reward": 1.8433427095413208, "reward_std": 0.20897280275821686, "rewards/code_format_reward": 0.9824999928474426, "rewards/code_reward": 0.6760463416576385, "step": 3160, "zero_std_ratio": 0.55 }, { "clip_ratio/high_max": 0.007103513111360371, "clip_ratio/high_mean": 0.0009442454349482432, "clip_ratio/low_mean": 0.0005656339257257059, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0015098793461220338, "completion_length": 97.03500061035156, "epoch": 0.6089712803765248, "grad_norm": 0.3194718658924103, "kl": 19.38877977654338, "learning_rate": 3.9921514332421193e-07, "loss": 0.1279, "reward": 1.3801440358161927, "reward_std": 0.26880781557410954, "rewards/code_format_reward": 0.9699999928474426, "rewards/code_reward": 0.44757200181484225, "step": 3170, "zero_std_ratio": 0.45 }, { "clip_ratio/high_max": 0.027826181857381015, "clip_ratio/high_mean": 0.004423137854610104, "clip_ratio/low_mean": 0.000519216748944018, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.004942354625381995, "completion_length": 99.6375015258789, "epoch": 0.6108923254250312, "grad_norm": 133.27151489257812, "kl": 91.96420569866896, "learning_rate": 3.966583550416676e-07, "loss": 284.3821, "reward": 1.6065278768539428, "reward_std": 0.2671674907207489, "rewards/code_format_reward": 0.9737499952316284, "rewards/code_reward": 0.5598264217376709, "step": 3180, "zero_std_ratio": 0.45 }, { "clip_ratio/high_max": 0.03810381339862943, "clip_ratio/high_mean": 0.005511091940570622, "clip_ratio/low_mean": 0.00701818183879368, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.01252927360474132, "completion_length": 90.77000122070312, "epoch": 0.6128133704735376, "grad_norm": 2.931155204772949, "kl": 4.587994083762169, "learning_rate": 3.9410715511388647e-07, "loss": 28143.1688, "reward": 1.7186223268508911, "reward_std": 0.2031429558992386, "rewards/code_format_reward": 0.98125, "rewards/code_reward": 0.6139986515045166, "step": 3190, "zero_std_ratio": 0.525 }, { "clip_ratio/high_max": 0.18900979291647674, "clip_ratio/high_mean": 0.025313075329177082, "clip_ratio/low_mean": 0.00013794690457871183, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.02545102240983397, "completion_length": 88.60750122070313, "epoch": 0.614734415522044, "grad_norm": 3.9914708137512207, "kl": 0.678571529686451, "learning_rate": 3.915616365163304e-07, "loss": 0.0002, "reward": 1.818918228149414, "reward_std": 0.24608779847621917, "rewards/code_format_reward": 0.9762500047683715, "rewards/code_reward": 0.6653966069221496, "step": 3200, "zero_std_ratio": 0.575 }, { "clip_ratio/high_max": 0.05510081194806844, "clip_ratio/high_mean": 0.008429678474203683, "clip_ratio/low_mean": 0.0015389235399197788, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0099686019733781, "completion_length": 85.05250091552735, "epoch": 0.6166554605705504, "grad_norm": 2.0297534465789795, "kl": 0.5190044179558754, "learning_rate": 3.890218920174122e-07, "loss": -0.0056, "reward": 1.938026785850525, "reward_std": 0.2829041987657547, "rewards/code_format_reward": 0.9887499928474426, "rewards/code_reward": 0.7218258857727051, "step": 3210, "zero_std_ratio": 0.55 }, { "clip_ratio/high_max": 0.04617450258228928, "clip_ratio/high_mean": 0.007303895291988738, "clip_ratio/low_mean": 0.002542783234093804, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.009846678806934506, "completion_length": 92.52000122070312, "epoch": 0.6185765056190567, "grad_norm": 3.2283730506896973, "kl": 0.5362374372780323, "learning_rate": 3.86488014175114e-07, "loss": 0.0003, "reward": 1.7741312742233277, "reward_std": 0.20447308868169783, "rewards/code_format_reward": 0.9899999976158143, "rewards/code_reward": 0.6395656108856201, "step": 3220, "zero_std_ratio": 0.525 }, { "clip_ratio/high_max": 0.059750709845684466, "clip_ratio/high_mean": 0.00790787541482132, "clip_ratio/low_mean": 0.0012954409321537241, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.009203316466300748, "completion_length": 90.4375, "epoch": 0.6204975506675632, "grad_norm": 2.409045934677124, "kl": 0.553566773980856, "learning_rate": 3.8396009533361486e-07, "loss": -0.0, "reward": 1.6513851642608643, "reward_std": 0.24081393480300903, "rewards/code_format_reward": 0.9799999952316284, "rewards/code_reward": 0.580692571401596, "step": 3230, "zero_std_ratio": 0.45 }, { "clip_ratio/high_max": 0.03564520282670856, "clip_ratio/high_mean": 0.004964679945260286, "clip_ratio/low_mean": 0.004444090686592972, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.009408770385198295, "completion_length": 79.08000183105469, "epoch": 0.6224185957160695, "grad_norm": 7.759763717651367, "kl": 1.2998816877603532, "learning_rate": 3.814382276199251e-07, "loss": -0.0006, "reward": 1.6336610555648803, "reward_std": 0.1691926121711731, "rewards/code_format_reward": 0.9949999928474427, "rewards/code_reward": 0.5680804908275604, "step": 3240, "zero_std_ratio": 0.525 }, { "clip_ratio/high_max": 0.011579358880408109, "clip_ratio/high_mean": 0.002202258622855879, "clip_ratio/low_mean": 0.0003946456956327893, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0025969043519580735, "completion_length": 88.7375, "epoch": 0.6243396407645759, "grad_norm": 9.489768981933594, "kl": 4.286054483056068, "learning_rate": 3.7892250294052853e-07, "loss": 31.2761, "reward": 1.8622464895248414, "reward_std": 0.2547990471124649, "rewards/code_format_reward": 0.98125, "rewards/code_reward": 0.6858106970787048, "step": 3250, "zero_std_ratio": 0.45 }, { "clip_ratio/high_max": 0.07659143296186813, "clip_ratio/high_mean": 0.010122461079299682, "clip_ratio/low_mean": 0.0019954566974774933, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.012117917880095775, "completion_length": 99.80750274658203, "epoch": 0.6262606858130824, "grad_norm": 2.884183168411255, "kl": 1.2840011775493623, "learning_rate": 3.764130129780341e-07, "loss": 0.0383, "reward": 1.6670962572097778, "reward_std": 0.34920003414154055, "rewards/code_format_reward": 0.9712499976158142, "rewards/code_reward": 0.5907356142997742, "step": 3260, "zero_std_ratio": 0.375 }, { "clip_ratio/high_max": 0.029844516195589678, "clip_ratio/high_mean": 0.004244843772175955, "clip_ratio/low_mean": 0.0002169124811189249, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.004461756230011815, "completion_length": 100.70250091552734, "epoch": 0.6281817308615887, "grad_norm": 4.036985397338867, "kl": 2.1118960954248904, "learning_rate": 3.7390984918783286e-07, "loss": 0.9419, "reward": 1.6084105730056764, "reward_std": 0.17128639966249465, "rewards/code_format_reward": 0.9712500095367431, "rewards/code_reward": 0.5613927602767944, "step": 3270, "zero_std_ratio": 0.55 }, { "clip_ratio/high_max": 0.07067356873303651, "clip_ratio/high_mean": 0.00971948360092938, "clip_ratio/low_mean": 0.0006290240438829642, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.010348507657181472, "completion_length": 88.9000015258789, "epoch": 0.6301027759100951, "grad_norm": 1.543152928352356, "kl": 0.5742107287049294, "learning_rate": 3.714131027947669e-07, "loss": 0.0006, "reward": 1.808586883544922, "reward_std": 0.20984979271888732, "rewards/code_format_reward": 0.9912499904632568, "rewards/code_reward": 0.6564809083938599, "step": 3280, "zero_std_ratio": 0.6 }, { "clip_ratio/high_max": 0.060896387742832306, "clip_ratio/high_mean": 0.00765781793743372, "clip_ratio/low_mean": 0.01029690281720832, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.017954721208661796, "completion_length": 80.08500213623047, "epoch": 0.6320238209586014, "grad_norm": 2.127617359161377, "kl": 0.6725200928747654, "learning_rate": 3.689228647898034e-07, "loss": 0.1143, "reward": 1.678031039237976, "reward_std": 0.19750893712043763, "rewards/code_format_reward": 0.987499988079071, "rewards/code_reward": 0.5921404898166657, "step": 3290, "zero_std_ratio": 0.575 }, { "clip_ratio/high_max": 0.05414980174973607, "clip_ratio/high_mean": 0.007520435960032046, "clip_ratio/low_mean": 0.00011696565634338185, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.007637401651300025, "completion_length": 92.725, "epoch": 0.6339448660071079, "grad_norm": 8.315914154052734, "kl": 0.30459046363830566, "learning_rate": 3.6643922592671904e-07, "loss": -0.0066, "reward": 1.5898099780082702, "reward_std": 0.1832955375313759, "rewards/code_format_reward": 0.99375, "rewards/code_reward": 0.5464674949645996, "step": 3300, "zero_std_ratio": 0.625 }, { "clip_ratio/high_max": 0.02745349882170558, "clip_ratio/high_mean": 0.004275670822244138, "clip_ratio/low_mean": 0.001036624335392844, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.005312295141629874, "completion_length": 86.80250091552735, "epoch": 0.6358659110556143, "grad_norm": 4.2797441482543945, "kl": 2.398578557372093, "learning_rate": 3.6396227671879267e-07, "loss": 0.028, "reward": 1.7730424404144287, "reward_std": 0.3175764262676239, "rewards/code_format_reward": 0.9912500023841858, "rewards/code_reward": 0.6387087047100067, "step": 3310, "zero_std_ratio": 0.425 }, { "clip_ratio/high_max": 0.03558065614197403, "clip_ratio/high_mean": 0.004970578508800827, "clip_ratio/low_mean": 0.0008951228694058955, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0058657014247728515, "completion_length": 91.0875015258789, "epoch": 0.6377869561041206, "grad_norm": 5.376333713531494, "kl": 1.4305558323860168, "learning_rate": 3.614921074355067e-07, "loss": 0.0034, "reward": 1.7029305696487427, "reward_std": 0.34837333858013153, "rewards/code_format_reward": 0.9849999904632568, "rewards/code_reward": 0.6052152514457703, "step": 3320, "zero_std_ratio": 0.45 }, { "clip_ratio/high_max": 0.04346057323855348, "clip_ratio/high_mean": 0.005737839776702458, "clip_ratio/low_mean": 0.001675139949657023, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00741297956337803, "completion_length": 88.75250396728515, "epoch": 0.639708001152627, "grad_norm": 2.969228744506836, "kl": 0.7607076019048691, "learning_rate": 3.5902880809925704e-07, "loss": -0.0001, "reward": 1.6762405157089233, "reward_std": 0.2515918217599392, "rewards/code_format_reward": 0.9887499928474426, "rewards/code_reward": 0.5909327387809753, "step": 3330, "zero_std_ratio": 0.475 }, { "clip_ratio/high_max": 0.05080293011851609, "clip_ratio/high_mean": 0.006427765643456951, "clip_ratio/low_mean": 0.00040708604792598634, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.006834851735038683, "completion_length": 88.0750015258789, "epoch": 0.6416290462011334, "grad_norm": 12.137472152709961, "kl": 0.31881698705255984, "learning_rate": 3.565724684820727e-07, "loss": 3.6118, "reward": 1.8916306495666504, "reward_std": 0.1850387692451477, "rewards/code_format_reward": 0.99375, "rewards/code_reward": 0.6973778128623962, "step": 3340, "zero_std_ratio": 0.575 }, { "clip_ratio/high_max": 0.14287711144424975, "clip_ratio/high_mean": 0.019231261435197666, "clip_ratio/low_mean": 0.0020263168029487134, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.021257578127551822, "completion_length": 94.19000091552735, "epoch": 0.6435500912496398, "grad_norm": 6.10810661315918, "kl": 0.8296034529805183, "learning_rate": 3.541231781023436e-07, "loss": -0.0004, "reward": 1.6248144626617431, "reward_std": 0.2219874605536461, "rewards/code_format_reward": 0.9887499809265137, "rewards/code_reward": 0.5652197122573852, "step": 3350, "zero_std_ratio": 0.525 }, { "clip_ratio/high_max": 0.07329725201707334, "clip_ratio/high_mean": 0.009671362905646675, "clip_ratio/low_mean": 0.005342914546781685, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015014277724549174, "completion_length": 97.74500274658203, "epoch": 0.6454711362981462, "grad_norm": 2.801866054534912, "kl": 0.5770246163010597, "learning_rate": 3.5168102622155894e-07, "loss": 0.0, "reward": 1.6838999271392823, "reward_std": 0.2707583636045456, "rewards/code_format_reward": 0.9912499904632568, "rewards/code_reward": 0.594137442111969, "step": 3360, "zero_std_ratio": 0.45 }, { "clip_ratio/high_max": 0.005975091701839119, "clip_ratio/high_mean": 0.0011488659016322344, "clip_ratio/low_mean": 0.001098146109143272, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0022470120195066555, "completion_length": 89.36750030517578, "epoch": 0.6473921813466526, "grad_norm": 34.13050079345703, "kl": 2.2693535044789312, "learning_rate": 3.492461018410535e-07, "loss": 0.0028, "reward": 1.8977232933044434, "reward_std": 0.2937870219349861, "rewards/code_format_reward": 0.9862499952316284, "rewards/code_reward": 0.7022991299629211, "step": 3370, "zero_std_ratio": 0.425 }, { "clip_ratio/high_max": 0.04565345844021067, "clip_ratio/high_mean": 0.009023689541209023, "clip_ratio/low_mean": 0.00031946374219842254, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.009343153254303616, "completion_length": 84.38249969482422, "epoch": 0.649313226395159, "grad_norm": 0.9170461893081665, "kl": 108.80695619434118, "learning_rate": 3.468184936987645e-07, "loss": 920.5057, "reward": 1.7496967315673828, "reward_std": 0.2916144669055939, "rewards/code_format_reward": 0.99375, "rewards/code_reward": 0.626410859823227, "step": 3380, "zero_std_ratio": 0.475 }, { "clip_ratio/high_max": 0.04991705315187574, "clip_ratio/high_mean": 0.007881995162460954, "clip_ratio/low_mean": 0.00031314246589317916, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.008195137570146472, "completion_length": 88.86000061035156, "epoch": 0.6512342714436653, "grad_norm": 3.084516763687134, "kl": 1331.8980419039726, "learning_rate": 3.4439829026599765e-07, "loss": 2.6994, "reward": 1.7110779523849486, "reward_std": 0.22298349142074586, "rewards/code_format_reward": 0.99375, "rewards/code_reward": 0.6071014523506164, "step": 3390, "zero_std_ratio": 0.525 }, { "clip_ratio/high_max": 0.03301922780228779, "clip_ratio/high_mean": 0.005802097530977335, "clip_ratio/low_mean": 0.002479181956732646, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00828127931599738, "completion_length": 77.6500015258789, "epoch": 0.6531553164921717, "grad_norm": 3643.742919921875, "kl": 629.580971956253, "learning_rate": 3.4198557974420236e-07, "loss": 1.3601, "reward": 1.9027020692825318, "reward_std": 0.23692196756601333, "rewards/code_format_reward": 0.9899999856948852, "rewards/code_reward": 0.7038509964942932, "step": 3400, "zero_std_ratio": 0.5 }, { "clip_ratio/high_max": 0.026931732892990112, "clip_ratio/high_mean": 0.004060871619731188, "clip_ratio/low_mean": 0.0013453641964588313, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00540623576962389, "completion_length": 81.77250213623047, "epoch": 0.6550763615406782, "grad_norm": 3.2221176624298096, "kl": 17.398655989021062, "learning_rate": 3.3958045006175804e-07, "loss": 0.0552, "reward": 1.7479909420013429, "reward_std": 0.22741918563842772, "rewards/code_format_reward": 0.9725000023841858, "rewards/code_reward": 0.6308704853057862, "step": 3410, "zero_std_ratio": 0.575 }, { "clip_ratio/high_max": 0.06737710665911437, "clip_ratio/high_mean": 0.008767830353463069, "clip_ratio/low_mean": 0.0005067014892119915, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.009274532069684937, "completion_length": 92.88500061035157, "epoch": 0.6569974065891845, "grad_norm": 4.063995838165283, "kl": 2.0011128395795823, "learning_rate": 3.3718298887077003e-07, "loss": 0.0159, "reward": 1.7235053777694702, "reward_std": 0.2168472334742546, "rewards/code_format_reward": 0.98125, "rewards/code_reward": 0.616440212726593, "step": 3420, "zero_std_ratio": 0.525 }, { "clip_ratio/high_max": 0.019622295489534737, "clip_ratio/high_mean": 0.003191170998616144, "clip_ratio/low_mean": 0.0015002752974396572, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0046914463368011635, "completion_length": 80.16000213623047, "epoch": 0.6589184516376909, "grad_norm": 1.253300428390503, "kl": 0.48067781031131745, "learning_rate": 3.3479328354387286e-07, "loss": 0.0008, "reward": 1.7450715541839599, "reward_std": 0.1590050458908081, "rewards/code_format_reward": 0.9924999952316285, "rewards/code_reward": 0.6244107484817505, "step": 3430, "zero_std_ratio": 0.625 }, { "clip_ratio/high_max": 0.03181373123079538, "clip_ratio/high_mean": 0.0046242739539593455, "clip_ratio/low_mean": 0.012107005770667456, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.016731279762461783, "completion_length": 84.32750244140625, "epoch": 0.6608394966861972, "grad_norm": 1.5854672193527222, "kl": 0.42518851198256014, "learning_rate": 3.324114211710498e-07, "loss": 0.0, "reward": 1.6541699171066284, "reward_std": 0.1113172210752964, "rewards/code_format_reward": 0.9962499856948852, "rewards/code_reward": 0.5780224561691284, "step": 3440, "zero_std_ratio": 0.575 }, { "clip_ratio/high_max": 0.13535575959831475, "clip_ratio/high_mean": 0.018421862670220435, "clip_ratio/low_mean": 0.0012572539155371488, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.019679116318002343, "completion_length": 91.63250122070312, "epoch": 0.6627605417347037, "grad_norm": 4.593750476837158, "kl": 0.7388513803482055, "learning_rate": 3.300374885564553e-07, "loss": -0.0, "reward": 1.5408308625221252, "reward_std": 0.29571940898895266, "rewards/code_format_reward": 0.9749999880790711, "rewards/code_reward": 0.5266654074192048, "step": 3450, "zero_std_ratio": 0.525 }, { "clip_ratio/high_max": 0.04671933995559811, "clip_ratio/high_mean": 0.0062255718978121875, "clip_ratio/low_mean": 0.003391482085862663, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.009617053843976464, "completion_length": 78.45250091552734, "epoch": 0.6646815867832101, "grad_norm": 2.5849409103393555, "kl": 10.90581871420145, "learning_rate": 3.2767157221525437e-07, "loss": 0.0178, "reward": 1.5087457418441772, "reward_std": 0.19353876560926436, "rewards/code_format_reward": 0.9875, "rewards/code_reward": 0.5074978828430176, "step": 3460, "zero_std_ratio": 0.625 }, { "clip_ratio/high_max": 0.0318772604689002, "clip_ratio/high_mean": 0.004644899175036699, "clip_ratio/low_mean": 0.0032211030862526967, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.007866002165246754, "completion_length": 75.36500091552735, "epoch": 0.6666026318317164, "grad_norm": 1.8117616176605225, "kl": 187030.33910432606, "learning_rate": 3.253137583704673e-07, "loss": 374.1458, "reward": 1.6825225114822389, "reward_std": 0.2058879092335701, "rewards/code_format_reward": 0.9962499976158142, "rewards/code_reward": 0.5921987533569336, "step": 3470, "zero_std_ratio": 0.475 }, { "clip_ratio/high_max": 0.09497236199676991, "clip_ratio/high_mean": 0.015650217607617378, "clip_ratio/low_mean": 0.0006928690614586231, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.016343086725100875, "completion_length": 89.13250198364258, "epoch": 0.6685236768802229, "grad_norm": 5.850868225097656, "kl": 0.5080707125365734, "learning_rate": 3.229641329498296e-07, "loss": 0.0463, "reward": 1.6678599119186401, "reward_std": 0.2830047011375427, "rewards/code_format_reward": 0.9724999904632569, "rewards/code_reward": 0.5908049464225769, "step": 3480, "zero_std_ratio": 0.425 }, { "clip_ratio/high_max": 0.050425410037860274, "clip_ratio/high_mean": 0.006465365196345374, "clip_ratio/low_mean": 0.0006157927738968283, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0070811579586006704, "completion_length": 81.40749969482422, "epoch": 0.6704447219287293, "grad_norm": 10.526844024658203, "kl": 1.5019532606005668, "learning_rate": 3.2062278158265866e-07, "loss": -0.0021, "reward": 1.7323597908020019, "reward_std": 0.15349715426564217, "rewards/code_format_reward": 0.9837499976158142, "rewards/code_reward": 0.6202423751354218, "step": 3490, "zero_std_ratio": 0.65 }, { "clip_ratio/high_max": 0.07051093662157655, "clip_ratio/high_mean": 0.009594869159627706, "clip_ratio/low_mean": 0.001997971232049167, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.011592840391676873, "completion_length": 96.4625, "epoch": 0.6723657669772356, "grad_norm": 12.833992958068848, "kl": 0.37389371246099473, "learning_rate": 3.182897895967338e-07, "loss": 0.0008, "reward": 1.6037346363067626, "reward_std": 0.329493448138237, "rewards/code_format_reward": 0.98125, "rewards/code_reward": 0.556554788351059, "step": 3500, "zero_std_ratio": 0.325 }, { "clip_ratio/high_max": 0.017038265755400062, "clip_ratio/high_mean": 0.0027443476661574095, "clip_ratio/low_mean": 0.000347714369854657, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.003092062100768089, "completion_length": 84.05, "epoch": 0.674286812025742, "grad_norm": 6.119350910186768, "kl": 0.4559432238340378, "learning_rate": 3.15965242015187e-07, "loss": 0.0298, "reward": 1.6935490131378175, "reward_std": 0.26772548258304596, "rewards/code_format_reward": 0.9862499952316284, "rewards/code_reward": 0.6002120196819305, "step": 3510, "zero_std_ratio": 0.475 }, { "clip_ratio/high_max": 0.03435098186600953, "clip_ratio/high_mean": 0.005973302901838906, "clip_ratio/low_mean": 0.0006568559459992684, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.006630158764892258, "completion_length": 95.0, "epoch": 0.6762078570742484, "grad_norm": 4.796656608581543, "kl": 0.3851431407034397, "learning_rate": 3.1364922355340346e-07, "loss": 0.0214, "reward": 1.8059131860733033, "reward_std": 0.18592590391635894, "rewards/code_format_reward": 0.9899999976158143, "rewards/code_reward": 0.6554565787315368, "step": 3520, "zero_std_ratio": 0.6 }, { "clip_ratio/high_max": 0.024073917022906243, "clip_ratio/high_mean": 0.0035194387339288367, "clip_ratio/low_mean": 0.0002464063392835669, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0037658450222807006, "completion_length": 86.9800018310547, "epoch": 0.6781289021227548, "grad_norm": 7.799978256225586, "kl": 0.2617302156984806, "learning_rate": 3.113418186159349e-07, "loss": -0.0088, "reward": 1.515157699584961, "reward_std": 0.2593328535556793, "rewards/code_format_reward": 0.975, "rewards/code_reward": 0.5138288617134095, "step": 3530, "zero_std_ratio": 0.5 }, { "clip_ratio/high_max": 0.014256173744797707, "clip_ratio/high_mean": 0.002001363394083455, "clip_ratio/low_mean": 0.001286455297667999, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0032878186670131982, "completion_length": 93.04500274658203, "epoch": 0.6800499471712612, "grad_norm": 1.323721170425415, "kl": 0.32287237197160723, "learning_rate": 3.090431112934235e-07, "loss": -0.0056, "reward": 1.8219903230667114, "reward_std": 0.28862411081790923, "rewards/code_format_reward": 0.9875, "rewards/code_reward": 0.6641201436519623, "step": 3540, "zero_std_ratio": 0.45 }, { "clip_ratio/high_max": 0.0389601900940761, "clip_ratio/high_mean": 0.005975415915600024, "clip_ratio/low_mean": 0.0006638197373831645, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.006639235676266253, "completion_length": 95.84250183105469, "epoch": 0.6819709922197675, "grad_norm": 4.850042819976807, "kl": 1.8627108559012413, "learning_rate": 3.067531853595369e-07, "loss": 1.6968, "reward": 1.8796481132507323, "reward_std": 0.13976119682192803, "rewards/code_format_reward": 0.9837500095367432, "rewards/code_reward": 0.6938865780830383, "step": 3550, "zero_std_ratio": 0.65 }, { "clip_ratio/high_max": 0.01971529610455036, "clip_ratio/high_mean": 0.002580236754147336, "clip_ratio/low_mean": 0.0005060694311396219, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0030863061954732986, "completion_length": 85.99000091552735, "epoch": 0.683892037268274, "grad_norm": 499.5800476074219, "kl": 3.8079170405864717, "learning_rate": 3.0447212426791546e-07, "loss": 0.0153, "reward": 1.73906729221344, "reward_std": 0.21255102157592773, "rewards/code_format_reward": 0.9875, "rewards/code_reward": 0.622658634185791, "step": 3560, "zero_std_ratio": 0.525 }, { "clip_ratio/high_max": 0.03546578506939113, "clip_ratio/high_mean": 0.005373837990919128, "clip_ratio/low_mean": 0.0011442803021054714, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0065181183628737925, "completion_length": 93.75249938964843, "epoch": 0.6858130823167803, "grad_norm": 3.144973039627075, "kl": 0.7828342400491237, "learning_rate": 3.022000111491309e-07, "loss": 0.0001, "reward": 1.8471190690994264, "reward_std": 0.27725095450878146, "rewards/code_format_reward": 0.9487499952316284, "rewards/code_reward": 0.686372023820877, "step": 3570, "zero_std_ratio": 0.475 }, { "clip_ratio/high_max": 0.01367393396794796, "clip_ratio/high_mean": 0.001831050164764747, "clip_ratio/low_mean": 0.0008013980732357595, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.002632448251824826, "completion_length": 96.29500122070313, "epoch": 0.6877341273652867, "grad_norm": 3.9027657508850098, "kl": 0.8669951900839805, "learning_rate": 2.99936928807657e-07, "loss": -0.0007, "reward": 1.6410433769226074, "reward_std": 0.25681858956813813, "rewards/code_format_reward": 0.9875, "rewards/code_reward": 0.5736466705799103, "step": 3580, "zero_std_ratio": 0.55 }, { "clip_ratio/high_max": 0.028569919406436384, "clip_ratio/high_mean": 0.0037314103537937626, "clip_ratio/low_mean": 0.0012738955876557157, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.005005306130624376, "completion_length": 84.3875015258789, "epoch": 0.6896551724137931, "grad_norm": 1.8412340879440308, "kl": 0.6606554225087166, "learning_rate": 2.976829597188506e-07, "loss": -0.0007, "reward": 1.6131571292877198, "reward_std": 0.15807003602385522, "rewards/code_format_reward": 0.9950000047683716, "rewards/code_reward": 0.5578285574913024, "step": 3590, "zero_std_ratio": 0.575 }, { "clip_ratio/high_max": 0.045477401558309795, "clip_ratio/high_mean": 0.007051247591152787, "clip_ratio/low_mean": 0.00021358822996262461, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.007264835678506642, "completion_length": 92.69250030517578, "epoch": 0.6915762174622995, "grad_norm": 4.787570953369141, "kl": 0.2786871612071991, "learning_rate": 2.9543818602594826e-07, "loss": 0.0001, "reward": 1.6197675943374634, "reward_std": 0.2863120764493942, "rewards/code_format_reward": 0.9787499904632568, "rewards/code_reward": 0.5651962697505951, "step": 3600, "zero_std_ratio": 0.4 }, { "clip_ratio/high_max": 0.047238136362284425, "clip_ratio/high_mean": 0.006483453582040966, "clip_ratio/low_mean": 0.0015064548759255558, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.007989908382296563, "completion_length": 83.11750183105468, "epoch": 0.6934972625108059, "grad_norm": 1.5795401334762573, "kl": 0.512858135998249, "learning_rate": 2.932026895370697e-07, "loss": 0.0021, "reward": 1.6763751983642579, "reward_std": 0.12559455148875714, "rewards/code_format_reward": 0.9912499904632568, "rewards/code_reward": 0.5903751432895661, "step": 3610, "zero_std_ratio": 0.55 }, { "clip_ratio/high_max": 0.031613614642992616, "clip_ratio/high_mean": 0.00453935784753412, "clip_ratio/low_mean": 0.0026672417909139766, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.007206599647179246, "completion_length": 89.80250244140625, "epoch": 0.6954183075593122, "grad_norm": 0.9828081130981445, "kl": 2.053369848430157, "learning_rate": 2.909765517222392e-07, "loss": -0.0015, "reward": 1.6560463190078736, "reward_std": 0.2526627391576767, "rewards/code_format_reward": 0.9837499976158142, "rewards/code_reward": 0.5820856630802155, "step": 3620, "zero_std_ratio": 0.475 }, { "clip_ratio/high_max": 0.00962083850754425, "clip_ratio/high_mean": 0.0013845860186847859, "clip_ratio/low_mean": 0.00100141861839802, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0023860046290792524, "completion_length": 93.0125015258789, "epoch": 0.6973393526078187, "grad_norm": 1.4326051473617554, "kl": 0.7425350762903691, "learning_rate": 2.887598537104141e-07, "loss": 0.017, "reward": 1.608488416671753, "reward_std": 0.18181688338518143, "rewards/code_format_reward": 0.9887499928474426, "rewards/code_reward": 0.557056725025177, "step": 3630, "zero_std_ratio": 0.6 }, { "clip_ratio/high_max": 0.10694800971541554, "clip_ratio/high_mean": 0.016866487907827833, "clip_ratio/low_mean": 0.000146488708560355, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.017012976511614396, "completion_length": 86.96750183105469, "epoch": 0.6992603976563251, "grad_norm": 5.3714776039123535, "kl": 0.5909165881574154, "learning_rate": 2.8655267628653044e-07, "loss": 0.0005, "reward": 1.6461472749710082, "reward_std": 0.22788509875535964, "rewards/code_format_reward": 0.9862500071525574, "rewards/code_reward": 0.5765111327171326, "step": 3640, "zero_std_ratio": 0.6 }, { "clip_ratio/high_max": 0.015923467138782142, "clip_ratio/high_mean": 0.0022047571546863765, "clip_ratio/low_mean": 0.0014544774603564292, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0036592346790712328, "completion_length": 91.53250122070312, "epoch": 0.7011814427048314, "grad_norm": 7.581000328063965, "kl": 3.2652564592659474, "learning_rate": 2.8435509988855683e-07, "loss": -0.0019, "reward": 1.6843700885772706, "reward_std": 0.20299706608057022, "rewards/code_format_reward": 0.993749988079071, "rewards/code_reward": 0.5937475442886353, "step": 3650, "zero_std_ratio": 0.575 }, { "clip_ratio/high_max": 0.04569347179494798, "clip_ratio/high_mean": 0.00580012007849291, "clip_ratio/low_mean": 0.003195645064988639, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.008995765156578272, "completion_length": 82.49500122070313, "epoch": 0.7031024877533378, "grad_norm": 10.031012535095215, "kl": 0.3446802504360676, "learning_rate": 2.821672046045642e-07, "loss": -0.003, "reward": 1.9148546934127808, "reward_std": 0.15906044691801072, "rewards/code_format_reward": 0.99375, "rewards/code_reward": 0.7089898109436035, "step": 3660, "zero_std_ratio": 0.6 }, { "clip_ratio/high_max": 0.01652005296200514, "clip_ratio/high_mean": 0.0032194001134485005, "clip_ratio/low_mean": 0.0004348491333075799, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.003654249230748974, "completion_length": 87.48000030517578, "epoch": 0.7050235328018442, "grad_norm": 4.5817551612854, "kl": 0.5381794683635235, "learning_rate": 2.799890701698068e-07, "loss": -0.0018, "reward": 1.4432553768157959, "reward_std": 0.19258553311228752, "rewards/code_format_reward": 0.9887499928474426, "rewards/code_reward": 0.4744401514530182, "step": 3670, "zero_std_ratio": 0.475 }, { "clip_ratio/high_max": 0.03230769606307149, "clip_ratio/high_mean": 0.004254726751241833, "clip_ratio/low_mean": 0.0003341716161230579, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0045888983644545075, "completion_length": 91.96000366210937, "epoch": 0.7069445778503506, "grad_norm": 3.1825077533721924, "kl": 0.5493438571691514, "learning_rate": 2.7782077596381596e-07, "loss": 0.0032, "reward": 1.8943065643310546, "reward_std": 0.22485891729593277, "rewards/code_format_reward": 0.9924999952316285, "rewards/code_reward": 0.6990282416343689, "step": 3680, "zero_std_ratio": 0.55 }, { "clip_ratio/high_max": 0.017006529681384563, "clip_ratio/high_mean": 0.0026059710187837483, "clip_ratio/low_mean": 0.00022266755404416473, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0028286385582759976, "completion_length": 92.6875015258789, "epoch": 0.708865622898857, "grad_norm": 3.126534938812256, "kl": 2.302929486706853, "learning_rate": 2.7566240100750794e-07, "loss": 0.0024, "reward": 1.6279277324676513, "reward_std": 0.3058730036020279, "rewards/code_format_reward": 0.9837499976158142, "rewards/code_reward": 0.568026351928711, "step": 3690, "zero_std_ratio": 0.4 }, { "clip_ratio/high_max": 0.020053896540775894, "clip_ratio/high_mean": 0.0029980215302202852, "clip_ratio/low_mean": 0.0004860887274844572, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.003484110155841336, "completion_length": 97.92250061035156, "epoch": 0.7107866679473633, "grad_norm": 4.224461555480957, "kl": 4.42233342602849, "learning_rate": 2.735140239603034e-07, "loss": -0.0003, "reward": 1.960454559326172, "reward_std": 0.24239360094070433, "rewards/code_format_reward": 0.981249988079071, "rewards/code_reward": 0.7349147796630859, "step": 3700, "zero_std_ratio": 0.425 }, { "clip_ratio/high_max": 0.02752018291503191, "clip_ratio/high_mean": 0.005125764373224229, "clip_ratio/low_mean": 0.00023403638042509555, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.005359800613950938, "completion_length": 101.37250061035157, "epoch": 0.7127077129958698, "grad_norm": 4.285885334014893, "kl": 0.952894814312458, "learning_rate": 2.713757231172611e-07, "loss": -0.0013, "reward": 1.6773537874221802, "reward_std": 0.2778655707836151, "rewards/code_format_reward": 0.9837499976158142, "rewards/code_reward": 0.5927394092082977, "step": 3710, "zero_std_ratio": 0.375 }, { "clip_ratio/high_max": 0.021348989009857176, "clip_ratio/high_mean": 0.0030060237273573875, "clip_ratio/low_mean": 0.0013181588088627904, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.004324182611890137, "completion_length": 95.14250183105469, "epoch": 0.7146287580443761, "grad_norm": 2.7202091217041016, "kl": 2.8931914918124675, "learning_rate": 2.692475764062245e-07, "loss": -0.0021, "reward": 1.8867613315582275, "reward_std": 0.18746355026960373, "rewards/code_format_reward": 0.9987499952316284, "rewards/code_reward": 0.6936931371688843, "step": 3720, "zero_std_ratio": 0.525 }, { "clip_ratio/high_max": 0.007143327506491914, "clip_ratio/high_mean": 0.0009208801442582626, "clip_ratio/low_mean": 0.00037053466949146243, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.001291414822480874, "completion_length": 94.1875015258789, "epoch": 0.7165498030928825, "grad_norm": 2.7853496074676514, "kl": 0.6755535811185837, "learning_rate": 2.6712966138498174e-07, "loss": -0.003, "reward": 1.723927640914917, "reward_std": 0.2750594407320023, "rewards/code_format_reward": 0.9825000047683716, "rewards/code_reward": 0.6163387894630432, "step": 3730, "zero_std_ratio": 0.475 }, { "clip_ratio/high_max": 0.019027433777227997, "clip_ratio/high_mean": 0.002618219889700413, "clip_ratio/low_mean": 0.0018478537182090803, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.004466073628282175, "completion_length": 102.04000091552734, "epoch": 0.718470848141389, "grad_norm": 5.998534202575684, "kl": 0.9062080264091492, "learning_rate": 2.650220552384391e-07, "loss": 0.0289, "reward": 1.8737354516983031, "reward_std": 0.34540517926216124, "rewards/code_format_reward": 0.9824999928474426, "rewards/code_reward": 0.6912427186965943, "step": 3740, "zero_std_ratio": 0.375 }, { "clip_ratio/high_max": 0.056439303827937694, "clip_ratio/high_mean": 0.007310985976073425, "clip_ratio/low_mean": 0.0005420514833531342, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.007853037484164816, "completion_length": 92.48250122070313, "epoch": 0.7203918931898953, "grad_norm": 5.3343424797058105, "kl": 0.3819971337914467, "learning_rate": 2.6292483477580816e-07, "loss": -0.011, "reward": 1.672910475730896, "reward_std": 0.2516419067978859, "rewards/code_format_reward": 0.9774999976158142, "rewards/code_reward": 0.5920802116394043, "step": 3750, "zero_std_ratio": 0.575 }, { "clip_ratio/high_max": 0.13834233868401496, "clip_ratio/high_mean": 0.018591971611022017, "clip_ratio/low_mean": 0.0006771487518562935, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.019269120390526952, "completion_length": 99.33000030517579, "epoch": 0.7223129382384017, "grad_norm": 1.4892189502716064, "kl": 0.9441468060016632, "learning_rate": 2.6083807642780644e-07, "loss": -0.0084, "reward": 1.5579908847808839, "reward_std": 0.272139647603035, "rewards/code_format_reward": 0.9787500023841857, "rewards/code_reward": 0.5343079507350922, "step": 3760, "zero_std_ratio": 0.4 }, { "clip_ratio/high_max": 0.023900310718454422, "clip_ratio/high_mean": 0.005545906673069112, "clip_ratio/low_mean": 0.0007872088695876301, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.006333115603774786, "completion_length": 90.84000396728516, "epoch": 0.724233983286908, "grad_norm": 12.181316375732422, "kl": 8.179486125707626, "learning_rate": 2.5876185624387225e-07, "loss": 0.0398, "reward": 1.743166995048523, "reward_std": 0.3216101437807083, "rewards/code_format_reward": 0.9824999928474426, "rewards/code_reward": 0.6259585380554199, "step": 3770, "zero_std_ratio": 0.325 }, { "clip_ratio/high_max": 0.00846199265215546, "clip_ratio/high_mean": 0.0012625553936231881, "clip_ratio/low_mean": 0.00030621195983258074, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.001568767352728173, "completion_length": 118.35750122070313, "epoch": 0.7261550283354145, "grad_norm": 1.6517783403396606, "kl": 0.968211068212986, "learning_rate": 2.5669624988939287e-07, "loss": 0.1551, "reward": 1.7871047019958497, "reward_std": 0.21420088410377502, "rewards/code_format_reward": 0.9949999928474427, "rewards/code_reward": 0.644802349805832, "step": 3780, "zero_std_ratio": 0.525 }, { "clip_ratio/high_max": 0.02817701958119869, "clip_ratio/high_mean": 0.0037564294645562766, "clip_ratio/low_mean": 0.011859719056519679, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015616148672415875, "completion_length": 93.14750213623047, "epoch": 0.7280760733839209, "grad_norm": 11.322369575500488, "kl": 0.45075275003910065, "learning_rate": 2.5464133264294705e-07, "loss": -0.0008, "reward": 1.662767267227173, "reward_std": 0.24967537969350814, "rewards/code_format_reward": 0.9862500071525574, "rewards/code_reward": 0.5848211228847504, "step": 3790, "zero_std_ratio": 0.55 }, { "clip_ratio/high_max": 0.05006286900024861, "clip_ratio/high_mean": 0.007249254969065077, "clip_ratio/low_mean": 0.00040258544613607227, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.007651840391918086, "completion_length": 110.32750396728515, "epoch": 0.7299971184324272, "grad_norm": 16.862590789794922, "kl": 0.3901309326291084, "learning_rate": 2.5259717939356175e-07, "loss": -0.0019, "reward": 1.7777814149856568, "reward_std": 0.25982470586895945, "rewards/code_format_reward": 0.987500011920929, "rewards/code_reward": 0.6420157194137573, "step": 3800, "zero_std_ratio": 0.45 }, { "clip_ratio/high_max": 0.007159786019474268, "clip_ratio/high_mean": 0.0011859470629133283, "clip_ratio/low_mean": 0.0021440873795654626, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0033300343551672996, "completion_length": 96.07000122070312, "epoch": 0.7319181634809336, "grad_norm": 2.4953460693359375, "kl": 0.3146058402955532, "learning_rate": 2.505638646379831e-07, "loss": -0.0042, "reward": 1.7296765804290772, "reward_std": 0.3011175274848938, "rewards/code_format_reward": 0.9837499976158142, "rewards/code_reward": 0.6189007639884949, "step": 3810, "zero_std_ratio": 0.325 }, { "clip_ratio/high_max": 0.03548359724227339, "clip_ratio/high_mean": 0.004679994014441036, "clip_ratio/low_mean": 0.00017329893162241206, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.004853292935877107, "completion_length": 101.32000122070312, "epoch": 0.7338392085294401, "grad_norm": 3.954063892364502, "kl": 0.34448319524526594, "learning_rate": 2.485414624779603e-07, "loss": -0.0051, "reward": 1.690654444694519, "reward_std": 0.24299487322568894, "rewards/code_format_reward": 0.981249988079071, "rewards/code_reward": 0.6000146985054016, "step": 3820, "zero_std_ratio": 0.45 }, { "clip_ratio/high_max": 0.00636189088691026, "clip_ratio/high_mean": 0.0008543322241166606, "clip_ratio/low_mean": 0.00028777473780792204, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0011421069371863267, "completion_length": 94.74000244140625, "epoch": 0.7357602535779464, "grad_norm": 1.0420587062835693, "kl": 0.28902386128902435, "learning_rate": 2.4653004661754703e-07, "loss": 0.0021, "reward": 1.929768443107605, "reward_std": 0.19695264101028442, "rewards/code_format_reward": 0.9899999976158143, "rewards/code_reward": 0.7173841595649719, "step": 3830, "zero_std_ratio": 0.7 }, { "clip_ratio/high_max": 0.0385974693344906, "clip_ratio/high_mean": 0.0054892279236810285, "clip_ratio/low_mean": 0.0004371934803202748, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.005926421421463601, "completion_length": 100.23250122070313, "epoch": 0.7376812986264528, "grad_norm": 6.22709846496582, "kl": 0.39053357392549515, "learning_rate": 2.445296903604131e-07, "loss": -0.0123, "reward": 1.7683161497116089, "reward_std": 0.4236398935317993, "rewards/code_format_reward": 0.9712499976158142, "rewards/code_reward": 0.6413455486297608, "step": 3840, "zero_std_ratio": 0.3 }, { "clip_ratio/high_max": 0.013776408764533699, "clip_ratio/high_mean": 0.0019065461441641674, "clip_ratio/low_mean": 0.0035487653221935034, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.005455311315017752, "completion_length": 91.36000213623046, "epoch": 0.7396023436749591, "grad_norm": 3.84639573097229, "kl": 9.267435324192046, "learning_rate": 2.4254046660717555e-07, "loss": 0.0107, "reward": 1.7194789409637452, "reward_std": 0.23012096285820008, "rewards/code_format_reward": 0.98125, "rewards/code_reward": 0.6144269347190857, "step": 3850, "zero_std_ratio": 0.425 }, { "clip_ratio/high_max": 0.019276025268482044, "clip_ratio/high_mean": 0.0034578723403683397, "clip_ratio/low_mean": 0.0028569042566232382, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.006314776389626786, "completion_length": 96.3250015258789, "epoch": 0.7415233887234656, "grad_norm": 4.765519142150879, "kl": 0.5375766545534134, "learning_rate": 2.4056244785273895e-07, "loss": -0.0038, "reward": 1.713827419281006, "reward_std": 0.28884910941123965, "rewards/code_format_reward": 0.9824999928474426, "rewards/code_reward": 0.6112887144088746, "step": 3860, "zero_std_ratio": 0.35 }, { "clip_ratio/high_max": 0.06692883024225012, "clip_ratio/high_mean": 0.008779459849756676, "clip_ratio/low_mean": 0.0002708235711907037, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.009050283460237552, "completion_length": 103.41250152587891, "epoch": 0.743444433771972, "grad_norm": 2.68007493019104, "kl": 0.34222877621650694, "learning_rate": 2.3859570618365614e-07, "loss": -0.0009, "reward": 1.74418466091156, "reward_std": 0.20953620076179505, "rewards/code_format_reward": 0.9912499785423279, "rewards/code_reward": 0.6242798089981079, "step": 3870, "zero_std_ratio": 0.6 }, { "clip_ratio/high_max": 0.01482260066550225, "clip_ratio/high_mean": 0.0023997865355340764, "clip_ratio/low_mean": 0.00038790585967944934, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.002787692387937568, "completion_length": 98.42250061035156, "epoch": 0.7453654788204783, "grad_norm": 4.816893100738525, "kl": 0.4661983668804169, "learning_rate": 2.366403132754995e-07, "loss": -0.0019, "reward": 1.6338875532150268, "reward_std": 0.21452725008130075, "rewards/code_format_reward": 0.9887500047683716, "rewards/code_reward": 0.5697562634944916, "step": 3880, "zero_std_ratio": 0.5 }, { "clip_ratio/high_max": 0.02494101980701089, "clip_ratio/high_mean": 0.003492716047912836, "clip_ratio/low_mean": 0.00024301124794874341, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0037357273045927285, "completion_length": 97.44000091552735, "epoch": 0.7472865238689848, "grad_norm": 82.46282958984375, "kl": 0.5981974095106125, "learning_rate": 2.3469634039024927e-07, "loss": 0.0024, "reward": 1.8161945581436156, "reward_std": 0.17759706005454062, "rewards/code_format_reward": 0.9837499976158142, "rewards/code_reward": 0.6621597528457641, "step": 3890, "zero_std_ratio": 0.55 }, { "clip_ratio/high_max": 0.0019452353473752737, "clip_ratio/high_mean": 0.00039936143439263106, "clip_ratio/low_mean": 0.0002315789126441814, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0006309403397608548, "completion_length": 94.07750091552734, "epoch": 0.7492075689174911, "grad_norm": 6.090396404266357, "kl": 0.8421477146446705, "learning_rate": 2.3276385837369632e-07, "loss": 0.014, "reward": 1.4471250534057618, "reward_std": 0.25895166750997306, "rewards/code_format_reward": 0.9849999904632568, "rewards/code_reward": 0.4773125171661377, "step": 3900, "zero_std_ratio": 0.5 }, { "clip_ratio/high_max": 0.02143551183398813, "clip_ratio/high_mean": 0.002903820894425735, "clip_ratio/low_mean": 0.00011704202042892576, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0030208629119442775, "completion_length": 89.32750091552734, "epoch": 0.7511286139659975, "grad_norm": 7.675207614898682, "kl": 4.630686198174954, "learning_rate": 2.3084293765286074e-07, "loss": 0.0109, "reward": 1.7639801740646361, "reward_std": 0.32505679726600645, "rewards/code_format_reward": 0.9837499976158142, "rewards/code_reward": 0.6360525727272034, "step": 3910, "zero_std_ratio": 0.425 }, { "clip_ratio/high_max": 0.007216949050780385, "clip_ratio/high_mean": 0.0012314463703660295, "clip_ratio/low_mean": 0.000596191274235025, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0018276376475114375, "completion_length": 93.16250152587891, "epoch": 0.7530496590145039, "grad_norm": 3.4967644214630127, "kl": 0.9979558669030666, "learning_rate": 2.2893364823342454e-07, "loss": 0.0016, "reward": 1.5569410085678101, "reward_std": 0.2807903170585632, "rewards/code_format_reward": 0.9674999952316284, "rewards/code_reward": 0.5365955173969269, "step": 3920, "zero_std_ratio": 0.425 }, { "clip_ratio/high_max": 0.01850514723919332, "clip_ratio/high_mean": 0.003044746146770194, "clip_ratio/low_mean": 0.0006967324326978997, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0037414785125292837, "completion_length": 95.95500183105469, "epoch": 0.7549707040630103, "grad_norm": 2.8742544651031494, "kl": 0.44021010398864746, "learning_rate": 2.270360596971809e-07, "loss": -0.0037, "reward": 1.823073673248291, "reward_std": 0.24968771934509276, "rewards/code_format_reward": 0.9924999952316285, "rewards/code_reward": 0.663411819934845, "step": 3930, "zero_std_ratio": 0.425 }, { "clip_ratio/high_max": 0.015126590803265571, "clip_ratio/high_mean": 0.0023361636558547616, "clip_ratio/low_mean": 0.00015787699958309532, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.002494040655437857, "completion_length": 91.19500122070312, "epoch": 0.7568917491115167, "grad_norm": 3.40413236618042, "kl": 0.386103405430913, "learning_rate": 2.2515024119949826e-07, "loss": -0.011, "reward": 1.5718731164932251, "reward_std": 0.2807211749255657, "rewards/code_format_reward": 0.9774999976158142, "rewards/code_reward": 0.5415615499019623, "step": 3940, "zero_std_ratio": 0.5 }, { "clip_ratio/high_max": 0.015597179555334151, "clip_ratio/high_mean": 0.0027747701620683073, "clip_ratio/low_mean": 0.00042166481143794953, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.003196435049176216, "completion_length": 98.425, "epoch": 0.758812794160023, "grad_norm": 4.560734272003174, "kl": 0.4831135801970959, "learning_rate": 2.2327626146679974e-07, "loss": -0.0022, "reward": 1.7759766340255738, "reward_std": 0.2547271862626076, "rewards/code_format_reward": 0.9625, "rewards/code_reward": 0.6473633050918579, "step": 3950, "zero_std_ratio": 0.475 }, { "clip_ratio/high_max": 0.008683394081890583, "clip_ratio/high_mean": 0.0011145618045702577, "clip_ratio/low_mean": 0.0009394719265401364, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0020540336496196686, "completion_length": 102.31250305175782, "epoch": 0.7607338392085294, "grad_norm": 0.1577247530221939, "kl": 1.2770531885325909, "learning_rate": 2.2141418879405855e-07, "loss": 0.0032, "reward": 1.7324957370758056, "reward_std": 0.19914634823799132, "rewards/code_format_reward": 0.9850000023841858, "rewards/code_reward": 0.6199978470802308, "step": 3960, "zero_std_ratio": 0.625 }, { "clip_ratio/high_max": 0.004086668835952878, "clip_ratio/high_mean": 0.0005708287237212062, "clip_ratio/low_mean": 2.821670495904982e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0005990454228594899, "completion_length": 95.21750335693359, "epoch": 0.7626548842570359, "grad_norm": 268.0164794921875, "kl": 3.985953611135483, "learning_rate": 2.1956409104230986e-07, "loss": 0.0127, "reward": 1.7277408480644225, "reward_std": 0.19516595900058747, "rewards/code_format_reward": 0.9737500071525573, "rewards/code_reward": 0.6204329133033752, "step": 3970, "zero_std_ratio": 0.625 }, { "clip_ratio/high_max": 0.02115430913399905, "clip_ratio/high_mean": 0.003100222998182289, "clip_ratio/low_mean": 0.00045731081045232715, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0035575337868067438, "completion_length": 99.47250213623047, "epoch": 0.7645759293055422, "grad_norm": 4.087578773498535, "kl": 0.2619202695786953, "learning_rate": 2.1772603563617603e-07, "loss": -0.0024, "reward": 1.6976868152618407, "reward_std": 0.31094631999731065, "rewards/code_format_reward": 0.9787499904632568, "rewards/code_reward": 0.6041558861732483, "step": 3980, "zero_std_ratio": 0.4 }, { "clip_ratio/high_max": 0.022111613873858006, "clip_ratio/high_mean": 0.0033171431292430497, "clip_ratio/low_mean": 0.00019350402581039817, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.003510647150687873, "completion_length": 93.09000091552734, "epoch": 0.7664969743540486, "grad_norm": 2.557553291320801, "kl": 0.4590821463614702, "learning_rate": 2.1590008956141137e-07, "loss": -0.0014, "reward": 1.7825278520584107, "reward_std": 0.26515288949012755, "rewards/code_format_reward": 0.9887500047683716, "rewards/code_reward": 0.6440764307975769, "step": 3990, "zero_std_ratio": 0.425 }, { "clip_ratio/high_max": 0.03076116186566651, "clip_ratio/high_mean": 0.004437833256088197, "clip_ratio/low_mean": 0.0004819675668841228, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.004919800782226957, "completion_length": 89.73500061035156, "epoch": 0.7684180194025549, "grad_norm": 2.5422067642211914, "kl": 0.26607592329382895, "learning_rate": 2.1408631936245908e-07, "loss": 0.0026, "reward": 1.8288384914398192, "reward_std": 0.2508297085762024, "rewards/code_format_reward": 0.9837499856948853, "rewards/code_reward": 0.6684817314147949, "step": 4000, "zero_std_ratio": 0.5 }, { "clip_ratio/high_max": 0.020826040930114687, "clip_ratio/high_mean": 0.0040985049330629405, "clip_ratio/low_mean": 0.000369196553947404, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.004467701492831111, "completion_length": 97.69500122070312, "epoch": 0.7703390644510614, "grad_norm": 2.079371929168701, "kl": 0.3304180882871151, "learning_rate": 2.122847911400278e-07, "loss": 0.0019, "reward": 1.693557620048523, "reward_std": 0.21333991810679437, "rewards/code_format_reward": 0.9962499976158142, "rewards/code_reward": 0.5977162718772888, "step": 4010, "zero_std_ratio": 0.575 }, { "clip_ratio/high_max": 0.009364375309087337, "clip_ratio/high_mean": 0.0013745424774242565, "clip_ratio/low_mean": 0.0020853754234849476, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0034599179547512905, "completion_length": 94.00750274658203, "epoch": 0.7722601094995678, "grad_norm": 3.2660512924194336, "kl": 0.6432372182607651, "learning_rate": 2.1049557054868082e-07, "loss": 0.0073, "reward": 1.8483120203018188, "reward_std": 0.316910046339035, "rewards/code_format_reward": 0.9649999856948852, "rewards/code_reward": 0.6829060018062592, "step": 4020, "zero_std_ratio": 0.375 }, { "clip_ratio/high_max": 0.08980275879148394, "clip_ratio/high_mean": 0.011746273408061825, "clip_ratio/low_mean": 0.000331929670937825, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.012078202966949902, "completion_length": 92.7925018310547, "epoch": 0.7741811545480741, "grad_norm": 3.004549503326416, "kl": 0.74478175714612, "learning_rate": 2.0871872279444554e-07, "loss": -0.0021, "reward": 1.7010861873626708, "reward_std": 0.25111902356147764, "rewards/code_format_reward": 0.9737499952316284, "rewards/code_reward": 0.6071055889129638, "step": 4030, "zero_std_ratio": 0.45 }, { "clip_ratio/high_max": 0.0778543038177304, "clip_ratio/high_mean": 0.00988141688721953, "clip_ratio/low_mean": 0.0002543082577176392, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.01013572499359725, "completion_length": 105.63250122070312, "epoch": 0.7761021995965806, "grad_norm": 6.268821716308594, "kl": 0.32837071269750595, "learning_rate": 2.0695431263243512e-07, "loss": -0.0003, "reward": 1.716653084754944, "reward_std": 0.2870768278837204, "rewards/code_format_reward": 0.9899999976158143, "rewards/code_reward": 0.6108265280723572, "step": 4040, "zero_std_ratio": 0.4 }, { "clip_ratio/high_max": 0.07360692555084825, "clip_ratio/high_mean": 0.009302017895970493, "clip_ratio/low_mean": 0.0003425976261496544, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.009644615522120148, "completion_length": 91.73750152587891, "epoch": 0.7780232446450869, "grad_norm": 4.801341533660889, "kl": 13.291237189993263, "learning_rate": 2.052024043644897e-07, "loss": 0.0294, "reward": 1.7232446193695068, "reward_std": 0.24269133806228638, "rewards/code_format_reward": 0.9924999952316285, "rewards/code_reward": 0.6134972870349884, "step": 4050, "zero_std_ratio": 0.55 }, { "clip_ratio/high_max": 0.012731208954937756, "clip_ratio/high_mean": 0.00180651948612649, "clip_ratio/low_mean": 0.00015854310477152466, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0019650626258226112, "completion_length": 92.22000274658203, "epoch": 0.7799442896935933, "grad_norm": 0.6561126112937927, "kl": 0.4966626279056072, "learning_rate": 2.0346306183683254e-07, "loss": 0.0001, "reward": 1.8969059467315674, "reward_std": 0.33292114436626435, "rewards/code_format_reward": 0.9800000071525574, "rewards/code_reward": 0.7034529447555542, "step": 4060, "zero_std_ratio": 0.45 }, { "clip_ratio/high_max": 0.015003547444939614, "clip_ratio/high_mean": 0.002088976529194042, "clip_ratio/low_mean": 0.0003269152017310262, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.002415891730925068, "completion_length": 88.61250152587891, "epoch": 0.7818653347420997, "grad_norm": 3.062511920928955, "kl": 27.40203034952283, "learning_rate": 2.0173634843774363e-07, "loss": 0.0554, "reward": 1.7011754512786865, "reward_std": 0.3188599109649658, "rewards/code_format_reward": 0.981250011920929, "rewards/code_reward": 0.6052752196788788, "step": 4070, "zero_std_ratio": 0.375 }, { "clip_ratio/high_max": 0.006837105128215626, "clip_ratio/high_mean": 0.0008925169277063105, "clip_ratio/low_mean": 0.0005512935545993969, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0014438104728469626, "completion_length": 91.84750213623047, "epoch": 0.7837863797906061, "grad_norm": 3.0254440307617188, "kl": 1.3981286019086838, "learning_rate": 2.0002232709524897e-07, "loss": 0.0033, "reward": 1.6401101350784302, "reward_std": 0.26853239685297015, "rewards/code_format_reward": 0.9849999904632568, "rewards/code_reward": 0.5738050699234009, "step": 4080, "zero_std_ratio": 0.45 }, { "clip_ratio/high_max": 0.03983018643921241, "clip_ratio/high_mean": 0.005185264609463047, "clip_ratio/low_mean": 0.0019072047754889355, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.007092469278723002, "completion_length": 88.79250030517578, "epoch": 0.7857074248391125, "grad_norm": 2.8119072914123535, "kl": 0.41205914914608, "learning_rate": 1.983210602748279e-07, "loss": -0.0029, "reward": 1.9083050966262818, "reward_std": 0.29446094632148745, "rewards/code_format_reward": 0.9825000047683716, "rewards/code_reward": 0.7085274815559387, "step": 4090, "zero_std_ratio": 0.5 }, { "clip_ratio/high_max": 0.013765955006238072, "clip_ratio/high_mean": 0.0018926289907540196, "clip_ratio/low_mean": 0.0033484802523162218, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00524110905098496, "completion_length": 85.72500305175781, "epoch": 0.7876284698876188, "grad_norm": 9.436022758483887, "kl": 0.5864221028983593, "learning_rate": 1.966326099771361e-07, "loss": -0.0013, "reward": 1.8478533029556274, "reward_std": 0.2244624227285385, "rewards/code_format_reward": 0.987499988079071, "rewards/code_reward": 0.6770516157150268, "step": 4100, "zero_std_ratio": 0.55 }, { "clip_ratio/high_max": 0.008409230364486575, "clip_ratio/high_mean": 0.0011749810015317052, "clip_ratio/low_mean": 0.00043775633239420133, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.001612737326649949, "completion_length": 91.16000213623047, "epoch": 0.7895495149361252, "grad_norm": 6.15724515914917, "kl": 19.288723162561656, "learning_rate": 1.9495703773574628e-07, "loss": 0.0383, "reward": 1.6099607944488525, "reward_std": 0.30300846993923186, "rewards/code_format_reward": 0.9799999952316284, "rewards/code_reward": 0.5599803984165191, "step": 4110, "zero_std_ratio": 0.45 }, { "clip_ratio/high_max": 0.009142859559506177, "clip_ratio/high_mean": 0.001581054090638645, "clip_ratio/low_mean": 0.0003552554393536411, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0019363095459993928, "completion_length": 91.99500122070313, "epoch": 0.7914705599846317, "grad_norm": 6.634824752807617, "kl": 6.53539779484272, "learning_rate": 1.9329440461490576e-07, "loss": 0.0342, "reward": 1.647179627418518, "reward_std": 0.2863168239593506, "rewards/code_format_reward": 0.9862499952316284, "rewards/code_reward": 0.5770273089408875, "step": 4120, "zero_std_ratio": 0.475 }, { "clip_ratio/high_max": 0.002549535338766873, "clip_ratio/high_mean": 0.0003399143257411197, "clip_ratio/low_mean": 0.00017536718805786223, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.000515281516709365, "completion_length": 90.89250183105469, "epoch": 0.793391605033138, "grad_norm": 2.817605972290039, "kl": 2.417458937317133, "learning_rate": 1.9164477120731066e-07, "loss": 0.0066, "reward": 1.7660948038101196, "reward_std": 0.2769928514957428, "rewards/code_format_reward": 0.96875, "rewards/code_reward": 0.640859854221344, "step": 4130, "zero_std_ratio": 0.375 }, { "clip_ratio/high_max": 0.037738511635689066, "clip_ratio/high_mean": 0.0050403060296957845, "clip_ratio/low_mean": 0.0007518758837250061, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.005792181929427898, "completion_length": 96.35750122070313, "epoch": 0.7953126500816444, "grad_norm": 4.240172386169434, "kl": 0.28425633125007155, "learning_rate": 1.900081976318983e-07, "loss": 0.002, "reward": 1.6942025184631349, "reward_std": 0.3146607309579849, "rewards/code_format_reward": 0.9737499833106995, "rewards/code_reward": 0.6036637306213379, "step": 4140, "zero_std_ratio": 0.45 }, { "clip_ratio/high_max": 0.005694918753579259, "clip_ratio/high_mean": 0.0007544978521764279, "clip_ratio/low_mean": 0.000571403895446565, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.001325901737436652, "completion_length": 91.79500122070313, "epoch": 0.7972336951301509, "grad_norm": 3.9649434089660645, "kl": 0.5314306125044823, "learning_rate": 1.8838474353165547e-07, "loss": -0.0054, "reward": 1.7638010501861572, "reward_std": 0.2793388396501541, "rewards/code_format_reward": 0.9824999928474426, "rewards/code_reward": 0.6362755179405213, "step": 4150, "zero_std_ratio": 0.45 }, { "clip_ratio/high_max": 0.07577090607956052, "clip_ratio/high_mean": 0.009897856542374938, "clip_ratio/low_mean": 0.00011142043076688424, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.010009276978962589, "completion_length": 94.04000244140624, "epoch": 0.7991547401786572, "grad_norm": 2.2340188026428223, "kl": 0.524626237899065, "learning_rate": 1.8677446807144554e-07, "loss": -0.0045, "reward": 1.7472325563430786, "reward_std": 0.3027869775891304, "rewards/code_format_reward": 0.9787499904632568, "rewards/code_reward": 0.6289287328720092, "step": 4160, "zero_std_ratio": 0.45 }, { "clip_ratio/high_max": 0.012572024948894978, "clip_ratio/high_mean": 0.0020916348788887263, "clip_ratio/low_mean": 0.00022255638323258607, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0023141912854043765, "completion_length": 94.53000183105469, "epoch": 0.8010757852271636, "grad_norm": 10.561907768249512, "kl": 2.102495136484504, "learning_rate": 1.8517742993585178e-07, "loss": 0.0137, "reward": 1.7456205368041993, "reward_std": 0.2167625606060028, "rewards/code_format_reward": 0.9862500071525574, "rewards/code_reward": 0.6262477397918701, "step": 4170, "zero_std_ratio": 0.525 }, { "clip_ratio/high_max": 0.06855184989399277, "clip_ratio/high_mean": 0.008778795686521335, "clip_ratio/low_mean": 5.122950533404946e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.008830025191855384, "completion_length": 101.0425018310547, "epoch": 0.8029968302756699, "grad_norm": 5.673184871673584, "kl": 0.428597304970026, "learning_rate": 1.835936873270389e-07, "loss": -0.0078, "reward": 1.818405318260193, "reward_std": 0.23994216322898865, "rewards/code_format_reward": 0.9862499952316284, "rewards/code_reward": 0.6626401782035828, "step": 4180, "zero_std_ratio": 0.55 }, { "clip_ratio/high_max": 0.002845590282231569, "clip_ratio/high_mean": 0.0004983038117643446, "clip_ratio/low_mean": 0.00037282529519870876, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0008711291156942025, "completion_length": 92.31250305175782, "epoch": 0.8049178753241764, "grad_norm": 6.281589508056641, "kl": 0.4346353754401207, "learning_rate": 1.8202329796263172e-07, "loss": -0.0009, "reward": 1.8768694639205932, "reward_std": 0.21767425537109375, "rewards/code_format_reward": 0.9862499952316284, "rewards/code_reward": 0.6918722629547119, "step": 4190, "zero_std_ratio": 0.55 }, { "clip_ratio/high_max": 0.003876271191984415, "clip_ratio/high_mean": 0.0004845338989980519, "clip_ratio/low_mean": 0.0001560977878398262, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0006406316824723035, "completion_length": 75.7275016784668, "epoch": 0.8068389203726828, "grad_norm": 1.0423272848129272, "kl": 0.9193772681057453, "learning_rate": 1.8046631907361226e-07, "loss": 0.0041, "reward": 1.8756553649902343, "reward_std": 0.18836807161569596, "rewards/code_format_reward": 0.99375, "rewards/code_reward": 0.6893901348114013, "step": 4200, "zero_std_ratio": 0.575 }, { "clip_ratio/high_max": 0.004609162057749927, "clip_ratio/high_mean": 0.0007380300055956468, "clip_ratio/low_mean": 0.00015018127305665985, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.000888211271376349, "completion_length": 86.45750122070312, "epoch": 0.8087599654211891, "grad_norm": 4.096966743469238, "kl": 0.45643181502819063, "learning_rate": 1.7892280740223303e-07, "loss": -0.004, "reward": 1.5836501359939574, "reward_std": 0.2258547842502594, "rewards/code_format_reward": 0.9799999952316284, "rewards/code_reward": 0.5468250632286071, "step": 4210, "zero_std_ratio": 0.525 }, { "clip_ratio/high_max": 0.007562826108187437, "clip_ratio/high_mean": 0.0010166528285481037, "clip_ratio/low_mean": 0.000701455632224679, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0017181085073389112, "completion_length": 90.34250335693359, "epoch": 0.8106810104696955, "grad_norm": 0.29423439502716064, "kl": 0.2636001568287611, "learning_rate": 1.7739281919995045e-07, "loss": 0.0161, "reward": 1.5646157741546631, "reward_std": 0.12648468129336835, "rewards/code_format_reward": 0.9837499976158142, "rewards/code_reward": 0.5363703727722168, "step": 4220, "zero_std_ratio": 0.625 }, { "clip_ratio/high_max": 0.02073557274416089, "clip_ratio/high_mean": 0.002738419675733894, "clip_ratio/low_mean": 0.001565844019933138, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.004304263507947326, "completion_length": 85.92750091552735, "epoch": 0.8126020555182019, "grad_norm": 3.8796801567077637, "kl": 0.6587013073265553, "learning_rate": 1.7587641022537335e-07, "loss": -0.0031, "reward": 1.598485040664673, "reward_std": 0.23664331436157227, "rewards/code_format_reward": 0.9862499952316284, "rewards/code_reward": 0.5526800036430359, "step": 4230, "zero_std_ratio": 0.55 }, { "clip_ratio/high_max": 0.0027086240705102684, "clip_ratio/high_mean": 0.0003605463745770976, "clip_ratio/low_mean": 0.0004666288397856988, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0008271752245491371, "completion_length": 86.6875, "epoch": 0.8145231005667083, "grad_norm": 6.270168781280518, "kl": 3.9651204235851765, "learning_rate": 1.7437363574223244e-07, "loss": 0.0141, "reward": 1.8213656187057494, "reward_std": 0.2221561223268509, "rewards/code_format_reward": 0.9837499976158142, "rewards/code_reward": 0.66474529504776, "step": 4240, "zero_std_ratio": 0.6 }, { "clip_ratio/high_max": 0.009644670388661325, "clip_ratio/high_mean": 0.0013658979878528044, "clip_ratio/low_mean": 0.0006750999338692055, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.002040997930453159, "completion_length": 86.42250061035156, "epoch": 0.8164441456152147, "grad_norm": 4.402440071105957, "kl": 0.27487861886620524, "learning_rate": 1.7288455051736474e-07, "loss": -0.0005, "reward": 1.6581492662429809, "reward_std": 0.14444592781364918, "rewards/code_format_reward": 0.9862499952316284, "rewards/code_reward": 0.5825121104717255, "step": 4250, "zero_std_ratio": 0.65 }, { "clip_ratio/high_max": 0.017426467640325426, "clip_ratio/high_mean": 0.0023936200188472865, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0023936200188472865, "completion_length": 88.66250152587891, "epoch": 0.818365190663721, "grad_norm": 15.625293731689453, "kl": 0.5453658372163772, "learning_rate": 1.7140920881871927e-07, "loss": 0.0001, "reward": 1.9025921821594238, "reward_std": 0.1951783686876297, "rewards/code_format_reward": 0.9899999856948852, "rewards/code_reward": 0.7037960886955261, "step": 4260, "zero_std_ratio": 0.65 }, { "clip_ratio/high_max": 0.024276501801796257, "clip_ratio/high_mean": 0.0037041545001557097, "clip_ratio/low_mean": 0.0004929742426611483, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.004197128777741454, "completion_length": 94.525, "epoch": 0.8202862357122275, "grad_norm": 19.728607177734375, "kl": 3.983573118597269, "learning_rate": 1.699476644133778e-07, "loss": 0.0122, "reward": 1.7488954544067383, "reward_std": 0.2558127373456955, "rewards/code_format_reward": 0.9899999976158143, "rewards/code_reward": 0.6269477069377899, "step": 4270, "zero_std_ratio": 0.55 }, { "clip_ratio/high_max": 0.00913497168221511, "clip_ratio/high_mean": 0.0011987716374278535, "clip_ratio/low_mean": 0.0007998564062290825, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0019986280538432767, "completion_length": 87.73999938964843, "epoch": 0.8222072807607338, "grad_norm": 4.567457675933838, "kl": 0.6975361555814743, "learning_rate": 1.6849997056559662e-07, "loss": -0.0116, "reward": 1.7202219009399413, "reward_std": 0.27057143300771713, "rewards/code_format_reward": 0.9725000023841858, "rewards/code_reward": 0.6169859290122985, "step": 4280, "zero_std_ratio": 0.575 }, { "clip_ratio/high_max": 0.016106344643048942, "clip_ratio/high_mean": 0.002376156343962066, "clip_ratio/low_mean": 5.540161509998143e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0024315579648828134, "completion_length": 94.21750030517578, "epoch": 0.8241283258092402, "grad_norm": 17.505773544311523, "kl": 1.1381098613142968, "learning_rate": 1.670661800348644e-07, "loss": -0.0006, "reward": 1.7664429664611816, "reward_std": 0.283676877617836, "rewards/code_format_reward": 0.9849999904632568, "rewards/code_reward": 0.6369715094566345, "step": 4290, "zero_std_ratio": 0.45 }, { "clip_ratio/high_max": 0.05613061334006488, "clip_ratio/high_mean": 0.0073514855874236675, "clip_ratio/low_mean": 0.00012296391359996052, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0074744494573678825, "completion_length": 94.24500122070313, "epoch": 0.8260493708577467, "grad_norm": 36.729576110839844, "kl": 2.2444246262311935, "learning_rate": 1.656463450739801e-07, "loss": 0.0024, "reward": 1.7431164741516114, "reward_std": 0.29661422967910767, "rewards/code_format_reward": 0.9787499904632568, "rewards/code_reward": 0.6268707036972045, "step": 4300, "zero_std_ratio": 0.45 }, { "clip_ratio/high_max": 0.003962649451568723, "clip_ratio/high_mean": 0.0005655559070874006, "clip_ratio/low_mean": 0.00023454214970115573, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0008000980655197054, "completion_length": 91.94250030517578, "epoch": 0.827970415906253, "grad_norm": 5.331088066101074, "kl": 0.6558065637946129, "learning_rate": 1.6424051742714851e-07, "loss": 0.0002, "reward": 1.76786208152771, "reward_std": 0.17127570807933806, "rewards/code_format_reward": 0.9912500023841858, "rewards/code_reward": 0.6361185550689697, "step": 4310, "zero_std_ratio": 0.6 }, { "clip_ratio/high_max": 0.007816581195220352, "clip_ratio/high_mean": 0.001516599569004029, "clip_ratio/low_mean": 4.7630388871766625e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0015642299549654126, "completion_length": 82.7000015258789, "epoch": 0.8298914609547594, "grad_norm": 9.622750282287598, "kl": 0.9509772717952728, "learning_rate": 1.6284874832809436e-07, "loss": 0.0023, "reward": 1.9346927881240845, "reward_std": 0.3074748650193214, "rewards/code_format_reward": 0.99375, "rewards/code_reward": 0.7189089298248291, "step": 4320, "zero_std_ratio": 0.45 }, { "clip_ratio/high_max": 0.02933923137607053, "clip_ratio/high_mean": 0.004640504893905018, "clip_ratio/low_mean": 0.00011013215407729149, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.004750637047982309, "completion_length": 88.08000183105469, "epoch": 0.8318125060032657, "grad_norm": 1.8961539268493652, "kl": 1.2761327236890794, "learning_rate": 1.614710884981951e-07, "loss": 0.0002, "reward": 1.5815791606903076, "reward_std": 0.24661691784858703, "rewards/code_format_reward": 0.9862499833106995, "rewards/code_reward": 0.5442270636558533, "step": 4330, "zero_std_ratio": 0.475 }, { "clip_ratio/high_max": 0.035589413810521366, "clip_ratio/high_mean": 0.005689902242738754, "clip_ratio/low_mean": 0.00015636042662663384, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.005846262606792152, "completion_length": 89.16500244140624, "epoch": 0.8337335510517722, "grad_norm": 1.6006284952163696, "kl": 0.6420656457543373, "learning_rate": 1.6010758814463287e-07, "loss": 0.0027, "reward": 1.643228530883789, "reward_std": 0.2129346549510956, "rewards/code_format_reward": 0.9899999856948852, "rewards/code_reward": 0.5741142451763153, "step": 4340, "zero_std_ratio": 0.55 }, { "clip_ratio/high_max": 0.007347659638617188, "clip_ratio/high_mean": 0.001005946182704065, "clip_ratio/low_mean": 0.00028780620195902886, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0012937523904838599, "completion_length": 98.85000152587891, "epoch": 0.8356545961002786, "grad_norm": 5.479083061218262, "kl": 0.3409851986914873, "learning_rate": 1.5875829695856406e-07, "loss": -0.0007, "reward": 1.882705855369568, "reward_std": 0.22037020921707154, "rewards/code_format_reward": 0.9899999856948852, "rewards/code_reward": 0.6938528895378113, "step": 4350, "zero_std_ratio": 0.525 }, { "clip_ratio/high_max": 0.032100778096355496, "clip_ratio/high_mean": 0.004409284892608412, "clip_ratio/low_mean": 4.9924499762710184e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.004459209358901717, "completion_length": 90.73500213623046, "epoch": 0.8375756411487849, "grad_norm": 56.100852966308594, "kl": 0.22566271349787712, "learning_rate": 1.5742326411330942e-07, "loss": 0.0011, "reward": 1.8064903020858765, "reward_std": 0.1691088706254959, "rewards/code_format_reward": 0.9962499976158142, "rewards/code_reward": 0.65418261885643, "step": 4360, "zero_std_ratio": 0.675 }, { "clip_ratio/high_max": 0.005500979837961495, "clip_ratio/high_mean": 0.0008934643206885085, "clip_ratio/low_mean": 0.0005694760067854077, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.001462940318742767, "completion_length": 93.03750305175781, "epoch": 0.8394966861972913, "grad_norm": 7.828958034515381, "kl": 0.6565275602042675, "learning_rate": 1.5610253826256036e-07, "loss": 0.003, "reward": 1.7732144832611083, "reward_std": 0.33924323320388794, "rewards/code_format_reward": 0.9825000047683716, "rewards/code_reward": 0.6409822225570678, "step": 4370, "zero_std_ratio": 0.5 }, { "clip_ratio/high_max": 0.0038046793546527625, "clip_ratio/high_mean": 0.0004755849193315953, "clip_ratio/low_mean": 0.0006387827248545364, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0011143676441861317, "completion_length": 85.95250244140625, "epoch": 0.8414177312457977, "grad_norm": 3.0064802169799805, "kl": 9.46174124404788, "learning_rate": 1.5479616753860792e-07, "loss": 0.0195, "reward": 1.8130270481109618, "reward_std": 0.1679749459028244, "rewards/code_format_reward": 0.9924999952316285, "rewards/code_reward": 0.6583885312080383, "step": 4380, "zero_std_ratio": 0.675 }, { "clip_ratio/high_max": 0.020394568890333177, "clip_ratio/high_mean": 0.002549321111291647, "clip_ratio/low_mean": 0.0012285682838410138, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.003777889395132661, "completion_length": 94.72500305175781, "epoch": 0.8433387762943041, "grad_norm": 8.23426342010498, "kl": 0.3538756832480431, "learning_rate": 1.5350419955058645e-07, "loss": -0.0046, "reward": 1.6075192928314208, "reward_std": 0.16927714347839357, "rewards/code_format_reward": 0.9962499976158142, "rewards/code_reward": 0.5546970963478088, "step": 4390, "zero_std_ratio": 0.675 }, { "clip_ratio/high_max": 0.06588131491444074, "clip_ratio/high_mean": 0.009102540424646578, "clip_ratio/low_mean": 0.0007730728961178101, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.009875613666372374, "completion_length": 90.04500274658203, "epoch": 0.8452598213428105, "grad_norm": 7.7215776443481445, "kl": 0.2362464390695095, "learning_rate": 1.522266813827407e-07, "loss": 0.0036, "reward": 1.8586368560791016, "reward_std": 0.2194239765405655, "rewards/code_format_reward": 0.9949999928474427, "rewards/code_reward": 0.6805683970451355, "step": 4400, "zero_std_ratio": 0.55 }, { "clip_ratio/high_max": 0.004174966411665082, "clip_ratio/high_mean": 0.0007423789938911796, "clip_ratio/low_mean": 7.375134955509566e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0008161303412634879, "completion_length": 88.90750122070312, "epoch": 0.8471808663913168, "grad_norm": 2.829716920852661, "kl": 1.5581397600471973, "learning_rate": 1.509636595927078e-07, "loss": 0.003, "reward": 1.9052275657653808, "reward_std": 0.256375952064991, "rewards/code_format_reward": 0.9787499904632568, "rewards/code_reward": 0.7079262495040893, "step": 4410, "zero_std_ratio": 0.6 }, { "clip_ratio/high_max": 0.07640773041639477, "clip_ratio/high_mean": 0.009898414360941387, "clip_ratio/low_mean": 9.467430354561656e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00999308866157662, "completion_length": 95.14500122070312, "epoch": 0.8491019114398233, "grad_norm": 0.3017069101333618, "kl": 0.8497596487402916, "learning_rate": 1.4971518020982232e-07, "loss": -0.0017, "reward": 1.5574845552444458, "reward_std": 0.1220448928885162, "rewards/code_format_reward": 0.9862499952316284, "rewards/code_reward": 0.5321797609329224, "step": 4420, "zero_std_ratio": 0.6 }, { "clip_ratio/high_max": 0.02955477687064558, "clip_ratio/high_mean": 0.00414732932113111, "clip_ratio/low_mean": 4.42216987721622e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00419155100826174, "completion_length": 99.7, "epoch": 0.8510229564883296, "grad_norm": 5.942404747009277, "kl": 0.5168043114244938, "learning_rate": 1.4848128873343773e-07, "loss": -0.0003, "reward": 1.6633994817733764, "reward_std": 0.2619109332561493, "rewards/code_format_reward": 0.9762499928474426, "rewards/code_reward": 0.5876372039318085, "step": 4430, "zero_std_ratio": 0.45 }, { "clip_ratio/high_max": 0.014663098810706288, "clip_ratio/high_mean": 0.0024809099428239278, "clip_ratio/low_mean": 3.1672295881435276e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.002512582238705363, "completion_length": 100.20750122070312, "epoch": 0.852944001536836, "grad_norm": 3.1078836917877197, "kl": 0.39873379915952684, "learning_rate": 1.4726203013126844e-07, "loss": 0.006, "reward": 1.7631917238235473, "reward_std": 0.22433922737836837, "rewards/code_format_reward": 0.9862499952316284, "rewards/code_reward": 0.6350333511829376, "step": 4440, "zero_std_ratio": 0.475 }, { "clip_ratio/high_max": 0.014706605696119368, "clip_ratio/high_mean": 0.00257694432802964, "clip_ratio/low_mean": 0.00032811136916279795, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.002905055697192438, "completion_length": 99.6875, "epoch": 0.8548650465853425, "grad_norm": 8.708415985107422, "kl": 0.4677444875240326, "learning_rate": 1.4605744883775122e-07, "loss": -0.0036, "reward": 1.8840698957443238, "reward_std": 0.2510286644101143, "rewards/code_format_reward": 0.9850000143051147, "rewards/code_reward": 0.6957849264144897, "step": 4450, "zero_std_ratio": 0.55 }, { "clip_ratio/high_max": 0.008664844953455032, "clip_ratio/high_mean": 0.0019024941400857642, "clip_ratio/low_mean": 0.002259151160251349, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.004161645277054049, "completion_length": 89.9000015258789, "epoch": 0.8567860916338488, "grad_norm": 7.514847755432129, "kl": 0.3879747323691845, "learning_rate": 1.4486758875242557e-07, "loss": -0.0046, "reward": 1.9147763013839723, "reward_std": 0.2857444554567337, "rewards/code_format_reward": 0.9887499928474426, "rewards/code_reward": 0.7102006316184998, "step": 4460, "zero_std_ratio": 0.425 }, { "clip_ratio/high_max": 0.01213214877061546, "clip_ratio/high_mean": 0.0017360628451569937, "clip_ratio/low_mean": 0.0008027118048630655, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.002538774654385634, "completion_length": 100.34250183105469, "epoch": 0.8587071366823552, "grad_norm": 4.4957380294799805, "kl": 0.7120470233261585, "learning_rate": 1.436924932383341e-07, "loss": -0.0029, "reward": 1.7210463523864745, "reward_std": 0.348609185218811, "rewards/code_format_reward": 0.9762500047683715, "rewards/code_reward": 0.6164606809616089, "step": 4470, "zero_std_ratio": 0.375 }, { "clip_ratio/high_max": 0.04909939672797918, "clip_ratio/high_mean": 0.006639666750561446, "clip_ratio/low_mean": 0.0001961415633559227, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.006835808313917368, "completion_length": 89.65500030517578, "epoch": 0.8606281817308616, "grad_norm": 0.6291245818138123, "kl": 0.914973171055317, "learning_rate": 1.4253220512044194e-07, "loss": 0.0052, "reward": 1.5310453414916991, "reward_std": 0.2040669571608305, "rewards/code_format_reward": 0.9849999904632568, "rewards/code_reward": 0.519272655248642, "step": 4480, "zero_std_ratio": 0.5 }, { "clip_ratio/high_max": 0.023789329756982624, "clip_ratio/high_mean": 0.0034216867323266344, "clip_ratio/low_mean": 6.479026051238179e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.003486476981197484, "completion_length": 94.68500061035157, "epoch": 0.862549226779368, "grad_norm": 3.6435203552246094, "kl": 0.24871882200241088, "learning_rate": 1.4138676668407637e-07, "loss": -0.004, "reward": 1.7728254079818726, "reward_std": 0.21846108362078667, "rewards/code_format_reward": 0.9912499904632568, "rewards/code_reward": 0.6386001646518707, "step": 4490, "zero_std_ratio": 0.6 }, { "clip_ratio/high_max": 0.02858473571250215, "clip_ratio/high_mean": 0.004445229801058303, "clip_ratio/low_mean": 0.005347848287783563, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.009793078135407996, "completion_length": 94.14000091552734, "epoch": 0.8644702718278744, "grad_norm": 7.250815391540527, "kl": 1.268965845555067, "learning_rate": 1.402562196733855e-07, "loss": 0.1222, "reward": 1.6482325553894044, "reward_std": 0.321136474609375, "rewards/code_format_reward": 0.9712499976158142, "rewards/code_reward": 0.5813037693500519, "step": 4500, "zero_std_ratio": 0.4 }, { "clip_ratio/high_max": 0.0016861034324392675, "clip_ratio/high_mean": 0.00025714511721162123, "clip_ratio/low_mean": 8.047257215366699e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0003376176857273094, "completion_length": 89.82750244140625, "epoch": 0.8663913168763807, "grad_norm": 1.5697243213653564, "kl": 0.3187939524650574, "learning_rate": 1.3914060528981713e-07, "loss": -0.0008, "reward": 1.6549904108047486, "reward_std": 0.15924324840307236, "rewards/code_format_reward": 0.9912499785423279, "rewards/code_reward": 0.5796826839447021, "step": 4510, "zero_std_ratio": 0.65 }, { "clip_ratio/high_max": 0.005441831634379923, "clip_ratio/high_mean": 0.0007462791429134086, "clip_ratio/low_mean": 0.0008039395906962454, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0015502187496167607, "completion_length": 97.60250091552734, "epoch": 0.8683123619248871, "grad_norm": 2.864607334136963, "kl": 0.36184127181768416, "learning_rate": 1.38039964190617e-07, "loss": -0.0068, "reward": 1.5000358819961548, "reward_std": 0.22264644205570222, "rewards/code_format_reward": 0.9850000023841858, "rewards/code_reward": 0.5037679553031922, "step": 4520, "zero_std_ratio": 0.575 }, { "clip_ratio/high_max": 0.045888486225157975, "clip_ratio/high_mean": 0.006488210440147668, "clip_ratio/low_mean": 4.380840982776135e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.006532018849975429, "completion_length": 107.06000061035157, "epoch": 0.8702334069733936, "grad_norm": 3.5723934173583984, "kl": 0.21280892938375473, "learning_rate": 1.369543364873474e-07, "loss": 0.0008, "reward": 1.8976154088974, "reward_std": 0.22375442534685136, "rewards/code_format_reward": 0.9737499952316284, "rewards/code_reward": 0.7053701996803283, "step": 4530, "zero_std_ratio": 0.625 }, { "clip_ratio/high_max": 0.021734172268770634, "clip_ratio/high_mean": 0.00285033899708651, "clip_ratio/low_mean": 0.00015430593703058548, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.003004644898464903, "completion_length": 90.5125, "epoch": 0.8721544520218999, "grad_norm": 26.33332633972168, "kl": 16.64756402745843, "learning_rate": 1.3588376174442495e-07, "loss": 0.0407, "reward": 1.8465018033981324, "reward_std": 0.26863393038511274, "rewards/code_format_reward": 0.9900000095367432, "rewards/code_reward": 0.6757509171962738, "step": 4540, "zero_std_ratio": 0.6 }, { "clip_ratio/high_max": 0.01540006476570852, "clip_ratio/high_mean": 0.00195431642132462, "clip_ratio/low_mean": 0.0003294220077805221, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0022837384400190785, "completion_length": 91.96750183105469, "epoch": 0.8740754970704063, "grad_norm": 5.637061595916748, "kl": 0.5615961387753486, "learning_rate": 1.348282789776792e-07, "loss": 0.0006, "reward": 1.7335857629776001, "reward_std": 0.16677757501602172, "rewards/code_format_reward": 0.9712500095367431, "rewards/code_reward": 0.6239803791046142, "step": 4550, "zero_std_ratio": 0.65 }, { "clip_ratio/high_max": 0.012607228197157382, "clip_ratio/high_mean": 0.0017772652208805084, "clip_ratio/low_mean": 0.00020470973395276816, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0019819749519228934, "completion_length": 90.42000122070313, "epoch": 0.8759965421189126, "grad_norm": 4.87404727935791, "kl": 0.5051522366702557, "learning_rate": 1.3378792665293032e-07, "loss": -0.0007, "reward": 1.8114176988601685, "reward_std": 0.27143858969211576, "rewards/code_format_reward": 0.9687499880790711, "rewards/code_reward": 0.6635213375091553, "step": 4560, "zero_std_ratio": 0.45 }, { "clip_ratio/high_max": 0.003521555650513619, "clip_ratio/high_mean": 0.0005311336179147474, "clip_ratio/low_mean": 0.00039719248161418363, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0009283260951633565, "completion_length": 96.31000061035157, "epoch": 0.8779175871674191, "grad_norm": 3.5294971466064453, "kl": 0.44891551434993743, "learning_rate": 1.3276274268458749e-07, "loss": -0.0011, "reward": 1.8015916109085084, "reward_std": 0.23535949736833572, "rewards/code_format_reward": 0.9849999904632568, "rewards/code_reward": 0.65454580783844, "step": 4570, "zero_std_ratio": 0.625 }, { "clip_ratio/high_max": 0.016813984792679548, "clip_ratio/high_mean": 0.0026305554260034115, "clip_ratio/low_mean": 0.00013278115511639044, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0027633365796646105, "completion_length": 92.12000122070313, "epoch": 0.8798386322159255, "grad_norm": 3.3064281940460205, "kl": 147.6638460204005, "learning_rate": 1.3175276443426704e-07, "loss": 0.3018, "reward": 1.8557111263275146, "reward_std": 0.21927002370357512, "rewards/code_format_reward": 0.9924999833106994, "rewards/code_reward": 0.6797305464744567, "step": 4580, "zero_std_ratio": 0.55 }, { "clip_ratio/high_max": 0.004338507051579654, "clip_ratio/high_mean": 0.0005835221760207787, "clip_ratio/low_mean": 9.975638386094943e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0006832785555161535, "completion_length": 96.14250183105469, "epoch": 0.8817596772644318, "grad_norm": 5.933443546295166, "kl": 0.7469953082501888, "learning_rate": 1.3075802870943102e-07, "loss": -0.0005, "reward": 1.7140401601791382, "reward_std": 0.32567469477653505, "rewards/code_format_reward": 0.9699999928474426, "rewards/code_reward": 0.6145200908184052, "step": 4590, "zero_std_ratio": 0.5 }, { "clip_ratio/high_max": 0.008221420878544449, "clip_ratio/high_mean": 0.0010466745734447613, "clip_ratio/low_mean": 0.00021989296365063638, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0012665675370953978, "completion_length": 96.91000213623047, "epoch": 0.8836807223129383, "grad_norm": 3.6585068702697754, "kl": 0.2884219281375408, "learning_rate": 1.2977857176204554e-07, "loss": -0.0014, "reward": 1.745366358757019, "reward_std": 0.28437634110450744, "rewards/code_format_reward": 0.9612499952316285, "rewards/code_reward": 0.6323706865310669, "step": 4600, "zero_std_ratio": 0.55 }, { "clip_ratio/high_max": 0.013106013461947442, "clip_ratio/high_mean": 0.001983167743310332, "clip_ratio/low_mean": 0.0010545071098022162, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0030376748647540806, "completion_length": 95.46000366210937, "epoch": 0.8856017673614446, "grad_norm": 3.166572332382202, "kl": 0.7999920375645161, "learning_rate": 1.2881442928725997e-07, "loss": 0.0024, "reward": 1.7604058027267455, "reward_std": 0.1588110476732254, "rewards/code_format_reward": 0.9837499976158142, "rewards/code_reward": 0.6342653870582581, "step": 4610, "zero_std_ratio": 0.725 }, { "clip_ratio/high_max": 0.03971324802841991, "clip_ratio/high_mean": 0.005240282195154577, "clip_ratio/low_mean": 0.00012449334171833472, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.005364775560155977, "completion_length": 91.37250213623047, "epoch": 0.887522812409951, "grad_norm": 1.2688926458358765, "kl": 52.058202140033245, "learning_rate": 1.2786563642210536e-07, "loss": 0.1059, "reward": 1.6578764081001283, "reward_std": 0.1922210179269314, "rewards/code_format_reward": 0.9724999904632569, "rewards/code_reward": 0.5858131945133209, "step": 4620, "zero_std_ratio": 0.6 }, { "clip_ratio/high_max": 0.014312215382233262, "clip_ratio/high_mean": 0.002295189391588792, "clip_ratio/low_mean": 0.0010927254450507462, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0033879148249980062, "completion_length": 92.44750061035157, "epoch": 0.8894438574584574, "grad_norm": 1.0473991632461548, "kl": 0.48051133900880816, "learning_rate": 1.269322277442151e-07, "loss": 0.0015, "reward": 1.8454564094543457, "reward_std": 0.23949076235294342, "rewards/code_format_reward": 0.9824999809265137, "rewards/code_reward": 0.6771032094955445, "step": 4630, "zero_std_ratio": 0.575 }, { "clip_ratio/high_max": 0.040262592025101185, "clip_ratio/high_mean": 0.005372756696306169, "clip_ratio/low_mean": 0.0007898360927356407, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.006162592757027597, "completion_length": 84.6050018310547, "epoch": 0.8913649025069638, "grad_norm": 6.553028106689453, "kl": 0.6895815744996071, "learning_rate": 1.2601423727056346e-07, "loss": -0.0001, "reward": 1.6561978340148926, "reward_std": 0.36703028678894045, "rewards/code_format_reward": 0.975, "rewards/code_reward": 0.5843489110469818, "step": 4640, "zero_std_ratio": 0.325 }, { "clip_ratio/high_max": 0.06538669131696224, "clip_ratio/high_mean": 0.009138646663632243, "clip_ratio/low_mean": 0.0017342485502013006, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.010872895480133593, "completion_length": 88.39750061035156, "epoch": 0.8932859475554702, "grad_norm": 4.167427062988281, "kl": 1.728559673577547, "learning_rate": 1.2511169845622699e-07, "loss": 0.0019, "reward": 1.6277015209197998, "reward_std": 0.21625073552131652, "rewards/code_format_reward": 0.975000011920929, "rewards/code_reward": 0.5701007604598999, "step": 4650, "zero_std_ratio": 0.475 }, { "clip_ratio/high_max": 0.042488472175318745, "clip_ratio/high_mean": 0.005791870540997479, "clip_ratio/low_mean": 2.0525451691355557e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.005812396005785559, "completion_length": 92.57250213623047, "epoch": 0.8952069926039765, "grad_norm": 6.026858806610107, "kl": 0.7588046140968799, "learning_rate": 1.2422464419316432e-07, "loss": 0.0034, "reward": 1.7008742094039917, "reward_std": 0.27438378930091856, "rewards/code_format_reward": 0.9699999928474426, "rewards/code_reward": 0.6079370617866516, "step": 4660, "zero_std_ratio": 0.5 }, { "clip_ratio/high_max": 0.013316971366293728, "clip_ratio/high_mean": 0.0019765587523579596, "clip_ratio/low_mean": 8.563735173083842e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.002062196109909564, "completion_length": 93.35000305175781, "epoch": 0.897128037652483, "grad_norm": 4.863064765930176, "kl": 7.6627843722701074, "learning_rate": 1.233531068090184e-07, "loss": 0.011, "reward": 1.8806322813034058, "reward_std": 0.28162118047475815, "rewards/code_format_reward": 0.9887500047683716, "rewards/code_reward": 0.6931286633014679, "step": 4670, "zero_std_ratio": 0.45 }, { "clip_ratio/high_max": 0.004586372757330537, "clip_ratio/high_mean": 0.0006299379543634132, "clip_ratio/low_mean": 1.7313018906861545e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0006472509849118069, "completion_length": 92.36500091552735, "epoch": 0.8990490827009894, "grad_norm": 2.1931703090667725, "kl": 0.2519014351069927, "learning_rate": 1.2249711806593762e-07, "loss": 0.0034, "reward": 1.8040930509567261, "reward_std": 0.24223610311746596, "rewards/code_format_reward": 0.9837499976158142, "rewards/code_reward": 0.6561090111732483, "step": 4680, "zero_std_ratio": 0.575 }, { "clip_ratio/high_max": 0.006883417209610343, "clip_ratio/high_mean": 0.0009808192204218357, "clip_ratio/low_mean": 0.00029269491278682835, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.001273514133208664, "completion_length": 91.60500030517578, "epoch": 0.9009701277494957, "grad_norm": 21.0294132232666, "kl": 0.25964570268988607, "learning_rate": 1.2165670915941866e-07, "loss": -0.0043, "reward": 1.9244711637496947, "reward_std": 0.1629927098751068, "rewards/code_format_reward": 0.9862499952316284, "rewards/code_reward": 0.7156730651855469, "step": 4690, "zero_std_ratio": 0.65 }, { "clip_ratio/high_max": 0.010230390657670795, "clip_ratio/high_mean": 0.0014585633180104196, "clip_ratio/low_mean": 3.0266345129348336e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.001488829671870917, "completion_length": 87.84500274658203, "epoch": 0.9028911727980021, "grad_norm": 1.7218002080917358, "kl": 16.447690600901844, "learning_rate": 1.2083191071716937e-07, "loss": 0.0339, "reward": 1.940086579322815, "reward_std": 0.16455088555812836, "rewards/code_format_reward": 0.993749988079071, "rewards/code_reward": 0.7216057777404785, "step": 4700, "zero_std_ratio": 0.675 }, { "clip_ratio/high_max": 0.025706328079104425, "clip_ratio/high_mean": 0.003573437442537397, "clip_ratio/low_mean": 3.392130311112851e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0036073587427381424, "completion_length": 81.84500274658203, "epoch": 0.9048122178465084, "grad_norm": 0.22543705999851227, "kl": 0.31741214692592623, "learning_rate": 1.2002275279799288e-07, "loss": -0.0056, "reward": 1.8292718410491944, "reward_std": 0.12828939855098725, "rewards/code_format_reward": 0.9987499952316284, "rewards/code_reward": 0.6649484276771546, "step": 4710, "zero_std_ratio": 0.65 }, { "clip_ratio/high_max": 0.009241180948447437, "clip_ratio/high_mean": 0.0013442957555525937, "clip_ratio/low_mean": 4.643963038688526e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0013907353932154365, "completion_length": 95.63500213623047, "epoch": 0.9067332628950149, "grad_norm": 5.23514986038208, "kl": 0.804936108738184, "learning_rate": 1.192292648906918e-07, "loss": 0.0031, "reward": 1.925449275970459, "reward_std": 0.2213977299630642, "rewards/code_format_reward": 0.9899999976158143, "rewards/code_reward": 0.7152246475219727, "step": 4720, "zero_std_ratio": 0.6 }, { "clip_ratio/high_max": 0.021669640118489042, "clip_ratio/high_mean": 0.003889294656983111, "clip_ratio/low_mean": 0.00044275675172684715, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.004332051414530724, "completion_length": 92.25250244140625, "epoch": 0.9086543079435213, "grad_norm": 66.00515747070312, "kl": 2.1086502872407435, "learning_rate": 1.1845147591299378e-07, "loss": 0.0162, "reward": 1.5327723979949952, "reward_std": 0.2872114762663841, "rewards/code_format_reward": 0.9725000023841858, "rewards/code_reward": 0.5232611894607544, "step": 4730, "zero_std_ratio": 0.4 }, { "clip_ratio/high_max": 0.006079713994404301, "clip_ratio/high_mean": 0.0010439059922646265, "clip_ratio/low_mean": 0.0013928397551353556, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.002436745767045068, "completion_length": 97.43000030517578, "epoch": 0.9105753529920276, "grad_norm": 2.8770546913146973, "kl": 3.1866038836538793, "learning_rate": 1.1768941421049768e-07, "loss": 0.0069, "reward": 1.7776832818984984, "reward_std": 0.29561240673065187, "rewards/code_format_reward": 0.9949999928474427, "rewards/code_reward": 0.6400915861129761, "step": 4740, "zero_std_ratio": 0.475 }, { "clip_ratio/high_max": 0.005499497149139642, "clip_ratio/high_mean": 0.0006874371436424553, "clip_ratio/low_mean": 0.000482194940559566, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0011696320783812554, "completion_length": 88.9500015258789, "epoch": 0.9124963980405341, "grad_norm": 8.540057182312012, "kl": 0.9072364956140518, "learning_rate": 1.1694310755564014e-07, "loss": -0.0021, "reward": 1.6791202545166015, "reward_std": 0.326928648352623, "rewards/code_format_reward": 0.9774999976158142, "rewards/code_reward": 0.595185148715973, "step": 4750, "zero_std_ratio": 0.5 }, { "clip_ratio/high_max": 0.008684736292343587, "clip_ratio/high_mean": 0.001148225087672472, "clip_ratio/low_mean": 0.0005504319502506405, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0016986570524750277, "completion_length": 95.68250122070313, "epoch": 0.9144174430890404, "grad_norm": 4.539205551147461, "kl": 0.860013198107481, "learning_rate": 1.1621258314668402e-07, "loss": 0.0, "reward": 1.7214089155197143, "reward_std": 0.1847836285829544, "rewards/code_format_reward": 0.9674999952316284, "rewards/code_reward": 0.6188294351100921, "step": 4760, "zero_std_ratio": 0.625 }, { "clip_ratio/high_max": 0.012012088089250028, "clip_ratio/high_mean": 0.0020424059097422288, "clip_ratio/low_mean": 7.006222731433808e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0021124681399669496, "completion_length": 93.21750183105469, "epoch": 0.9163384881375468, "grad_norm": 6.60590124130249, "kl": 0.45315413996577264, "learning_rate": 1.1549786760672676e-07, "loss": -0.0013, "reward": 1.7664082288742065, "reward_std": 0.24015129953622819, "rewards/code_format_reward": 0.9849999904632568, "rewards/code_reward": 0.6369540929794312, "step": 4770, "zero_std_ratio": 0.55 }, { "clip_ratio/high_max": 0.028257530624978246, "clip_ratio/high_mean": 0.00573968501703348, "clip_ratio/low_mean": 0.00028595919138751924, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.006025644272449426, "completion_length": 93.59000244140626, "epoch": 0.9182595331860532, "grad_norm": 3.693448781967163, "kl": 0.5863466400653123, "learning_rate": 1.1479898698273037e-07, "loss": 0.0001, "reward": 1.7522862911224366, "reward_std": 0.24038469642400742, "rewards/code_format_reward": 0.9762500047683715, "rewards/code_reward": 0.6320806205272674, "step": 4780, "zero_std_ratio": 0.475 }, { "clip_ratio/high_max": 0.006044295988976956, "clip_ratio/high_mean": 0.0008545084856450558, "clip_ratio/low_mean": 0.0004956311546266079, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0013501396053470672, "completion_length": 100.22000122070312, "epoch": 0.9201805782345596, "grad_norm": 17.894886016845703, "kl": 0.33935268595814705, "learning_rate": 1.1411596674457193e-07, "loss": -0.0019, "reward": 1.697510004043579, "reward_std": 0.16087576895952224, "rewards/code_format_reward": 0.987499988079071, "rewards/code_reward": 0.6018799901008606, "step": 4790, "zero_std_ratio": 0.55 }, { "clip_ratio/high_max": 0.005959878279827535, "clip_ratio/high_mean": 0.0009170519857434556, "clip_ratio/low_mean": 7.972503372002392e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0009967770281946286, "completion_length": 98.54750213623046, "epoch": 0.922101623283066, "grad_norm": 3.242460250854492, "kl": 0.46977903619408606, "learning_rate": 1.1344883178411565e-07, "loss": -0.0036, "reward": 1.7927821159362793, "reward_std": 0.24044746458530425, "rewards/code_format_reward": 0.9699999809265136, "rewards/code_reward": 0.6538910627365112, "step": 4800, "zero_std_ratio": 0.45 }, { "clip_ratio/high_max": 0.00756604690104723, "clip_ratio/high_mean": 0.0010185762541368604, "clip_ratio/low_mean": 0.00016034738000598737, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.001178923639236018, "completion_length": 99.49000091552735, "epoch": 0.9240226683315724, "grad_norm": 7.225472927093506, "kl": 0.2285786397755146, "learning_rate": 1.1279760641430568e-07, "loss": 0.0001, "reward": 1.7233760595321654, "reward_std": 0.22306990921497344, "rewards/code_format_reward": 0.9875, "rewards/code_reward": 0.6148130118846893, "step": 4810, "zero_std_ratio": 0.55 }, { "clip_ratio/high_max": 0.011060118256136776, "clip_ratio/high_mean": 0.0017047788191121072, "clip_ratio/low_mean": 0.00028281604463700207, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0019875948812114073, "completion_length": 92.36500091552735, "epoch": 0.9259437133800787, "grad_norm": 4.390386581420898, "kl": 0.8032988727092742, "learning_rate": 1.1216231436827974e-07, "loss": 0.0005, "reward": 1.7829072952270508, "reward_std": 0.21434771865606309, "rewards/code_format_reward": 0.9862500071525574, "rewards/code_reward": 0.6448911607265473, "step": 4820, "zero_std_ratio": 0.55 }, { "clip_ratio/high_max": 0.013947398256277665, "clip_ratio/high_mean": 0.0018392194229818414, "clip_ratio/low_mean": 0.0004060613617184572, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.002245280790521065, "completion_length": 103.5125015258789, "epoch": 0.9278647584285852, "grad_norm": 6.774899482727051, "kl": 0.34710453301668165, "learning_rate": 1.1154297879850462e-07, "loss": 0.0003, "reward": 1.7023445606231689, "reward_std": 0.23593612909317016, "rewards/code_format_reward": 0.96875, "rewards/code_reward": 0.60898477435112, "step": 4830, "zero_std_ratio": 0.45 }, { "clip_ratio/high_max": 0.00909699429757893, "clip_ratio/high_mean": 0.0014705892943311482, "clip_ratio/low_mean": 0.0004355724740889855, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0019061617698753253, "completion_length": 91.47000274658203, "epoch": 0.9297858034770915, "grad_norm": 1.7924318313598633, "kl": 0.5235365644097328, "learning_rate": 1.1093962227593214e-07, "loss": 0.0017, "reward": 1.823938512802124, "reward_std": 0.18318418860435487, "rewards/code_format_reward": 0.987499988079071, "rewards/code_reward": 0.6650941967964172, "step": 4840, "zero_std_ratio": 0.55 }, { "clip_ratio/high_max": 0.008868952537886799, "clip_ratio/high_mean": 0.0013198618631577118, "clip_ratio/low_mean": 6.896776030771435e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0013888296205550432, "completion_length": 97.04750061035156, "epoch": 0.9317068485255979, "grad_norm": 5.492427825927734, "kl": 0.27957614585757257, "learning_rate": 1.1035226678917662e-07, "loss": 0.0001, "reward": 1.7743586778640748, "reward_std": 0.19067177027463914, "rewards/code_format_reward": 0.9699999928474426, "rewards/code_reward": 0.6446793019771576, "step": 4850, "zero_std_ratio": 0.6 }, { "clip_ratio/high_max": 0.00021865542512387038, "clip_ratio/high_mean": 2.7331928140483797e-05, "clip_ratio/low_mean": 0.00022580694640055298, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0002531388745410368, "completion_length": 91.65750274658203, "epoch": 0.9336278935741044, "grad_norm": 8.045164108276367, "kl": 0.20759812816977502, "learning_rate": 1.0978093374371373e-07, "loss": -0.0004, "reward": 1.7663999795913696, "reward_std": 0.281513449549675, "rewards/code_format_reward": 0.9912499904632568, "rewards/code_reward": 0.6353874802589417, "step": 4860, "zero_std_ratio": 0.4 }, { "clip_ratio/high_max": 0.02832627217285335, "clip_ratio/high_mean": 0.0035600741393864155, "clip_ratio/low_mean": 0.00011176664993399754, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0036718408693559466, "completion_length": 84.46500244140626, "epoch": 0.9355489386226107, "grad_norm": 4.819484233856201, "kl": 0.5664212189614772, "learning_rate": 1.0922564396109993e-07, "loss": -0.0008, "reward": 1.7755849838256836, "reward_std": 0.20761601328849794, "rewards/code_format_reward": 0.9899999856948852, "rewards/code_reward": 0.640292489528656, "step": 4870, "zero_std_ratio": 0.625 }, { "clip_ratio/high_max": 0.006872700434178114, "clip_ratio/high_mean": 0.0009693403088022023, "clip_ratio/low_mean": 3.415665923967026e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0010034969680418726, "completion_length": 92.47500152587891, "epoch": 0.9374699836711171, "grad_norm": 2.605060338973999, "kl": 0.6489929877221584, "learning_rate": 1.0868641767821432e-07, "loss": -0.0041, "reward": 1.9151075601577758, "reward_std": 0.2566168040037155, "rewards/code_format_reward": 0.9849999904632568, "rewards/code_reward": 0.7113037467002868, "step": 4880, "zero_std_ratio": 0.575 }, { "clip_ratio/high_max": 0.018258474441245197, "clip_ratio/high_mean": 0.003355332469800487, "clip_ratio/low_mean": 0.000607103164657019, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.003962435649009421, "completion_length": 90.51000366210937, "epoch": 0.9393910287196234, "grad_norm": 4.408846378326416, "kl": 0.35625301077961924, "learning_rate": 1.0816327454652044e-07, "loss": -0.0018, "reward": 1.7154739379882813, "reward_std": 0.2987362504005432, "rewards/code_format_reward": 0.9612499952316285, "rewards/code_reward": 0.6174244284629822, "step": 4890, "zero_std_ratio": 0.5 }, { "clip_ratio/high_max": 0.010325380798894912, "clip_ratio/high_mean": 0.0015298718310077675, "clip_ratio/low_mean": 0.000294900168228196, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0018247719475766645, "completion_length": 100.19250183105468, "epoch": 0.9413120737681299, "grad_norm": 9.08279037475586, "kl": 0.23486268445849418, "learning_rate": 1.0765623363135061e-07, "loss": -0.0011, "reward": 1.5800267338752747, "reward_std": 0.26311944872140886, "rewards/code_format_reward": 0.9862499952316284, "rewards/code_reward": 0.5434508502483368, "step": 4900, "zero_std_ratio": 0.525 }, { "clip_ratio/high_max": 0.004708675656002015, "clip_ratio/high_mean": 0.0008911975004593842, "clip_ratio/low_mean": 0.0001348661200609058, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.001026063623430673, "completion_length": 85.6300033569336, "epoch": 0.9432331188166363, "grad_norm": 2.5798628330230713, "kl": 0.5353534445166588, "learning_rate": 1.071653134112109e-07, "loss": -0.0018, "reward": 1.7293733358383179, "reward_std": 0.23426424115896224, "rewards/code_format_reward": 0.9862499833106995, "rewards/code_reward": 0.6181241631507873, "step": 4910, "zero_std_ratio": 0.55 }, { "clip_ratio/high_max": 0.031931064534001054, "clip_ratio/high_mean": 0.004416047394624911, "clip_ratio/low_mean": 0.00037934551510261373, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0047953929373761636, "completion_length": 93.08500061035156, "epoch": 0.9451541638651426, "grad_norm": 3.0407347679138184, "kl": 0.3617399115115404, "learning_rate": 1.0669053177710766e-07, "loss": -0.0023, "reward": 1.602178120613098, "reward_std": 0.23843889832496643, "rewards/code_format_reward": 0.987499988079071, "rewards/code_reward": 0.5542140543460846, "step": 4920, "zero_std_ratio": 0.5 }, { "clip_ratio/high_max": 0.008546069997828453, "clip_ratio/high_mean": 0.0011838132908451372, "clip_ratio/low_mean": 6.596306338906288e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0012497763542341999, "completion_length": 102.79500122070313, "epoch": 0.947075208913649, "grad_norm": 5.987438678741455, "kl": 0.28761252388358116, "learning_rate": 1.0623190603189566e-07, "loss": 0.0011, "reward": 1.5471005201339723, "reward_std": 0.28855718672275543, "rewards/code_format_reward": 0.9674999952316284, "rewards/code_reward": 0.5316752552986145, "step": 4930, "zero_std_ratio": 0.4 }, { "clip_ratio/high_max": 0.03786106104962528, "clip_ratio/high_mean": 0.005264365172479302, "clip_ratio/low_mean": 0.001157468621386215, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00642183352902066, "completion_length": 95.2625015258789, "epoch": 0.9489962539621554, "grad_norm": 4.088647842407227, "kl": 9114.393886435031, "learning_rate": 1.0578945288964734e-07, "loss": 18.226, "reward": 1.5625978589057923, "reward_std": 0.22688832581043245, "rewards/code_format_reward": 0.9762499928474426, "rewards/code_reward": 0.5372364044189453, "step": 4940, "zero_std_ratio": 0.45 }, { "clip_ratio/high_max": 0.009068883489817381, "clip_ratio/high_mean": 0.0015044378931634128, "clip_ratio/low_mean": 8.003948896657675e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0015844773850403726, "completion_length": 88.91999969482421, "epoch": 0.9509172990106618, "grad_norm": 4.558302879333496, "kl": 0.322134206071496, "learning_rate": 1.0536318847504383e-07, "loss": 0.0008, "reward": 1.683999252319336, "reward_std": 0.15837213546037673, "rewards/code_format_reward": 0.9887500047683716, "rewards/code_reward": 0.5948120951652527, "step": 4950, "zero_std_ratio": 0.65 }, { "clip_ratio/high_max": 0.004135725944070146, "clip_ratio/high_mean": 0.0006349837080051657, "clip_ratio/low_mean": 0.0002028582151979208, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0008378419290238526, "completion_length": 88.58000030517579, "epoch": 0.9528383440591682, "grad_norm": 1.3143569231033325, "kl": 0.32492467686533927, "learning_rate": 1.0495312832278721e-07, "loss": 0.001, "reward": 1.757376217842102, "reward_std": 0.18446292728185654, "rewards/code_format_reward": 0.9875, "rewards/code_reward": 0.6318130671977997, "step": 4960, "zero_std_ratio": 0.575 }, { "clip_ratio/high_max": 0.004577422246802599, "clip_ratio/high_mean": 0.00067823924619006, "clip_ratio/low_mean": 0.00020590101485140622, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0008841402595862746, "completion_length": 91.77249908447266, "epoch": 0.9547593891076745, "grad_norm": 2.7616970539093018, "kl": 0.6282597549259663, "learning_rate": 1.0455928737703441e-07, "loss": 0.0001, "reward": 1.665701198577881, "reward_std": 0.1566584974527359, "rewards/code_format_reward": 0.99375, "rewards/code_reward": 0.5844130754470825, "step": 4970, "zero_std_ratio": 0.6 }, { "clip_ratio/high_max": 0.012442531622946262, "clip_ratio/high_mean": 0.0018589732819236815, "clip_ratio/low_mean": 6.720430101267994e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0019261775829363613, "completion_length": 90.92250213623046, "epoch": 0.956680434156181, "grad_norm": 2.84462308883667, "kl": 0.3018207371234894, "learning_rate": 1.0418167999085259e-07, "loss": 0.0041, "reward": 1.7472755432128906, "reward_std": 0.24319706559181214, "rewards/code_format_reward": 0.9774999856948853, "rewards/code_reward": 0.6292627692222595, "step": 4980, "zero_std_ratio": 0.525 }, { "clip_ratio/high_max": 0.01709002295974642, "clip_ratio/high_mean": 0.002679444645764306, "clip_ratio/low_mean": 0.0003698979213368148, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.003049342567101121, "completion_length": 92.97750091552734, "epoch": 0.9586014792046873, "grad_norm": 11.978320121765137, "kl": 1.2294385731220245, "learning_rate": 1.0382031992569592e-07, "loss": 0.0036, "reward": 1.739167046546936, "reward_std": 0.29275294244289396, "rewards/code_format_reward": 0.9875, "rewards/code_reward": 0.622708535194397, "step": 4990, "zero_std_ratio": 0.45 }, { "clip_ratio/high_max": 0.007942511793226003, "clip_ratio/high_mean": 0.001185902243014425, "clip_ratio/low_mean": 5.571418441832066e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0012416164390742779, "completion_length": 93.31250305175782, "epoch": 0.9605225242531937, "grad_norm": 3.364788055419922, "kl": 0.35085868686437605, "learning_rate": 1.0347522035090446e-07, "loss": -0.0003, "reward": 1.9564055442810058, "reward_std": 0.2229623466730118, "rewards/code_format_reward": 0.9912499904632568, "rewards/code_reward": 0.7303902268409729, "step": 5000, "zero_std_ratio": 0.6 }, { "clip_ratio/high_max": 0.015932422177866102, "clip_ratio/high_mean": 0.0028564550855662675, "clip_ratio/low_mean": 0.00020086783915758134, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.003057322936365381, "completion_length": 96.12750091552735, "epoch": 0.9624435693017002, "grad_norm": 5.283419609069824, "kl": 0.3115640334784985, "learning_rate": 1.0314639384322356e-07, "loss": -0.0037, "reward": 1.6293291807174684, "reward_std": 0.2581008836627007, "rewards/code_format_reward": 0.981249988079071, "rewards/code_reward": 0.5693520545959473, "step": 5010, "zero_std_ratio": 0.425 }, { "clip_ratio/high_max": 0.003986756759695708, "clip_ratio/high_mean": 0.0006319725507637486, "clip_ratio/low_mean": 0.000603693921584636, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0012356664577964694, "completion_length": 86.04750366210938, "epoch": 0.9643646143502065, "grad_norm": 8.70874309539795, "kl": 0.47548493221402166, "learning_rate": 1.0283385238634632e-07, "loss": 0.0041, "reward": 1.622909712791443, "reward_std": 0.2179076835513115, "rewards/code_format_reward": 0.9712499976158142, "rewards/code_reward": 0.5686423420906067, "step": 5020, "zero_std_ratio": 0.525 }, { "clip_ratio/high_max": 0.005893218703567982, "clip_ratio/high_mean": 0.000819433806464076, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.000819433806464076, "completion_length": 88.9375015258789, "epoch": 0.9662856593987129, "grad_norm": 6.6337199211120605, "kl": 0.5933880299329758, "learning_rate": 1.0253760737047606e-07, "loss": -0.0043, "reward": 1.7307970523834229, "reward_std": 0.1557233951985836, "rewards/code_format_reward": 0.9924999952316285, "rewards/code_reward": 0.6172735095024109, "step": 5030, "zero_std_ratio": 0.625 }, { "clip_ratio/high_max": 0.010898534208536148, "clip_ratio/high_mean": 0.0015226851450279356, "clip_ratio/low_mean": 0.009100449224933981, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.010623134509660303, "completion_length": 87.65500183105469, "epoch": 0.9682067044472192, "grad_norm": 12.837902069091797, "kl": 0.1521947119385004, "learning_rate": 1.0225766959191187e-07, "loss": 0.0007, "reward": 1.766017746925354, "reward_std": 0.1697022169828415, "rewards/code_format_reward": 0.9924999952316285, "rewards/code_reward": 0.6348838567733764, "step": 5040, "zero_std_ratio": 0.625 }, { "clip_ratio/high_max": 0.039930257271043955, "clip_ratio/high_mean": 0.005228024450480007, "clip_ratio/low_mean": 0.0012023555900668725, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.006430380133679137, "completion_length": 99.63500213623047, "epoch": 0.9701277494957257, "grad_norm": 3.0135834217071533, "kl": 0.5389343507587909, "learning_rate": 1.0199404925265473e-07, "loss": -0.0011, "reward": 1.5655887126922607, "reward_std": 0.1425598829984665, "rewards/code_format_reward": 0.9849999904632568, "rewards/code_reward": 0.5365443468093872, "step": 5050, "zero_std_ratio": 0.575 }, { "clip_ratio/high_max": 0.013292990019544959, "clip_ratio/high_mean": 0.0019570814620237797, "clip_ratio/low_mean": 0.0004139953598496504, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0023710768204182387, "completion_length": 92.15750122070312, "epoch": 0.9720487945442321, "grad_norm": 8.622629165649414, "kl": 0.3708019584417343, "learning_rate": 1.0174675596003588e-07, "loss": -0.0037, "reward": 1.6285043001174926, "reward_std": 0.21171441301703453, "rewards/code_format_reward": 0.9675000071525574, "rewards/code_reward": 0.5723771452903748, "step": 5060, "zero_std_ratio": 0.5 }, { "clip_ratio/high_max": 0.011086594103835523, "clip_ratio/high_mean": 0.001482552892412059, "clip_ratio/low_mean": 7.31003499822691e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.001555653239483945, "completion_length": 92.72000122070312, "epoch": 0.9739698395927384, "grad_norm": 10.519503593444824, "kl": 0.42225370053201916, "learning_rate": 1.0151579872636673e-07, "loss": 0.0073, "reward": 1.9428821086883545, "reward_std": 0.2824172407388687, "rewards/code_format_reward": 0.981249988079071, "rewards/code_reward": 0.7261285543441772, "step": 5070, "zero_std_ratio": 0.45 }, { "clip_ratio/high_max": 0.02070889645256102, "clip_ratio/high_mean": 0.0035216436022892593, "clip_ratio/low_mean": 0.0003085655207542004, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0038302090688375756, "completion_length": 105.0050048828125, "epoch": 0.9758908846412448, "grad_norm": 4.139841079711914, "kl": 0.3159520372748375, "learning_rate": 1.0130118596861028e-07, "loss": -0.0044, "reward": 1.6708447217941285, "reward_std": 0.30501508712768555, "rewards/code_format_reward": 0.9837499976158142, "rewards/code_reward": 0.5894848227500915, "step": 5080, "zero_std_ratio": 0.4 }, { "clip_ratio/high_max": 0.008058706868905575, "clip_ratio/high_mean": 0.0012073565638274885, "clip_ratio/low_mean": 0.00031636476196581496, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0015237213257933036, "completion_length": 84.28750152587891, "epoch": 0.9778119296897512, "grad_norm": 4.015879154205322, "kl": 0.2918614260852337, "learning_rate": 1.0110292550807451e-07, "loss": -0.0012, "reward": 1.7721335172653199, "reward_std": 0.286711610853672, "rewards/code_format_reward": 0.9899999976158143, "rewards/code_reward": 0.6385667800903321, "step": 5090, "zero_std_ratio": 0.475 }, { "clip_ratio/high_max": 0.019471552316099407, "clip_ratio/high_mean": 0.0026317643467336895, "clip_ratio/low_mean": 0.0003316317946882918, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0029633961850777267, "completion_length": 90.81000213623047, "epoch": 0.9797329747382576, "grad_norm": 1.132954716682434, "kl": 0.2704964060336351, "learning_rate": 1.0092102457012717e-07, "loss": -0.0022, "reward": 1.6570582151412965, "reward_std": 0.21210518777370452, "rewards/code_format_reward": 0.9899999976158143, "rewards/code_reward": 0.5810291051864624, "step": 5100, "zero_std_ratio": 0.5 }, { "clip_ratio/high_max": 0.011018617497757077, "clip_ratio/high_mean": 0.0013860319217201323, "clip_ratio/low_mean": 3.4722223062999547e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0014207541418727488, "completion_length": 93.61250305175781, "epoch": 0.981654019786764, "grad_norm": 16.08737564086914, "kl": 0.26382347345352175, "learning_rate": 1.0075548978393277e-07, "loss": -0.0002, "reward": 1.8070130348205566, "reward_std": 0.1673865035176277, "rewards/code_format_reward": 0.9912500023841858, "rewards/code_reward": 0.6556940078735352, "step": 5110, "zero_std_ratio": 0.625 }, { "clip_ratio/high_max": 0.010004310857038946, "clip_ratio/high_mean": 0.0012777127660228871, "clip_ratio/low_mean": 0.0003342236072057858, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0016119363936013542, "completion_length": 89.3, "epoch": 0.9835750648352704, "grad_norm": 0.4556010961532593, "kl": 0.4934497371315956, "learning_rate": 1.0060632718221066e-07, "loss": 0.0026, "reward": 1.3408710062503815, "reward_std": 0.16168890111148357, "rewards/code_format_reward": 0.9875, "rewards/code_reward": 0.42356050610542295, "step": 5120, "zero_std_ratio": 0.7 }, { "clip_ratio/high_max": 0.05311971204355359, "clip_ratio/high_mean": 0.0075716287479735914, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0075716287479735914, "completion_length": 102.92249908447266, "epoch": 0.9854961098837768, "grad_norm": 3.9305222034454346, "kl": 0.27781638093292715, "learning_rate": 1.0047354220101518e-07, "loss": -0.0011, "reward": 1.630450439453125, "reward_std": 0.18297318816185, "rewards/code_format_reward": 0.9887499928474426, "rewards/code_reward": 0.5680376827716828, "step": 5130, "zero_std_ratio": 0.525 }, { "clip_ratio/high_max": 0.004692732833791524, "clip_ratio/high_mean": 0.0006299943852354772, "clip_ratio/low_mean": 0.00031122941145440565, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0009412237734068186, "completion_length": 88.23250122070313, "epoch": 0.9874171549322832, "grad_norm": 4.31157112121582, "kl": 0.2751577727496624, "learning_rate": 1.0035713967953797e-07, "loss": -0.0038, "reward": 1.635274839401245, "reward_std": 0.29494107216596605, "rewards/code_format_reward": 0.9849999904632568, "rewards/code_reward": 0.5713874340057373, "step": 5140, "zero_std_ratio": 0.45 }, { "clip_ratio/high_max": 0.012987824180163443, "clip_ratio/high_mean": 0.0019960356265073644, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0019960356265073644, "completion_length": 86.49750061035157, "epoch": 0.9893381999807895, "grad_norm": 7.45393705368042, "kl": 0.3619408316910267, "learning_rate": 1.0025712385993115e-07, "loss": 0.0012, "reward": 1.687432312965393, "reward_std": 0.2386924833059311, "rewards/code_format_reward": 0.9912499904632568, "rewards/code_reward": 0.5959036707878113, "step": 5150, "zero_std_ratio": 0.475 }, { "clip_ratio/high_max": 0.014351918507600203, "clip_ratio/high_mean": 0.002000216278975131, "clip_ratio/low_mean": 7.898250914877281e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0020791988128621595, "completion_length": 89.34250030517578, "epoch": 0.991259245029296, "grad_norm": 35.51432418823242, "kl": 0.2617587223649025, "learning_rate": 1.0017349838715278e-07, "loss": -0.004, "reward": 1.2408424496650696, "reward_std": 0.21315770447254181, "rewards/code_format_reward": 0.9774999976158142, "rewards/code_reward": 0.3760462045669556, "step": 5160, "zero_std_ratio": 0.525 }, { "clip_ratio/high_max": 0.003973034140653908, "clip_ratio/high_mean": 0.0004966292675817385, "clip_ratio/low_mean": 6.530825339723379e-06, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0005031600929214619, "completion_length": 99.66499938964844, "epoch": 0.9931802900778023, "grad_norm": 2.5418131351470947, "kl": 0.1747375037521124, "learning_rate": 1.0010626630883432e-07, "loss": 0.003, "reward": 1.421428418159485, "reward_std": 0.09218620862811804, "rewards/code_format_reward": 0.9612499952316285, "rewards/code_reward": 0.4704016923904419, "step": 5170, "zero_std_ratio": 0.675 }, { "clip_ratio/high_max": 0.02674068254418671, "clip_ratio/high_mean": 0.003380646219011396, "clip_ratio/low_mean": 9.682812378741801e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0034774743369780483, "completion_length": 92.72500152587891, "epoch": 0.9951013351263087, "grad_norm": 6.10137414932251, "kl": 0.41192906014621256, "learning_rate": 1.0005543007516928e-07, "loss": -0.0051, "reward": 1.5263760328292846, "reward_std": 0.28926219046115875, "rewards/code_format_reward": 0.9899999976158143, "rewards/code_reward": 0.5156879663467407, "step": 5180, "zero_std_ratio": 0.425 }, { "clip_ratio/high_max": 0.1273454572306946, "clip_ratio/high_mean": 0.016399703072966076, "clip_ratio/low_mean": 0.0004187120386632159, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.016818415274610744, "completion_length": 81.02250213623047, "epoch": 0.9970223801748151, "grad_norm": 7.77527379989624, "kl": 0.7098278045654297, "learning_rate": 1.0002099153882402e-07, "loss": -0.0041, "reward": 1.6053562879562377, "reward_std": 0.16601394787430762, "rewards/code_format_reward": 0.9824999928474426, "rewards/code_reward": 0.557053166627884, "step": 5190, "zero_std_ratio": 0.6 }, { "clip_ratio/high_max": 0.0028753917664289474, "clip_ratio/high_mean": 0.00045008738234173504, "clip_ratio/low_mean": 0.00016858125454746186, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.000618668642709963, "completion_length": 95.49250183105468, "epoch": 0.9989434252233215, "grad_norm": 6.507387638092041, "kl": 0.9340068377554417, "learning_rate": 1.0000295195487024e-07, "loss": -0.0018, "reward": 1.4542541027069091, "reward_std": 0.20283248797059059, "rewards/code_format_reward": 0.981249988079071, "rewards/code_reward": 0.4818145722150803, "step": 5200, "zero_std_ratio": 0.55 }, { "clip_ratio/high_max": 0.010171899455599487, "clip_ratio/high_mean": 0.0014317149762064219, "clip_ratio/low_mean": 0.00016592920292168856, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0015976441791281104, "completion_length": 90.05000305175781, "epoch": 0.999711843242724, "kl": 0.5218422394245863, "reward": 1.0329873859882355, "reward_std": 0.19616412371397018, "rewards/code_format_reward": 0.934374988079071, "rewards/code_reward": 0.28289994597435, "step": 5204, "total_flos": 0.0, "train_loss": 1756184.5472393532, "train_runtime": 149594.4727, "train_samples_per_second": 0.139, "train_steps_per_second": 0.035, "zero_std_ratio": 0.5625 } ], "logging_steps": 10, "max_steps": 5205, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 2000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 5, "trial_name": null, "trial_params": null }