mini_sudoku / trainer_state.json
atutej's picture
Model save
3b2dcaa verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.9955022488755623,
"eval_steps": 500,
"global_step": 166,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.002083333333333337,
"completions/max_length": 414.3,
"completions/max_terminated_length": 390.7,
"completions/mean_length": 96.92292022705078,
"completions/mean_terminated_length": 95.08196258544922,
"completions/min_length": 43.4,
"completions/min_terminated_length": 43.4,
"epoch": 0.05997001499250375,
"frac_reward_zero_std": 0.10000000149011612,
"grad_norm": 0.875,
"kl": 0.04828977584838867,
"learning_rate": 1.9855293386108995e-05,
"loss": -0.0894,
"num_tokens": 164459.0,
"reward": 0.25649446398019793,
"reward_std": 0.12709882631897926,
"rewards/_accuracy_reward/mean": 0.25024444460868833,
"rewards/_accuracy_reward/std": 0.16934245973825454,
"rewards/_format_reward/mean": 0.00625,
"rewards/_format_reward/std": 0.02446230351924896,
"step": 10
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 405.9,
"completions/max_terminated_length": 405.9,
"completions/mean_length": 115.59583740234375,
"completions/mean_terminated_length": 115.59583740234375,
"completions/min_length": 65.6,
"completions/min_terminated_length": 65.6,
"epoch": 0.1199400299850075,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.8359375,
"kl": 0.0818756103515625,
"learning_rate": 1.936044737814273e-05,
"loss": 0.0581,
"num_tokens": 338097.0,
"reward": 0.36916170418262484,
"reward_std": 0.13012803941965104,
"rewards/_accuracy_reward/mean": 0.36916169822216033,
"rewards/_accuracy_reward/std": 0.1557157054543495,
"rewards/_format_reward/mean": 0.0,
"rewards/_format_reward/std": 0.0,
"step": 20
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 269.2,
"completions/max_terminated_length": 269.2,
"completions/mean_length": 85.4083351135254,
"completions/mean_terminated_length": 85.4083351135254,
"completions/min_length": 57.7,
"completions/min_terminated_length": 57.7,
"epoch": 0.17991004497751126,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.6484375,
"kl": 0.102984619140625,
"learning_rate": 1.8531342035272768e-05,
"loss": 0.0225,
"num_tokens": 497557.0,
"reward": 0.4350260511040688,
"reward_std": 0.14344265162944794,
"rewards/_accuracy_reward/mean": 0.43502604514360427,
"rewards/_accuracy_reward/std": 0.1680240161716938,
"rewards/_format_reward/mean": 0.0,
"rewards/_format_reward/std": 0.0,
"step": 30
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 198.7,
"completions/max_terminated_length": 198.7,
"completions/mean_length": 76.06458435058593,
"completions/mean_terminated_length": 76.06458435058593,
"completions/min_length": 44.7,
"completions/min_terminated_length": 44.7,
"epoch": 0.239880059970015,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.63671875,
"kl": 0.11517333984375,
"learning_rate": 1.7397584510798208e-05,
"loss": 0.0064,
"num_tokens": 652196.0,
"reward": 0.4085416719317436,
"reward_std": 0.13231892064213752,
"rewards/_accuracy_reward/mean": 0.4085416689515114,
"rewards/_accuracy_reward/std": 0.1608368895947933,
"rewards/_format_reward/mean": 0.0,
"rewards/_format_reward/std": 0.0,
"step": 40
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 265.6,
"completions/max_terminated_length": 265.6,
"completions/mean_length": 89.46666946411133,
"completions/mean_terminated_length": 89.46666946411133,
"completions/min_length": 62.4,
"completions/min_terminated_length": 62.4,
"epoch": 0.29985007496251875,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.392578125,
"kl": 0.118402099609375,
"learning_rate": 1.5999661014486956e-05,
"loss": 0.0506,
"num_tokens": 813532.0,
"reward": 0.5038281351327896,
"reward_std": 0.12968316152691842,
"rewards/_accuracy_reward/mean": 0.5038281202316284,
"rewards/_accuracy_reward/std": 0.16127740293741227,
"rewards/_format_reward/mean": 0.0,
"rewards/_format_reward/std": 0.0,
"step": 50
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.002083333333333337,
"completions/max_length": 364.6,
"completions/max_terminated_length": 304.4,
"completions/mean_length": 93.76042022705079,
"completions/mean_terminated_length": 91.8341781616211,
"completions/min_length": 62.7,
"completions/min_terminated_length": 62.7,
"epoch": 0.3598200899550225,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.455078125,
"kl": 0.288519287109375,
"learning_rate": 1.4387491059717653e-05,
"loss": 0.0338,
"num_tokens": 976497.0,
"reward": 0.4925865650177002,
"reward_std": 0.12617484703660012,
"rewards/_accuracy_reward/mean": 0.492586562037468,
"rewards/_accuracy_reward/std": 0.15444535091519357,
"rewards/_format_reward/mean": 0.0,
"rewards/_format_reward/std": 0.0,
"step": 60
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 265.3,
"completions/max_terminated_length": 265.3,
"completions/mean_length": 85.07291946411132,
"completions/mean_terminated_length": 85.07291946411132,
"completions/min_length": 70.0,
"completions/min_terminated_length": 70.0,
"epoch": 0.4197901049475262,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.451171875,
"kl": 0.089068603515625,
"learning_rate": 1.2618644849608068e-05,
"loss": 0.0247,
"num_tokens": 1135460.0,
"reward": 0.4750963538885117,
"reward_std": 0.12951767966151237,
"rewards/_accuracy_reward/mean": 0.4750963240861893,
"rewards/_accuracy_reward/std": 0.1608477719128132,
"rewards/_format_reward/mean": 0.0,
"rewards/_format_reward/std": 0.0,
"step": 70
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 284.3,
"completions/max_terminated_length": 284.3,
"completions/mean_length": 87.73958587646484,
"completions/mean_terminated_length": 87.73958587646484,
"completions/min_length": 69.4,
"completions/min_terminated_length": 69.4,
"epoch": 0.47976011994003,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.4140625,
"kl": 0.07869873046875,
"learning_rate": 1.075628745884457e-05,
"loss": 0.0442,
"num_tokens": 1295727.0,
"reward": 0.5100168704986572,
"reward_std": 0.15252956375479698,
"rewards/_accuracy_reward/mean": 0.5100168436765671,
"rewards/_accuracy_reward/std": 0.17259212732315063,
"rewards/_format_reward/mean": 0.0,
"rewards/_format_reward/std": 0.0,
"step": 80
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 259.8,
"completions/max_terminated_length": 259.8,
"completions/mean_length": 88.41250228881836,
"completions/mean_terminated_length": 88.41250228881836,
"completions/min_length": 64.1,
"completions/min_terminated_length": 64.1,
"epoch": 0.5397301349325337,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.66796875,
"kl": 0.17242431640625,
"learning_rate": 8.866923223987303e-06,
"loss": 0.027,
"num_tokens": 1456149.0,
"reward": 0.4691749334335327,
"reward_std": 0.11657274290919303,
"rewards/_accuracy_reward/mean": 0.4691749155521393,
"rewards/_accuracy_reward/std": 0.13906535133719444,
"rewards/_format_reward/mean": 0.0,
"rewards/_format_reward/std": 0.0,
"step": 90
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 338.8,
"completions/max_terminated_length": 338.8,
"completions/mean_length": 94.3062530517578,
"completions/mean_terminated_length": 94.3062530517578,
"completions/min_length": 59.2,
"completions/min_terminated_length": 59.2,
"epoch": 0.5997001499250375,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.59375,
"kl": 0.106280517578125,
"learning_rate": 7.018020889533348e-06,
"loss": -0.0246,
"num_tokens": 1619424.0,
"reward": 0.5008379817008972,
"reward_std": 0.1251222789287567,
"rewards/_accuracy_reward/mean": 0.5008379787206649,
"rewards/_accuracy_reward/std": 0.14559592306613922,
"rewards/_format_reward/mean": 0.0,
"rewards/_format_reward/std": 0.0,
"step": 100
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 404.7,
"completions/max_terminated_length": 404.7,
"completions/mean_length": 115.0875030517578,
"completions/mean_terminated_length": 115.0875030517578,
"completions/min_length": 51.4,
"completions/min_terminated_length": 51.4,
"epoch": 0.6596701649175413,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.353515625,
"kl": 0.0727294921875,
"learning_rate": 5.2756043152032934e-06,
"loss": -0.0641,
"num_tokens": 1792722.0,
"reward": 0.4739863067865372,
"reward_std": 0.13204658553004264,
"rewards/_accuracy_reward/mean": 0.47398627996444703,
"rewards/_accuracy_reward/std": 0.15842494517564773,
"rewards/_format_reward/mean": 0.0,
"rewards/_format_reward/std": 0.0,
"step": 110
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.002083333333333337,
"completions/max_length": 517.4,
"completions/max_terminated_length": 435.3,
"completions/mean_length": 136.24375381469727,
"completions/mean_terminated_length": 134.30962219238282,
"completions/min_length": 51.6,
"completions/min_terminated_length": 51.6,
"epoch": 0.719640179910045,
"frac_reward_zero_std": 0.01666666716337204,
"grad_norm": 0.279296875,
"kl": 0.0670654296875,
"learning_rate": 3.7018947797172864e-06,
"loss": -0.0695,
"num_tokens": 1976103.0,
"reward": 0.5253316760063171,
"reward_std": 0.12880267389118671,
"rewards/_accuracy_reward/mean": 0.5253316521644592,
"rewards/_accuracy_reward/std": 0.17076537311077117,
"rewards/_format_reward/mean": 0.0,
"rewards/_format_reward/std": 0.0,
"step": 120
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 522.1,
"completions/max_terminated_length": 522.1,
"completions/mean_length": 152.3375045776367,
"completions/mean_terminated_length": 152.3375045776367,
"completions/min_length": 46.7,
"completions/min_terminated_length": 46.7,
"epoch": 0.7796101949025487,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.298828125,
"kl": 0.063446044921875,
"learning_rate": 2.353089073828255e-06,
"loss": -0.0654,
"num_tokens": 2167713.0,
"reward": 0.48647034764289854,
"reward_std": 0.1461639277637005,
"rewards/_accuracy_reward/mean": 0.48647033274173734,
"rewards/_accuracy_reward/std": 0.17206955328583717,
"rewards/_format_reward/mean": 0.0,
"rewards/_format_reward/std": 0.0,
"step": 130
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.002083333333333337,
"completions/max_length": 562.7,
"completions/max_terminated_length": 500.5,
"completions/mean_length": 158.33542022705078,
"completions/mean_terminated_length": 156.5459243774414,
"completions/min_length": 50.7,
"completions/min_terminated_length": 50.7,
"epoch": 0.8395802098950524,
"frac_reward_zero_std": 0.03333333432674408,
"grad_norm": 0.5,
"kl": 0.06429443359375,
"learning_rate": 1.2773527263780626e-06,
"loss": -0.0735,
"num_tokens": 2362058.0,
"reward": 0.4694186806678772,
"reward_std": 0.13372117429971694,
"rewards/_accuracy_reward/mean": 0.46941866278648375,
"rewards/_accuracy_reward/std": 0.1658677004277706,
"rewards/_format_reward/mean": 0.0,
"rewards/_format_reward/std": 0.0,
"step": 140
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.004166666666666674,
"completions/max_length": 636.7,
"completions/max_terminated_length": 532.8,
"completions/mean_length": 168.87708892822266,
"completions/mean_terminated_length": 165.28546447753905,
"completions/min_length": 42.2,
"completions/min_terminated_length": 42.2,
"epoch": 0.8995502248875562,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.232421875,
"kl": 0.085772705078125,
"learning_rate": 5.131000247938367e-07,
"loss": -0.074,
"num_tokens": 2561559.0,
"reward": 0.5166842222213746,
"reward_std": 0.15563009977340697,
"rewards/_accuracy_reward/mean": 0.5166841924190522,
"rewards/_accuracy_reward/std": 0.1897404298186302,
"rewards/_format_reward/mean": 0.0,
"rewards/_format_reward/std": 0.0,
"step": 150
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.004166666666666674,
"completions/max_length": 578.3,
"completions/max_terminated_length": 469.4,
"completions/mean_length": 176.32083740234376,
"completions/mean_terminated_length": 172.7707046508789,
"completions/min_length": 57.6,
"completions/min_terminated_length": 57.6,
"epoch": 0.95952023988006,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.310546875,
"kl": 0.0620758056640625,
"learning_rate": 8.762225008062675e-08,
"loss": -0.0572,
"num_tokens": 2764345.0,
"reward": 0.4916923582553864,
"reward_std": 0.14141732677817345,
"rewards/_accuracy_reward/mean": 0.49169233739376067,
"rewards/_accuracy_reward/std": 0.17603871822357178,
"rewards/_format_reward/mean": 0.0,
"rewards/_format_reward/std": 0.0,
"step": 160
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 447.6666666666667,
"completions/max_terminated_length": 447.6666666666667,
"completions/mean_length": 160.1840337117513,
"completions/mean_terminated_length": 160.1840337117513,
"completions/min_length": 49.333333333333336,
"completions/min_terminated_length": 49.333333333333336,
"epoch": 0.9955022488755623,
"frac_reward_zero_std": 0.0,
"kl": 0.069488525390625,
"num_tokens": 2881422.0,
"reward": 0.49811801811059314,
"reward_std": 0.12225283433993657,
"rewards/_accuracy_reward/mean": 0.4981180081764857,
"rewards/_accuracy_reward/std": 0.14967897906899452,
"rewards/_format_reward/mean": 0.0,
"rewards/_format_reward/std": 0.0,
"step": 166,
"total_flos": 0.0,
"train_loss": -0.017725162387612355,
"train_runtime": 3869.3247,
"train_samples_per_second": 0.258,
"train_steps_per_second": 0.043
}
],
"logging_steps": 10,
"max_steps": 166,
"num_input_tokens_seen": 2881422,
"num_train_epochs": 1,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 12,
"trial_name": null,
"trial_params": null
}