sedrickkeh's picture
End of training
370e1f6 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 2.994350282485876,
"eval_steps": 500,
"global_step": 795,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.03766478342749529,
"grad_norm": 2.263125496637036,
"learning_rate": 5e-06,
"loss": 0.8213,
"step": 10
},
{
"epoch": 0.07532956685499058,
"grad_norm": 2.189120038314998,
"learning_rate": 5e-06,
"loss": 0.725,
"step": 20
},
{
"epoch": 0.11299435028248588,
"grad_norm": 2.7377229620997587,
"learning_rate": 5e-06,
"loss": 0.6984,
"step": 30
},
{
"epoch": 0.15065913370998116,
"grad_norm": 1.6393030705097076,
"learning_rate": 5e-06,
"loss": 0.6949,
"step": 40
},
{
"epoch": 0.18832391713747645,
"grad_norm": 1.0398712245575565,
"learning_rate": 5e-06,
"loss": 0.6683,
"step": 50
},
{
"epoch": 0.22598870056497175,
"grad_norm": 0.8808957646364496,
"learning_rate": 5e-06,
"loss": 0.6657,
"step": 60
},
{
"epoch": 0.263653483992467,
"grad_norm": 0.7566447407586742,
"learning_rate": 5e-06,
"loss": 0.6496,
"step": 70
},
{
"epoch": 0.3013182674199623,
"grad_norm": 0.7117741359385918,
"learning_rate": 5e-06,
"loss": 0.6458,
"step": 80
},
{
"epoch": 0.3389830508474576,
"grad_norm": 0.7784154878802825,
"learning_rate": 5e-06,
"loss": 0.643,
"step": 90
},
{
"epoch": 0.3766478342749529,
"grad_norm": 1.2764054406718297,
"learning_rate": 5e-06,
"loss": 0.634,
"step": 100
},
{
"epoch": 0.4143126177024482,
"grad_norm": 0.7389123630080362,
"learning_rate": 5e-06,
"loss": 0.642,
"step": 110
},
{
"epoch": 0.4519774011299435,
"grad_norm": 0.5996098331338064,
"learning_rate": 5e-06,
"loss": 0.6213,
"step": 120
},
{
"epoch": 0.4896421845574388,
"grad_norm": 1.1501035575220573,
"learning_rate": 5e-06,
"loss": 0.6249,
"step": 130
},
{
"epoch": 0.527306967984934,
"grad_norm": 0.6904388049559987,
"learning_rate": 5e-06,
"loss": 0.6243,
"step": 140
},
{
"epoch": 0.5649717514124294,
"grad_norm": 0.783351095580665,
"learning_rate": 5e-06,
"loss": 0.621,
"step": 150
},
{
"epoch": 0.6026365348399246,
"grad_norm": 0.5318412680049267,
"learning_rate": 5e-06,
"loss": 0.6296,
"step": 160
},
{
"epoch": 0.64030131826742,
"grad_norm": 1.1247908870238332,
"learning_rate": 5e-06,
"loss": 0.6244,
"step": 170
},
{
"epoch": 0.6779661016949152,
"grad_norm": 0.5383437573904913,
"learning_rate": 5e-06,
"loss": 0.6174,
"step": 180
},
{
"epoch": 0.7156308851224106,
"grad_norm": 2.5733659531838198,
"learning_rate": 5e-06,
"loss": 0.6193,
"step": 190
},
{
"epoch": 0.7532956685499058,
"grad_norm": 0.7831306502565981,
"learning_rate": 5e-06,
"loss": 0.6127,
"step": 200
},
{
"epoch": 0.7909604519774012,
"grad_norm": 0.6934442696862589,
"learning_rate": 5e-06,
"loss": 0.6244,
"step": 210
},
{
"epoch": 0.8286252354048964,
"grad_norm": 0.6677867481758228,
"learning_rate": 5e-06,
"loss": 0.613,
"step": 220
},
{
"epoch": 0.8662900188323918,
"grad_norm": 0.4859701739274024,
"learning_rate": 5e-06,
"loss": 0.6101,
"step": 230
},
{
"epoch": 0.903954802259887,
"grad_norm": 0.8722337210188531,
"learning_rate": 5e-06,
"loss": 0.606,
"step": 240
},
{
"epoch": 0.9416195856873822,
"grad_norm": 1.9266628990003756,
"learning_rate": 5e-06,
"loss": 0.6019,
"step": 250
},
{
"epoch": 0.9792843691148776,
"grad_norm": 1.056076074715482,
"learning_rate": 5e-06,
"loss": 0.6112,
"step": 260
},
{
"epoch": 0.9981167608286252,
"eval_loss": 0.605501651763916,
"eval_runtime": 91.0085,
"eval_samples_per_second": 78.586,
"eval_steps_per_second": 0.615,
"step": 265
},
{
"epoch": 1.0169491525423728,
"grad_norm": 0.8391047454772584,
"learning_rate": 5e-06,
"loss": 0.5915,
"step": 270
},
{
"epoch": 1.054613935969868,
"grad_norm": 0.834651626730102,
"learning_rate": 5e-06,
"loss": 0.5646,
"step": 280
},
{
"epoch": 1.0922787193973635,
"grad_norm": 0.7450681825170591,
"learning_rate": 5e-06,
"loss": 0.5518,
"step": 290
},
{
"epoch": 1.1299435028248588,
"grad_norm": 0.590980176111281,
"learning_rate": 5e-06,
"loss": 0.5601,
"step": 300
},
{
"epoch": 1.167608286252354,
"grad_norm": 0.9385625642802127,
"learning_rate": 5e-06,
"loss": 0.552,
"step": 310
},
{
"epoch": 1.2052730696798493,
"grad_norm": 0.8126168794552087,
"learning_rate": 5e-06,
"loss": 0.558,
"step": 320
},
{
"epoch": 1.2429378531073447,
"grad_norm": 0.677905810554928,
"learning_rate": 5e-06,
"loss": 0.565,
"step": 330
},
{
"epoch": 1.28060263653484,
"grad_norm": 0.6544626057539239,
"learning_rate": 5e-06,
"loss": 0.5582,
"step": 340
},
{
"epoch": 1.3182674199623352,
"grad_norm": 0.8524924080405836,
"learning_rate": 5e-06,
"loss": 0.5602,
"step": 350
},
{
"epoch": 1.3559322033898304,
"grad_norm": 0.4907188308076832,
"learning_rate": 5e-06,
"loss": 0.5607,
"step": 360
},
{
"epoch": 1.3935969868173257,
"grad_norm": 0.53907446375581,
"learning_rate": 5e-06,
"loss": 0.5547,
"step": 370
},
{
"epoch": 1.4312617702448212,
"grad_norm": 0.5927028384991923,
"learning_rate": 5e-06,
"loss": 0.5541,
"step": 380
},
{
"epoch": 1.4689265536723164,
"grad_norm": 0.7128973727870778,
"learning_rate": 5e-06,
"loss": 0.5528,
"step": 390
},
{
"epoch": 1.5065913370998116,
"grad_norm": 0.49840825439685243,
"learning_rate": 5e-06,
"loss": 0.5668,
"step": 400
},
{
"epoch": 1.544256120527307,
"grad_norm": 0.5370743335720791,
"learning_rate": 5e-06,
"loss": 0.5575,
"step": 410
},
{
"epoch": 1.5819209039548023,
"grad_norm": 0.6150871895812915,
"learning_rate": 5e-06,
"loss": 0.5597,
"step": 420
},
{
"epoch": 1.6195856873822976,
"grad_norm": 0.563194743905304,
"learning_rate": 5e-06,
"loss": 0.5592,
"step": 430
},
{
"epoch": 1.6572504708097928,
"grad_norm": 0.5119581124907059,
"learning_rate": 5e-06,
"loss": 0.5621,
"step": 440
},
{
"epoch": 1.694915254237288,
"grad_norm": 0.5352254655513019,
"learning_rate": 5e-06,
"loss": 0.5541,
"step": 450
},
{
"epoch": 1.7325800376647833,
"grad_norm": 0.6077433771903062,
"learning_rate": 5e-06,
"loss": 0.5563,
"step": 460
},
{
"epoch": 1.7702448210922788,
"grad_norm": 0.562877694142977,
"learning_rate": 5e-06,
"loss": 0.555,
"step": 470
},
{
"epoch": 1.807909604519774,
"grad_norm": 0.5453089094350608,
"learning_rate": 5e-06,
"loss": 0.5465,
"step": 480
},
{
"epoch": 1.8455743879472695,
"grad_norm": 0.5709862620082578,
"learning_rate": 5e-06,
"loss": 0.5592,
"step": 490
},
{
"epoch": 1.8832391713747647,
"grad_norm": 0.49785144147435545,
"learning_rate": 5e-06,
"loss": 0.5563,
"step": 500
},
{
"epoch": 1.92090395480226,
"grad_norm": 0.48543855573710365,
"learning_rate": 5e-06,
"loss": 0.5552,
"step": 510
},
{
"epoch": 1.9585687382297552,
"grad_norm": 0.5180932799655572,
"learning_rate": 5e-06,
"loss": 0.5571,
"step": 520
},
{
"epoch": 1.9962335216572504,
"grad_norm": 0.5674984350650156,
"learning_rate": 5e-06,
"loss": 0.5554,
"step": 530
},
{
"epoch": 2.0,
"eval_loss": 0.5974339842796326,
"eval_runtime": 92.2503,
"eval_samples_per_second": 77.528,
"eval_steps_per_second": 0.607,
"step": 531
},
{
"epoch": 2.0338983050847457,
"grad_norm": 0.6380443072327275,
"learning_rate": 5e-06,
"loss": 0.5074,
"step": 540
},
{
"epoch": 2.071563088512241,
"grad_norm": 0.7526012751703193,
"learning_rate": 5e-06,
"loss": 0.5056,
"step": 550
},
{
"epoch": 2.109227871939736,
"grad_norm": 0.601125683400543,
"learning_rate": 5e-06,
"loss": 0.5081,
"step": 560
},
{
"epoch": 2.146892655367232,
"grad_norm": 0.5412801866050161,
"learning_rate": 5e-06,
"loss": 0.4964,
"step": 570
},
{
"epoch": 2.184557438794727,
"grad_norm": 0.6605525778778812,
"learning_rate": 5e-06,
"loss": 0.4924,
"step": 580
},
{
"epoch": 2.2222222222222223,
"grad_norm": 0.5634126387252626,
"learning_rate": 5e-06,
"loss": 0.5017,
"step": 590
},
{
"epoch": 2.2598870056497176,
"grad_norm": 0.5612826370434433,
"learning_rate": 5e-06,
"loss": 0.507,
"step": 600
},
{
"epoch": 2.297551789077213,
"grad_norm": 0.5863149934883163,
"learning_rate": 5e-06,
"loss": 0.4966,
"step": 610
},
{
"epoch": 2.335216572504708,
"grad_norm": 0.5234770461125302,
"learning_rate": 5e-06,
"loss": 0.504,
"step": 620
},
{
"epoch": 2.3728813559322033,
"grad_norm": 0.6459395940002383,
"learning_rate": 5e-06,
"loss": 0.5026,
"step": 630
},
{
"epoch": 2.4105461393596985,
"grad_norm": 0.6027956338487243,
"learning_rate": 5e-06,
"loss": 0.5025,
"step": 640
},
{
"epoch": 2.4482109227871938,
"grad_norm": 0.5328974338222766,
"learning_rate": 5e-06,
"loss": 0.5003,
"step": 650
},
{
"epoch": 2.4858757062146895,
"grad_norm": 0.6107575449426592,
"learning_rate": 5e-06,
"loss": 0.5009,
"step": 660
},
{
"epoch": 2.5235404896421847,
"grad_norm": 0.6193028412595688,
"learning_rate": 5e-06,
"loss": 0.5068,
"step": 670
},
{
"epoch": 2.56120527306968,
"grad_norm": 0.5313172697707192,
"learning_rate": 5e-06,
"loss": 0.5087,
"step": 680
},
{
"epoch": 2.598870056497175,
"grad_norm": 0.6705815338360445,
"learning_rate": 5e-06,
"loss": 0.5072,
"step": 690
},
{
"epoch": 2.6365348399246704,
"grad_norm": 0.5631108090258757,
"learning_rate": 5e-06,
"loss": 0.5053,
"step": 700
},
{
"epoch": 2.6741996233521657,
"grad_norm": 0.6409277069423337,
"learning_rate": 5e-06,
"loss": 0.503,
"step": 710
},
{
"epoch": 2.711864406779661,
"grad_norm": 0.5852444630897177,
"learning_rate": 5e-06,
"loss": 0.5099,
"step": 720
},
{
"epoch": 2.7495291902071566,
"grad_norm": 0.6554053610190018,
"learning_rate": 5e-06,
"loss": 0.5149,
"step": 730
},
{
"epoch": 2.7871939736346514,
"grad_norm": 0.6563071365261379,
"learning_rate": 5e-06,
"loss": 0.5018,
"step": 740
},
{
"epoch": 2.824858757062147,
"grad_norm": 0.5582449045429995,
"learning_rate": 5e-06,
"loss": 0.5103,
"step": 750
},
{
"epoch": 2.8625235404896423,
"grad_norm": 0.5062040173398443,
"learning_rate": 5e-06,
"loss": 0.5063,
"step": 760
},
{
"epoch": 2.9001883239171375,
"grad_norm": 0.6071759917390698,
"learning_rate": 5e-06,
"loss": 0.5003,
"step": 770
},
{
"epoch": 2.937853107344633,
"grad_norm": 0.5606403524855348,
"learning_rate": 5e-06,
"loss": 0.512,
"step": 780
},
{
"epoch": 2.975517890772128,
"grad_norm": 0.6859712101741441,
"learning_rate": 5e-06,
"loss": 0.5025,
"step": 790
},
{
"epoch": 2.994350282485876,
"eval_loss": 0.6022372245788574,
"eval_runtime": 89.8413,
"eval_samples_per_second": 79.607,
"eval_steps_per_second": 0.623,
"step": 795
},
{
"epoch": 2.994350282485876,
"step": 795,
"total_flos": 1331235850813440.0,
"train_loss": 0.5684360762062313,
"train_runtime": 14109.8059,
"train_samples_per_second": 28.892,
"train_steps_per_second": 0.056
}
],
"logging_steps": 10,
"max_steps": 795,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 1331235850813440.0,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}