rabiulawal's picture
Add files using upload-large-folder tool
9fc0971 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 6.187560425394779,
"eval_steps": 100,
"global_step": 2400,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.12890750886239125,
"grad_norm": 5.744519798404323,
"learning_rate": 1e-05,
"loss": 3.8898,
"step": 50
},
{
"epoch": 0.2578150177247825,
"grad_norm": 7.663336118255639,
"learning_rate": 9.999766822485166e-06,
"loss": 2.7367,
"step": 100
},
{
"epoch": 0.2578150177247825,
"eval_loss": 2.547008991241455,
"eval_runtime": 13.6928,
"eval_samples_per_second": 73.031,
"eval_steps_per_second": 2.337,
"step": 100
},
{
"epoch": 0.3867225265871737,
"grad_norm": 6.687747513820881,
"learning_rate": 9.999067314105889e-06,
"loss": 2.4915,
"step": 150
},
{
"epoch": 0.515630035449565,
"grad_norm": 3.3619995730848027,
"learning_rate": 9.997901547355329e-06,
"loss": 2.3844,
"step": 200
},
{
"epoch": 0.515630035449565,
"eval_loss": 2.2610087394714355,
"eval_runtime": 13.0766,
"eval_samples_per_second": 76.472,
"eval_steps_per_second": 2.447,
"step": 200
},
{
"epoch": 0.6445375443119562,
"grad_norm": 5.032691180052785,
"learning_rate": 9.996269643047091e-06,
"loss": 2.2534,
"step": 250
},
{
"epoch": 0.7734450531743474,
"grad_norm": 3.975968794985624,
"learning_rate": 9.99417177030268e-06,
"loss": 2.1781,
"step": 300
},
{
"epoch": 0.7734450531743474,
"eval_loss": 2.0959012508392334,
"eval_runtime": 13.0524,
"eval_samples_per_second": 76.614,
"eval_steps_per_second": 2.452,
"step": 300
},
{
"epoch": 0.9023525620367386,
"grad_norm": 3.795190451013864,
"learning_rate": 9.991608146533984e-06,
"loss": 2.1342,
"step": 350
},
{
"epoch": 1.03126007089913,
"grad_norm": 3.850079627270192,
"learning_rate": 9.988579037420745e-06,
"loss": 2.0796,
"step": 400
},
{
"epoch": 1.03126007089913,
"eval_loss": 2.029681921005249,
"eval_runtime": 13.0552,
"eval_samples_per_second": 76.598,
"eval_steps_per_second": 2.451,
"step": 400
},
{
"epoch": 1.1601675797615212,
"grad_norm": 2.0860401247438722,
"learning_rate": 9.985084756883026e-06,
"loss": 2.0583,
"step": 450
},
{
"epoch": 1.2890750886239124,
"grad_norm": 1.7587705935004414,
"learning_rate": 9.98112566704867e-06,
"loss": 2.0388,
"step": 500
},
{
"epoch": 1.2890750886239124,
"eval_loss": 1.9869602918624878,
"eval_runtime": 13.1308,
"eval_samples_per_second": 76.157,
"eval_steps_per_second": 2.437,
"step": 500
},
{
"epoch": 1.4179825974863036,
"grad_norm": 2.675426697648005,
"learning_rate": 9.97670217821578e-06,
"loss": 2.0418,
"step": 550
},
{
"epoch": 1.5468901063486948,
"grad_norm": 2.1347946648640694,
"learning_rate": 9.971814748810192e-06,
"loss": 2.0001,
"step": 600
},
{
"epoch": 1.5468901063486948,
"eval_loss": 1.963959813117981,
"eval_runtime": 13.0814,
"eval_samples_per_second": 76.444,
"eval_steps_per_second": 2.446,
"step": 600
},
{
"epoch": 1.675797615211086,
"grad_norm": 2.8870637538075505,
"learning_rate": 9.96646388533797e-06,
"loss": 1.9743,
"step": 650
},
{
"epoch": 1.8047051240734773,
"grad_norm": 4.178579445918737,
"learning_rate": 9.960650142332914e-06,
"loss": 1.9717,
"step": 700
},
{
"epoch": 1.8047051240734773,
"eval_loss": 1.9353902339935303,
"eval_runtime": 13.1146,
"eval_samples_per_second": 76.251,
"eval_steps_per_second": 2.44,
"step": 700
},
{
"epoch": 1.9336126329358685,
"grad_norm": 2.199357001490578,
"learning_rate": 9.954374122299082e-06,
"loss": 1.9634,
"step": 750
},
{
"epoch": 2.06252014179826,
"grad_norm": 2.166363542050421,
"learning_rate": 9.947636475648373e-06,
"loss": 1.9475,
"step": 800
},
{
"epoch": 2.06252014179826,
"eval_loss": 1.9235832691192627,
"eval_runtime": 13.0977,
"eval_samples_per_second": 76.349,
"eval_steps_per_second": 2.443,
"step": 800
},
{
"epoch": 2.191427650660651,
"grad_norm": 2.467046686495788,
"learning_rate": 9.940437900633096e-06,
"loss": 1.9309,
"step": 850
},
{
"epoch": 2.3203351595230424,
"grad_norm": 2.855139647937512,
"learning_rate": 9.932779143273619e-06,
"loss": 1.9347,
"step": 900
},
{
"epoch": 2.3203351595230424,
"eval_loss": 1.9101444482803345,
"eval_runtime": 13.1292,
"eval_samples_per_second": 76.166,
"eval_steps_per_second": 2.437,
"step": 900
},
{
"epoch": 2.4492426683854336,
"grad_norm": 1.9014562925621068,
"learning_rate": 9.92466099728106e-06,
"loss": 1.9278,
"step": 950
},
{
"epoch": 2.578150177247825,
"grad_norm": 1.3665703880188187,
"learning_rate": 9.91608430397502e-06,
"loss": 1.9157,
"step": 1000
},
{
"epoch": 2.578150177247825,
"eval_loss": 1.8934762477874756,
"eval_runtime": 13.0864,
"eval_samples_per_second": 76.415,
"eval_steps_per_second": 2.445,
"step": 1000
},
{
"epoch": 2.707057686110216,
"grad_norm": 1.2319877018351602,
"learning_rate": 9.907049952196403e-06,
"loss": 1.9105,
"step": 1050
},
{
"epoch": 2.8359651949726072,
"grad_norm": 1.5481172638750316,
"learning_rate": 9.897558878215295e-06,
"loss": 1.907,
"step": 1100
},
{
"epoch": 2.8359651949726072,
"eval_loss": 1.8839563131332397,
"eval_runtime": 13.153,
"eval_samples_per_second": 76.028,
"eval_steps_per_second": 2.433,
"step": 1100
},
{
"epoch": 2.9648727038349985,
"grad_norm": 2.045794673210496,
"learning_rate": 9.887612065633936e-06,
"loss": 1.8945,
"step": 1150
},
{
"epoch": 3.0937802126973897,
"grad_norm": 1.1757006962680892,
"learning_rate": 9.877210545284792e-06,
"loss": 1.888,
"step": 1200
},
{
"epoch": 3.0937802126973897,
"eval_loss": 1.874881386756897,
"eval_runtime": 13.1346,
"eval_samples_per_second": 76.135,
"eval_steps_per_second": 2.436,
"step": 1200
},
{
"epoch": 3.222687721559781,
"grad_norm": 4.950973442850626,
"learning_rate": 9.86635539512371e-06,
"loss": 1.8811,
"step": 1250
},
{
"epoch": 3.351595230422172,
"grad_norm": 1.8124376039297767,
"learning_rate": 9.855047740118221e-06,
"loss": 1.876,
"step": 1300
},
{
"epoch": 3.351595230422172,
"eval_loss": 1.86701238155365,
"eval_runtime": 13.0772,
"eval_samples_per_second": 76.469,
"eval_steps_per_second": 2.447,
"step": 1300
},
{
"epoch": 3.4805027392845633,
"grad_norm": 1.0286338189260262,
"learning_rate": 9.843288752130942e-06,
"loss": 1.8683,
"step": 1350
},
{
"epoch": 3.6094102481469545,
"grad_norm": 1.9894455905335977,
"learning_rate": 9.831079649798138e-06,
"loss": 1.8731,
"step": 1400
},
{
"epoch": 3.6094102481469545,
"eval_loss": 1.8612475395202637,
"eval_runtime": 13.1177,
"eval_samples_per_second": 76.233,
"eval_steps_per_second": 2.439,
"step": 1400
},
{
"epoch": 3.7383177570093458,
"grad_norm": 1.7685619948548048,
"learning_rate": 9.818421698403429e-06,
"loss": 1.8648,
"step": 1450
},
{
"epoch": 3.867225265871737,
"grad_norm": 1.5478355130519232,
"learning_rate": 9.805316209746655e-06,
"loss": 1.8665,
"step": 1500
},
{
"epoch": 3.867225265871737,
"eval_loss": 1.8537719249725342,
"eval_runtime": 13.1085,
"eval_samples_per_second": 76.286,
"eval_steps_per_second": 2.441,
"step": 1500
},
{
"epoch": 3.996132774734128,
"grad_norm": 2.1355502302510447,
"learning_rate": 9.791764542007945e-06,
"loss": 1.8655,
"step": 1550
},
{
"epoch": 4.12504028359652,
"grad_norm": 1.1480708382569662,
"learning_rate": 9.777768099606938e-06,
"loss": 1.8346,
"step": 1600
},
{
"epoch": 4.12504028359652,
"eval_loss": 1.8495159149169922,
"eval_runtime": 13.1242,
"eval_samples_per_second": 76.195,
"eval_steps_per_second": 2.438,
"step": 1600
},
{
"epoch": 4.253947792458911,
"grad_norm": 1.4360213593754323,
"learning_rate": 9.763328333057263e-06,
"loss": 1.8265,
"step": 1650
},
{
"epoch": 4.382855301321302,
"grad_norm": 1.8676589580299272,
"learning_rate": 9.748446738816201e-06,
"loss": 1.8391,
"step": 1700
},
{
"epoch": 4.382855301321302,
"eval_loss": 1.8443361520767212,
"eval_runtime": 13.1311,
"eval_samples_per_second": 76.155,
"eval_steps_per_second": 2.437,
"step": 1700
},
{
"epoch": 4.5117628101836935,
"grad_norm": 1.2690786695530565,
"learning_rate": 9.733124859129598e-06,
"loss": 1.8434,
"step": 1750
},
{
"epoch": 4.640670319046085,
"grad_norm": 1.0388627604335856,
"learning_rate": 9.717364281872047e-06,
"loss": 1.842,
"step": 1800
},
{
"epoch": 4.640670319046085,
"eval_loss": 1.8362445831298828,
"eval_runtime": 13.1151,
"eval_samples_per_second": 76.248,
"eval_steps_per_second": 2.44,
"step": 1800
},
{
"epoch": 4.769577827908476,
"grad_norm": 1.8674582891348575,
"learning_rate": 9.701166640382317e-06,
"loss": 1.8308,
"step": 1850
},
{
"epoch": 4.898485336770867,
"grad_norm": 1.4621440377933717,
"learning_rate": 9.684533613294096e-06,
"loss": 1.8382,
"step": 1900
},
{
"epoch": 4.898485336770867,
"eval_loss": 1.831936240196228,
"eval_runtime": 13.0768,
"eval_samples_per_second": 76.472,
"eval_steps_per_second": 2.447,
"step": 1900
},
{
"epoch": 5.027392845633258,
"grad_norm": 1.0861177702948985,
"learning_rate": 9.667466924362013e-06,
"loss": 1.8308,
"step": 1950
},
{
"epoch": 5.15630035449565,
"grad_norm": 1.8095526457252593,
"learning_rate": 9.649968342283005e-06,
"loss": 1.8161,
"step": 2000
},
{
"epoch": 5.15630035449565,
"eval_loss": 1.829033374786377,
"eval_runtime": 13.067,
"eval_samples_per_second": 76.529,
"eval_steps_per_second": 2.449,
"step": 2000
},
{
"epoch": 5.285207863358041,
"grad_norm": 1.518641242728159,
"learning_rate": 9.632039680513024e-06,
"loss": 1.8007,
"step": 2050
},
{
"epoch": 5.414115372220432,
"grad_norm": 1.0542508895398046,
"learning_rate": 9.613682797079086e-06,
"loss": 1.7999,
"step": 2100
},
{
"epoch": 5.414115372220432,
"eval_loss": 1.8255321979522705,
"eval_runtime": 13.0764,
"eval_samples_per_second": 76.474,
"eval_steps_per_second": 2.447,
"step": 2100
},
{
"epoch": 5.543022881082823,
"grad_norm": 1.9568398940474616,
"learning_rate": 9.594899594386732e-06,
"loss": 1.8189,
"step": 2150
},
{
"epoch": 5.6719303899452145,
"grad_norm": 0.8933581069243784,
"learning_rate": 9.57569201902286e-06,
"loss": 1.8066,
"step": 2200
},
{
"epoch": 5.6719303899452145,
"eval_loss": 1.8212575912475586,
"eval_runtime": 13.0944,
"eval_samples_per_second": 76.369,
"eval_steps_per_second": 2.444,
"step": 2200
},
{
"epoch": 5.800837898807606,
"grad_norm": 0.7789642769876175,
"learning_rate": 9.556062061553995e-06,
"loss": 1.8068,
"step": 2250
},
{
"epoch": 5.929745407669997,
"grad_norm": 1.0394842073428503,
"learning_rate": 9.536011756320011e-06,
"loss": 1.8165,
"step": 2300
},
{
"epoch": 5.929745407669997,
"eval_loss": 1.8185018301010132,
"eval_runtime": 13.0777,
"eval_samples_per_second": 76.466,
"eval_steps_per_second": 2.447,
"step": 2300
},
{
"epoch": 6.058652916532388,
"grad_norm": 1.9360605440141716,
"learning_rate": 9.515543181223277e-06,
"loss": 1.7866,
"step": 2350
},
{
"epoch": 6.187560425394779,
"grad_norm": 1.2520661239107576,
"learning_rate": 9.494658457513341e-06,
"loss": 1.7824,
"step": 2400
},
{
"epoch": 6.187560425394779,
"eval_loss": 1.8156663179397583,
"eval_runtime": 13.1655,
"eval_samples_per_second": 75.956,
"eval_steps_per_second": 2.431,
"step": 2400
}
],
"logging_steps": 50,
"max_steps": 15480,
"num_input_tokens_seen": 0,
"num_train_epochs": 40,
"save_steps": 800,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 2134166405644288.0,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}