| { | |
| "best_metric": 1.6244161128997803, | |
| "best_model_checkpoint": "./output/checkpoints/2024-06-11_10-58-33/checkpoint-30", | |
| "epoch": 1.0, | |
| "eval_steps": 1, | |
| "global_step": 37, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.02702702702702703, | |
| "grad_norm": 3.2274646759033203, | |
| "learning_rate": 0.0001, | |
| "loss": 5.7374, | |
| "step": 1 | |
| }, | |
| { | |
| "epoch": 0.02702702702702703, | |
| "eval_loss": 5.6431050300598145, | |
| "eval_runtime": 10.9746, | |
| "eval_samples_per_second": 11.299, | |
| "eval_steps_per_second": 0.729, | |
| "step": 1 | |
| }, | |
| { | |
| "epoch": 0.05405405405405406, | |
| "grad_norm": 3.1417839527130127, | |
| "learning_rate": 0.0002, | |
| "loss": 5.6423, | |
| "step": 2 | |
| }, | |
| { | |
| "epoch": 0.05405405405405406, | |
| "eval_loss": 5.1099138259887695, | |
| "eval_runtime": 11.0219, | |
| "eval_samples_per_second": 11.25, | |
| "eval_steps_per_second": 0.726, | |
| "step": 2 | |
| }, | |
| { | |
| "epoch": 0.08108108108108109, | |
| "grad_norm": 3.1990153789520264, | |
| "learning_rate": 0.00030000000000000003, | |
| "loss": 5.0948, | |
| "step": 3 | |
| }, | |
| { | |
| "epoch": 0.08108108108108109, | |
| "eval_loss": 3.559605836868286, | |
| "eval_runtime": 11.0991, | |
| "eval_samples_per_second": 11.172, | |
| "eval_steps_per_second": 0.721, | |
| "step": 3 | |
| }, | |
| { | |
| "epoch": 0.10810810810810811, | |
| "grad_norm": 3.2903366088867188, | |
| "learning_rate": 0.0004, | |
| "loss": 3.4375, | |
| "step": 4 | |
| }, | |
| { | |
| "epoch": 0.10810810810810811, | |
| "eval_loss": 2.3610196113586426, | |
| "eval_runtime": 11.0573, | |
| "eval_samples_per_second": 11.214, | |
| "eval_steps_per_second": 0.724, | |
| "step": 4 | |
| }, | |
| { | |
| "epoch": 0.13513513513513514, | |
| "grad_norm": 1.878879189491272, | |
| "learning_rate": 0.0003878787878787879, | |
| "loss": 2.2693, | |
| "step": 5 | |
| }, | |
| { | |
| "epoch": 0.13513513513513514, | |
| "eval_loss": 1.8543975353240967, | |
| "eval_runtime": 11.1541, | |
| "eval_samples_per_second": 11.117, | |
| "eval_steps_per_second": 0.717, | |
| "step": 5 | |
| }, | |
| { | |
| "epoch": 0.16216216216216217, | |
| "grad_norm": 1.2040495872497559, | |
| "learning_rate": 0.0003757575757575758, | |
| "loss": 1.7546, | |
| "step": 6 | |
| }, | |
| { | |
| "epoch": 0.16216216216216217, | |
| "eval_loss": 1.7222554683685303, | |
| "eval_runtime": 11.0742, | |
| "eval_samples_per_second": 11.197, | |
| "eval_steps_per_second": 0.722, | |
| "step": 6 | |
| }, | |
| { | |
| "epoch": 0.1891891891891892, | |
| "grad_norm": 1.080614447593689, | |
| "learning_rate": 0.00036363636363636367, | |
| "loss": 1.6633, | |
| "step": 7 | |
| }, | |
| { | |
| "epoch": 0.1891891891891892, | |
| "eval_loss": 1.610931158065796, | |
| "eval_runtime": 11.0872, | |
| "eval_samples_per_second": 11.184, | |
| "eval_steps_per_second": 0.722, | |
| "step": 7 | |
| }, | |
| { | |
| "epoch": 0.21621621621621623, | |
| "grad_norm": 0.28874385356903076, | |
| "learning_rate": 0.00035151515151515155, | |
| "loss": 1.5122, | |
| "step": 8 | |
| }, | |
| { | |
| "epoch": 0.21621621621621623, | |
| "eval_loss": 1.5804481506347656, | |
| "eval_runtime": 11.053, | |
| "eval_samples_per_second": 11.219, | |
| "eval_steps_per_second": 0.724, | |
| "step": 8 | |
| }, | |
| { | |
| "epoch": 0.24324324324324326, | |
| "grad_norm": 0.32991790771484375, | |
| "learning_rate": 0.00033939393939393943, | |
| "loss": 1.4316, | |
| "step": 9 | |
| }, | |
| { | |
| "epoch": 0.24324324324324326, | |
| "eval_loss": 1.5746935606002808, | |
| "eval_runtime": 11.152, | |
| "eval_samples_per_second": 11.119, | |
| "eval_steps_per_second": 0.717, | |
| "step": 9 | |
| }, | |
| { | |
| "epoch": 0.2702702702702703, | |
| "grad_norm": 0.5137693285942078, | |
| "learning_rate": 0.0003272727272727273, | |
| "loss": 1.3161, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.2702702702702703, | |
| "eval_loss": 1.651396632194519, | |
| "eval_runtime": 11.1562, | |
| "eval_samples_per_second": 11.115, | |
| "eval_steps_per_second": 0.717, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.2972972972972973, | |
| "grad_norm": 0.25246673822402954, | |
| "learning_rate": 0.00031515151515151515, | |
| "loss": 1.207, | |
| "step": 11 | |
| }, | |
| { | |
| "epoch": 0.2972972972972973, | |
| "eval_loss": 1.7246230840682983, | |
| "eval_runtime": 11.1298, | |
| "eval_samples_per_second": 11.141, | |
| "eval_steps_per_second": 0.719, | |
| "step": 11 | |
| }, | |
| { | |
| "epoch": 0.32432432432432434, | |
| "grad_norm": 0.2032381296157837, | |
| "learning_rate": 0.00030303030303030303, | |
| "loss": 1.158, | |
| "step": 12 | |
| }, | |
| { | |
| "epoch": 0.32432432432432434, | |
| "eval_loss": 1.7255425453186035, | |
| "eval_runtime": 11.0733, | |
| "eval_samples_per_second": 11.198, | |
| "eval_steps_per_second": 0.722, | |
| "step": 12 | |
| }, | |
| { | |
| "epoch": 0.35135135135135137, | |
| "grad_norm": 0.2133413404226303, | |
| "learning_rate": 0.0002909090909090909, | |
| "loss": 1.1137, | |
| "step": 13 | |
| }, | |
| { | |
| "epoch": 0.35135135135135137, | |
| "eval_loss": 1.6880252361297607, | |
| "eval_runtime": 11.2007, | |
| "eval_samples_per_second": 11.071, | |
| "eval_steps_per_second": 0.714, | |
| "step": 13 | |
| }, | |
| { | |
| "epoch": 0.3783783783783784, | |
| "grad_norm": 0.20175401866436005, | |
| "learning_rate": 0.0002787878787878788, | |
| "loss": 1.1059, | |
| "step": 14 | |
| }, | |
| { | |
| "epoch": 0.3783783783783784, | |
| "eval_loss": 1.6500831842422485, | |
| "eval_runtime": 11.1367, | |
| "eval_samples_per_second": 11.134, | |
| "eval_steps_per_second": 0.718, | |
| "step": 14 | |
| }, | |
| { | |
| "epoch": 0.40540540540540543, | |
| "grad_norm": 0.22595511376857758, | |
| "learning_rate": 0.0002666666666666667, | |
| "loss": 1.0483, | |
| "step": 15 | |
| }, | |
| { | |
| "epoch": 0.40540540540540543, | |
| "eval_loss": 1.6288588047027588, | |
| "eval_runtime": 11.1914, | |
| "eval_samples_per_second": 11.08, | |
| "eval_steps_per_second": 0.715, | |
| "step": 15 | |
| }, | |
| { | |
| "epoch": 0.43243243243243246, | |
| "grad_norm": 0.17468485236167908, | |
| "learning_rate": 0.00025454545454545456, | |
| "loss": 1.0584, | |
| "step": 16 | |
| }, | |
| { | |
| "epoch": 0.43243243243243246, | |
| "eval_loss": 1.6247642040252686, | |
| "eval_runtime": 11.1035, | |
| "eval_samples_per_second": 11.168, | |
| "eval_steps_per_second": 0.72, | |
| "step": 16 | |
| }, | |
| { | |
| "epoch": 0.4594594594594595, | |
| "grad_norm": 0.1654416024684906, | |
| "learning_rate": 0.00024242424242424245, | |
| "loss": 1.0402, | |
| "step": 17 | |
| }, | |
| { | |
| "epoch": 0.4594594594594595, | |
| "eval_loss": 1.6316722631454468, | |
| "eval_runtime": 11.2065, | |
| "eval_samples_per_second": 11.065, | |
| "eval_steps_per_second": 0.714, | |
| "step": 17 | |
| }, | |
| { | |
| "epoch": 0.4864864864864865, | |
| "grad_norm": 0.10361829400062561, | |
| "learning_rate": 0.00023030303030303033, | |
| "loss": 1.0301, | |
| "step": 18 | |
| }, | |
| { | |
| "epoch": 0.4864864864864865, | |
| "eval_loss": 1.6415338516235352, | |
| "eval_runtime": 11.18, | |
| "eval_samples_per_second": 11.091, | |
| "eval_steps_per_second": 0.716, | |
| "step": 18 | |
| }, | |
| { | |
| "epoch": 0.5135135135135135, | |
| "grad_norm": 0.09156349301338196, | |
| "learning_rate": 0.00021818181818181818, | |
| "loss": 1.0183, | |
| "step": 19 | |
| }, | |
| { | |
| "epoch": 0.5135135135135135, | |
| "eval_loss": 1.6544169187545776, | |
| "eval_runtime": 11.1626, | |
| "eval_samples_per_second": 11.109, | |
| "eval_steps_per_second": 0.717, | |
| "step": 19 | |
| }, | |
| { | |
| "epoch": 0.5405405405405406, | |
| "grad_norm": 0.087005615234375, | |
| "learning_rate": 0.00020606060606060607, | |
| "loss": 1.028, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.5405405405405406, | |
| "eval_loss": 1.6620415449142456, | |
| "eval_runtime": 11.2393, | |
| "eval_samples_per_second": 11.033, | |
| "eval_steps_per_second": 0.712, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.5675675675675675, | |
| "grad_norm": 0.09235216677188873, | |
| "learning_rate": 0.00019393939393939395, | |
| "loss": 0.9825, | |
| "step": 21 | |
| }, | |
| { | |
| "epoch": 0.5675675675675675, | |
| "eval_loss": 1.6642476320266724, | |
| "eval_runtime": 11.2278, | |
| "eval_samples_per_second": 11.044, | |
| "eval_steps_per_second": 0.713, | |
| "step": 21 | |
| }, | |
| { | |
| "epoch": 0.5945945945945946, | |
| "grad_norm": 0.0915454775094986, | |
| "learning_rate": 0.00018181818181818183, | |
| "loss": 0.9991, | |
| "step": 22 | |
| }, | |
| { | |
| "epoch": 0.5945945945945946, | |
| "eval_loss": 1.6625572443008423, | |
| "eval_runtime": 11.1424, | |
| "eval_samples_per_second": 11.129, | |
| "eval_steps_per_second": 0.718, | |
| "step": 22 | |
| }, | |
| { | |
| "epoch": 0.6216216216216216, | |
| "grad_norm": 0.09213992953300476, | |
| "learning_rate": 0.00016969696969696972, | |
| "loss": 1.0211, | |
| "step": 23 | |
| }, | |
| { | |
| "epoch": 0.6216216216216216, | |
| "eval_loss": 1.6593235731124878, | |
| "eval_runtime": 11.1978, | |
| "eval_samples_per_second": 11.074, | |
| "eval_steps_per_second": 0.714, | |
| "step": 23 | |
| }, | |
| { | |
| "epoch": 0.6486486486486487, | |
| "grad_norm": 0.0854020044207573, | |
| "learning_rate": 0.00015757575757575757, | |
| "loss": 1.0291, | |
| "step": 24 | |
| }, | |
| { | |
| "epoch": 0.6486486486486487, | |
| "eval_loss": 1.6526458263397217, | |
| "eval_runtime": 11.2323, | |
| "eval_samples_per_second": 11.04, | |
| "eval_steps_per_second": 0.712, | |
| "step": 24 | |
| }, | |
| { | |
| "epoch": 0.6756756756756757, | |
| "grad_norm": 0.08045388758182526, | |
| "learning_rate": 0.00014545454545454546, | |
| "loss": 0.9887, | |
| "step": 25 | |
| }, | |
| { | |
| "epoch": 0.6756756756756757, | |
| "eval_loss": 1.6451815366744995, | |
| "eval_runtime": 11.1905, | |
| "eval_samples_per_second": 11.081, | |
| "eval_steps_per_second": 0.715, | |
| "step": 25 | |
| }, | |
| { | |
| "epoch": 0.7027027027027027, | |
| "grad_norm": 0.07576093822717667, | |
| "learning_rate": 0.00013333333333333334, | |
| "loss": 1.0044, | |
| "step": 26 | |
| }, | |
| { | |
| "epoch": 0.7027027027027027, | |
| "eval_loss": 1.6377238035202026, | |
| "eval_runtime": 11.1714, | |
| "eval_samples_per_second": 11.1, | |
| "eval_steps_per_second": 0.716, | |
| "step": 26 | |
| }, | |
| { | |
| "epoch": 0.7297297297297297, | |
| "grad_norm": 0.07311829924583435, | |
| "learning_rate": 0.00012121212121212122, | |
| "loss": 0.9772, | |
| "step": 27 | |
| }, | |
| { | |
| "epoch": 0.7297297297297297, | |
| "eval_loss": 1.6314424276351929, | |
| "eval_runtime": 11.1489, | |
| "eval_samples_per_second": 11.122, | |
| "eval_steps_per_second": 0.718, | |
| "step": 27 | |
| }, | |
| { | |
| "epoch": 0.7567567567567568, | |
| "grad_norm": 0.07776332646608353, | |
| "learning_rate": 0.00010909090909090909, | |
| "loss": 0.9902, | |
| "step": 28 | |
| }, | |
| { | |
| "epoch": 0.7567567567567568, | |
| "eval_loss": 1.625641942024231, | |
| "eval_runtime": 11.1261, | |
| "eval_samples_per_second": 11.145, | |
| "eval_steps_per_second": 0.719, | |
| "step": 28 | |
| }, | |
| { | |
| "epoch": 0.7837837837837838, | |
| "grad_norm": 0.07536856085062027, | |
| "learning_rate": 9.696969696969698e-05, | |
| "loss": 0.9902, | |
| "step": 29 | |
| }, | |
| { | |
| "epoch": 0.7837837837837838, | |
| "eval_loss": 1.6233930587768555, | |
| "eval_runtime": 11.1754, | |
| "eval_samples_per_second": 11.096, | |
| "eval_steps_per_second": 0.716, | |
| "step": 29 | |
| }, | |
| { | |
| "epoch": 0.8108108108108109, | |
| "grad_norm": 0.07941398024559021, | |
| "learning_rate": 8.484848484848486e-05, | |
| "loss": 0.9784, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.8108108108108109, | |
| "eval_loss": 1.6244161128997803, | |
| "eval_runtime": 11.2511, | |
| "eval_samples_per_second": 11.021, | |
| "eval_steps_per_second": 0.711, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.8378378378378378, | |
| "grad_norm": 0.07617861032485962, | |
| "learning_rate": 7.272727272727273e-05, | |
| "loss": 1.0064, | |
| "step": 31 | |
| }, | |
| { | |
| "epoch": 0.8378378378378378, | |
| "eval_loss": 1.62636399269104, | |
| "eval_runtime": 11.1098, | |
| "eval_samples_per_second": 11.161, | |
| "eval_steps_per_second": 0.72, | |
| "step": 31 | |
| }, | |
| { | |
| "epoch": 0.8648648648648649, | |
| "grad_norm": 0.06959453225135803, | |
| "learning_rate": 6.060606060606061e-05, | |
| "loss": 0.9764, | |
| "step": 32 | |
| }, | |
| { | |
| "epoch": 0.8648648648648649, | |
| "eval_loss": 1.6286530494689941, | |
| "eval_runtime": 11.2497, | |
| "eval_samples_per_second": 11.023, | |
| "eval_steps_per_second": 0.711, | |
| "step": 32 | |
| }, | |
| { | |
| "epoch": 0.8918918918918919, | |
| "grad_norm": 0.07171300053596497, | |
| "learning_rate": 4.848484848484849e-05, | |
| "loss": 0.9921, | |
| "step": 33 | |
| }, | |
| { | |
| "epoch": 0.8918918918918919, | |
| "eval_loss": 1.630918264389038, | |
| "eval_runtime": 11.1794, | |
| "eval_samples_per_second": 11.092, | |
| "eval_steps_per_second": 0.716, | |
| "step": 33 | |
| }, | |
| { | |
| "epoch": 0.918918918918919, | |
| "grad_norm": 0.07644116133451462, | |
| "learning_rate": 3.6363636363636364e-05, | |
| "loss": 0.9716, | |
| "step": 34 | |
| }, | |
| { | |
| "epoch": 0.918918918918919, | |
| "eval_loss": 1.63330078125, | |
| "eval_runtime": 11.1352, | |
| "eval_samples_per_second": 11.136, | |
| "eval_steps_per_second": 0.718, | |
| "step": 34 | |
| }, | |
| { | |
| "epoch": 0.9459459459459459, | |
| "grad_norm": 0.07242273539304733, | |
| "learning_rate": 2.4242424242424244e-05, | |
| "loss": 0.9781, | |
| "step": 35 | |
| }, | |
| { | |
| "epoch": 0.9459459459459459, | |
| "eval_loss": 1.634429931640625, | |
| "eval_runtime": 11.203, | |
| "eval_samples_per_second": 11.069, | |
| "eval_steps_per_second": 0.714, | |
| "step": 35 | |
| }, | |
| { | |
| "epoch": 0.972972972972973, | |
| "grad_norm": 0.069486603140831, | |
| "learning_rate": 1.2121212121212122e-05, | |
| "loss": 0.9592, | |
| "step": 36 | |
| }, | |
| { | |
| "epoch": 0.972972972972973, | |
| "eval_loss": 1.6349563598632812, | |
| "eval_runtime": 11.1077, | |
| "eval_samples_per_second": 11.163, | |
| "eval_steps_per_second": 0.72, | |
| "step": 36 | |
| }, | |
| { | |
| "epoch": 1.0, | |
| "grad_norm": 0.07558272778987885, | |
| "learning_rate": 0.0, | |
| "loss": 0.9368, | |
| "step": 37 | |
| }, | |
| { | |
| "epoch": 1.0, | |
| "eval_loss": 1.6352812051773071, | |
| "eval_runtime": 11.1164, | |
| "eval_samples_per_second": 11.155, | |
| "eval_steps_per_second": 0.72, | |
| "step": 37 | |
| }, | |
| { | |
| "epoch": 1.0, | |
| "step": 37, | |
| "total_flos": 1.3641878835560448e+16, | |
| "train_loss": 1.5526673584371, | |
| "train_runtime": 758.4004, | |
| "train_samples_per_second": 1.552, | |
| "train_steps_per_second": 0.049 | |
| } | |
| ], | |
| "logging_steps": 1, | |
| "max_steps": 37, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 1, | |
| "save_steps": 10, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": false | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 1.3641878835560448e+16, | |
| "train_batch_size": 32, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |