{ "best_metric": null, "best_model_checkpoint": null, "epoch": 5.0, "eval_steps": 720, "global_step": 17995, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.13892747985551543, "grad_norm": 0.9573676586151123, "learning_rate": 0.0007907566916736131, "loss": 0.377, "step": 500 }, { "epoch": 0.2000555709919422, "eval_loss": 0.28939542174339294, "eval_runtime": 4.6518, "eval_samples_per_second": 107.484, "eval_steps_per_second": 6.879, "step": 720 }, { "epoch": 0.27785495971103086, "grad_norm": 0.5571001768112183, "learning_rate": 0.0007815133833472261, "loss": 0.2824, "step": 1000 }, { "epoch": 0.4001111419838844, "eval_loss": 0.2604603171348572, "eval_runtime": 4.3083, "eval_samples_per_second": 116.054, "eval_steps_per_second": 7.427, "step": 1440 }, { "epoch": 0.41678243956654626, "grad_norm": 0.6729081273078918, "learning_rate": 0.0007722515513568584, "loss": 0.2627, "step": 1500 }, { "epoch": 0.5557099194220617, "grad_norm": 0.5200651288032532, "learning_rate": 0.0007629897193664907, "loss": 0.2488, "step": 2000 }, { "epoch": 0.6001667129758266, "eval_loss": 0.24864767491817474, "eval_runtime": 3.8713, "eval_samples_per_second": 129.157, "eval_steps_per_second": 8.266, "step": 2160 }, { "epoch": 0.6946373992775771, "grad_norm": 0.6645380258560181, "learning_rate": 0.0007537464110401038, "loss": 0.2424, "step": 2500 }, { "epoch": 0.8002222839677688, "eval_loss": 0.2434273064136505, "eval_runtime": 4.2012, "eval_samples_per_second": 119.013, "eval_steps_per_second": 7.617, "step": 2880 }, { "epoch": 0.8335648791330925, "grad_norm": 0.8886778354644775, "learning_rate": 0.000744484579049736, "loss": 0.2332, "step": 3000 }, { "epoch": 0.972492358988608, "grad_norm": 0.5990277528762817, "learning_rate": 0.0007352227470593684, "loss": 0.2307, "step": 3500 }, { "epoch": 1.000277854959711, "eval_loss": 0.23320984840393066, "eval_runtime": 4.3777, "eval_samples_per_second": 114.216, "eval_steps_per_second": 7.31, "step": 3600 }, { "epoch": 1.1114198388441234, "grad_norm": 0.9014330506324768, "learning_rate": 0.0007259609150690007, "loss": 0.2171, "step": 4000 }, { "epoch": 1.2003334259516532, "eval_loss": 0.23165521025657654, "eval_runtime": 4.5533, "eval_samples_per_second": 109.81, "eval_steps_per_second": 7.028, "step": 4320 }, { "epoch": 1.2503473186996388, "grad_norm": 0.9070354104042053, "learning_rate": 0.000716699083078633, "loss": 0.2174, "step": 4500 }, { "epoch": 1.3892747985551543, "grad_norm": 0.9023842215538025, "learning_rate": 0.000707455774752246, "loss": 0.215, "step": 5000 }, { "epoch": 1.4003889969435954, "eval_loss": 0.21996016800403595, "eval_runtime": 4.2649, "eval_samples_per_second": 117.235, "eval_steps_per_second": 7.503, "step": 5040 }, { "epoch": 1.5282022784106695, "grad_norm": 0.8724926710128784, "learning_rate": 0.0006981939427618783, "loss": 0.2101, "step": 5500 }, { "epoch": 1.6004445679355377, "eval_loss": 0.21771642565727234, "eval_runtime": 4.6518, "eval_samples_per_second": 107.486, "eval_steps_per_second": 6.879, "step": 5760 }, { "epoch": 1.667129758266185, "grad_norm": 1.1229606866836548, "learning_rate": 0.0006889321107715106, "loss": 0.209, "step": 6000 }, { "epoch": 1.8005001389274797, "eval_loss": 0.2162657231092453, "eval_runtime": 3.9728, "eval_samples_per_second": 125.857, "eval_steps_per_second": 8.055, "step": 6480 }, { "epoch": 1.8060572381217006, "grad_norm": 0.562541127204895, "learning_rate": 0.000679670278781143, "loss": 0.2053, "step": 6500 }, { "epoch": 1.9449847179772157, "grad_norm": 0.9058884382247925, "learning_rate": 0.0006704084467907753, "loss": 0.2047, "step": 7000 }, { "epoch": 2.000555709919422, "eval_loss": 0.21751119196414948, "eval_runtime": 3.8866, "eval_samples_per_second": 128.646, "eval_steps_per_second": 8.233, "step": 7200 }, { "epoch": 2.0839121978327313, "grad_norm": 0.5113864541053772, "learning_rate": 0.0006611466148004076, "loss": 0.1972, "step": 7500 }, { "epoch": 2.2006112809113643, "eval_loss": 0.21081140637397766, "eval_runtime": 3.9021, "eval_samples_per_second": 128.136, "eval_steps_per_second": 8.201, "step": 7920 }, { "epoch": 2.222839677688247, "grad_norm": 0.6523454785346985, "learning_rate": 0.0006518847828100398, "loss": 0.1968, "step": 8000 }, { "epoch": 2.361767157543762, "grad_norm": 0.48720309138298035, "learning_rate": 0.0006426414744836529, "loss": 0.1934, "step": 8500 }, { "epoch": 2.4006668519033063, "eval_loss": 0.20452378690242767, "eval_runtime": 4.661, "eval_samples_per_second": 107.272, "eval_steps_per_second": 6.865, "step": 8640 }, { "epoch": 2.5006946373992776, "grad_norm": 0.6927788257598877, "learning_rate": 0.0006333796424932852, "loss": 0.1906, "step": 9000 }, { "epoch": 2.600722422895249, "eval_loss": 0.1991214007139206, "eval_runtime": 4.7026, "eval_samples_per_second": 106.324, "eval_steps_per_second": 6.805, "step": 9360 }, { "epoch": 2.639622117254793, "grad_norm": 0.6956183910369873, "learning_rate": 0.0006241178105029175, "loss": 0.1893, "step": 9500 }, { "epoch": 2.7785495971103087, "grad_norm": 0.40230458974838257, "learning_rate": 0.0006148559785125498, "loss": 0.1889, "step": 10000 }, { "epoch": 2.800777993887191, "eval_loss": 0.20080548524856567, "eval_runtime": 4.5769, "eval_samples_per_second": 109.245, "eval_steps_per_second": 6.992, "step": 10080 }, { "epoch": 2.917477076965824, "grad_norm": 1.1422420740127563, "learning_rate": 0.0006055941465221821, "loss": 0.1911, "step": 10500 }, { "epoch": 3.000833564879133, "eval_loss": 0.2006545066833496, "eval_runtime": 4.7461, "eval_samples_per_second": 105.351, "eval_steps_per_second": 6.742, "step": 10800 }, { "epoch": 3.0564045568213394, "grad_norm": 0.8359866142272949, "learning_rate": 0.0005963323145318144, "loss": 0.1858, "step": 11000 }, { "epoch": 3.1953320366768545, "grad_norm": 0.5189564824104309, "learning_rate": 0.0005870704825414468, "loss": 0.1789, "step": 11500 }, { "epoch": 3.2008891358710754, "eval_loss": 0.19731611013412476, "eval_runtime": 4.0407, "eval_samples_per_second": 123.742, "eval_steps_per_second": 7.919, "step": 11520 }, { "epoch": 3.33425951653237, "grad_norm": 0.578628659248352, "learning_rate": 0.0005778271742150597, "loss": 0.1799, "step": 12000 }, { "epoch": 3.4009447068630174, "eval_loss": 0.19467230141162872, "eval_runtime": 4.0481, "eval_samples_per_second": 123.514, "eval_steps_per_second": 7.905, "step": 12240 }, { "epoch": 3.4731869963878856, "grad_norm": 0.8292349576950073, "learning_rate": 0.0005685653422246921, "loss": 0.1769, "step": 12500 }, { "epoch": 3.6010002778549595, "eval_loss": 0.19331130385398865, "eval_runtime": 4.0814, "eval_samples_per_second": 122.508, "eval_steps_per_second": 7.841, "step": 12960 }, { "epoch": 3.612114476243401, "grad_norm": 0.6857467293739319, "learning_rate": 0.0005593035102343244, "loss": 0.1796, "step": 13000 }, { "epoch": 3.7510419560989163, "grad_norm": 0.5829126238822937, "learning_rate": 0.0005500416782439567, "loss": 0.176, "step": 13500 }, { "epoch": 3.801055848846902, "eval_loss": 0.18971021473407745, "eval_runtime": 4.5985, "eval_samples_per_second": 108.73, "eval_steps_per_second": 6.959, "step": 13680 }, { "epoch": 3.889969435954432, "grad_norm": 0.7102944254875183, "learning_rate": 0.0005407798462535889, "loss": 0.1753, "step": 14000 }, { "epoch": 4.001111419838844, "eval_loss": 0.18332147598266602, "eval_runtime": 4.569, "eval_samples_per_second": 109.434, "eval_steps_per_second": 7.004, "step": 14400 }, { "epoch": 4.0288969158099475, "grad_norm": 0.5539807081222534, "learning_rate": 0.000531536537927202, "loss": 0.17, "step": 14500 }, { "epoch": 4.167824395665463, "grad_norm": 0.6355165243148804, "learning_rate": 0.0005222747059368344, "loss": 0.1645, "step": 15000 }, { "epoch": 4.201166990830786, "eval_loss": 0.18329477310180664, "eval_runtime": 4.6958, "eval_samples_per_second": 106.477, "eval_steps_per_second": 6.815, "step": 15120 }, { "epoch": 4.306751875520978, "grad_norm": 0.5928105115890503, "learning_rate": 0.0005130128739464667, "loss": 0.1684, "step": 15500 }, { "epoch": 4.4012225618227285, "eval_loss": 0.1863529086112976, "eval_runtime": 3.9814, "eval_samples_per_second": 125.585, "eval_steps_per_second": 8.037, "step": 15840 }, { "epoch": 4.445679355376494, "grad_norm": 0.5573757886886597, "learning_rate": 0.000503751041956099, "loss": 0.1671, "step": 16000 }, { "epoch": 4.584606835232009, "grad_norm": 0.5758992433547974, "learning_rate": 0.0004944892099657312, "loss": 0.1626, "step": 16500 }, { "epoch": 4.601278132814671, "eval_loss": 0.18393146991729736, "eval_runtime": 3.9565, "eval_samples_per_second": 126.373, "eval_steps_per_second": 8.088, "step": 16560 }, { "epoch": 4.723534315087524, "grad_norm": 0.65135258436203, "learning_rate": 0.0004852459016393443, "loss": 0.1649, "step": 17000 }, { "epoch": 4.801333703806613, "eval_loss": 0.176134392619133, "eval_runtime": 4.7856, "eval_samples_per_second": 104.479, "eval_steps_per_second": 6.687, "step": 17280 }, { "epoch": 4.86246179494304, "grad_norm": 0.4339986741542816, "learning_rate": 0.0004759840696489766, "loss": 0.1643, "step": 17500 } ], "logging_steps": 500, "max_steps": 43188, "num_input_tokens_seen": 0, "num_train_epochs": 12, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 5.05936785054892e+17, "train_batch_size": 2, "trial_name": null, "trial_params": null }