| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 7.0, | |
| "eval_steps": 720, | |
| "global_step": 25193, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.13892747985551543, | |
| "grad_norm": 0.9573676586151123, | |
| "learning_rate": 0.0007907566916736131, | |
| "loss": 0.377, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.2000555709919422, | |
| "eval_loss": 0.28939542174339294, | |
| "eval_runtime": 4.6518, | |
| "eval_samples_per_second": 107.484, | |
| "eval_steps_per_second": 6.879, | |
| "step": 720 | |
| }, | |
| { | |
| "epoch": 0.27785495971103086, | |
| "grad_norm": 0.5571001768112183, | |
| "learning_rate": 0.0007815133833472261, | |
| "loss": 0.2824, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.4001111419838844, | |
| "eval_loss": 0.2604603171348572, | |
| "eval_runtime": 4.3083, | |
| "eval_samples_per_second": 116.054, | |
| "eval_steps_per_second": 7.427, | |
| "step": 1440 | |
| }, | |
| { | |
| "epoch": 0.41678243956654626, | |
| "grad_norm": 0.6729081273078918, | |
| "learning_rate": 0.0007722515513568584, | |
| "loss": 0.2627, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 0.5557099194220617, | |
| "grad_norm": 0.5200651288032532, | |
| "learning_rate": 0.0007629897193664907, | |
| "loss": 0.2488, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 0.6001667129758266, | |
| "eval_loss": 0.24864767491817474, | |
| "eval_runtime": 3.8713, | |
| "eval_samples_per_second": 129.157, | |
| "eval_steps_per_second": 8.266, | |
| "step": 2160 | |
| }, | |
| { | |
| "epoch": 0.6946373992775771, | |
| "grad_norm": 0.6645380258560181, | |
| "learning_rate": 0.0007537464110401038, | |
| "loss": 0.2424, | |
| "step": 2500 | |
| }, | |
| { | |
| "epoch": 0.8002222839677688, | |
| "eval_loss": 0.2434273064136505, | |
| "eval_runtime": 4.2012, | |
| "eval_samples_per_second": 119.013, | |
| "eval_steps_per_second": 7.617, | |
| "step": 2880 | |
| }, | |
| { | |
| "epoch": 0.8335648791330925, | |
| "grad_norm": 0.8886778354644775, | |
| "learning_rate": 0.000744484579049736, | |
| "loss": 0.2332, | |
| "step": 3000 | |
| }, | |
| { | |
| "epoch": 0.972492358988608, | |
| "grad_norm": 0.5990277528762817, | |
| "learning_rate": 0.0007352227470593684, | |
| "loss": 0.2307, | |
| "step": 3500 | |
| }, | |
| { | |
| "epoch": 1.000277854959711, | |
| "eval_loss": 0.23320984840393066, | |
| "eval_runtime": 4.3777, | |
| "eval_samples_per_second": 114.216, | |
| "eval_steps_per_second": 7.31, | |
| "step": 3600 | |
| }, | |
| { | |
| "epoch": 1.1114198388441234, | |
| "grad_norm": 0.9014330506324768, | |
| "learning_rate": 0.0007259609150690007, | |
| "loss": 0.2171, | |
| "step": 4000 | |
| }, | |
| { | |
| "epoch": 1.2003334259516532, | |
| "eval_loss": 0.23165521025657654, | |
| "eval_runtime": 4.5533, | |
| "eval_samples_per_second": 109.81, | |
| "eval_steps_per_second": 7.028, | |
| "step": 4320 | |
| }, | |
| { | |
| "epoch": 1.2503473186996388, | |
| "grad_norm": 0.9070354104042053, | |
| "learning_rate": 0.000716699083078633, | |
| "loss": 0.2174, | |
| "step": 4500 | |
| }, | |
| { | |
| "epoch": 1.3892747985551543, | |
| "grad_norm": 0.9023842215538025, | |
| "learning_rate": 0.000707455774752246, | |
| "loss": 0.215, | |
| "step": 5000 | |
| }, | |
| { | |
| "epoch": 1.4003889969435954, | |
| "eval_loss": 0.21996016800403595, | |
| "eval_runtime": 4.2649, | |
| "eval_samples_per_second": 117.235, | |
| "eval_steps_per_second": 7.503, | |
| "step": 5040 | |
| }, | |
| { | |
| "epoch": 1.5282022784106695, | |
| "grad_norm": 0.8724926710128784, | |
| "learning_rate": 0.0006981939427618783, | |
| "loss": 0.2101, | |
| "step": 5500 | |
| }, | |
| { | |
| "epoch": 1.6004445679355377, | |
| "eval_loss": 0.21771642565727234, | |
| "eval_runtime": 4.6518, | |
| "eval_samples_per_second": 107.486, | |
| "eval_steps_per_second": 6.879, | |
| "step": 5760 | |
| }, | |
| { | |
| "epoch": 1.667129758266185, | |
| "grad_norm": 1.1229606866836548, | |
| "learning_rate": 0.0006889321107715106, | |
| "loss": 0.209, | |
| "step": 6000 | |
| }, | |
| { | |
| "epoch": 1.8005001389274797, | |
| "eval_loss": 0.2162657231092453, | |
| "eval_runtime": 3.9728, | |
| "eval_samples_per_second": 125.857, | |
| "eval_steps_per_second": 8.055, | |
| "step": 6480 | |
| }, | |
| { | |
| "epoch": 1.8060572381217006, | |
| "grad_norm": 0.562541127204895, | |
| "learning_rate": 0.000679670278781143, | |
| "loss": 0.2053, | |
| "step": 6500 | |
| }, | |
| { | |
| "epoch": 1.9449847179772157, | |
| "grad_norm": 0.9058884382247925, | |
| "learning_rate": 0.0006704084467907753, | |
| "loss": 0.2047, | |
| "step": 7000 | |
| }, | |
| { | |
| "epoch": 2.000555709919422, | |
| "eval_loss": 0.21751119196414948, | |
| "eval_runtime": 3.8866, | |
| "eval_samples_per_second": 128.646, | |
| "eval_steps_per_second": 8.233, | |
| "step": 7200 | |
| }, | |
| { | |
| "epoch": 2.0839121978327313, | |
| "grad_norm": 0.5113864541053772, | |
| "learning_rate": 0.0006611466148004076, | |
| "loss": 0.1972, | |
| "step": 7500 | |
| }, | |
| { | |
| "epoch": 2.2006112809113643, | |
| "eval_loss": 0.21081140637397766, | |
| "eval_runtime": 3.9021, | |
| "eval_samples_per_second": 128.136, | |
| "eval_steps_per_second": 8.201, | |
| "step": 7920 | |
| }, | |
| { | |
| "epoch": 2.222839677688247, | |
| "grad_norm": 0.6523454785346985, | |
| "learning_rate": 0.0006518847828100398, | |
| "loss": 0.1968, | |
| "step": 8000 | |
| }, | |
| { | |
| "epoch": 2.361767157543762, | |
| "grad_norm": 0.48720309138298035, | |
| "learning_rate": 0.0006426414744836529, | |
| "loss": 0.1934, | |
| "step": 8500 | |
| }, | |
| { | |
| "epoch": 2.4006668519033063, | |
| "eval_loss": 0.20452378690242767, | |
| "eval_runtime": 4.661, | |
| "eval_samples_per_second": 107.272, | |
| "eval_steps_per_second": 6.865, | |
| "step": 8640 | |
| }, | |
| { | |
| "epoch": 2.5006946373992776, | |
| "grad_norm": 0.6927788257598877, | |
| "learning_rate": 0.0006333796424932852, | |
| "loss": 0.1906, | |
| "step": 9000 | |
| }, | |
| { | |
| "epoch": 2.600722422895249, | |
| "eval_loss": 0.1991214007139206, | |
| "eval_runtime": 4.7026, | |
| "eval_samples_per_second": 106.324, | |
| "eval_steps_per_second": 6.805, | |
| "step": 9360 | |
| }, | |
| { | |
| "epoch": 2.639622117254793, | |
| "grad_norm": 0.6956183910369873, | |
| "learning_rate": 0.0006241178105029175, | |
| "loss": 0.1893, | |
| "step": 9500 | |
| }, | |
| { | |
| "epoch": 2.7785495971103087, | |
| "grad_norm": 0.40230458974838257, | |
| "learning_rate": 0.0006148559785125498, | |
| "loss": 0.1889, | |
| "step": 10000 | |
| }, | |
| { | |
| "epoch": 2.800777993887191, | |
| "eval_loss": 0.20080548524856567, | |
| "eval_runtime": 4.5769, | |
| "eval_samples_per_second": 109.245, | |
| "eval_steps_per_second": 6.992, | |
| "step": 10080 | |
| }, | |
| { | |
| "epoch": 2.917477076965824, | |
| "grad_norm": 1.1422420740127563, | |
| "learning_rate": 0.0006055941465221821, | |
| "loss": 0.1911, | |
| "step": 10500 | |
| }, | |
| { | |
| "epoch": 3.000833564879133, | |
| "eval_loss": 0.2006545066833496, | |
| "eval_runtime": 4.7461, | |
| "eval_samples_per_second": 105.351, | |
| "eval_steps_per_second": 6.742, | |
| "step": 10800 | |
| }, | |
| { | |
| "epoch": 3.0564045568213394, | |
| "grad_norm": 0.8359866142272949, | |
| "learning_rate": 0.0005963323145318144, | |
| "loss": 0.1858, | |
| "step": 11000 | |
| }, | |
| { | |
| "epoch": 3.1953320366768545, | |
| "grad_norm": 0.5189564824104309, | |
| "learning_rate": 0.0005870704825414468, | |
| "loss": 0.1789, | |
| "step": 11500 | |
| }, | |
| { | |
| "epoch": 3.2008891358710754, | |
| "eval_loss": 0.19731611013412476, | |
| "eval_runtime": 4.0407, | |
| "eval_samples_per_second": 123.742, | |
| "eval_steps_per_second": 7.919, | |
| "step": 11520 | |
| }, | |
| { | |
| "epoch": 3.33425951653237, | |
| "grad_norm": 0.578628659248352, | |
| "learning_rate": 0.0005778271742150597, | |
| "loss": 0.1799, | |
| "step": 12000 | |
| }, | |
| { | |
| "epoch": 3.4009447068630174, | |
| "eval_loss": 0.19467230141162872, | |
| "eval_runtime": 4.0481, | |
| "eval_samples_per_second": 123.514, | |
| "eval_steps_per_second": 7.905, | |
| "step": 12240 | |
| }, | |
| { | |
| "epoch": 3.4731869963878856, | |
| "grad_norm": 0.8292349576950073, | |
| "learning_rate": 0.0005685653422246921, | |
| "loss": 0.1769, | |
| "step": 12500 | |
| }, | |
| { | |
| "epoch": 3.6010002778549595, | |
| "eval_loss": 0.19331130385398865, | |
| "eval_runtime": 4.0814, | |
| "eval_samples_per_second": 122.508, | |
| "eval_steps_per_second": 7.841, | |
| "step": 12960 | |
| }, | |
| { | |
| "epoch": 3.612114476243401, | |
| "grad_norm": 0.6857467293739319, | |
| "learning_rate": 0.0005593035102343244, | |
| "loss": 0.1796, | |
| "step": 13000 | |
| }, | |
| { | |
| "epoch": 3.7510419560989163, | |
| "grad_norm": 0.5829126238822937, | |
| "learning_rate": 0.0005500416782439567, | |
| "loss": 0.176, | |
| "step": 13500 | |
| }, | |
| { | |
| "epoch": 3.801055848846902, | |
| "eval_loss": 0.18971021473407745, | |
| "eval_runtime": 4.5985, | |
| "eval_samples_per_second": 108.73, | |
| "eval_steps_per_second": 6.959, | |
| "step": 13680 | |
| }, | |
| { | |
| "epoch": 3.889969435954432, | |
| "grad_norm": 0.7102944254875183, | |
| "learning_rate": 0.0005407798462535889, | |
| "loss": 0.1753, | |
| "step": 14000 | |
| }, | |
| { | |
| "epoch": 4.001111419838844, | |
| "eval_loss": 0.18332147598266602, | |
| "eval_runtime": 4.569, | |
| "eval_samples_per_second": 109.434, | |
| "eval_steps_per_second": 7.004, | |
| "step": 14400 | |
| }, | |
| { | |
| "epoch": 4.0288969158099475, | |
| "grad_norm": 0.5539807081222534, | |
| "learning_rate": 0.000531536537927202, | |
| "loss": 0.17, | |
| "step": 14500 | |
| }, | |
| { | |
| "epoch": 4.167824395665463, | |
| "grad_norm": 0.6355165243148804, | |
| "learning_rate": 0.0005222747059368344, | |
| "loss": 0.1645, | |
| "step": 15000 | |
| }, | |
| { | |
| "epoch": 4.201166990830786, | |
| "eval_loss": 0.18329477310180664, | |
| "eval_runtime": 4.6958, | |
| "eval_samples_per_second": 106.477, | |
| "eval_steps_per_second": 6.815, | |
| "step": 15120 | |
| }, | |
| { | |
| "epoch": 4.306751875520978, | |
| "grad_norm": 0.5928105115890503, | |
| "learning_rate": 0.0005130128739464667, | |
| "loss": 0.1684, | |
| "step": 15500 | |
| }, | |
| { | |
| "epoch": 4.4012225618227285, | |
| "eval_loss": 0.1863529086112976, | |
| "eval_runtime": 3.9814, | |
| "eval_samples_per_second": 125.585, | |
| "eval_steps_per_second": 8.037, | |
| "step": 15840 | |
| }, | |
| { | |
| "epoch": 4.445679355376494, | |
| "grad_norm": 0.5573757886886597, | |
| "learning_rate": 0.000503751041956099, | |
| "loss": 0.1671, | |
| "step": 16000 | |
| }, | |
| { | |
| "epoch": 4.584606835232009, | |
| "grad_norm": 0.5758992433547974, | |
| "learning_rate": 0.0004944892099657312, | |
| "loss": 0.1626, | |
| "step": 16500 | |
| }, | |
| { | |
| "epoch": 4.601278132814671, | |
| "eval_loss": 0.18393146991729736, | |
| "eval_runtime": 3.9565, | |
| "eval_samples_per_second": 126.373, | |
| "eval_steps_per_second": 8.088, | |
| "step": 16560 | |
| }, | |
| { | |
| "epoch": 4.723534315087524, | |
| "grad_norm": 0.65135258436203, | |
| "learning_rate": 0.0004852459016393443, | |
| "loss": 0.1649, | |
| "step": 17000 | |
| }, | |
| { | |
| "epoch": 4.801333703806613, | |
| "eval_loss": 0.176134392619133, | |
| "eval_runtime": 4.7856, | |
| "eval_samples_per_second": 104.479, | |
| "eval_steps_per_second": 6.687, | |
| "step": 17280 | |
| }, | |
| { | |
| "epoch": 4.86246179494304, | |
| "grad_norm": 0.4339986741542816, | |
| "learning_rate": 0.0004759840696489766, | |
| "loss": 0.1643, | |
| "step": 17500 | |
| }, | |
| { | |
| "epoch": 5.001389274798555, | |
| "grad_norm": 0.6604339480400085, | |
| "learning_rate": 0.00046672223765860887, | |
| "loss": 0.1613, | |
| "step": 18000 | |
| }, | |
| { | |
| "epoch": 5.001389274798555, | |
| "eval_loss": 0.18032513558864594, | |
| "eval_runtime": 4.4011, | |
| "eval_samples_per_second": 113.608, | |
| "eval_steps_per_second": 7.271, | |
| "step": 18000 | |
| }, | |
| { | |
| "epoch": 5.14031675465407, | |
| "grad_norm": 0.5034669637680054, | |
| "learning_rate": 0.00045746040566824116, | |
| "loss": 0.1523, | |
| "step": 18500 | |
| }, | |
| { | |
| "epoch": 5.201444845790498, | |
| "eval_loss": 0.17549069225788116, | |
| "eval_runtime": 4.1969, | |
| "eval_samples_per_second": 119.137, | |
| "eval_steps_per_second": 7.625, | |
| "step": 18720 | |
| }, | |
| { | |
| "epoch": 5.279244234509586, | |
| "grad_norm": 0.6701804995536804, | |
| "learning_rate": 0.00044821709734185424, | |
| "loss": 0.1531, | |
| "step": 19000 | |
| }, | |
| { | |
| "epoch": 5.401500416782439, | |
| "eval_loss": 0.1766393482685089, | |
| "eval_runtime": 4.6747, | |
| "eval_samples_per_second": 106.959, | |
| "eval_steps_per_second": 6.845, | |
| "step": 19440 | |
| }, | |
| { | |
| "epoch": 5.418171714365101, | |
| "grad_norm": 0.8808122277259827, | |
| "learning_rate": 0.0004389552653514866, | |
| "loss": 0.1538, | |
| "step": 19500 | |
| }, | |
| { | |
| "epoch": 5.5570991942206165, | |
| "grad_norm": 0.7931129336357117, | |
| "learning_rate": 0.0004296934333611189, | |
| "loss": 0.1539, | |
| "step": 20000 | |
| }, | |
| { | |
| "epoch": 5.601555987774382, | |
| "eval_loss": 0.1726432889699936, | |
| "eval_runtime": 3.9865, | |
| "eval_samples_per_second": 125.423, | |
| "eval_steps_per_second": 8.027, | |
| "step": 20160 | |
| }, | |
| { | |
| "epoch": 5.6960266740761325, | |
| "grad_norm": 0.6453366875648499, | |
| "learning_rate": 0.00042043160137075113, | |
| "loss": 0.1527, | |
| "step": 20500 | |
| }, | |
| { | |
| "epoch": 5.801611558766324, | |
| "eval_loss": 0.17778918147087097, | |
| "eval_runtime": 4.6178, | |
| "eval_samples_per_second": 108.277, | |
| "eval_steps_per_second": 6.93, | |
| "step": 20880 | |
| }, | |
| { | |
| "epoch": 5.834954153931648, | |
| "grad_norm": 0.539543867111206, | |
| "learning_rate": 0.0004111697693803834, | |
| "loss": 0.1522, | |
| "step": 21000 | |
| }, | |
| { | |
| "epoch": 5.973881633787163, | |
| "grad_norm": 0.7059842944145203, | |
| "learning_rate": 0.0004019079373900158, | |
| "loss": 0.1483, | |
| "step": 21500 | |
| }, | |
| { | |
| "epoch": 6.001667129758266, | |
| "eval_loss": 0.16399218142032623, | |
| "eval_runtime": 4.5977, | |
| "eval_samples_per_second": 108.749, | |
| "eval_steps_per_second": 6.96, | |
| "step": 21600 | |
| }, | |
| { | |
| "epoch": 6.112809113642679, | |
| "grad_norm": 0.6448105573654175, | |
| "learning_rate": 0.00039264610539964807, | |
| "loss": 0.1404, | |
| "step": 22000 | |
| }, | |
| { | |
| "epoch": 6.201722700750208, | |
| "eval_loss": 0.16838781535625458, | |
| "eval_runtime": 4.653, | |
| "eval_samples_per_second": 107.459, | |
| "eval_steps_per_second": 6.877, | |
| "step": 22320 | |
| }, | |
| { | |
| "epoch": 6.251736593498194, | |
| "grad_norm": 0.6864171028137207, | |
| "learning_rate": 0.0003834027970732611, | |
| "loss": 0.1425, | |
| "step": 22500 | |
| }, | |
| { | |
| "epoch": 6.390664073353709, | |
| "grad_norm": 0.6795888543128967, | |
| "learning_rate": 0.0003741409650828934, | |
| "loss": 0.1426, | |
| "step": 23000 | |
| }, | |
| { | |
| "epoch": 6.401778271742151, | |
| "eval_loss": 0.1665893793106079, | |
| "eval_runtime": 4.626, | |
| "eval_samples_per_second": 108.085, | |
| "eval_steps_per_second": 6.917, | |
| "step": 23040 | |
| }, | |
| { | |
| "epoch": 6.529591553209225, | |
| "grad_norm": 0.6111213564872742, | |
| "learning_rate": 0.00036487913309252574, | |
| "loss": 0.1407, | |
| "step": 23500 | |
| }, | |
| { | |
| "epoch": 6.601833842734093, | |
| "eval_loss": 0.16504846513271332, | |
| "eval_runtime": 3.8884, | |
| "eval_samples_per_second": 128.587, | |
| "eval_steps_per_second": 8.23, | |
| "step": 23760 | |
| }, | |
| { | |
| "epoch": 6.66851903306474, | |
| "grad_norm": 0.5933897495269775, | |
| "learning_rate": 0.000355617301102158, | |
| "loss": 0.1413, | |
| "step": 24000 | |
| }, | |
| { | |
| "epoch": 6.801889413726035, | |
| "eval_loss": 0.16348223388195038, | |
| "eval_runtime": 4.6832, | |
| "eval_samples_per_second": 106.764, | |
| "eval_steps_per_second": 6.833, | |
| "step": 24480 | |
| }, | |
| { | |
| "epoch": 6.807446512920255, | |
| "grad_norm": 0.7584686875343323, | |
| "learning_rate": 0.00034635546911179033, | |
| "loss": 0.1405, | |
| "step": 24500 | |
| }, | |
| { | |
| "epoch": 6.946373992775771, | |
| "grad_norm": 0.614815890789032, | |
| "learning_rate": 0.00033709363712142263, | |
| "loss": 0.1386, | |
| "step": 25000 | |
| } | |
| ], | |
| "logging_steps": 500, | |
| "max_steps": 43188, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 12, | |
| "save_steps": 500, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": false | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 7.083556356321444e+17, | |
| "train_batch_size": 2, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |