| { | |
| "best_global_step": null, | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 20.0, | |
| "eval_steps": 500, | |
| "global_step": 383360, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.026085141903171953, | |
| "grad_norm": 3.252858877182007, | |
| "learning_rate": 4.978305856983862e-05, | |
| "loss": 1.9218, | |
| "num_input_tokens_seen": 283536, | |
| "step": 500, | |
| "train_runtime": 18.7004, | |
| "train_tokens_per_second": 15162.068 | |
| }, | |
| { | |
| "epoch": 0.052170283806343906, | |
| "grad_norm": 3.500765085220337, | |
| "learning_rate": 4.956568238731219e-05, | |
| "loss": 1.7605, | |
| "num_input_tokens_seen": 574552, | |
| "step": 1000, | |
| "train_runtime": 38.0018, | |
| "train_tokens_per_second": 15119.085 | |
| }, | |
| { | |
| "epoch": 0.07825542570951587, | |
| "grad_norm": 3.4590671062469482, | |
| "learning_rate": 4.934830620478575e-05, | |
| "loss": 1.6898, | |
| "num_input_tokens_seen": 859976, | |
| "step": 1500, | |
| "train_runtime": 57.2593, | |
| "train_tokens_per_second": 15018.978 | |
| }, | |
| { | |
| "epoch": 0.10434056761268781, | |
| "grad_norm": 3.67798113822937, | |
| "learning_rate": 4.9130930022259324e-05, | |
| "loss": 1.6968, | |
| "num_input_tokens_seen": 1151232, | |
| "step": 2000, | |
| "train_runtime": 76.5735, | |
| "train_tokens_per_second": 15034.338 | |
| }, | |
| { | |
| "epoch": 0.13042570951585977, | |
| "grad_norm": 3.009059190750122, | |
| "learning_rate": 4.891355383973289e-05, | |
| "loss": 1.6838, | |
| "num_input_tokens_seen": 1439432, | |
| "step": 2500, | |
| "train_runtime": 95.8962, | |
| "train_tokens_per_second": 15010.309 | |
| }, | |
| { | |
| "epoch": 0.15651085141903173, | |
| "grad_norm": 3.1467044353485107, | |
| "learning_rate": 4.869617765720646e-05, | |
| "loss": 1.6861, | |
| "num_input_tokens_seen": 1727728, | |
| "step": 3000, | |
| "train_runtime": 114.9793, | |
| "train_tokens_per_second": 15026.424 | |
| }, | |
| { | |
| "epoch": 0.18259599332220366, | |
| "grad_norm": 2.8238844871520996, | |
| "learning_rate": 4.8478801474680025e-05, | |
| "loss": 1.6343, | |
| "num_input_tokens_seen": 2016488, | |
| "step": 3500, | |
| "train_runtime": 134.048, | |
| "train_tokens_per_second": 15043.024 | |
| }, | |
| { | |
| "epoch": 0.20868113522537562, | |
| "grad_norm": 2.7848801612854004, | |
| "learning_rate": 4.826142529215359e-05, | |
| "loss": 1.6482, | |
| "num_input_tokens_seen": 2310136, | |
| "step": 4000, | |
| "train_runtime": 153.775, | |
| "train_tokens_per_second": 15022.828 | |
| }, | |
| { | |
| "epoch": 0.23476627712854758, | |
| "grad_norm": 3.402919054031372, | |
| "learning_rate": 4.804404910962716e-05, | |
| "loss": 1.6326, | |
| "num_input_tokens_seen": 2601800, | |
| "step": 4500, | |
| "train_runtime": 173.1573, | |
| "train_tokens_per_second": 15025.64 | |
| }, | |
| { | |
| "epoch": 0.26085141903171954, | |
| "grad_norm": 4.777134418487549, | |
| "learning_rate": 4.7826672927100726e-05, | |
| "loss": 1.6236, | |
| "num_input_tokens_seen": 2889448, | |
| "step": 5000, | |
| "train_runtime": 192.4563, | |
| "train_tokens_per_second": 15013.531 | |
| }, | |
| { | |
| "epoch": 0.2869365609348915, | |
| "grad_norm": 2.45479416847229, | |
| "learning_rate": 4.760929674457429e-05, | |
| "loss": 1.5949, | |
| "num_input_tokens_seen": 3180128, | |
| "step": 5500, | |
| "train_runtime": 211.2052, | |
| "train_tokens_per_second": 15057.053 | |
| }, | |
| { | |
| "epoch": 0.31302170283806346, | |
| "grad_norm": 2.6998794078826904, | |
| "learning_rate": 4.7391920562047856e-05, | |
| "loss": 1.6117, | |
| "num_input_tokens_seen": 3470912, | |
| "step": 6000, | |
| "train_runtime": 230.9915, | |
| "train_tokens_per_second": 15026.144 | |
| }, | |
| { | |
| "epoch": 0.33910684474123537, | |
| "grad_norm": 2.838428258895874, | |
| "learning_rate": 4.717454437952143e-05, | |
| "loss": 1.6056, | |
| "num_input_tokens_seen": 3764848, | |
| "step": 6500, | |
| "train_runtime": 251.0138, | |
| "train_tokens_per_second": 14998.572 | |
| }, | |
| { | |
| "epoch": 0.36519198664440733, | |
| "grad_norm": 2.8896422386169434, | |
| "learning_rate": 4.695716819699499e-05, | |
| "loss": 1.6002, | |
| "num_input_tokens_seen": 4049200, | |
| "step": 7000, | |
| "train_runtime": 270.653, | |
| "train_tokens_per_second": 14960.855 | |
| }, | |
| { | |
| "epoch": 0.3912771285475793, | |
| "grad_norm": 2.878220558166504, | |
| "learning_rate": 4.673979201446856e-05, | |
| "loss": 1.5839, | |
| "num_input_tokens_seen": 4340488, | |
| "step": 7500, | |
| "train_runtime": 290.1843, | |
| "train_tokens_per_second": 14957.693 | |
| }, | |
| { | |
| "epoch": 0.41736227045075125, | |
| "grad_norm": 2.7241406440734863, | |
| "learning_rate": 4.652241583194213e-05, | |
| "loss": 1.5844, | |
| "num_input_tokens_seen": 4631904, | |
| "step": 8000, | |
| "train_runtime": 309.2754, | |
| "train_tokens_per_second": 14976.633 | |
| }, | |
| { | |
| "epoch": 0.4434474123539232, | |
| "grad_norm": 2.727529287338257, | |
| "learning_rate": 4.630503964941569e-05, | |
| "loss": 1.5936, | |
| "num_input_tokens_seen": 4919576, | |
| "step": 8500, | |
| "train_runtime": 328.4961, | |
| "train_tokens_per_second": 14976.057 | |
| }, | |
| { | |
| "epoch": 0.46953255425709517, | |
| "grad_norm": 3.117870330810547, | |
| "learning_rate": 4.6087663466889265e-05, | |
| "loss": 1.5695, | |
| "num_input_tokens_seen": 5211016, | |
| "step": 9000, | |
| "train_runtime": 348.3435, | |
| "train_tokens_per_second": 14959.417 | |
| }, | |
| { | |
| "epoch": 0.49561769616026713, | |
| "grad_norm": 2.490983724594116, | |
| "learning_rate": 4.587028728436283e-05, | |
| "loss": 1.5802, | |
| "num_input_tokens_seen": 5507568, | |
| "step": 9500, | |
| "train_runtime": 368.0383, | |
| "train_tokens_per_second": 14964.661 | |
| }, | |
| { | |
| "epoch": 0.5217028380634391, | |
| "grad_norm": 2.392632246017456, | |
| "learning_rate": 4.56529111018364e-05, | |
| "loss": 1.5806, | |
| "num_input_tokens_seen": 5798840, | |
| "step": 10000, | |
| "train_runtime": 387.6945, | |
| "train_tokens_per_second": 14957.241 | |
| }, | |
| { | |
| "epoch": 0.547787979966611, | |
| "grad_norm": 2.6862573623657227, | |
| "learning_rate": 4.5435534919309966e-05, | |
| "loss": 1.5801, | |
| "num_input_tokens_seen": 6085768, | |
| "step": 10500, | |
| "train_runtime": 407.4294, | |
| "train_tokens_per_second": 14936.988 | |
| }, | |
| { | |
| "epoch": 0.573873121869783, | |
| "grad_norm": 3.164522647857666, | |
| "learning_rate": 4.521815873678353e-05, | |
| "loss": 1.5636, | |
| "num_input_tokens_seen": 6371672, | |
| "step": 11000, | |
| "train_runtime": 426.5237, | |
| "train_tokens_per_second": 14938.61 | |
| }, | |
| { | |
| "epoch": 0.5999582637729549, | |
| "grad_norm": 2.5483455657958984, | |
| "learning_rate": 4.5000782554257095e-05, | |
| "loss": 1.5541, | |
| "num_input_tokens_seen": 6659744, | |
| "step": 11500, | |
| "train_runtime": 445.61, | |
| "train_tokens_per_second": 14945.23 | |
| }, | |
| { | |
| "epoch": 0.6260434056761269, | |
| "grad_norm": 2.6326119899749756, | |
| "learning_rate": 4.478340637173066e-05, | |
| "loss": 1.5801, | |
| "num_input_tokens_seen": 6947616, | |
| "step": 12000, | |
| "train_runtime": 465.2155, | |
| "train_tokens_per_second": 14934.188 | |
| }, | |
| { | |
| "epoch": 0.6521285475792988, | |
| "grad_norm": 2.5993449687957764, | |
| "learning_rate": 4.456603018920423e-05, | |
| "loss": 1.5497, | |
| "num_input_tokens_seen": 7236800, | |
| "step": 12500, | |
| "train_runtime": 484.6648, | |
| "train_tokens_per_second": 14931.556 | |
| }, | |
| { | |
| "epoch": 0.6782136894824707, | |
| "grad_norm": 2.419832468032837, | |
| "learning_rate": 4.4348654006677796e-05, | |
| "loss": 1.5692, | |
| "num_input_tokens_seen": 7525160, | |
| "step": 13000, | |
| "train_runtime": 504.4097, | |
| "train_tokens_per_second": 14918.745 | |
| }, | |
| { | |
| "epoch": 0.7042988313856428, | |
| "grad_norm": 2.346853017807007, | |
| "learning_rate": 4.413127782415137e-05, | |
| "loss": 1.568, | |
| "num_input_tokens_seen": 7815704, | |
| "step": 13500, | |
| "train_runtime": 523.0681, | |
| "train_tokens_per_second": 14942.039 | |
| }, | |
| { | |
| "epoch": 0.7303839732888147, | |
| "grad_norm": 2.47847580909729, | |
| "learning_rate": 4.391390164162493e-05, | |
| "loss": 1.5597, | |
| "num_input_tokens_seen": 8107760, | |
| "step": 14000, | |
| "train_runtime": 542.052, | |
| "train_tokens_per_second": 14957.533 | |
| }, | |
| { | |
| "epoch": 0.7564691151919867, | |
| "grad_norm": 2.5489418506622314, | |
| "learning_rate": 4.36965254590985e-05, | |
| "loss": 1.5588, | |
| "num_input_tokens_seen": 8400096, | |
| "step": 14500, | |
| "train_runtime": 562.4429, | |
| "train_tokens_per_second": 14935.019 | |
| }, | |
| { | |
| "epoch": 0.7825542570951586, | |
| "grad_norm": 3.1929831504821777, | |
| "learning_rate": 4.347914927657207e-05, | |
| "loss": 1.5409, | |
| "num_input_tokens_seen": 8679112, | |
| "step": 15000, | |
| "train_runtime": 581.6704, | |
| "train_tokens_per_second": 14921.014 | |
| }, | |
| { | |
| "epoch": 0.8086393989983306, | |
| "grad_norm": 2.6714396476745605, | |
| "learning_rate": 4.3261773094045634e-05, | |
| "loss": 1.5494, | |
| "num_input_tokens_seen": 8969456, | |
| "step": 15500, | |
| "train_runtime": 600.81, | |
| "train_tokens_per_second": 14928.94 | |
| }, | |
| { | |
| "epoch": 0.8347245409015025, | |
| "grad_norm": 2.379903554916382, | |
| "learning_rate": 4.3044396911519205e-05, | |
| "loss": 1.5589, | |
| "num_input_tokens_seen": 9261064, | |
| "step": 16000, | |
| "train_runtime": 619.9911, | |
| "train_tokens_per_second": 14937.414 | |
| }, | |
| { | |
| "epoch": 0.8608096828046744, | |
| "grad_norm": 2.5801916122436523, | |
| "learning_rate": 4.282702072899277e-05, | |
| "loss": 1.5594, | |
| "num_input_tokens_seen": 9550752, | |
| "step": 16500, | |
| "train_runtime": 639.7359, | |
| "train_tokens_per_second": 14929.21 | |
| }, | |
| { | |
| "epoch": 0.8868948247078464, | |
| "grad_norm": 2.8763697147369385, | |
| "learning_rate": 4.2609644546466335e-05, | |
| "loss": 1.5768, | |
| "num_input_tokens_seen": 9839304, | |
| "step": 17000, | |
| "train_runtime": 659.5206, | |
| "train_tokens_per_second": 14918.873 | |
| }, | |
| { | |
| "epoch": 0.9129799666110183, | |
| "grad_norm": 3.0146758556365967, | |
| "learning_rate": 4.23922683639399e-05, | |
| "loss": 1.5499, | |
| "num_input_tokens_seen": 10132792, | |
| "step": 17500, | |
| "train_runtime": 679.4357, | |
| "train_tokens_per_second": 14913.541 | |
| }, | |
| { | |
| "epoch": 0.9390651085141903, | |
| "grad_norm": 2.629668951034546, | |
| "learning_rate": 4.2174892181413464e-05, | |
| "loss": 1.5469, | |
| "num_input_tokens_seen": 10417368, | |
| "step": 18000, | |
| "train_runtime": 699.3592, | |
| "train_tokens_per_second": 14895.589 | |
| }, | |
| { | |
| "epoch": 0.9651502504173622, | |
| "grad_norm": 2.527364492416382, | |
| "learning_rate": 4.1957515998887036e-05, | |
| "loss": 1.5396, | |
| "num_input_tokens_seen": 10711800, | |
| "step": 18500, | |
| "train_runtime": 719.3221, | |
| "train_tokens_per_second": 14891.522 | |
| }, | |
| { | |
| "epoch": 0.9912353923205343, | |
| "grad_norm": 2.3071608543395996, | |
| "learning_rate": 4.17401398163606e-05, | |
| "loss": 1.5194, | |
| "num_input_tokens_seen": 10994712, | |
| "step": 19000, | |
| "train_runtime": 738.6768, | |
| "train_tokens_per_second": 14884.334 | |
| }, | |
| { | |
| "epoch": 1.0, | |
| "eval_loss": 1.3842333555221558, | |
| "eval_runtime": 47.3762, | |
| "eval_samples_per_second": 809.161, | |
| "eval_steps_per_second": 101.148, | |
| "num_input_tokens_seen": 11091734, | |
| "step": 19168 | |
| }, | |
| { | |
| "epoch": 1.0173205342237062, | |
| "grad_norm": 2.8192083835601807, | |
| "learning_rate": 4.152276363383417e-05, | |
| "loss": 1.4963, | |
| "num_input_tokens_seen": 11281086, | |
| "step": 19500, | |
| "train_runtime": 806.5637, | |
| "train_tokens_per_second": 13986.603 | |
| }, | |
| { | |
| "epoch": 1.0434056761268782, | |
| "grad_norm": 3.121436595916748, | |
| "learning_rate": 4.130538745130774e-05, | |
| "loss": 1.5117, | |
| "num_input_tokens_seen": 11574638, | |
| "step": 20000, | |
| "train_runtime": 825.9512, | |
| "train_tokens_per_second": 14013.707 | |
| }, | |
| { | |
| "epoch": 1.06949081803005, | |
| "grad_norm": 2.0136849880218506, | |
| "learning_rate": 4.108801126878131e-05, | |
| "loss": 1.5143, | |
| "num_input_tokens_seen": 11864494, | |
| "step": 20500, | |
| "train_runtime": 845.2133, | |
| "train_tokens_per_second": 14037.278 | |
| }, | |
| { | |
| "epoch": 1.095575959933222, | |
| "grad_norm": 2.6219029426574707, | |
| "learning_rate": 4.087063508625487e-05, | |
| "loss": 1.5055, | |
| "num_input_tokens_seen": 12158550, | |
| "step": 21000, | |
| "train_runtime": 864.7079, | |
| "train_tokens_per_second": 14060.876 | |
| }, | |
| { | |
| "epoch": 1.121661101836394, | |
| "grad_norm": 3.265441656112671, | |
| "learning_rate": 4.065325890372844e-05, | |
| "loss": 1.4973, | |
| "num_input_tokens_seen": 12445726, | |
| "step": 21500, | |
| "train_runtime": 885.5348, | |
| "train_tokens_per_second": 14054.474 | |
| }, | |
| { | |
| "epoch": 1.147746243739566, | |
| "grad_norm": 2.6268465518951416, | |
| "learning_rate": 4.043588272120201e-05, | |
| "loss": 1.5264, | |
| "num_input_tokens_seen": 12733878, | |
| "step": 22000, | |
| "train_runtime": 905.2864, | |
| "train_tokens_per_second": 14066.131 | |
| }, | |
| { | |
| "epoch": 1.1738313856427378, | |
| "grad_norm": 4.112071990966797, | |
| "learning_rate": 4.0218506538675574e-05, | |
| "loss": 1.4786, | |
| "num_input_tokens_seen": 13017478, | |
| "step": 22500, | |
| "train_runtime": 924.8642, | |
| "train_tokens_per_second": 14075.016 | |
| }, | |
| { | |
| "epoch": 1.1999165275459098, | |
| "grad_norm": 3.13775897026062, | |
| "learning_rate": 4.000113035614914e-05, | |
| "loss": 1.4809, | |
| "num_input_tokens_seen": 13308726, | |
| "step": 23000, | |
| "train_runtime": 944.4145, | |
| "train_tokens_per_second": 14092.038 | |
| }, | |
| { | |
| "epoch": 1.2260016694490818, | |
| "grad_norm": 2.7305409908294678, | |
| "learning_rate": 3.9783754173622704e-05, | |
| "loss": 1.5037, | |
| "num_input_tokens_seen": 13600462, | |
| "step": 23500, | |
| "train_runtime": 964.343, | |
| "train_tokens_per_second": 14103.346 | |
| }, | |
| { | |
| "epoch": 1.2520868113522536, | |
| "grad_norm": 3.8625481128692627, | |
| "learning_rate": 3.9566377991096275e-05, | |
| "loss": 1.4744, | |
| "num_input_tokens_seen": 13886382, | |
| "step": 24000, | |
| "train_runtime": 983.6134, | |
| "train_tokens_per_second": 14117.723 | |
| }, | |
| { | |
| "epoch": 1.2781719532554257, | |
| "grad_norm": 3.4027693271636963, | |
| "learning_rate": 3.934900180856984e-05, | |
| "loss": 1.4796, | |
| "num_input_tokens_seen": 14171390, | |
| "step": 24500, | |
| "train_runtime": 1003.0211, | |
| "train_tokens_per_second": 14128.706 | |
| }, | |
| { | |
| "epoch": 1.3042570951585977, | |
| "grad_norm": 2.1200718879699707, | |
| "learning_rate": 3.9131625626043405e-05, | |
| "loss": 1.5107, | |
| "num_input_tokens_seen": 14461470, | |
| "step": 25000, | |
| "train_runtime": 1022.6959, | |
| "train_tokens_per_second": 14140.538 | |
| }, | |
| { | |
| "epoch": 1.3303422370617697, | |
| "grad_norm": 2.7789530754089355, | |
| "learning_rate": 3.8914249443516976e-05, | |
| "loss": 1.4596, | |
| "num_input_tokens_seen": 14747598, | |
| "step": 25500, | |
| "train_runtime": 1042.1868, | |
| "train_tokens_per_second": 14150.628 | |
| }, | |
| { | |
| "epoch": 1.3564273789649417, | |
| "grad_norm": 2.1225244998931885, | |
| "learning_rate": 3.869687326099054e-05, | |
| "loss": 1.4669, | |
| "num_input_tokens_seen": 15036278, | |
| "step": 26000, | |
| "train_runtime": 1061.7955, | |
| "train_tokens_per_second": 14161.181 | |
| }, | |
| { | |
| "epoch": 1.3825125208681135, | |
| "grad_norm": 2.9342072010040283, | |
| "learning_rate": 3.847949707846411e-05, | |
| "loss": 1.4947, | |
| "num_input_tokens_seen": 15322110, | |
| "step": 26500, | |
| "train_runtime": 1081.7408, | |
| "train_tokens_per_second": 14164.308 | |
| }, | |
| { | |
| "epoch": 1.4085976627712855, | |
| "grad_norm": 2.25174880027771, | |
| "learning_rate": 3.826212089593768e-05, | |
| "loss": 1.472, | |
| "num_input_tokens_seen": 15619830, | |
| "step": 27000, | |
| "train_runtime": 1101.4139, | |
| "train_tokens_per_second": 14181.616 | |
| }, | |
| { | |
| "epoch": 1.4346828046744573, | |
| "grad_norm": 2.1327219009399414, | |
| "learning_rate": 3.804474471341124e-05, | |
| "loss": 1.4745, | |
| "num_input_tokens_seen": 15910494, | |
| "step": 27500, | |
| "train_runtime": 1120.8296, | |
| "train_tokens_per_second": 14195.283 | |
| }, | |
| { | |
| "epoch": 1.4607679465776293, | |
| "grad_norm": 2.2169244289398193, | |
| "learning_rate": 3.782736853088481e-05, | |
| "loss": 1.4961, | |
| "num_input_tokens_seen": 16202854, | |
| "step": 28000, | |
| "train_runtime": 1140.1942, | |
| "train_tokens_per_second": 14210.609 | |
| }, | |
| { | |
| "epoch": 1.4868530884808013, | |
| "grad_norm": 2.7171308994293213, | |
| "learning_rate": 3.760999234835837e-05, | |
| "loss": 1.4707, | |
| "num_input_tokens_seen": 16491582, | |
| "step": 28500, | |
| "train_runtime": 1160.3313, | |
| "train_tokens_per_second": 14212.822 | |
| }, | |
| { | |
| "epoch": 1.5129382303839733, | |
| "grad_norm": 2.9756038188934326, | |
| "learning_rate": 3.739261616583194e-05, | |
| "loss": 1.4584, | |
| "num_input_tokens_seen": 16778886, | |
| "step": 29000, | |
| "train_runtime": 1180.225, | |
| "train_tokens_per_second": 14216.684 | |
| }, | |
| { | |
| "epoch": 1.5390233722871454, | |
| "grad_norm": 2.1410768032073975, | |
| "learning_rate": 3.717523998330551e-05, | |
| "loss": 1.4856, | |
| "num_input_tokens_seen": 17072582, | |
| "step": 29500, | |
| "train_runtime": 1199.0906, | |
| "train_tokens_per_second": 14237.942 | |
| }, | |
| { | |
| "epoch": 1.5651085141903172, | |
| "grad_norm": 2.650392532348633, | |
| "learning_rate": 3.695786380077908e-05, | |
| "loss": 1.4821, | |
| "num_input_tokens_seen": 17362110, | |
| "step": 30000, | |
| "train_runtime": 1218.8129, | |
| "train_tokens_per_second": 14245.098 | |
| }, | |
| { | |
| "epoch": 1.5911936560934892, | |
| "grad_norm": 2.675250291824341, | |
| "learning_rate": 3.6740487618252644e-05, | |
| "loss": 1.4694, | |
| "num_input_tokens_seen": 17647902, | |
| "step": 30500, | |
| "train_runtime": 1238.9908, | |
| "train_tokens_per_second": 14243.772 | |
| }, | |
| { | |
| "epoch": 1.617278797996661, | |
| "grad_norm": 2.670755386352539, | |
| "learning_rate": 3.652311143572621e-05, | |
| "loss": 1.5342, | |
| "num_input_tokens_seen": 17943398, | |
| "step": 31000, | |
| "train_runtime": 1259.7818, | |
| "train_tokens_per_second": 14243.259 | |
| }, | |
| { | |
| "epoch": 1.643363939899833, | |
| "grad_norm": 2.637608051300049, | |
| "learning_rate": 3.630573525319978e-05, | |
| "loss": 1.4575, | |
| "num_input_tokens_seen": 18231966, | |
| "step": 31500, | |
| "train_runtime": 1279.0356, | |
| "train_tokens_per_second": 14254.464 | |
| }, | |
| { | |
| "epoch": 1.669449081803005, | |
| "grad_norm": 2.5078988075256348, | |
| "learning_rate": 3.6088359070673345e-05, | |
| "loss": 1.4518, | |
| "num_input_tokens_seen": 18525670, | |
| "step": 32000, | |
| "train_runtime": 1297.7662, | |
| "train_tokens_per_second": 14275.044 | |
| }, | |
| { | |
| "epoch": 1.695534223706177, | |
| "grad_norm": 2.266803503036499, | |
| "learning_rate": 3.587098288814692e-05, | |
| "loss": 1.5014, | |
| "num_input_tokens_seen": 18815526, | |
| "step": 32500, | |
| "train_runtime": 1316.4234, | |
| "train_tokens_per_second": 14292.914 | |
| }, | |
| { | |
| "epoch": 1.721619365609349, | |
| "grad_norm": 3.0197086334228516, | |
| "learning_rate": 3.565360670562048e-05, | |
| "loss": 1.4843, | |
| "num_input_tokens_seen": 19112486, | |
| "step": 33000, | |
| "train_runtime": 1335.2332, | |
| "train_tokens_per_second": 14313.968 | |
| }, | |
| { | |
| "epoch": 1.7477045075125208, | |
| "grad_norm": 2.791066884994507, | |
| "learning_rate": 3.5436230523094046e-05, | |
| "loss": 1.4878, | |
| "num_input_tokens_seen": 19396846, | |
| "step": 33500, | |
| "train_runtime": 1353.9271, | |
| "train_tokens_per_second": 14326.359 | |
| }, | |
| { | |
| "epoch": 1.7737896494156928, | |
| "grad_norm": 2.995617628097534, | |
| "learning_rate": 3.521885434056761e-05, | |
| "loss": 1.4606, | |
| "num_input_tokens_seen": 19683174, | |
| "step": 34000, | |
| "train_runtime": 1372.6447, | |
| "train_tokens_per_second": 14339.599 | |
| }, | |
| { | |
| "epoch": 1.7998747913188646, | |
| "grad_norm": 2.561185836791992, | |
| "learning_rate": 3.5001478158041176e-05, | |
| "loss": 1.4802, | |
| "num_input_tokens_seen": 19973646, | |
| "step": 34500, | |
| "train_runtime": 1391.2808, | |
| "train_tokens_per_second": 14356.301 | |
| }, | |
| { | |
| "epoch": 1.8259599332220366, | |
| "grad_norm": 3.1782171726226807, | |
| "learning_rate": 3.478410197551475e-05, | |
| "loss": 1.4588, | |
| "num_input_tokens_seen": 20264526, | |
| "step": 35000, | |
| "train_runtime": 1409.9676, | |
| "train_tokens_per_second": 14372.334 | |
| }, | |
| { | |
| "epoch": 1.8520450751252087, | |
| "grad_norm": 5.561634063720703, | |
| "learning_rate": 3.456672579298831e-05, | |
| "loss": 1.4609, | |
| "num_input_tokens_seen": 20553006, | |
| "step": 35500, | |
| "train_runtime": 1428.6129, | |
| "train_tokens_per_second": 14386.686 | |
| }, | |
| { | |
| "epoch": 1.8781302170283807, | |
| "grad_norm": 2.784186363220215, | |
| "learning_rate": 3.4349349610461884e-05, | |
| "loss": 1.4682, | |
| "num_input_tokens_seen": 20844014, | |
| "step": 36000, | |
| "train_runtime": 1447.2777, | |
| "train_tokens_per_second": 14402.221 | |
| }, | |
| { | |
| "epoch": 1.9042153589315527, | |
| "grad_norm": 2.59779691696167, | |
| "learning_rate": 3.413197342793545e-05, | |
| "loss": 1.5035, | |
| "num_input_tokens_seen": 21130910, | |
| "step": 36500, | |
| "train_runtime": 1465.9615, | |
| "train_tokens_per_second": 14414.369 | |
| }, | |
| { | |
| "epoch": 1.9303005008347245, | |
| "grad_norm": 2.6355996131896973, | |
| "learning_rate": 3.391459724540902e-05, | |
| "loss": 1.4815, | |
| "num_input_tokens_seen": 21419886, | |
| "step": 37000, | |
| "train_runtime": 1484.6953, | |
| "train_tokens_per_second": 14427.126 | |
| }, | |
| { | |
| "epoch": 1.9563856427378965, | |
| "grad_norm": 2.1540422439575195, | |
| "learning_rate": 3.3697221062882585e-05, | |
| "loss": 1.4686, | |
| "num_input_tokens_seen": 21706222, | |
| "step": 37500, | |
| "train_runtime": 1503.3619, | |
| "train_tokens_per_second": 14438.454 | |
| }, | |
| { | |
| "epoch": 1.9824707846410683, | |
| "grad_norm": 2.1270930767059326, | |
| "learning_rate": 3.347984488035615e-05, | |
| "loss": 1.4853, | |
| "num_input_tokens_seen": 21997414, | |
| "step": 38000, | |
| "train_runtime": 1522.056, | |
| "train_tokens_per_second": 14452.434 | |
| }, | |
| { | |
| "epoch": 2.0, | |
| "eval_loss": 1.347296118736267, | |
| "eval_runtime": 45.0902, | |
| "eval_samples_per_second": 850.185, | |
| "eval_steps_per_second": 106.276, | |
| "num_input_tokens_seen": 22196446, | |
| "step": 38336 | |
| }, | |
| { | |
| "epoch": 2.0085559265442403, | |
| "grad_norm": 2.812293767929077, | |
| "learning_rate": 3.326246869782972e-05, | |
| "loss": 1.4672, | |
| "num_input_tokens_seen": 22289118, | |
| "step": 38500, | |
| "train_runtime": 1586.651, | |
| "train_tokens_per_second": 14047.902 | |
| }, | |
| { | |
| "epoch": 2.0346410684474123, | |
| "grad_norm": 3.67232346534729, | |
| "learning_rate": 3.3045092515303286e-05, | |
| "loss": 1.4381, | |
| "num_input_tokens_seen": 22577710, | |
| "step": 39000, | |
| "train_runtime": 1605.3175, | |
| "train_tokens_per_second": 14064.327 | |
| }, | |
| { | |
| "epoch": 2.0607262103505843, | |
| "grad_norm": 2.2775866985321045, | |
| "learning_rate": 3.282771633277685e-05, | |
| "loss": 1.4397, | |
| "num_input_tokens_seen": 22866142, | |
| "step": 39500, | |
| "train_runtime": 1623.9658, | |
| "train_tokens_per_second": 14080.434 | |
| }, | |
| { | |
| "epoch": 2.0868113522537564, | |
| "grad_norm": 3.0156877040863037, | |
| "learning_rate": 3.2610340150250415e-05, | |
| "loss": 1.4657, | |
| "num_input_tokens_seen": 23163734, | |
| "step": 40000, | |
| "train_runtime": 1642.6646, | |
| "train_tokens_per_second": 14101.317 | |
| }, | |
| { | |
| "epoch": 2.1128964941569284, | |
| "grad_norm": 3.8104028701782227, | |
| "learning_rate": 3.239296396772399e-05, | |
| "loss": 1.4687, | |
| "num_input_tokens_seen": 23451982, | |
| "step": 40500, | |
| "train_runtime": 1661.3261, | |
| "train_tokens_per_second": 14116.423 | |
| }, | |
| { | |
| "epoch": 2.1389816360601, | |
| "grad_norm": 1.780987024307251, | |
| "learning_rate": 3.217558778519755e-05, | |
| "loss": 1.4432, | |
| "num_input_tokens_seen": 23743406, | |
| "step": 41000, | |
| "train_runtime": 1679.966, | |
| "train_tokens_per_second": 14133.266 | |
| }, | |
| { | |
| "epoch": 2.165066777963272, | |
| "grad_norm": 2.234935998916626, | |
| "learning_rate": 3.1958211602671117e-05, | |
| "loss": 1.447, | |
| "num_input_tokens_seen": 24037990, | |
| "step": 41500, | |
| "train_runtime": 1698.6679, | |
| "train_tokens_per_second": 14151.082 | |
| }, | |
| { | |
| "epoch": 2.191151919866444, | |
| "grad_norm": 2.599027395248413, | |
| "learning_rate": 3.174083542014469e-05, | |
| "loss": 1.4307, | |
| "num_input_tokens_seen": 24333206, | |
| "step": 42000, | |
| "train_runtime": 1717.3337, | |
| "train_tokens_per_second": 14169.177 | |
| }, | |
| { | |
| "epoch": 2.217237061769616, | |
| "grad_norm": 3.104538917541504, | |
| "learning_rate": 3.152345923761825e-05, | |
| "loss": 1.4165, | |
| "num_input_tokens_seen": 24623262, | |
| "step": 42500, | |
| "train_runtime": 1735.9704, | |
| "train_tokens_per_second": 14184.149 | |
| }, | |
| { | |
| "epoch": 2.243322203672788, | |
| "grad_norm": 2.5183098316192627, | |
| "learning_rate": 3.1306083055091824e-05, | |
| "loss": 1.4251, | |
| "num_input_tokens_seen": 24910790, | |
| "step": 43000, | |
| "train_runtime": 1754.6301, | |
| "train_tokens_per_second": 14197.175 | |
| }, | |
| { | |
| "epoch": 2.26940734557596, | |
| "grad_norm": 3.010117530822754, | |
| "learning_rate": 3.108870687256539e-05, | |
| "loss": 1.4719, | |
| "num_input_tokens_seen": 25200606, | |
| "step": 43500, | |
| "train_runtime": 1773.3028, | |
| "train_tokens_per_second": 14211.112 | |
| }, | |
| { | |
| "epoch": 2.295492487479132, | |
| "grad_norm": 3.781156063079834, | |
| "learning_rate": 3.087133069003896e-05, | |
| "loss": 1.44, | |
| "num_input_tokens_seen": 25494558, | |
| "step": 44000, | |
| "train_runtime": 1791.9661, | |
| "train_tokens_per_second": 14227.143 | |
| }, | |
| { | |
| "epoch": 2.321577629382304, | |
| "grad_norm": 2.3171684741973877, | |
| "learning_rate": 3.0653954507512525e-05, | |
| "loss": 1.4048, | |
| "num_input_tokens_seen": 25783878, | |
| "step": 44500, | |
| "train_runtime": 1810.6406, | |
| "train_tokens_per_second": 14240.196 | |
| }, | |
| { | |
| "epoch": 2.3476627712854756, | |
| "grad_norm": 2.785936117172241, | |
| "learning_rate": 3.0436578324986087e-05, | |
| "loss": 1.4333, | |
| "num_input_tokens_seen": 26074006, | |
| "step": 45000, | |
| "train_runtime": 1829.2827, | |
| "train_tokens_per_second": 14253.677 | |
| }, | |
| { | |
| "epoch": 2.3737479131886476, | |
| "grad_norm": 3.067204475402832, | |
| "learning_rate": 3.021920214245966e-05, | |
| "loss": 1.412, | |
| "num_input_tokens_seen": 26362862, | |
| "step": 45500, | |
| "train_runtime": 1847.9255, | |
| "train_tokens_per_second": 14266.193 | |
| }, | |
| { | |
| "epoch": 2.3998330550918197, | |
| "grad_norm": 3.440131902694702, | |
| "learning_rate": 3.0001825959933223e-05, | |
| "loss": 1.4343, | |
| "num_input_tokens_seen": 26659222, | |
| "step": 46000, | |
| "train_runtime": 1866.6572, | |
| "train_tokens_per_second": 14281.799 | |
| }, | |
| { | |
| "epoch": 2.4259181969949917, | |
| "grad_norm": 4.180527210235596, | |
| "learning_rate": 2.978444977740679e-05, | |
| "loss": 1.4231, | |
| "num_input_tokens_seen": 26945814, | |
| "step": 46500, | |
| "train_runtime": 1885.3282, | |
| "train_tokens_per_second": 14292.373 | |
| }, | |
| { | |
| "epoch": 2.4520033388981637, | |
| "grad_norm": 4.318091869354248, | |
| "learning_rate": 2.9567073594880356e-05, | |
| "loss": 1.4234, | |
| "num_input_tokens_seen": 27240518, | |
| "step": 47000, | |
| "train_runtime": 1904.0251, | |
| "train_tokens_per_second": 14306.806 | |
| }, | |
| { | |
| "epoch": 2.4780884808013357, | |
| "grad_norm": 2.4914376735687256, | |
| "learning_rate": 2.9349697412353928e-05, | |
| "loss": 1.4466, | |
| "num_input_tokens_seen": 27523134, | |
| "step": 47500, | |
| "train_runtime": 1922.7393, | |
| "train_tokens_per_second": 14314.543 | |
| }, | |
| { | |
| "epoch": 2.5041736227045073, | |
| "grad_norm": 2.4933414459228516, | |
| "learning_rate": 2.9132321229827492e-05, | |
| "loss": 1.4219, | |
| "num_input_tokens_seen": 27811630, | |
| "step": 48000, | |
| "train_runtime": 1941.4401, | |
| "train_tokens_per_second": 14325.258 | |
| }, | |
| { | |
| "epoch": 2.5302587646076793, | |
| "grad_norm": 3.3003621101379395, | |
| "learning_rate": 2.8914945047301057e-05, | |
| "loss": 1.4167, | |
| "num_input_tokens_seen": 28103582, | |
| "step": 48500, | |
| "train_runtime": 1960.1495, | |
| "train_tokens_per_second": 14337.469 | |
| }, | |
| { | |
| "epoch": 2.5563439065108513, | |
| "grad_norm": 2.9343557357788086, | |
| "learning_rate": 2.8697568864774625e-05, | |
| "loss": 1.4343, | |
| "num_input_tokens_seen": 28395062, | |
| "step": 49000, | |
| "train_runtime": 1978.7726, | |
| "train_tokens_per_second": 14349.836 | |
| }, | |
| { | |
| "epoch": 2.5824290484140233, | |
| "grad_norm": 2.247775077819824, | |
| "learning_rate": 2.848019268224819e-05, | |
| "loss": 1.44, | |
| "num_input_tokens_seen": 28682022, | |
| "step": 49500, | |
| "train_runtime": 1997.425, | |
| "train_tokens_per_second": 14359.499 | |
| }, | |
| { | |
| "epoch": 2.6085141903171953, | |
| "grad_norm": 3.329780101776123, | |
| "learning_rate": 2.826281649972176e-05, | |
| "loss": 1.4366, | |
| "num_input_tokens_seen": 28966702, | |
| "step": 50000, | |
| "train_runtime": 2016.0551, | |
| "train_tokens_per_second": 14368.011 | |
| }, | |
| { | |
| "epoch": 2.6345993322203674, | |
| "grad_norm": 2.639854907989502, | |
| "learning_rate": 2.8045440317195326e-05, | |
| "loss": 1.4175, | |
| "num_input_tokens_seen": 29256878, | |
| "step": 50500, | |
| "train_runtime": 2034.718, | |
| "train_tokens_per_second": 14378.837 | |
| }, | |
| { | |
| "epoch": 2.6606844741235394, | |
| "grad_norm": 4.10645055770874, | |
| "learning_rate": 2.7828064134668898e-05, | |
| "loss": 1.4229, | |
| "num_input_tokens_seen": 29545014, | |
| "step": 51000, | |
| "train_runtime": 2053.4349, | |
| "train_tokens_per_second": 14388.094 | |
| }, | |
| { | |
| "epoch": 2.6867696160267114, | |
| "grad_norm": 3.233084201812744, | |
| "learning_rate": 2.7610687952142463e-05, | |
| "loss": 1.4396, | |
| "num_input_tokens_seen": 29832302, | |
| "step": 51500, | |
| "train_runtime": 2072.1004, | |
| "train_tokens_per_second": 14397.132 | |
| }, | |
| { | |
| "epoch": 2.7128547579298834, | |
| "grad_norm": 3.0811736583709717, | |
| "learning_rate": 2.7393311769616027e-05, | |
| "loss": 1.4417, | |
| "num_input_tokens_seen": 30124678, | |
| "step": 52000, | |
| "train_runtime": 2090.765, | |
| "train_tokens_per_second": 14408.448 | |
| }, | |
| { | |
| "epoch": 2.738939899833055, | |
| "grad_norm": 3.9066579341888428, | |
| "learning_rate": 2.7175935587089595e-05, | |
| "loss": 1.42, | |
| "num_input_tokens_seen": 30411006, | |
| "step": 52500, | |
| "train_runtime": 2109.4596, | |
| "train_tokens_per_second": 14416.492 | |
| }, | |
| { | |
| "epoch": 2.765025041736227, | |
| "grad_norm": 3.752941131591797, | |
| "learning_rate": 2.695855940456316e-05, | |
| "loss": 1.4416, | |
| "num_input_tokens_seen": 30697118, | |
| "step": 53000, | |
| "train_runtime": 2128.1961, | |
| "train_tokens_per_second": 14424.008 | |
| }, | |
| { | |
| "epoch": 2.791110183639399, | |
| "grad_norm": 2.2906174659729004, | |
| "learning_rate": 2.6741183222036732e-05, | |
| "loss": 1.434, | |
| "num_input_tokens_seen": 30985038, | |
| "step": 53500, | |
| "train_runtime": 2146.9172, | |
| "train_tokens_per_second": 14432.339 | |
| }, | |
| { | |
| "epoch": 2.817195325542571, | |
| "grad_norm": 4.612029075622559, | |
| "learning_rate": 2.6523807039510297e-05, | |
| "loss": 1.4167, | |
| "num_input_tokens_seen": 31273350, | |
| "step": 54000, | |
| "train_runtime": 2165.6016, | |
| "train_tokens_per_second": 14440.952 | |
| }, | |
| { | |
| "epoch": 2.843280467445743, | |
| "grad_norm": 2.9580113887786865, | |
| "learning_rate": 2.6306430856983865e-05, | |
| "loss": 1.4059, | |
| "num_input_tokens_seen": 31560206, | |
| "step": 54500, | |
| "train_runtime": 2184.355, | |
| "train_tokens_per_second": 14448.295 | |
| }, | |
| { | |
| "epoch": 2.8693656093489146, | |
| "grad_norm": 3.1787197589874268, | |
| "learning_rate": 2.608905467445743e-05, | |
| "loss": 1.4472, | |
| "num_input_tokens_seen": 31852006, | |
| "step": 55000, | |
| "train_runtime": 2203.0469, | |
| "train_tokens_per_second": 14458.161 | |
| }, | |
| { | |
| "epoch": 2.8954507512520866, | |
| "grad_norm": 2.0112416744232178, | |
| "learning_rate": 2.5871678491930994e-05, | |
| "loss": 1.4311, | |
| "num_input_tokens_seen": 32138366, | |
| "step": 55500, | |
| "train_runtime": 2221.6719, | |
| "train_tokens_per_second": 14465.847 | |
| }, | |
| { | |
| "epoch": 2.9215358931552586, | |
| "grad_norm": 1.9806029796600342, | |
| "learning_rate": 2.5654302309404566e-05, | |
| "loss": 1.4348, | |
| "num_input_tokens_seen": 32427294, | |
| "step": 56000, | |
| "train_runtime": 2240.3821, | |
| "train_tokens_per_second": 14474.002 | |
| }, | |
| { | |
| "epoch": 2.9476210350584306, | |
| "grad_norm": 1.9818835258483887, | |
| "learning_rate": 2.543692612687813e-05, | |
| "loss": 1.4442, | |
| "num_input_tokens_seen": 32714750, | |
| "step": 56500, | |
| "train_runtime": 2259.0685, | |
| "train_tokens_per_second": 14481.522 | |
| }, | |
| { | |
| "epoch": 2.9737061769616027, | |
| "grad_norm": 2.794255256652832, | |
| "learning_rate": 2.52195499443517e-05, | |
| "loss": 1.4452, | |
| "num_input_tokens_seen": 33004950, | |
| "step": 57000, | |
| "train_runtime": 2277.7337, | |
| "train_tokens_per_second": 14490.258 | |
| }, | |
| { | |
| "epoch": 2.9997913188647747, | |
| "grad_norm": 3.825054407119751, | |
| "learning_rate": 2.5002173761825263e-05, | |
| "loss": 1.4031, | |
| "num_input_tokens_seen": 33292886, | |
| "step": 57500, | |
| "train_runtime": 2296.3777, | |
| "train_tokens_per_second": 14498.001 | |
| }, | |
| { | |
| "epoch": 3.0, | |
| "eval_loss": 1.3332206010818481, | |
| "eval_runtime": 45.0681, | |
| "eval_samples_per_second": 850.602, | |
| "eval_steps_per_second": 106.328, | |
| "num_input_tokens_seen": 33294704, | |
| "step": 57504 | |
| }, | |
| { | |
| "epoch": 3.0258764607679467, | |
| "grad_norm": 3.42480731010437, | |
| "learning_rate": 2.478479757929883e-05, | |
| "loss": 1.3848, | |
| "num_input_tokens_seen": 33584784, | |
| "step": 58000, | |
| "train_runtime": 2361.2516, | |
| "train_tokens_per_second": 14223.298 | |
| }, | |
| { | |
| "epoch": 3.0519616026711187, | |
| "grad_norm": 2.5299935340881348, | |
| "learning_rate": 2.45674213967724e-05, | |
| "loss": 1.3964, | |
| "num_input_tokens_seen": 33871192, | |
| "step": 58500, | |
| "train_runtime": 2379.8401, | |
| "train_tokens_per_second": 14232.55 | |
| }, | |
| { | |
| "epoch": 3.0780467445742903, | |
| "grad_norm": 2.3154349327087402, | |
| "learning_rate": 2.4350045214245968e-05, | |
| "loss": 1.4092, | |
| "num_input_tokens_seen": 34162736, | |
| "step": 59000, | |
| "train_runtime": 2398.5047, | |
| "train_tokens_per_second": 14243.348 | |
| }, | |
| { | |
| "epoch": 3.1041318864774623, | |
| "grad_norm": 3.183199167251587, | |
| "learning_rate": 2.4132669031719536e-05, | |
| "loss": 1.4007, | |
| "num_input_tokens_seen": 34452880, | |
| "step": 59500, | |
| "train_runtime": 2417.223, | |
| "train_tokens_per_second": 14253.083 | |
| }, | |
| { | |
| "epoch": 3.1302170283806343, | |
| "grad_norm": 2.856942892074585, | |
| "learning_rate": 2.39152928491931e-05, | |
| "loss": 1.407, | |
| "num_input_tokens_seen": 34740064, | |
| "step": 60000, | |
| "train_runtime": 2435.9312, | |
| "train_tokens_per_second": 14261.513 | |
| }, | |
| { | |
| "epoch": 3.1563021702838063, | |
| "grad_norm": 3.0104143619537354, | |
| "learning_rate": 2.3697916666666666e-05, | |
| "loss": 1.3869, | |
| "num_input_tokens_seen": 35033296, | |
| "step": 60500, | |
| "train_runtime": 2454.6106, | |
| "train_tokens_per_second": 14272.446 | |
| }, | |
| { | |
| "epoch": 3.1823873121869783, | |
| "grad_norm": 2.1120755672454834, | |
| "learning_rate": 2.3480540484140234e-05, | |
| "loss": 1.4128, | |
| "num_input_tokens_seen": 35326400, | |
| "step": 61000, | |
| "train_runtime": 2473.3018, | |
| "train_tokens_per_second": 14283.093 | |
| }, | |
| { | |
| "epoch": 3.2084724540901504, | |
| "grad_norm": 2.3867533206939697, | |
| "learning_rate": 2.3263164301613802e-05, | |
| "loss": 1.421, | |
| "num_input_tokens_seen": 35610096, | |
| "step": 61500, | |
| "train_runtime": 2491.98, | |
| "train_tokens_per_second": 14289.88 | |
| }, | |
| { | |
| "epoch": 3.2345575959933224, | |
| "grad_norm": 2.934441566467285, | |
| "learning_rate": 2.304578811908737e-05, | |
| "loss": 1.4507, | |
| "num_input_tokens_seen": 35899736, | |
| "step": 62000, | |
| "train_runtime": 2510.6844, | |
| "train_tokens_per_second": 14298.785 | |
| }, | |
| { | |
| "epoch": 3.260642737896494, | |
| "grad_norm": 1.9727118015289307, | |
| "learning_rate": 2.2828411936560938e-05, | |
| "loss": 1.4167, | |
| "num_input_tokens_seen": 36185200, | |
| "step": 62500, | |
| "train_runtime": 2529.3663, | |
| "train_tokens_per_second": 14306.034 | |
| }, | |
| { | |
| "epoch": 3.286727879799666, | |
| "grad_norm": 2.6939632892608643, | |
| "learning_rate": 2.2611035754034503e-05, | |
| "loss": 1.4152, | |
| "num_input_tokens_seen": 36476040, | |
| "step": 63000, | |
| "train_runtime": 2548.104, | |
| "train_tokens_per_second": 14314.973 | |
| }, | |
| { | |
| "epoch": 3.312813021702838, | |
| "grad_norm": 2.878223180770874, | |
| "learning_rate": 2.2393659571508068e-05, | |
| "loss": 1.4027, | |
| "num_input_tokens_seen": 36776288, | |
| "step": 63500, | |
| "train_runtime": 2566.9571, | |
| "train_tokens_per_second": 14326.803 | |
| }, | |
| { | |
| "epoch": 3.33889816360601, | |
| "grad_norm": 2.485452175140381, | |
| "learning_rate": 2.2176283388981636e-05, | |
| "loss": 1.3992, | |
| "num_input_tokens_seen": 37063960, | |
| "step": 64000, | |
| "train_runtime": 2585.6586, | |
| "train_tokens_per_second": 14334.437 | |
| }, | |
| { | |
| "epoch": 3.364983305509182, | |
| "grad_norm": 3.862046241760254, | |
| "learning_rate": 2.1958907206455204e-05, | |
| "loss": 1.3968, | |
| "num_input_tokens_seen": 37353184, | |
| "step": 64500, | |
| "train_runtime": 2604.3949, | |
| "train_tokens_per_second": 14342.366 | |
| }, | |
| { | |
| "epoch": 3.391068447412354, | |
| "grad_norm": 2.4618258476257324, | |
| "learning_rate": 2.1741531023928772e-05, | |
| "loss": 1.4059, | |
| "num_input_tokens_seen": 37648648, | |
| "step": 65000, | |
| "train_runtime": 2623.1097, | |
| "train_tokens_per_second": 14352.678 | |
| }, | |
| { | |
| "epoch": 3.417153589315526, | |
| "grad_norm": 2.7443792819976807, | |
| "learning_rate": 2.152415484140234e-05, | |
| "loss": 1.3809, | |
| "num_input_tokens_seen": 37936072, | |
| "step": 65500, | |
| "train_runtime": 2641.8438, | |
| "train_tokens_per_second": 14359.695 | |
| }, | |
| { | |
| "epoch": 3.443238731218698, | |
| "grad_norm": 2.808088541030884, | |
| "learning_rate": 2.1306778658875905e-05, | |
| "loss": 1.4118, | |
| "num_input_tokens_seen": 38225568, | |
| "step": 66000, | |
| "train_runtime": 2660.549, | |
| "train_tokens_per_second": 14367.549 | |
| }, | |
| { | |
| "epoch": 3.4693238731218696, | |
| "grad_norm": 2.7997331619262695, | |
| "learning_rate": 2.1089402476349473e-05, | |
| "loss": 1.404, | |
| "num_input_tokens_seen": 38512144, | |
| "step": 66500, | |
| "train_runtime": 2679.274, | |
| "train_tokens_per_second": 14374.097 | |
| }, | |
| { | |
| "epoch": 3.4954090150250416, | |
| "grad_norm": 2.4735493659973145, | |
| "learning_rate": 2.0872026293823038e-05, | |
| "loss": 1.4271, | |
| "num_input_tokens_seen": 38797344, | |
| "step": 67000, | |
| "train_runtime": 2697.9506, | |
| "train_tokens_per_second": 14380.302 | |
| }, | |
| { | |
| "epoch": 3.5214941569282137, | |
| "grad_norm": 4.414172172546387, | |
| "learning_rate": 2.0654650111296606e-05, | |
| "loss": 1.3969, | |
| "num_input_tokens_seen": 39085088, | |
| "step": 67500, | |
| "train_runtime": 2716.6451, | |
| "train_tokens_per_second": 14387.263 | |
| }, | |
| { | |
| "epoch": 3.5475792988313857, | |
| "grad_norm": 2.165419340133667, | |
| "learning_rate": 2.0437273928770174e-05, | |
| "loss": 1.4137, | |
| "num_input_tokens_seen": 39369904, | |
| "step": 68000, | |
| "train_runtime": 2735.364, | |
| "train_tokens_per_second": 14392.93 | |
| }, | |
| { | |
| "epoch": 3.5736644407345577, | |
| "grad_norm": 2.251249074935913, | |
| "learning_rate": 2.021989774624374e-05, | |
| "loss": 1.4066, | |
| "num_input_tokens_seen": 39661008, | |
| "step": 68500, | |
| "train_runtime": 2754.1198, | |
| "train_tokens_per_second": 14400.611 | |
| }, | |
| { | |
| "epoch": 3.5997495826377297, | |
| "grad_norm": 2.874959945678711, | |
| "learning_rate": 2.0002521563717307e-05, | |
| "loss": 1.3949, | |
| "num_input_tokens_seen": 39953968, | |
| "step": 69000, | |
| "train_runtime": 2772.8706, | |
| "train_tokens_per_second": 14408.883 | |
| }, | |
| { | |
| "epoch": 3.6258347245409013, | |
| "grad_norm": 2.662647008895874, | |
| "learning_rate": 1.9785145381190875e-05, | |
| "loss": 1.4054, | |
| "num_input_tokens_seen": 40240768, | |
| "step": 69500, | |
| "train_runtime": 2791.6372, | |
| "train_tokens_per_second": 14414.756 | |
| }, | |
| { | |
| "epoch": 3.6519198664440733, | |
| "grad_norm": 2.5272815227508545, | |
| "learning_rate": 1.9567769198664444e-05, | |
| "loss": 1.4323, | |
| "num_input_tokens_seen": 40533416, | |
| "step": 70000, | |
| "train_runtime": 2810.3654, | |
| "train_tokens_per_second": 14422.827 | |
| }, | |
| { | |
| "epoch": 3.6780050083472453, | |
| "grad_norm": 2.721334457397461, | |
| "learning_rate": 1.9350393016138008e-05, | |
| "loss": 1.3872, | |
| "num_input_tokens_seen": 40825024, | |
| "step": 70500, | |
| "train_runtime": 2829.08, | |
| "train_tokens_per_second": 14430.495 | |
| }, | |
| { | |
| "epoch": 3.7040901502504173, | |
| "grad_norm": 2.5722897052764893, | |
| "learning_rate": 1.9133016833611576e-05, | |
| "loss": 1.372, | |
| "num_input_tokens_seen": 41113376, | |
| "step": 71000, | |
| "train_runtime": 2847.8223, | |
| "train_tokens_per_second": 14436.777 | |
| }, | |
| { | |
| "epoch": 3.7301752921535893, | |
| "grad_norm": 2.262794256210327, | |
| "learning_rate": 1.891564065108514e-05, | |
| "loss": 1.3728, | |
| "num_input_tokens_seen": 41401936, | |
| "step": 71500, | |
| "train_runtime": 2866.4955, | |
| "train_tokens_per_second": 14443.398 | |
| }, | |
| { | |
| "epoch": 3.7562604340567614, | |
| "grad_norm": 2.6011643409729004, | |
| "learning_rate": 1.869826446855871e-05, | |
| "loss": 1.3901, | |
| "num_input_tokens_seen": 41689120, | |
| "step": 72000, | |
| "train_runtime": 2885.163, | |
| "train_tokens_per_second": 14449.485 | |
| }, | |
| { | |
| "epoch": 3.7823455759599334, | |
| "grad_norm": 2.6435554027557373, | |
| "learning_rate": 1.8480888286032277e-05, | |
| "loss": 1.4071, | |
| "num_input_tokens_seen": 41974720, | |
| "step": 72500, | |
| "train_runtime": 2903.8827, | |
| "train_tokens_per_second": 14454.689 | |
| }, | |
| { | |
| "epoch": 3.8084307178631054, | |
| "grad_norm": 2.489372730255127, | |
| "learning_rate": 1.8263512103505846e-05, | |
| "loss": 1.4023, | |
| "num_input_tokens_seen": 42264016, | |
| "step": 73000, | |
| "train_runtime": 2922.5501, | |
| "train_tokens_per_second": 14461.349 | |
| }, | |
| { | |
| "epoch": 3.8345158597662774, | |
| "grad_norm": 2.4132964611053467, | |
| "learning_rate": 1.8046135920979414e-05, | |
| "loss": 1.4153, | |
| "num_input_tokens_seen": 42558416, | |
| "step": 73500, | |
| "train_runtime": 2941.2299, | |
| "train_tokens_per_second": 14469.599 | |
| }, | |
| { | |
| "epoch": 3.860601001669449, | |
| "grad_norm": 3.1832597255706787, | |
| "learning_rate": 1.782875973845298e-05, | |
| "loss": 1.4076, | |
| "num_input_tokens_seen": 42847504, | |
| "step": 74000, | |
| "train_runtime": 2959.9571, | |
| "train_tokens_per_second": 14475.718 | |
| }, | |
| { | |
| "epoch": 3.886686143572621, | |
| "grad_norm": 2.246975898742676, | |
| "learning_rate": 1.7611383555926543e-05, | |
| "loss": 1.3755, | |
| "num_input_tokens_seen": 43137392, | |
| "step": 74500, | |
| "train_runtime": 2978.6745, | |
| "train_tokens_per_second": 14482.077 | |
| }, | |
| { | |
| "epoch": 3.912771285475793, | |
| "grad_norm": 3.47536039352417, | |
| "learning_rate": 1.739400737340011e-05, | |
| "loss": 1.3837, | |
| "num_input_tokens_seen": 43421200, | |
| "step": 75000, | |
| "train_runtime": 2997.3314, | |
| "train_tokens_per_second": 14486.62 | |
| }, | |
| { | |
| "epoch": 3.938856427378965, | |
| "grad_norm": 2.817647695541382, | |
| "learning_rate": 1.717663119087368e-05, | |
| "loss": 1.3869, | |
| "num_input_tokens_seen": 43714432, | |
| "step": 75500, | |
| "train_runtime": 3015.9535, | |
| "train_tokens_per_second": 14494.399 | |
| }, | |
| { | |
| "epoch": 3.964941569282137, | |
| "grad_norm": 2.670565366744995, | |
| "learning_rate": 1.6959255008347248e-05, | |
| "loss": 1.3875, | |
| "num_input_tokens_seen": 44005040, | |
| "step": 76000, | |
| "train_runtime": 3034.653, | |
| "train_tokens_per_second": 14500.847 | |
| }, | |
| { | |
| "epoch": 3.9910267111853086, | |
| "grad_norm": 3.01701021194458, | |
| "learning_rate": 1.6741878825820816e-05, | |
| "loss": 1.3875, | |
| "num_input_tokens_seen": 44295304, | |
| "step": 76500, | |
| "train_runtime": 3053.3496, | |
| "train_tokens_per_second": 14507.118 | |
| }, | |
| { | |
| "epoch": 4.0, | |
| "eval_loss": 1.3256505727767944, | |
| "eval_runtime": 45.046, | |
| "eval_samples_per_second": 851.018, | |
| "eval_steps_per_second": 106.38, | |
| "num_input_tokens_seen": 44395724, | |
| "step": 76672 | |
| }, | |
| { | |
| "epoch": 4.017111853088481, | |
| "grad_norm": 2.520019292831421, | |
| "learning_rate": 1.652450264329438e-05, | |
| "loss": 1.3838, | |
| "num_input_tokens_seen": 44585564, | |
| "step": 77000, | |
| "train_runtime": 3118.0069, | |
| "train_tokens_per_second": 14299.379 | |
| }, | |
| { | |
| "epoch": 4.043196994991653, | |
| "grad_norm": 4.146509170532227, | |
| "learning_rate": 1.6307126460767945e-05, | |
| "loss": 1.3596, | |
| "num_input_tokens_seen": 44870940, | |
| "step": 77500, | |
| "train_runtime": 3136.5879, | |
| "train_tokens_per_second": 14305.654 | |
| }, | |
| { | |
| "epoch": 4.069282136894825, | |
| "grad_norm": 2.3407187461853027, | |
| "learning_rate": 1.6089750278241514e-05, | |
| "loss": 1.3979, | |
| "num_input_tokens_seen": 45165140, | |
| "step": 78000, | |
| "train_runtime": 3155.153, | |
| "train_tokens_per_second": 14314.723 | |
| }, | |
| { | |
| "epoch": 4.095367278797997, | |
| "grad_norm": 2.992572069168091, | |
| "learning_rate": 1.5872374095715082e-05, | |
| "loss": 1.4121, | |
| "num_input_tokens_seen": 45458076, | |
| "step": 78500, | |
| "train_runtime": 3173.7885, | |
| "train_tokens_per_second": 14322.97 | |
| }, | |
| { | |
| "epoch": 4.121452420701169, | |
| "grad_norm": 3.490511655807495, | |
| "learning_rate": 1.565499791318865e-05, | |
| "loss": 1.37, | |
| "num_input_tokens_seen": 45746588, | |
| "step": 79000, | |
| "train_runtime": 3192.4179, | |
| "train_tokens_per_second": 14329.762 | |
| }, | |
| { | |
| "epoch": 4.147537562604341, | |
| "grad_norm": 3.6620404720306396, | |
| "learning_rate": 1.5437621730662215e-05, | |
| "loss": 1.398, | |
| "num_input_tokens_seen": 46037020, | |
| "step": 79500, | |
| "train_runtime": 3212.1684, | |
| "train_tokens_per_second": 14332.069 | |
| }, | |
| { | |
| "epoch": 4.173622704507513, | |
| "grad_norm": 2.709702253341675, | |
| "learning_rate": 1.5220245548135783e-05, | |
| "loss": 1.3714, | |
| "num_input_tokens_seen": 46327764, | |
| "step": 80000, | |
| "train_runtime": 3232.3645, | |
| "train_tokens_per_second": 14332.469 | |
| }, | |
| { | |
| "epoch": 4.199707846410685, | |
| "grad_norm": 3.0171260833740234, | |
| "learning_rate": 1.5002869365609348e-05, | |
| "loss": 1.3777, | |
| "num_input_tokens_seen": 46608924, | |
| "step": 80500, | |
| "train_runtime": 3252.0642, | |
| "train_tokens_per_second": 14332.104 | |
| }, | |
| { | |
| "epoch": 4.225792988313857, | |
| "grad_norm": 2.588928461074829, | |
| "learning_rate": 1.4785493183082916e-05, | |
| "loss": 1.3768, | |
| "num_input_tokens_seen": 46898436, | |
| "step": 81000, | |
| "train_runtime": 3271.9745, | |
| "train_tokens_per_second": 14333.375 | |
| }, | |
| { | |
| "epoch": 4.251878130217029, | |
| "grad_norm": 2.5653598308563232, | |
| "learning_rate": 1.4568117000556484e-05, | |
| "loss": 1.3753, | |
| "num_input_tokens_seen": 47187548, | |
| "step": 81500, | |
| "train_runtime": 3291.5411, | |
| "train_tokens_per_second": 14336.005 | |
| }, | |
| { | |
| "epoch": 4.2779632721202, | |
| "grad_norm": 3.236936330795288, | |
| "learning_rate": 1.435074081803005e-05, | |
| "loss": 1.3987, | |
| "num_input_tokens_seen": 47475276, | |
| "step": 82000, | |
| "train_runtime": 3311.1953, | |
| "train_tokens_per_second": 14337.806 | |
| }, | |
| { | |
| "epoch": 4.304048414023372, | |
| "grad_norm": 2.4497241973876953, | |
| "learning_rate": 1.4133364635503618e-05, | |
| "loss": 1.36, | |
| "num_input_tokens_seen": 47768556, | |
| "step": 82500, | |
| "train_runtime": 3330.633, | |
| "train_tokens_per_second": 14342.185 | |
| }, | |
| { | |
| "epoch": 4.330133555926544, | |
| "grad_norm": 3.381693124771118, | |
| "learning_rate": 1.3915988452977185e-05, | |
| "loss": 1.4122, | |
| "num_input_tokens_seen": 48056012, | |
| "step": 83000, | |
| "train_runtime": 3350.0565, | |
| "train_tokens_per_second": 14344.836 | |
| }, | |
| { | |
| "epoch": 4.356218697829716, | |
| "grad_norm": 2.8100342750549316, | |
| "learning_rate": 1.3698612270450753e-05, | |
| "loss": 1.3836, | |
| "num_input_tokens_seen": 48341348, | |
| "step": 83500, | |
| "train_runtime": 3369.3072, | |
| "train_tokens_per_second": 14347.563 | |
| }, | |
| { | |
| "epoch": 4.382303839732888, | |
| "grad_norm": 3.380335569381714, | |
| "learning_rate": 1.3481236087924318e-05, | |
| "loss": 1.3726, | |
| "num_input_tokens_seen": 48631420, | |
| "step": 84000, | |
| "train_runtime": 3389.3206, | |
| "train_tokens_per_second": 14348.427 | |
| }, | |
| { | |
| "epoch": 4.40838898163606, | |
| "grad_norm": 2.434285879135132, | |
| "learning_rate": 1.3263859905397884e-05, | |
| "loss": 1.3937, | |
| "num_input_tokens_seen": 48915972, | |
| "step": 84500, | |
| "train_runtime": 3409.284, | |
| "train_tokens_per_second": 14347.873 | |
| }, | |
| { | |
| "epoch": 4.434474123539232, | |
| "grad_norm": 2.8802988529205322, | |
| "learning_rate": 1.3046483722871452e-05, | |
| "loss": 1.3761, | |
| "num_input_tokens_seen": 49203916, | |
| "step": 85000, | |
| "train_runtime": 3428.2362, | |
| "train_tokens_per_second": 14352.545 | |
| }, | |
| { | |
| "epoch": 4.460559265442404, | |
| "grad_norm": 3.350780963897705, | |
| "learning_rate": 1.282910754034502e-05, | |
| "loss": 1.3766, | |
| "num_input_tokens_seen": 49493860, | |
| "step": 85500, | |
| "train_runtime": 3447.3803, | |
| "train_tokens_per_second": 14356.948 | |
| }, | |
| { | |
| "epoch": 4.486644407345576, | |
| "grad_norm": 2.4271440505981445, | |
| "learning_rate": 1.2611731357818587e-05, | |
| "loss": 1.3672, | |
| "num_input_tokens_seen": 49778012, | |
| "step": 86000, | |
| "train_runtime": 3466.8453, | |
| "train_tokens_per_second": 14358.302 | |
| }, | |
| { | |
| "epoch": 4.512729549248748, | |
| "grad_norm": 2.5384743213653564, | |
| "learning_rate": 1.2394355175292154e-05, | |
| "loss": 1.3701, | |
| "num_input_tokens_seen": 50065764, | |
| "step": 86500, | |
| "train_runtime": 3486.5719, | |
| "train_tokens_per_second": 14359.596 | |
| }, | |
| { | |
| "epoch": 4.53881469115192, | |
| "grad_norm": 3.011307716369629, | |
| "learning_rate": 1.2176978992765722e-05, | |
| "loss": 1.3884, | |
| "num_input_tokens_seen": 50349860, | |
| "step": 87000, | |
| "train_runtime": 3505.9535, | |
| "train_tokens_per_second": 14361.246 | |
| }, | |
| { | |
| "epoch": 4.564899833055092, | |
| "grad_norm": 2.5870578289031982, | |
| "learning_rate": 1.1959602810239288e-05, | |
| "loss": 1.3991, | |
| "num_input_tokens_seen": 50643260, | |
| "step": 87500, | |
| "train_runtime": 3525.1982, | |
| "train_tokens_per_second": 14366.074 | |
| }, | |
| { | |
| "epoch": 4.590984974958264, | |
| "grad_norm": 3.0917413234710693, | |
| "learning_rate": 1.1742226627712856e-05, | |
| "loss": 1.3876, | |
| "num_input_tokens_seen": 50934732, | |
| "step": 88000, | |
| "train_runtime": 3544.4536, | |
| "train_tokens_per_second": 14370.264 | |
| }, | |
| { | |
| "epoch": 4.617070116861436, | |
| "grad_norm": 2.181250810623169, | |
| "learning_rate": 1.1524850445186423e-05, | |
| "loss": 1.3801, | |
| "num_input_tokens_seen": 51225644, | |
| "step": 88500, | |
| "train_runtime": 3563.7836, | |
| "train_tokens_per_second": 14373.949 | |
| }, | |
| { | |
| "epoch": 4.643155258764608, | |
| "grad_norm": 3.146324872970581, | |
| "learning_rate": 1.130747426265999e-05, | |
| "loss": 1.3451, | |
| "num_input_tokens_seen": 51515932, | |
| "step": 89000, | |
| "train_runtime": 3583.4863, | |
| "train_tokens_per_second": 14375.925 | |
| }, | |
| { | |
| "epoch": 4.66924040066778, | |
| "grad_norm": 2.4125654697418213, | |
| "learning_rate": 1.1090098080133557e-05, | |
| "loss": 1.3759, | |
| "num_input_tokens_seen": 51803372, | |
| "step": 89500, | |
| "train_runtime": 3602.6645, | |
| "train_tokens_per_second": 14379.183 | |
| }, | |
| { | |
| "epoch": 4.695325542570951, | |
| "grad_norm": 3.1065971851348877, | |
| "learning_rate": 1.0872721897607122e-05, | |
| "loss": 1.3846, | |
| "num_input_tokens_seen": 52096660, | |
| "step": 90000, | |
| "train_runtime": 3621.3864, | |
| "train_tokens_per_second": 14385.833 | |
| }, | |
| { | |
| "epoch": 4.721410684474123, | |
| "grad_norm": 2.9472384452819824, | |
| "learning_rate": 1.065534571508069e-05, | |
| "loss": 1.3826, | |
| "num_input_tokens_seen": 52385124, | |
| "step": 90500, | |
| "train_runtime": 3640.3069, | |
| "train_tokens_per_second": 14390.304 | |
| }, | |
| { | |
| "epoch": 4.747495826377295, | |
| "grad_norm": 3.2821028232574463, | |
| "learning_rate": 1.0437969532554258e-05, | |
| "loss": 1.3913, | |
| "num_input_tokens_seen": 52675284, | |
| "step": 91000, | |
| "train_runtime": 3659.1435, | |
| "train_tokens_per_second": 14395.523 | |
| }, | |
| { | |
| "epoch": 4.773580968280467, | |
| "grad_norm": 2.897390604019165, | |
| "learning_rate": 1.0220593350027825e-05, | |
| "loss": 1.3745, | |
| "num_input_tokens_seen": 52966012, | |
| "step": 91500, | |
| "train_runtime": 3677.8728, | |
| "train_tokens_per_second": 14401.263 | |
| }, | |
| { | |
| "epoch": 4.799666110183639, | |
| "grad_norm": 2.4328722953796387, | |
| "learning_rate": 1.0003217167501391e-05, | |
| "loss": 1.3675, | |
| "num_input_tokens_seen": 53260060, | |
| "step": 92000, | |
| "train_runtime": 3696.7483, | |
| "train_tokens_per_second": 14407.272 | |
| }, | |
| { | |
| "epoch": 4.825751252086811, | |
| "grad_norm": 2.3648526668548584, | |
| "learning_rate": 9.78584098497496e-06, | |
| "loss": 1.348, | |
| "num_input_tokens_seen": 53549900, | |
| "step": 92500, | |
| "train_runtime": 3715.4001, | |
| "train_tokens_per_second": 14412.956 | |
| }, | |
| { | |
| "epoch": 4.851836393989983, | |
| "grad_norm": 2.3531742095947266, | |
| "learning_rate": 9.568464802448526e-06, | |
| "loss": 1.3779, | |
| "num_input_tokens_seen": 53844180, | |
| "step": 93000, | |
| "train_runtime": 3734.2446, | |
| "train_tokens_per_second": 14419.029 | |
| }, | |
| { | |
| "epoch": 4.877921535893155, | |
| "grad_norm": 2.4701406955718994, | |
| "learning_rate": 9.351088619922092e-06, | |
| "loss": 1.3688, | |
| "num_input_tokens_seen": 54132452, | |
| "step": 93500, | |
| "train_runtime": 3752.9114, | |
| "train_tokens_per_second": 14424.122 | |
| }, | |
| { | |
| "epoch": 4.904006677796327, | |
| "grad_norm": 3.4860074520111084, | |
| "learning_rate": 9.13371243739566e-06, | |
| "loss": 1.3786, | |
| "num_input_tokens_seen": 54424212, | |
| "step": 94000, | |
| "train_runtime": 3771.7803, | |
| "train_tokens_per_second": 14429.316 | |
| }, | |
| { | |
| "epoch": 4.930091819699499, | |
| "grad_norm": 2.331005811691284, | |
| "learning_rate": 8.916336254869227e-06, | |
| "loss": 1.3582, | |
| "num_input_tokens_seen": 54719684, | |
| "step": 94500, | |
| "train_runtime": 3790.6832, | |
| "train_tokens_per_second": 14435.309 | |
| }, | |
| { | |
| "epoch": 4.956176961602671, | |
| "grad_norm": 2.379862070083618, | |
| "learning_rate": 8.698960072342793e-06, | |
| "loss": 1.3838, | |
| "num_input_tokens_seen": 55006740, | |
| "step": 95000, | |
| "train_runtime": 3809.755, | |
| "train_tokens_per_second": 14438.393 | |
| }, | |
| { | |
| "epoch": 4.982262103505843, | |
| "grad_norm": 3.527317523956299, | |
| "learning_rate": 8.481583889816362e-06, | |
| "loss": 1.3944, | |
| "num_input_tokens_seen": 55294876, | |
| "step": 95500, | |
| "train_runtime": 3829.1057, | |
| "train_tokens_per_second": 14440.676 | |
| }, | |
| { | |
| "epoch": 5.0, | |
| "eval_loss": 1.3229724168777466, | |
| "eval_runtime": 46.7304, | |
| "eval_samples_per_second": 820.343, | |
| "eval_steps_per_second": 102.546, | |
| "num_input_tokens_seen": 55492754, | |
| "step": 95840 | |
| }, | |
| { | |
| "epoch": 5.008347245409015, | |
| "grad_norm": 2.8223490715026855, | |
| "learning_rate": 8.264207707289928e-06, | |
| "loss": 1.3501, | |
| "num_input_tokens_seen": 55585722, | |
| "step": 96000, | |
| "train_runtime": 3896.8789, | |
| "train_tokens_per_second": 14264.165 | |
| }, | |
| { | |
| "epoch": 5.034432387312187, | |
| "grad_norm": 3.312976360321045, | |
| "learning_rate": 8.046831524763496e-06, | |
| "loss": 1.364, | |
| "num_input_tokens_seen": 55873162, | |
| "step": 96500, | |
| "train_runtime": 3916.5275, | |
| "train_tokens_per_second": 14265.995 | |
| }, | |
| { | |
| "epoch": 5.060517529215359, | |
| "grad_norm": 4.365355491638184, | |
| "learning_rate": 7.829455342237061e-06, | |
| "loss": 1.3657, | |
| "num_input_tokens_seen": 56159210, | |
| "step": 97000, | |
| "train_runtime": 3935.5771, | |
| "train_tokens_per_second": 14269.625 | |
| }, | |
| { | |
| "epoch": 5.086602671118531, | |
| "grad_norm": 2.77451753616333, | |
| "learning_rate": 7.612079159710629e-06, | |
| "loss": 1.3722, | |
| "num_input_tokens_seen": 56450234, | |
| "step": 97500, | |
| "train_runtime": 3954.8081, | |
| "train_tokens_per_second": 14273.824 | |
| }, | |
| { | |
| "epoch": 5.112687813021703, | |
| "grad_norm": 2.028353214263916, | |
| "learning_rate": 7.3947029771841964e-06, | |
| "loss": 1.3778, | |
| "num_input_tokens_seen": 56740002, | |
| "step": 98000, | |
| "train_runtime": 3973.4854, | |
| "train_tokens_per_second": 14279.655 | |
| }, | |
| { | |
| "epoch": 5.138772954924875, | |
| "grad_norm": 2.0676374435424805, | |
| "learning_rate": 7.177326794657763e-06, | |
| "loss": 1.3462, | |
| "num_input_tokens_seen": 57027226, | |
| "step": 98500, | |
| "train_runtime": 3992.3304, | |
| "train_tokens_per_second": 14284.195 | |
| }, | |
| { | |
| "epoch": 5.164858096828047, | |
| "grad_norm": 2.0867531299591064, | |
| "learning_rate": 6.95995061213133e-06, | |
| "loss": 1.3739, | |
| "num_input_tokens_seen": 57316978, | |
| "step": 99000, | |
| "train_runtime": 4012.1011, | |
| "train_tokens_per_second": 14286.025 | |
| }, | |
| { | |
| "epoch": 5.190943238731219, | |
| "grad_norm": 2.3995723724365234, | |
| "learning_rate": 6.7425744296048975e-06, | |
| "loss": 1.3821, | |
| "num_input_tokens_seen": 57607834, | |
| "step": 99500, | |
| "train_runtime": 4031.6912, | |
| "train_tokens_per_second": 14288.752 | |
| }, | |
| { | |
| "epoch": 5.217028380634391, | |
| "grad_norm": 3.466399669647217, | |
| "learning_rate": 6.525198247078465e-06, | |
| "loss": 1.3499, | |
| "num_input_tokens_seen": 57896786, | |
| "step": 100000, | |
| "train_runtime": 4051.2038, | |
| "train_tokens_per_second": 14291.255 | |
| }, | |
| { | |
| "epoch": 5.243113522537563, | |
| "grad_norm": 2.673947811126709, | |
| "learning_rate": 6.307822064552031e-06, | |
| "loss": 1.3703, | |
| "num_input_tokens_seen": 58184506, | |
| "step": 100500, | |
| "train_runtime": 4070.3919, | |
| "train_tokens_per_second": 14294.571 | |
| }, | |
| { | |
| "epoch": 5.269198664440735, | |
| "grad_norm": 2.0675642490386963, | |
| "learning_rate": 6.0904458820255986e-06, | |
| "loss": 1.3759, | |
| "num_input_tokens_seen": 58473186, | |
| "step": 101000, | |
| "train_runtime": 4090.425, | |
| "train_tokens_per_second": 14295.137 | |
| }, | |
| { | |
| "epoch": 5.295283806343907, | |
| "grad_norm": 2.8680272102355957, | |
| "learning_rate": 5.873069699499165e-06, | |
| "loss": 1.3811, | |
| "num_input_tokens_seen": 58764498, | |
| "step": 101500, | |
| "train_runtime": 4109.7435, | |
| "train_tokens_per_second": 14298.824 | |
| }, | |
| { | |
| "epoch": 5.321368948247079, | |
| "grad_norm": 3.1335153579711914, | |
| "learning_rate": 5.655693516972733e-06, | |
| "loss": 1.3914, | |
| "num_input_tokens_seen": 59053762, | |
| "step": 102000, | |
| "train_runtime": 4129.2443, | |
| "train_tokens_per_second": 14301.348 | |
| }, | |
| { | |
| "epoch": 5.347454090150251, | |
| "grad_norm": 4.179940223693848, | |
| "learning_rate": 5.4383173344463e-06, | |
| "loss": 1.3353, | |
| "num_input_tokens_seen": 59346138, | |
| "step": 102500, | |
| "train_runtime": 4148.9629, | |
| "train_tokens_per_second": 14303.849 | |
| }, | |
| { | |
| "epoch": 5.373539232053423, | |
| "grad_norm": 2.837871551513672, | |
| "learning_rate": 5.220941151919867e-06, | |
| "loss": 1.3592, | |
| "num_input_tokens_seen": 59634050, | |
| "step": 103000, | |
| "train_runtime": 4172.193, | |
| "train_tokens_per_second": 14293.215 | |
| }, | |
| { | |
| "epoch": 5.399624373956595, | |
| "grad_norm": 2.620933771133423, | |
| "learning_rate": 5.003564969393433e-06, | |
| "loss": 1.3438, | |
| "num_input_tokens_seen": 59920002, | |
| "step": 103500, | |
| "train_runtime": 4191.8017, | |
| "train_tokens_per_second": 14294.57 | |
| }, | |
| { | |
| "epoch": 5.425709515859766, | |
| "grad_norm": 2.974597454071045, | |
| "learning_rate": 4.786188786867001e-06, | |
| "loss": 1.3848, | |
| "num_input_tokens_seen": 60208490, | |
| "step": 104000, | |
| "train_runtime": 4210.5451, | |
| "train_tokens_per_second": 14299.453 | |
| }, | |
| { | |
| "epoch": 5.451794657762938, | |
| "grad_norm": 2.7892649173736572, | |
| "learning_rate": 4.568812604340568e-06, | |
| "loss": 1.3947, | |
| "num_input_tokens_seen": 60497570, | |
| "step": 104500, | |
| "train_runtime": 4229.6543, | |
| "train_tokens_per_second": 14303.195 | |
| }, | |
| { | |
| "epoch": 5.47787979966611, | |
| "grad_norm": 2.9217751026153564, | |
| "learning_rate": 4.3514364218141344e-06, | |
| "loss": 1.3637, | |
| "num_input_tokens_seen": 60791682, | |
| "step": 105000, | |
| "train_runtime": 4249.1377, | |
| "train_tokens_per_second": 14306.828 | |
| }, | |
| { | |
| "epoch": 5.503964941569282, | |
| "grad_norm": 2.3021788597106934, | |
| "learning_rate": 4.134060239287702e-06, | |
| "loss": 1.3772, | |
| "num_input_tokens_seen": 61081546, | |
| "step": 105500, | |
| "train_runtime": 4268.0879, | |
| "train_tokens_per_second": 14311.22 | |
| }, | |
| { | |
| "epoch": 5.530050083472454, | |
| "grad_norm": 2.520854949951172, | |
| "learning_rate": 3.916684056761269e-06, | |
| "loss": 1.3595, | |
| "num_input_tokens_seen": 61376714, | |
| "step": 106000, | |
| "train_runtime": 4287.1193, | |
| "train_tokens_per_second": 14316.54 | |
| }, | |
| { | |
| "epoch": 5.556135225375626, | |
| "grad_norm": 2.5124387741088867, | |
| "learning_rate": 3.6993078742348355e-06, | |
| "loss": 1.3755, | |
| "num_input_tokens_seen": 61670282, | |
| "step": 106500, | |
| "train_runtime": 4306.7613, | |
| "train_tokens_per_second": 14319.41 | |
| }, | |
| { | |
| "epoch": 5.582220367278798, | |
| "grad_norm": 3.6542813777923584, | |
| "learning_rate": 3.4819316917084032e-06, | |
| "loss": 1.3299, | |
| "num_input_tokens_seen": 61959530, | |
| "step": 107000, | |
| "train_runtime": 4325.9495, | |
| "train_tokens_per_second": 14322.758 | |
| }, | |
| { | |
| "epoch": 5.60830550918197, | |
| "grad_norm": 2.480987787246704, | |
| "learning_rate": 3.2645555091819697e-06, | |
| "loss": 1.3488, | |
| "num_input_tokens_seen": 62248610, | |
| "step": 107500, | |
| "train_runtime": 4344.6789, | |
| "train_tokens_per_second": 14327.551 | |
| }, | |
| { | |
| "epoch": 5.634390651085142, | |
| "grad_norm": 3.620051383972168, | |
| "learning_rate": 3.047179326655537e-06, | |
| "loss": 1.3663, | |
| "num_input_tokens_seen": 62535434, | |
| "step": 108000, | |
| "train_runtime": 4363.4204, | |
| "train_tokens_per_second": 14331.746 | |
| }, | |
| { | |
| "epoch": 5.660475792988314, | |
| "grad_norm": 2.9154930114746094, | |
| "learning_rate": 2.8298031441291043e-06, | |
| "loss": 1.3719, | |
| "num_input_tokens_seen": 62824930, | |
| "step": 108500, | |
| "train_runtime": 4382.1169, | |
| "train_tokens_per_second": 14336.662 | |
| }, | |
| { | |
| "epoch": 5.686560934891486, | |
| "grad_norm": 2.5228476524353027, | |
| "learning_rate": 2.612426961602671e-06, | |
| "loss": 1.3476, | |
| "num_input_tokens_seen": 63114954, | |
| "step": 109000, | |
| "train_runtime": 4400.737, | |
| "train_tokens_per_second": 14341.905 | |
| }, | |
| { | |
| "epoch": 5.712646076794658, | |
| "grad_norm": 2.6546239852905273, | |
| "learning_rate": 2.3950507790762385e-06, | |
| "loss": 1.3474, | |
| "num_input_tokens_seen": 63403826, | |
| "step": 109500, | |
| "train_runtime": 4419.5486, | |
| "train_tokens_per_second": 14346.222 | |
| }, | |
| { | |
| "epoch": 5.73873121869783, | |
| "grad_norm": 3.8582890033721924, | |
| "learning_rate": 2.1776745965498054e-06, | |
| "loss": 1.3451, | |
| "num_input_tokens_seen": 63689762, | |
| "step": 110000, | |
| "train_runtime": 4438.8474, | |
| "train_tokens_per_second": 14348.266 | |
| }, | |
| { | |
| "epoch": 5.764816360601001, | |
| "grad_norm": 3.4054343700408936, | |
| "learning_rate": 1.9602984140233727e-06, | |
| "loss": 1.3811, | |
| "num_input_tokens_seen": 63978794, | |
| "step": 110500, | |
| "train_runtime": 4458.4476, | |
| "train_tokens_per_second": 14350.016 | |
| }, | |
| { | |
| "epoch": 5.790901502504173, | |
| "grad_norm": 2.907578468322754, | |
| "learning_rate": 1.7429222314969393e-06, | |
| "loss": 1.3843, | |
| "num_input_tokens_seen": 64270234, | |
| "step": 111000, | |
| "train_runtime": 4478.4361, | |
| "train_tokens_per_second": 14351.044 | |
| }, | |
| { | |
| "epoch": 5.816986644407345, | |
| "grad_norm": 2.72294020652771, | |
| "learning_rate": 1.5255460489705064e-06, | |
| "loss": 1.3511, | |
| "num_input_tokens_seen": 64557130, | |
| "step": 111500, | |
| "train_runtime": 4498.1115, | |
| "train_tokens_per_second": 14352.052 | |
| }, | |
| { | |
| "epoch": 5.843071786310517, | |
| "grad_norm": 2.910423755645752, | |
| "learning_rate": 1.3081698664440735e-06, | |
| "loss": 1.355, | |
| "num_input_tokens_seen": 64847634, | |
| "step": 112000, | |
| "train_runtime": 4517.4916, | |
| "train_tokens_per_second": 14354.788 | |
| }, | |
| { | |
| "epoch": 5.869156928213689, | |
| "grad_norm": 2.3920516967773438, | |
| "learning_rate": 1.0907936839176406e-06, | |
| "loss": 1.3696, | |
| "num_input_tokens_seen": 65135722, | |
| "step": 112500, | |
| "train_runtime": 4536.2006, | |
| "train_tokens_per_second": 14359.092 | |
| }, | |
| { | |
| "epoch": 5.895242070116861, | |
| "grad_norm": 2.619903087615967, | |
| "learning_rate": 8.734175013912075e-07, | |
| "loss": 1.3515, | |
| "num_input_tokens_seen": 65423234, | |
| "step": 113000, | |
| "train_runtime": 4554.8848, | |
| "train_tokens_per_second": 14363.312 | |
| }, | |
| { | |
| "epoch": 5.921327212020033, | |
| "grad_norm": 2.61676025390625, | |
| "learning_rate": 6.560413188647746e-07, | |
| "loss": 1.3784, | |
| "num_input_tokens_seen": 65718338, | |
| "step": 113500, | |
| "train_runtime": 4573.7838, | |
| "train_tokens_per_second": 14368.484 | |
| }, | |
| { | |
| "epoch": 5.947412353923205, | |
| "grad_norm": 2.6655712127685547, | |
| "learning_rate": 4.3866513633834173e-07, | |
| "loss": 1.3672, | |
| "num_input_tokens_seen": 66007642, | |
| "step": 114000, | |
| "train_runtime": 4592.9685, | |
| "train_tokens_per_second": 14371.456 | |
| }, | |
| { | |
| "epoch": 5.973497495826377, | |
| "grad_norm": 2.606362819671631, | |
| "learning_rate": 2.2128895381190875e-07, | |
| "loss": 1.3579, | |
| "num_input_tokens_seen": 66290722, | |
| "step": 114500, | |
| "train_runtime": 4612.1986, | |
| "train_tokens_per_second": 14372.911 | |
| }, | |
| { | |
| "epoch": 5.999582637729549, | |
| "grad_norm": 2.8683297634124756, | |
| "learning_rate": 3.912771285475793e-09, | |
| "loss": 1.3687, | |
| "num_input_tokens_seen": 66581138, | |
| "step": 115000, | |
| "train_runtime": 4632.4758, | |
| "train_tokens_per_second": 14372.69 | |
| }, | |
| { | |
| "epoch": 6.0, | |
| "eval_loss": 1.319564938545227, | |
| "eval_runtime": 45.0275, | |
| "eval_samples_per_second": 851.369, | |
| "eval_steps_per_second": 106.424, | |
| "num_input_tokens_seen": 66585670, | |
| "step": 115008 | |
| }, | |
| { | |
| "epoch": 6.025667779632721, | |
| "grad_norm": 3.327254295349121, | |
| "learning_rate": 1.9871921953255425e-05, | |
| "loss": 1.3775, | |
| "num_input_tokens_seen": 66874998, | |
| "step": 115500, | |
| "train_runtime": 18.7889, | |
| "train_tokens_per_second": 3559284.107 | |
| }, | |
| { | |
| "epoch": 6.051752921535893, | |
| "grad_norm": 2.0363502502441406, | |
| "learning_rate": 1.9741496243739565e-05, | |
| "loss": 1.3598, | |
| "num_input_tokens_seen": 67165902, | |
| "step": 116000, | |
| "train_runtime": 38.0722, | |
| "train_tokens_per_second": 1764173.697 | |
| }, | |
| { | |
| "epoch": 6.077838063439065, | |
| "grad_norm": 3.2186789512634277, | |
| "learning_rate": 1.9611070534223708e-05, | |
| "loss": 1.3582, | |
| "num_input_tokens_seen": 67454310, | |
| "step": 116500, | |
| "train_runtime": 56.9892, | |
| "train_tokens_per_second": 1183632.851 | |
| }, | |
| { | |
| "epoch": 6.103923205342237, | |
| "grad_norm": 3.1102960109710693, | |
| "learning_rate": 1.9480644824707847e-05, | |
| "loss": 1.342, | |
| "num_input_tokens_seen": 67741886, | |
| "step": 117000, | |
| "train_runtime": 76.1489, | |
| "train_tokens_per_second": 889597.261 | |
| }, | |
| { | |
| "epoch": 6.130008347245409, | |
| "grad_norm": 2.1836190223693848, | |
| "learning_rate": 1.9350219115191987e-05, | |
| "loss": 1.3578, | |
| "num_input_tokens_seen": 68030070, | |
| "step": 117500, | |
| "train_runtime": 95.3289, | |
| "train_tokens_per_second": 713635.053 | |
| }, | |
| { | |
| "epoch": 6.156093489148581, | |
| "grad_norm": 2.637117624282837, | |
| "learning_rate": 1.921979340567613e-05, | |
| "loss": 1.3561, | |
| "num_input_tokens_seen": 68313278, | |
| "step": 118000, | |
| "train_runtime": 114.4954, | |
| "train_tokens_per_second": 596646.246 | |
| }, | |
| { | |
| "epoch": 6.182178631051753, | |
| "grad_norm": 2.454594612121582, | |
| "learning_rate": 1.908936769616027e-05, | |
| "loss": 1.3897, | |
| "num_input_tokens_seen": 68603790, | |
| "step": 118500, | |
| "train_runtime": 133.1929, | |
| "train_tokens_per_second": 515071.035 | |
| }, | |
| { | |
| "epoch": 6.208263772954925, | |
| "grad_norm": 2.6059861183166504, | |
| "learning_rate": 1.895894198664441e-05, | |
| "loss": 1.3662, | |
| "num_input_tokens_seen": 68897534, | |
| "step": 119000, | |
| "train_runtime": 152.3637, | |
| "train_tokens_per_second": 452191.312 | |
| }, | |
| { | |
| "epoch": 6.234348914858097, | |
| "grad_norm": 2.963710308074951, | |
| "learning_rate": 1.8828516277128548e-05, | |
| "loss": 1.3688, | |
| "num_input_tokens_seen": 69185822, | |
| "step": 119500, | |
| "train_runtime": 171.3295, | |
| "train_tokens_per_second": 403817.306 | |
| }, | |
| { | |
| "epoch": 6.260434056761269, | |
| "grad_norm": 2.3006739616394043, | |
| "learning_rate": 1.8698090567612688e-05, | |
| "loss": 1.3867, | |
| "num_input_tokens_seen": 69477766, | |
| "step": 120000, | |
| "train_runtime": 189.6964, | |
| "train_tokens_per_second": 366257.718 | |
| }, | |
| { | |
| "epoch": 6.286519198664441, | |
| "grad_norm": 2.4806406497955322, | |
| "learning_rate": 1.8567664858096827e-05, | |
| "loss": 1.349, | |
| "num_input_tokens_seen": 69770974, | |
| "step": 120500, | |
| "train_runtime": 208.9904, | |
| "train_tokens_per_second": 333847.728 | |
| }, | |
| { | |
| "epoch": 6.312604340567613, | |
| "grad_norm": 2.4395639896392822, | |
| "learning_rate": 1.843723914858097e-05, | |
| "loss": 1.3733, | |
| "num_input_tokens_seen": 70062350, | |
| "step": 121000, | |
| "train_runtime": 228.9771, | |
| "train_tokens_per_second": 305979.777 | |
| }, | |
| { | |
| "epoch": 6.338689482470785, | |
| "grad_norm": 2.7110908031463623, | |
| "learning_rate": 1.830681343906511e-05, | |
| "loss": 1.3708, | |
| "num_input_tokens_seen": 70351870, | |
| "step": 121500, | |
| "train_runtime": 248.7026, | |
| "train_tokens_per_second": 282875.484 | |
| }, | |
| { | |
| "epoch": 6.364774624373957, | |
| "grad_norm": 2.789796829223633, | |
| "learning_rate": 1.817638772954925e-05, | |
| "loss": 1.3688, | |
| "num_input_tokens_seen": 70642750, | |
| "step": 122000, | |
| "train_runtime": 268.6462, | |
| "train_tokens_per_second": 262958.28 | |
| }, | |
| { | |
| "epoch": 6.390859766277129, | |
| "grad_norm": 2.9111709594726562, | |
| "learning_rate": 1.8045962020033392e-05, | |
| "loss": 1.3518, | |
| "num_input_tokens_seen": 70931190, | |
| "step": 122500, | |
| "train_runtime": 288.3677, | |
| "train_tokens_per_second": 245974.799 | |
| }, | |
| { | |
| "epoch": 6.416944908180301, | |
| "grad_norm": 2.4599456787109375, | |
| "learning_rate": 1.791553631051753e-05, | |
| "loss": 1.3431, | |
| "num_input_tokens_seen": 71224646, | |
| "step": 123000, | |
| "train_runtime": 307.3647, | |
| "train_tokens_per_second": 231726.811 | |
| }, | |
| { | |
| "epoch": 6.443030050083473, | |
| "grad_norm": 2.365891456604004, | |
| "learning_rate": 1.778511060100167e-05, | |
| "loss": 1.3865, | |
| "num_input_tokens_seen": 71511326, | |
| "step": 123500, | |
| "train_runtime": 326.1759, | |
| "train_tokens_per_second": 219241.597 | |
| }, | |
| { | |
| "epoch": 6.469115191986645, | |
| "grad_norm": 2.6345105171203613, | |
| "learning_rate": 1.765468489148581e-05, | |
| "loss": 1.3734, | |
| "num_input_tokens_seen": 71797622, | |
| "step": 124000, | |
| "train_runtime": 344.951, | |
| "train_tokens_per_second": 208138.626 | |
| }, | |
| { | |
| "epoch": 6.495200333889817, | |
| "grad_norm": 3.2426106929779053, | |
| "learning_rate": 1.752425918196995e-05, | |
| "loss": 1.3628, | |
| "num_input_tokens_seen": 72088862, | |
| "step": 124500, | |
| "train_runtime": 363.8685, | |
| "train_tokens_per_second": 198117.913 | |
| }, | |
| { | |
| "epoch": 6.521285475792988, | |
| "grad_norm": 2.608137845993042, | |
| "learning_rate": 1.739383347245409e-05, | |
| "loss": 1.3723, | |
| "num_input_tokens_seen": 72378534, | |
| "step": 125000, | |
| "train_runtime": 383.5577, | |
| "train_tokens_per_second": 188703.107 | |
| }, | |
| { | |
| "epoch": 6.54737061769616, | |
| "grad_norm": 4.101028919219971, | |
| "learning_rate": 1.726340776293823e-05, | |
| "loss": 1.3776, | |
| "num_input_tokens_seen": 72669942, | |
| "step": 125500, | |
| "train_runtime": 402.8471, | |
| "train_tokens_per_second": 180390.889 | |
| }, | |
| { | |
| "epoch": 6.573455759599332, | |
| "grad_norm": 2.356037139892578, | |
| "learning_rate": 1.7132982053422372e-05, | |
| "loss": 1.376, | |
| "num_input_tokens_seen": 72956998, | |
| "step": 126000, | |
| "train_runtime": 422.9625, | |
| "train_tokens_per_second": 172490.455 | |
| }, | |
| { | |
| "epoch": 6.599540901502504, | |
| "grad_norm": 2.768091917037964, | |
| "learning_rate": 1.7002556343906512e-05, | |
| "loss": 1.3849, | |
| "num_input_tokens_seen": 73246278, | |
| "step": 126500, | |
| "train_runtime": 442.5677, | |
| "train_tokens_per_second": 165503.005 | |
| }, | |
| { | |
| "epoch": 6.625626043405676, | |
| "grad_norm": 2.1557633876800537, | |
| "learning_rate": 1.687213063439065e-05, | |
| "loss": 1.3692, | |
| "num_input_tokens_seen": 73532518, | |
| "step": 127000, | |
| "train_runtime": 461.2902, | |
| "train_tokens_per_second": 159406.192 | |
| }, | |
| { | |
| "epoch": 6.651711185308848, | |
| "grad_norm": 2.739330768585205, | |
| "learning_rate": 1.6741704924874794e-05, | |
| "loss": 1.3853, | |
| "num_input_tokens_seen": 73816374, | |
| "step": 127500, | |
| "train_runtime": 480.2569, | |
| "train_tokens_per_second": 153701.835 | |
| }, | |
| { | |
| "epoch": 6.67779632721202, | |
| "grad_norm": 2.28963303565979, | |
| "learning_rate": 1.6611279215358934e-05, | |
| "loss": 1.3539, | |
| "num_input_tokens_seen": 74103334, | |
| "step": 128000, | |
| "train_runtime": 499.0026, | |
| "train_tokens_per_second": 148502.901 | |
| }, | |
| { | |
| "epoch": 6.703881469115192, | |
| "grad_norm": 3.2728097438812256, | |
| "learning_rate": 1.6480853505843073e-05, | |
| "loss": 1.3519, | |
| "num_input_tokens_seen": 74392214, | |
| "step": 128500, | |
| "train_runtime": 517.9355, | |
| "train_tokens_per_second": 143632.196 | |
| }, | |
| { | |
| "epoch": 6.729966611018364, | |
| "grad_norm": 3.280041217803955, | |
| "learning_rate": 1.6350427796327213e-05, | |
| "loss": 1.3064, | |
| "num_input_tokens_seen": 74677654, | |
| "step": 129000, | |
| "train_runtime": 536.8375, | |
| "train_tokens_per_second": 139106.624 | |
| }, | |
| { | |
| "epoch": 6.756051752921536, | |
| "grad_norm": 3.9127538204193115, | |
| "learning_rate": 1.6220002086811352e-05, | |
| "loss": 1.3779, | |
| "num_input_tokens_seen": 74968646, | |
| "step": 129500, | |
| "train_runtime": 555.72, | |
| "train_tokens_per_second": 134903.621 | |
| }, | |
| { | |
| "epoch": 6.782136894824708, | |
| "grad_norm": 2.7960000038146973, | |
| "learning_rate": 1.6089576377295492e-05, | |
| "loss": 1.3327, | |
| "num_input_tokens_seen": 75257286, | |
| "step": 130000, | |
| "train_runtime": 574.6797, | |
| "train_tokens_per_second": 130955.186 | |
| }, | |
| { | |
| "epoch": 6.80822203672788, | |
| "grad_norm": 2.997286796569824, | |
| "learning_rate": 1.5959150667779635e-05, | |
| "loss": 1.3684, | |
| "num_input_tokens_seen": 75546398, | |
| "step": 130500, | |
| "train_runtime": 593.4532, | |
| "train_tokens_per_second": 127299.662 | |
| }, | |
| { | |
| "epoch": 6.834307178631052, | |
| "grad_norm": 2.6267356872558594, | |
| "learning_rate": 1.5828724958263774e-05, | |
| "loss": 1.3416, | |
| "num_input_tokens_seen": 75840662, | |
| "step": 131000, | |
| "train_runtime": 612.3615, | |
| "train_tokens_per_second": 123849.503 | |
| }, | |
| { | |
| "epoch": 6.860392320534224, | |
| "grad_norm": 2.1126062870025635, | |
| "learning_rate": 1.5698299248747914e-05, | |
| "loss": 1.3606, | |
| "num_input_tokens_seen": 76125694, | |
| "step": 131500, | |
| "train_runtime": 631.2618, | |
| "train_tokens_per_second": 120592.897 | |
| }, | |
| { | |
| "epoch": 6.886477462437396, | |
| "grad_norm": 2.9131317138671875, | |
| "learning_rate": 1.5567873539232053e-05, | |
| "loss": 1.3813, | |
| "num_input_tokens_seen": 76417118, | |
| "step": 132000, | |
| "train_runtime": 650.1892, | |
| "train_tokens_per_second": 117530.578 | |
| }, | |
| { | |
| "epoch": 6.912562604340567, | |
| "grad_norm": 3.5298712253570557, | |
| "learning_rate": 1.5437447829716196e-05, | |
| "loss": 1.3617, | |
| "num_input_tokens_seen": 76703430, | |
| "step": 132500, | |
| "train_runtime": 669.1223, | |
| "train_tokens_per_second": 114632.907 | |
| }, | |
| { | |
| "epoch": 6.938647746243739, | |
| "grad_norm": 2.850775718688965, | |
| "learning_rate": 1.5307022120200336e-05, | |
| "loss": 1.3672, | |
| "num_input_tokens_seen": 76992342, | |
| "step": 133000, | |
| "train_runtime": 687.9389, | |
| "train_tokens_per_second": 111917.419 | |
| }, | |
| { | |
| "epoch": 6.964732888146911, | |
| "grad_norm": 3.314821481704712, | |
| "learning_rate": 1.5176596410684474e-05, | |
| "loss": 1.3715, | |
| "num_input_tokens_seen": 77284374, | |
| "step": 133500, | |
| "train_runtime": 706.8708, | |
| "train_tokens_per_second": 109333.091 | |
| }, | |
| { | |
| "epoch": 6.990818030050083, | |
| "grad_norm": 3.3693618774414062, | |
| "learning_rate": 1.5046170701168617e-05, | |
| "loss": 1.3858, | |
| "num_input_tokens_seen": 77571966, | |
| "step": 134000, | |
| "train_runtime": 725.7267, | |
| "train_tokens_per_second": 106888.674 | |
| }, | |
| { | |
| "epoch": 7.0, | |
| "eval_loss": 1.3148815631866455, | |
| "eval_runtime": 45.8848, | |
| "eval_samples_per_second": 835.462, | |
| "eval_steps_per_second": 104.435, | |
| "num_input_tokens_seen": 77673096, | |
| "step": 134176 | |
| }, | |
| { | |
| "epoch": 7.016903171953255, | |
| "grad_norm": 2.7694716453552246, | |
| "learning_rate": 1.4915744991652755e-05, | |
| "loss": 1.3419, | |
| "num_input_tokens_seen": 77861608, | |
| "step": 134500, | |
| "train_runtime": 791.8621, | |
| "train_tokens_per_second": 98327.231 | |
| }, | |
| { | |
| "epoch": 7.042988313856427, | |
| "grad_norm": 2.7334187030792236, | |
| "learning_rate": 1.4785319282136894e-05, | |
| "loss": 1.3308, | |
| "num_input_tokens_seen": 78149784, | |
| "step": 135000, | |
| "train_runtime": 810.7343, | |
| "train_tokens_per_second": 96393.825 | |
| }, | |
| { | |
| "epoch": 7.069073455759599, | |
| "grad_norm": 2.9365265369415283, | |
| "learning_rate": 1.4654893572621037e-05, | |
| "loss": 1.3525, | |
| "num_input_tokens_seen": 78438792, | |
| "step": 135500, | |
| "train_runtime": 829.7324, | |
| "train_tokens_per_second": 94535.049 | |
| }, | |
| { | |
| "epoch": 7.095158597662771, | |
| "grad_norm": 4.147580146789551, | |
| "learning_rate": 1.4524467863105177e-05, | |
| "loss": 1.3465, | |
| "num_input_tokens_seen": 78732384, | |
| "step": 136000, | |
| "train_runtime": 848.5615, | |
| "train_tokens_per_second": 92783.357 | |
| }, | |
| { | |
| "epoch": 7.121243739565943, | |
| "grad_norm": 2.915922164916992, | |
| "learning_rate": 1.4394042153589316e-05, | |
| "loss": 1.3614, | |
| "num_input_tokens_seen": 79016208, | |
| "step": 136500, | |
| "train_runtime": 867.5653, | |
| "train_tokens_per_second": 91078.111 | |
| }, | |
| { | |
| "epoch": 7.147328881469115, | |
| "grad_norm": 2.549786329269409, | |
| "learning_rate": 1.4263616444073457e-05, | |
| "loss": 1.318, | |
| "num_input_tokens_seen": 79301784, | |
| "step": 137000, | |
| "train_runtime": 886.659, | |
| "train_tokens_per_second": 89438.871 | |
| }, | |
| { | |
| "epoch": 7.173414023372287, | |
| "grad_norm": 2.5047004222869873, | |
| "learning_rate": 1.4133190734557597e-05, | |
| "loss": 1.368, | |
| "num_input_tokens_seen": 79590400, | |
| "step": 137500, | |
| "train_runtime": 905.5133, | |
| "train_tokens_per_second": 87895.338 | |
| }, | |
| { | |
| "epoch": 7.199499165275459, | |
| "grad_norm": 3.0781052112579346, | |
| "learning_rate": 1.4002765025041736e-05, | |
| "loss": 1.3653, | |
| "num_input_tokens_seen": 79879504, | |
| "step": 138000, | |
| "train_runtime": 924.4454, | |
| "train_tokens_per_second": 86408.029 | |
| }, | |
| { | |
| "epoch": 7.225584307178631, | |
| "grad_norm": 3.6476972103118896, | |
| "learning_rate": 1.387233931552588e-05, | |
| "loss": 1.3514, | |
| "num_input_tokens_seen": 80167640, | |
| "step": 138500, | |
| "train_runtime": 943.229, | |
| "train_tokens_per_second": 84992.766 | |
| }, | |
| { | |
| "epoch": 7.2516694490818026, | |
| "grad_norm": 5.114116191864014, | |
| "learning_rate": 1.3741913606010017e-05, | |
| "loss": 1.3413, | |
| "num_input_tokens_seen": 80456216, | |
| "step": 139000, | |
| "train_runtime": 962.1141, | |
| "train_tokens_per_second": 83624.399 | |
| }, | |
| { | |
| "epoch": 7.277754590984975, | |
| "grad_norm": 2.5727877616882324, | |
| "learning_rate": 1.3611487896494157e-05, | |
| "loss": 1.3414, | |
| "num_input_tokens_seen": 80747832, | |
| "step": 139500, | |
| "train_runtime": 980.9921, | |
| "train_tokens_per_second": 82312.418 | |
| }, | |
| { | |
| "epoch": 7.303839732888147, | |
| "grad_norm": 2.9491872787475586, | |
| "learning_rate": 1.3481062186978296e-05, | |
| "loss": 1.3412, | |
| "num_input_tokens_seen": 81043216, | |
| "step": 140000, | |
| "train_runtime": 999.9989, | |
| "train_tokens_per_second": 81043.309 | |
| }, | |
| { | |
| "epoch": 7.329924874791319, | |
| "grad_norm": 2.045164108276367, | |
| "learning_rate": 1.3350636477462439e-05, | |
| "loss": 1.3729, | |
| "num_input_tokens_seen": 81333232, | |
| "step": 140500, | |
| "train_runtime": 1018.8484, | |
| "train_tokens_per_second": 79828.588 | |
| }, | |
| { | |
| "epoch": 7.356010016694491, | |
| "grad_norm": 3.922563314437866, | |
| "learning_rate": 1.3220210767946579e-05, | |
| "loss": 1.3443, | |
| "num_input_tokens_seen": 81622416, | |
| "step": 141000, | |
| "train_runtime": 1037.8422, | |
| "train_tokens_per_second": 78646.268 | |
| }, | |
| { | |
| "epoch": 7.382095158597663, | |
| "grad_norm": 2.426223039627075, | |
| "learning_rate": 1.3089785058430718e-05, | |
| "loss": 1.3544, | |
| "num_input_tokens_seen": 81911608, | |
| "step": 141500, | |
| "train_runtime": 1056.8045, | |
| "train_tokens_per_second": 77508.763 | |
| }, | |
| { | |
| "epoch": 7.408180300500835, | |
| "grad_norm": 2.67075514793396, | |
| "learning_rate": 1.295935934891486e-05, | |
| "loss": 1.3246, | |
| "num_input_tokens_seen": 82202544, | |
| "step": 142000, | |
| "train_runtime": 1075.733, | |
| "train_tokens_per_second": 76415.38 | |
| }, | |
| { | |
| "epoch": 7.434265442404007, | |
| "grad_norm": 2.5923829078674316, | |
| "learning_rate": 1.2828933639398999e-05, | |
| "loss": 1.3388, | |
| "num_input_tokens_seen": 82493944, | |
| "step": 142500, | |
| "train_runtime": 1094.8433, | |
| "train_tokens_per_second": 75347.716 | |
| }, | |
| { | |
| "epoch": 7.460350584307179, | |
| "grad_norm": 2.602835178375244, | |
| "learning_rate": 1.2698507929883138e-05, | |
| "loss": 1.3423, | |
| "num_input_tokens_seen": 82784656, | |
| "step": 143000, | |
| "train_runtime": 1113.7237, | |
| "train_tokens_per_second": 74331.413 | |
| }, | |
| { | |
| "epoch": 7.486435726210351, | |
| "grad_norm": 3.1531965732574463, | |
| "learning_rate": 1.256808222036728e-05, | |
| "loss": 1.3452, | |
| "num_input_tokens_seen": 83068624, | |
| "step": 143500, | |
| "train_runtime": 1132.6794, | |
| "train_tokens_per_second": 73338.162 | |
| }, | |
| { | |
| "epoch": 7.512520868113523, | |
| "grad_norm": 2.2403712272644043, | |
| "learning_rate": 1.243765651085142e-05, | |
| "loss": 1.3618, | |
| "num_input_tokens_seen": 83351920, | |
| "step": 144000, | |
| "train_runtime": 1151.592, | |
| "train_tokens_per_second": 72379.733 | |
| }, | |
| { | |
| "epoch": 7.538606010016695, | |
| "grad_norm": 3.465223550796509, | |
| "learning_rate": 1.2307230801335559e-05, | |
| "loss": 1.3632, | |
| "num_input_tokens_seen": 83638888, | |
| "step": 144500, | |
| "train_runtime": 1170.6851, | |
| "train_tokens_per_second": 71444.392 | |
| }, | |
| { | |
| "epoch": 7.564691151919867, | |
| "grad_norm": 2.3392977714538574, | |
| "learning_rate": 1.21768050918197e-05, | |
| "loss": 1.3318, | |
| "num_input_tokens_seen": 83931992, | |
| "step": 145000, | |
| "train_runtime": 1189.5906, | |
| "train_tokens_per_second": 70555.362 | |
| }, | |
| { | |
| "epoch": 7.590776293823039, | |
| "grad_norm": 3.0218007564544678, | |
| "learning_rate": 1.2046379382303841e-05, | |
| "loss": 1.3636, | |
| "num_input_tokens_seen": 84220168, | |
| "step": 145500, | |
| "train_runtime": 1208.5932, | |
| "train_tokens_per_second": 69684.461 | |
| }, | |
| { | |
| "epoch": 7.616861435726211, | |
| "grad_norm": 3.329549789428711, | |
| "learning_rate": 1.191595367278798e-05, | |
| "loss": 1.355, | |
| "num_input_tokens_seen": 84509512, | |
| "step": 146000, | |
| "train_runtime": 1227.4823, | |
| "train_tokens_per_second": 68847.845 | |
| }, | |
| { | |
| "epoch": 7.642946577629383, | |
| "grad_norm": 6.515806198120117, | |
| "learning_rate": 1.178552796327212e-05, | |
| "loss": 1.3414, | |
| "num_input_tokens_seen": 84808104, | |
| "step": 146500, | |
| "train_runtime": 1246.4918, | |
| "train_tokens_per_second": 68037.434 | |
| }, | |
| { | |
| "epoch": 7.669031719532554, | |
| "grad_norm": 3.5463063716888428, | |
| "learning_rate": 1.1655102253756262e-05, | |
| "loss": 1.3617, | |
| "num_input_tokens_seen": 85099704, | |
| "step": 147000, | |
| "train_runtime": 1265.5528, | |
| "train_tokens_per_second": 67243.109 | |
| }, | |
| { | |
| "epoch": 7.695116861435726, | |
| "grad_norm": 2.877112627029419, | |
| "learning_rate": 1.1524676544240401e-05, | |
| "loss": 1.3524, | |
| "num_input_tokens_seen": 85387272, | |
| "step": 147500, | |
| "train_runtime": 1284.3298, | |
| "train_tokens_per_second": 66483.913 | |
| }, | |
| { | |
| "epoch": 7.721202003338898, | |
| "grad_norm": 2.8873534202575684, | |
| "learning_rate": 1.1394250834724542e-05, | |
| "loss": 1.3442, | |
| "num_input_tokens_seen": 85671272, | |
| "step": 148000, | |
| "train_runtime": 1303.2108, | |
| "train_tokens_per_second": 65738.615 | |
| }, | |
| { | |
| "epoch": 7.74728714524207, | |
| "grad_norm": 3.5610382556915283, | |
| "learning_rate": 1.126382512520868e-05, | |
| "loss": 1.3505, | |
| "num_input_tokens_seen": 85959168, | |
| "step": 148500, | |
| "train_runtime": 1322.1598, | |
| "train_tokens_per_second": 65014.207 | |
| }, | |
| { | |
| "epoch": 7.773372287145242, | |
| "grad_norm": 2.6103343963623047, | |
| "learning_rate": 1.1133399415692821e-05, | |
| "loss": 1.3616, | |
| "num_input_tokens_seen": 86255128, | |
| "step": 149000, | |
| "train_runtime": 1341.3928, | |
| "train_tokens_per_second": 64302.661 | |
| }, | |
| { | |
| "epoch": 7.799457429048414, | |
| "grad_norm": 2.5157065391540527, | |
| "learning_rate": 1.1002973706176963e-05, | |
| "loss": 1.3422, | |
| "num_input_tokens_seen": 86546848, | |
| "step": 149500, | |
| "train_runtime": 1360.2395, | |
| "train_tokens_per_second": 63626.184 | |
| }, | |
| { | |
| "epoch": 7.825542570951586, | |
| "grad_norm": 2.315091371536255, | |
| "learning_rate": 1.0872547996661102e-05, | |
| "loss": 1.3511, | |
| "num_input_tokens_seen": 86837440, | |
| "step": 150000, | |
| "train_runtime": 1379.3034, | |
| "train_tokens_per_second": 62957.46 | |
| }, | |
| { | |
| "epoch": 7.851627712854758, | |
| "grad_norm": 2.2483925819396973, | |
| "learning_rate": 1.0742122287145243e-05, | |
| "loss": 1.3355, | |
| "num_input_tokens_seen": 87120032, | |
| "step": 150500, | |
| "train_runtime": 1398.2422, | |
| "train_tokens_per_second": 62306.824 | |
| }, | |
| { | |
| "epoch": 7.87771285475793, | |
| "grad_norm": 2.340362071990967, | |
| "learning_rate": 1.0611696577629383e-05, | |
| "loss": 1.3537, | |
| "num_input_tokens_seen": 87415824, | |
| "step": 151000, | |
| "train_runtime": 1417.1731, | |
| "train_tokens_per_second": 61683.236 | |
| }, | |
| { | |
| "epoch": 7.903797996661102, | |
| "grad_norm": 2.813960552215576, | |
| "learning_rate": 1.0481270868113522e-05, | |
| "loss": 1.3479, | |
| "num_input_tokens_seen": 87701680, | |
| "step": 151500, | |
| "train_runtime": 1436.1799, | |
| "train_tokens_per_second": 61065.945 | |
| }, | |
| { | |
| "epoch": 7.929883138564274, | |
| "grad_norm": 2.2960751056671143, | |
| "learning_rate": 1.0350845158597664e-05, | |
| "loss": 1.3475, | |
| "num_input_tokens_seen": 87992448, | |
| "step": 152000, | |
| "train_runtime": 1455.0801, | |
| "train_tokens_per_second": 60472.578 | |
| }, | |
| { | |
| "epoch": 7.955968280467446, | |
| "grad_norm": 3.048780918121338, | |
| "learning_rate": 1.0220419449081803e-05, | |
| "loss": 1.3619, | |
| "num_input_tokens_seen": 88281416, | |
| "step": 152500, | |
| "train_runtime": 1474.0153, | |
| "train_tokens_per_second": 59891.791 | |
| }, | |
| { | |
| "epoch": 7.982053422370617, | |
| "grad_norm": 2.816805362701416, | |
| "learning_rate": 1.0089993739565943e-05, | |
| "loss": 1.357, | |
| "num_input_tokens_seen": 88572368, | |
| "step": 153000, | |
| "train_runtime": 1492.8602, | |
| "train_tokens_per_second": 59330.65 | |
| }, | |
| { | |
| "epoch": 8.0, | |
| "eval_loss": 1.310753345489502, | |
| "eval_runtime": 45.8622, | |
| "eval_samples_per_second": 835.874, | |
| "eval_steps_per_second": 104.487, | |
| "num_input_tokens_seen": 88772850, | |
| "step": 153344 | |
| }, | |
| { | |
| "epoch": 8.00813856427379, | |
| "grad_norm": 2.93835186958313, | |
| "learning_rate": 9.959568030050084e-06, | |
| "loss": 1.3378, | |
| "num_input_tokens_seen": 88861818, | |
| "step": 153500, | |
| "train_runtime": 1558.6187, | |
| "train_tokens_per_second": 57013.187 | |
| }, | |
| { | |
| "epoch": 8.034223706176961, | |
| "grad_norm": 3.2679965496063232, | |
| "learning_rate": 9.829142320534224e-06, | |
| "loss": 1.3403, | |
| "num_input_tokens_seen": 89148626, | |
| "step": 154000, | |
| "train_runtime": 1577.6034, | |
| "train_tokens_per_second": 56508.897 | |
| }, | |
| { | |
| "epoch": 8.060308848080133, | |
| "grad_norm": 1.7137473821640015, | |
| "learning_rate": 9.698716611018365e-06, | |
| "loss": 1.3357, | |
| "num_input_tokens_seen": 89432242, | |
| "step": 154500, | |
| "train_runtime": 1596.5679, | |
| "train_tokens_per_second": 56015.306 | |
| }, | |
| { | |
| "epoch": 8.086393989983305, | |
| "grad_norm": 2.5696284770965576, | |
| "learning_rate": 9.568290901502506e-06, | |
| "loss": 1.3465, | |
| "num_input_tokens_seen": 89721890, | |
| "step": 155000, | |
| "train_runtime": 1615.4358, | |
| "train_tokens_per_second": 55540.364 | |
| }, | |
| { | |
| "epoch": 8.112479131886477, | |
| "grad_norm": 3.715364694595337, | |
| "learning_rate": 9.437865191986644e-06, | |
| "loss": 1.3407, | |
| "num_input_tokens_seen": 90009618, | |
| "step": 155500, | |
| "train_runtime": 1634.2407, | |
| "train_tokens_per_second": 55077.332 | |
| }, | |
| { | |
| "epoch": 8.13856427378965, | |
| "grad_norm": 2.7199196815490723, | |
| "learning_rate": 9.307439482470785e-06, | |
| "loss": 1.3444, | |
| "num_input_tokens_seen": 90299538, | |
| "step": 156000, | |
| "train_runtime": 1653.1786, | |
| "train_tokens_per_second": 54621.767 | |
| }, | |
| { | |
| "epoch": 8.164649415692821, | |
| "grad_norm": 2.546076774597168, | |
| "learning_rate": 9.177013772954925e-06, | |
| "loss": 1.3201, | |
| "num_input_tokens_seen": 90585634, | |
| "step": 156500, | |
| "train_runtime": 1671.9571, | |
| "train_tokens_per_second": 54179.401 | |
| }, | |
| { | |
| "epoch": 8.190734557595993, | |
| "grad_norm": 2.7355287075042725, | |
| "learning_rate": 9.046588063439066e-06, | |
| "loss": 1.3286, | |
| "num_input_tokens_seen": 90875986, | |
| "step": 157000, | |
| "train_runtime": 1690.8444, | |
| "train_tokens_per_second": 53745.919 | |
| }, | |
| { | |
| "epoch": 8.216819699499165, | |
| "grad_norm": 2.610476016998291, | |
| "learning_rate": 8.916162353923205e-06, | |
| "loss": 1.3624, | |
| "num_input_tokens_seen": 91165682, | |
| "step": 157500, | |
| "train_runtime": 1709.7962, | |
| "train_tokens_per_second": 53319.619 | |
| }, | |
| { | |
| "epoch": 8.242904841402337, | |
| "grad_norm": 3.424274444580078, | |
| "learning_rate": 8.785736644407345e-06, | |
| "loss": 1.3615, | |
| "num_input_tokens_seen": 91458162, | |
| "step": 158000, | |
| "train_runtime": 1728.5683, | |
| "train_tokens_per_second": 52909.776 | |
| }, | |
| { | |
| "epoch": 8.26898998330551, | |
| "grad_norm": 2.9222910404205322, | |
| "learning_rate": 8.655310934891486e-06, | |
| "loss": 1.3359, | |
| "num_input_tokens_seen": 91748050, | |
| "step": 158500, | |
| "train_runtime": 1747.5127, | |
| "train_tokens_per_second": 52502.078 | |
| }, | |
| { | |
| "epoch": 8.295075125208681, | |
| "grad_norm": 3.5217490196228027, | |
| "learning_rate": 8.524885225375627e-06, | |
| "loss": 1.3414, | |
| "num_input_tokens_seen": 92035050, | |
| "step": 159000, | |
| "train_runtime": 1766.4856, | |
| "train_tokens_per_second": 52100.651 | |
| }, | |
| { | |
| "epoch": 8.321160267111853, | |
| "grad_norm": 2.656613826751709, | |
| "learning_rate": 8.394459515859767e-06, | |
| "loss": 1.3436, | |
| "num_input_tokens_seen": 92326378, | |
| "step": 159500, | |
| "train_runtime": 1785.3077, | |
| "train_tokens_per_second": 51714.547 | |
| }, | |
| { | |
| "epoch": 8.347245409015025, | |
| "grad_norm": 2.8764595985412598, | |
| "learning_rate": 8.264033806343906e-06, | |
| "loss": 1.316, | |
| "num_input_tokens_seen": 92617586, | |
| "step": 160000, | |
| "train_runtime": 1804.1264, | |
| "train_tokens_per_second": 51336.529 | |
| }, | |
| { | |
| "epoch": 8.373330550918197, | |
| "grad_norm": 2.635450839996338, | |
| "learning_rate": 8.133608096828046e-06, | |
| "loss": 1.3745, | |
| "num_input_tokens_seen": 92904010, | |
| "step": 160500, | |
| "train_runtime": 1823.1613, | |
| "train_tokens_per_second": 50957.647 | |
| }, | |
| { | |
| "epoch": 8.39941569282137, | |
| "grad_norm": 3.4129796028137207, | |
| "learning_rate": 8.003182387312187e-06, | |
| "loss": 1.3278, | |
| "num_input_tokens_seen": 93189170, | |
| "step": 161000, | |
| "train_runtime": 1842.0126, | |
| "train_tokens_per_second": 50590.953 | |
| }, | |
| { | |
| "epoch": 8.425500834724541, | |
| "grad_norm": 3.2952401638031006, | |
| "learning_rate": 7.872756677796328e-06, | |
| "loss": 1.337, | |
| "num_input_tokens_seen": 93475210, | |
| "step": 161500, | |
| "train_runtime": 1861.0469, | |
| "train_tokens_per_second": 50227.218 | |
| }, | |
| { | |
| "epoch": 8.451585976627713, | |
| "grad_norm": 2.8078572750091553, | |
| "learning_rate": 7.742330968280468e-06, | |
| "loss": 1.3511, | |
| "num_input_tokens_seen": 93764458, | |
| "step": 162000, | |
| "train_runtime": 1880.0164, | |
| "train_tokens_per_second": 49874.278 | |
| }, | |
| { | |
| "epoch": 8.477671118530886, | |
| "grad_norm": 3.6334028244018555, | |
| "learning_rate": 7.611905258764608e-06, | |
| "loss": 1.3214, | |
| "num_input_tokens_seen": 94054690, | |
| "step": 162500, | |
| "train_runtime": 1898.9183, | |
| "train_tokens_per_second": 49530.666 | |
| }, | |
| { | |
| "epoch": 8.503756260434058, | |
| "grad_norm": 2.255051851272583, | |
| "learning_rate": 7.481479549248749e-06, | |
| "loss": 1.3181, | |
| "num_input_tokens_seen": 94342986, | |
| "step": 163000, | |
| "train_runtime": 1917.8935, | |
| "train_tokens_per_second": 49190.941 | |
| }, | |
| { | |
| "epoch": 8.52984140233723, | |
| "grad_norm": 2.2999086380004883, | |
| "learning_rate": 7.351053839732888e-06, | |
| "loss": 1.3468, | |
| "num_input_tokens_seen": 94628458, | |
| "step": 163500, | |
| "train_runtime": 1936.8254, | |
| "train_tokens_per_second": 48857.505 | |
| }, | |
| { | |
| "epoch": 8.5559265442404, | |
| "grad_norm": 2.8126626014709473, | |
| "learning_rate": 7.220628130217029e-06, | |
| "loss": 1.3442, | |
| "num_input_tokens_seen": 94916450, | |
| "step": 164000, | |
| "train_runtime": 1955.7716, | |
| "train_tokens_per_second": 48531.459 | |
| }, | |
| { | |
| "epoch": 8.582011686143572, | |
| "grad_norm": 3.6833460330963135, | |
| "learning_rate": 7.090202420701168e-06, | |
| "loss": 1.3097, | |
| "num_input_tokens_seen": 95209610, | |
| "step": 164500, | |
| "train_runtime": 1974.6708, | |
| "train_tokens_per_second": 48215.434 | |
| }, | |
| { | |
| "epoch": 8.608096828046744, | |
| "grad_norm": 2.2948975563049316, | |
| "learning_rate": 6.959776711185309e-06, | |
| "loss": 1.3158, | |
| "num_input_tokens_seen": 95500162, | |
| "step": 165000, | |
| "train_runtime": 1993.7313, | |
| "train_tokens_per_second": 47900.216 | |
| }, | |
| { | |
| "epoch": 8.634181969949916, | |
| "grad_norm": 2.677102565765381, | |
| "learning_rate": 6.82935100166945e-06, | |
| "loss": 1.3492, | |
| "num_input_tokens_seen": 95791218, | |
| "step": 165500, | |
| "train_runtime": 2012.6562, | |
| "train_tokens_per_second": 47594.428 | |
| }, | |
| { | |
| "epoch": 8.660267111853088, | |
| "grad_norm": 2.8302109241485596, | |
| "learning_rate": 6.698925292153589e-06, | |
| "loss": 1.3176, | |
| "num_input_tokens_seen": 96078250, | |
| "step": 166000, | |
| "train_runtime": 2031.6856, | |
| "train_tokens_per_second": 47289.919 | |
| }, | |
| { | |
| "epoch": 8.68635225375626, | |
| "grad_norm": 2.7552695274353027, | |
| "learning_rate": 6.56849958263773e-06, | |
| "loss": 1.3259, | |
| "num_input_tokens_seen": 96363322, | |
| "step": 166500, | |
| "train_runtime": 2050.6873, | |
| "train_tokens_per_second": 46990.744 | |
| }, | |
| { | |
| "epoch": 8.712437395659432, | |
| "grad_norm": 2.76167368888855, | |
| "learning_rate": 6.438073873121871e-06, | |
| "loss": 1.341, | |
| "num_input_tokens_seen": 96655826, | |
| "step": 167000, | |
| "train_runtime": 2069.5519, | |
| "train_tokens_per_second": 46703.746 | |
| }, | |
| { | |
| "epoch": 8.738522537562604, | |
| "grad_norm": 2.799135208129883, | |
| "learning_rate": 6.3076481636060104e-06, | |
| "loss": 1.3516, | |
| "num_input_tokens_seen": 96941474, | |
| "step": 167500, | |
| "train_runtime": 2088.6051, | |
| "train_tokens_per_second": 46414.458 | |
| }, | |
| { | |
| "epoch": 8.764607679465776, | |
| "grad_norm": 2.185119390487671, | |
| "learning_rate": 6.177222454090151e-06, | |
| "loss": 1.3495, | |
| "num_input_tokens_seen": 97236010, | |
| "step": 168000, | |
| "train_runtime": 2107.5825, | |
| "train_tokens_per_second": 46136.277 | |
| }, | |
| { | |
| "epoch": 8.790692821368948, | |
| "grad_norm": 2.787100315093994, | |
| "learning_rate": 6.046796744574291e-06, | |
| "loss": 1.3059, | |
| "num_input_tokens_seen": 97526826, | |
| "step": 168500, | |
| "train_runtime": 2126.4823, | |
| "train_tokens_per_second": 45862.984 | |
| }, | |
| { | |
| "epoch": 8.81677796327212, | |
| "grad_norm": 2.6303234100341797, | |
| "learning_rate": 5.916371035058431e-06, | |
| "loss": 1.3463, | |
| "num_input_tokens_seen": 97816378, | |
| "step": 169000, | |
| "train_runtime": 2145.3741, | |
| "train_tokens_per_second": 45594.088 | |
| }, | |
| { | |
| "epoch": 8.842863105175292, | |
| "grad_norm": 2.5196168422698975, | |
| "learning_rate": 5.785945325542571e-06, | |
| "loss": 1.3462, | |
| "num_input_tokens_seen": 98111226, | |
| "step": 169500, | |
| "train_runtime": 2164.4052, | |
| "train_tokens_per_second": 45329.417 | |
| }, | |
| { | |
| "epoch": 8.868948247078464, | |
| "grad_norm": 3.008777141571045, | |
| "learning_rate": 5.6555196160267115e-06, | |
| "loss": 1.3463, | |
| "num_input_tokens_seen": 98404994, | |
| "step": 170000, | |
| "train_runtime": 2183.2406, | |
| "train_tokens_per_second": 45072.904 | |
| }, | |
| { | |
| "epoch": 8.895033388981636, | |
| "grad_norm": 2.664883613586426, | |
| "learning_rate": 5.525093906510852e-06, | |
| "loss": 1.3505, | |
| "num_input_tokens_seen": 98691458, | |
| "step": 170500, | |
| "train_runtime": 2202.2373, | |
| "train_tokens_per_second": 44814.179 | |
| }, | |
| { | |
| "epoch": 8.921118530884808, | |
| "grad_norm": 3.8976974487304688, | |
| "learning_rate": 5.3946681969949914e-06, | |
| "loss": 1.3325, | |
| "num_input_tokens_seen": 98980730, | |
| "step": 171000, | |
| "train_runtime": 2221.2328, | |
| "train_tokens_per_second": 44561.169 | |
| }, | |
| { | |
| "epoch": 8.94720367278798, | |
| "grad_norm": 2.5917086601257324, | |
| "learning_rate": 5.264242487479132e-06, | |
| "loss": 1.333, | |
| "num_input_tokens_seen": 99265698, | |
| "step": 171500, | |
| "train_runtime": 2240.1093, | |
| "train_tokens_per_second": 44312.882 | |
| }, | |
| { | |
| "epoch": 8.973288814691152, | |
| "grad_norm": 3.012345314025879, | |
| "learning_rate": 5.133816777963272e-06, | |
| "loss": 1.3493, | |
| "num_input_tokens_seen": 99562818, | |
| "step": 172000, | |
| "train_runtime": 2259.2484, | |
| "train_tokens_per_second": 44069.001 | |
| }, | |
| { | |
| "epoch": 8.999373956594324, | |
| "grad_norm": 1.994488000869751, | |
| "learning_rate": 5.0033910684474126e-06, | |
| "loss": 1.3704, | |
| "num_input_tokens_seen": 99855026, | |
| "step": 172500, | |
| "train_runtime": 2278.2393, | |
| "train_tokens_per_second": 43829.912 | |
| }, | |
| { | |
| "epoch": 9.0, | |
| "eval_loss": 1.3092994689941406, | |
| "eval_runtime": 45.6876, | |
| "eval_samples_per_second": 839.069, | |
| "eval_steps_per_second": 104.886, | |
| "num_input_tokens_seen": 99861888, | |
| "step": 172512 | |
| }, | |
| { | |
| "epoch": 9.025459098497496, | |
| "grad_norm": 3.0312609672546387, | |
| "learning_rate": 4.872965358931553e-06, | |
| "loss": 1.3012, | |
| "num_input_tokens_seen": 100137840, | |
| "step": 173000, | |
| "train_runtime": 2343.9004, | |
| "train_tokens_per_second": 42722.738 | |
| }, | |
| { | |
| "epoch": 9.051544240400668, | |
| "grad_norm": 2.9846737384796143, | |
| "learning_rate": 4.7425396494156925e-06, | |
| "loss": 1.3416, | |
| "num_input_tokens_seen": 100428752, | |
| "step": 173500, | |
| "train_runtime": 2362.9108, | |
| "train_tokens_per_second": 42502.134 | |
| }, | |
| { | |
| "epoch": 9.07762938230384, | |
| "grad_norm": 2.700178623199463, | |
| "learning_rate": 4.612113939899834e-06, | |
| "loss": 1.3509, | |
| "num_input_tokens_seen": 100714360, | |
| "step": 174000, | |
| "train_runtime": 2381.7919, | |
| "train_tokens_per_second": 42285.122 | |
| }, | |
| { | |
| "epoch": 9.103714524207012, | |
| "grad_norm": 2.5982463359832764, | |
| "learning_rate": 4.481688230383973e-06, | |
| "loss": 1.33, | |
| "num_input_tokens_seen": 101010096, | |
| "step": 174500, | |
| "train_runtime": 2400.7332, | |
| "train_tokens_per_second": 42074.685 | |
| }, | |
| { | |
| "epoch": 9.129799666110184, | |
| "grad_norm": 3.2345430850982666, | |
| "learning_rate": 4.351262520868114e-06, | |
| "loss": 1.3127, | |
| "num_input_tokens_seen": 101301448, | |
| "step": 175000, | |
| "train_runtime": 2419.7, | |
| "train_tokens_per_second": 41865.292 | |
| }, | |
| { | |
| "epoch": 9.155884808013356, | |
| "grad_norm": 2.8651511669158936, | |
| "learning_rate": 4.220836811352254e-06, | |
| "loss": 1.3198, | |
| "num_input_tokens_seen": 101583952, | |
| "step": 175500, | |
| "train_runtime": 2438.5641, | |
| "train_tokens_per_second": 41657.282 | |
| }, | |
| { | |
| "epoch": 9.181969949916528, | |
| "grad_norm": 2.723923921585083, | |
| "learning_rate": 4.090411101836394e-06, | |
| "loss": 1.3486, | |
| "num_input_tokens_seen": 101879904, | |
| "step": 176000, | |
| "train_runtime": 2457.6414, | |
| "train_tokens_per_second": 41454.341 | |
| }, | |
| { | |
| "epoch": 9.2080550918197, | |
| "grad_norm": 2.9765188694000244, | |
| "learning_rate": 3.959985392320535e-06, | |
| "loss": 1.3247, | |
| "num_input_tokens_seen": 102169192, | |
| "step": 176500, | |
| "train_runtime": 2476.4667, | |
| "train_tokens_per_second": 41256.033 | |
| }, | |
| { | |
| "epoch": 9.234140233722872, | |
| "grad_norm": 2.14411997795105, | |
| "learning_rate": 3.829559682804674e-06, | |
| "loss": 1.3542, | |
| "num_input_tokens_seen": 102454992, | |
| "step": 177000, | |
| "train_runtime": 2495.3856, | |
| "train_tokens_per_second": 41057.779 | |
| }, | |
| { | |
| "epoch": 9.260225375626044, | |
| "grad_norm": 2.7752788066864014, | |
| "learning_rate": 3.6991339732888147e-06, | |
| "loss": 1.3469, | |
| "num_input_tokens_seen": 102739160, | |
| "step": 177500, | |
| "train_runtime": 2514.175, | |
| "train_tokens_per_second": 40863.966 | |
| }, | |
| { | |
| "epoch": 9.286310517529216, | |
| "grad_norm": 2.3828213214874268, | |
| "learning_rate": 3.5687082637729555e-06, | |
| "loss": 1.3267, | |
| "num_input_tokens_seen": 103027896, | |
| "step": 178000, | |
| "train_runtime": 2533.1537, | |
| "train_tokens_per_second": 40671.791 | |
| }, | |
| { | |
| "epoch": 9.312395659432386, | |
| "grad_norm": 2.554948329925537, | |
| "learning_rate": 3.4382825542570955e-06, | |
| "loss": 1.3218, | |
| "num_input_tokens_seen": 103314672, | |
| "step": 178500, | |
| "train_runtime": 2552.0027, | |
| "train_tokens_per_second": 40483.762 | |
| }, | |
| { | |
| "epoch": 9.338480801335558, | |
| "grad_norm": 2.6806468963623047, | |
| "learning_rate": 3.3078568447412354e-06, | |
| "loss": 1.3384, | |
| "num_input_tokens_seen": 103602648, | |
| "step": 179000, | |
| "train_runtime": 2571.0574, | |
| "train_tokens_per_second": 40295.735 | |
| }, | |
| { | |
| "epoch": 9.36456594323873, | |
| "grad_norm": 3.95470929145813, | |
| "learning_rate": 3.1774311352253754e-06, | |
| "loss": 1.3187, | |
| "num_input_tokens_seen": 103892480, | |
| "step": 179500, | |
| "train_runtime": 2590.0953, | |
| "train_tokens_per_second": 40111.45 | |
| }, | |
| { | |
| "epoch": 9.390651085141902, | |
| "grad_norm": 2.708707332611084, | |
| "learning_rate": 3.0470054257095158e-06, | |
| "loss": 1.335, | |
| "num_input_tokens_seen": 104178104, | |
| "step": 180000, | |
| "train_runtime": 2608.9847, | |
| "train_tokens_per_second": 39930.515 | |
| }, | |
| { | |
| "epoch": 9.416736227045075, | |
| "grad_norm": 3.4441354274749756, | |
| "learning_rate": 2.916579716193656e-06, | |
| "loss": 1.3204, | |
| "num_input_tokens_seen": 104470488, | |
| "step": 180500, | |
| "train_runtime": 2627.985, | |
| "train_tokens_per_second": 39753.076 | |
| }, | |
| { | |
| "epoch": 9.442821368948247, | |
| "grad_norm": 3.5723414421081543, | |
| "learning_rate": 2.7861540066777965e-06, | |
| "loss": 1.3457, | |
| "num_input_tokens_seen": 104759104, | |
| "step": 181000, | |
| "train_runtime": 2646.9218, | |
| "train_tokens_per_second": 39577.71 | |
| }, | |
| { | |
| "epoch": 9.468906510851419, | |
| "grad_norm": 3.956160068511963, | |
| "learning_rate": 2.655728297161937e-06, | |
| "loss": 1.3717, | |
| "num_input_tokens_seen": 105044408, | |
| "step": 181500, | |
| "train_runtime": 2665.9278, | |
| "train_tokens_per_second": 39402.571 | |
| }, | |
| { | |
| "epoch": 9.49499165275459, | |
| "grad_norm": 2.565819025039673, | |
| "learning_rate": 2.525302587646077e-06, | |
| "loss": 1.3413, | |
| "num_input_tokens_seen": 105327088, | |
| "step": 182000, | |
| "train_runtime": 2684.8097, | |
| "train_tokens_per_second": 39230.746 | |
| }, | |
| { | |
| "epoch": 9.521076794657763, | |
| "grad_norm": 3.5526235103607178, | |
| "learning_rate": 2.3948768781302173e-06, | |
| "loss": 1.2786, | |
| "num_input_tokens_seen": 105615560, | |
| "step": 182500, | |
| "train_runtime": 2703.8493, | |
| "train_tokens_per_second": 39061.185 | |
| }, | |
| { | |
| "epoch": 9.547161936560935, | |
| "grad_norm": 2.816168785095215, | |
| "learning_rate": 2.264451168614357e-06, | |
| "loss": 1.3268, | |
| "num_input_tokens_seen": 105904984, | |
| "step": 183000, | |
| "train_runtime": 2722.6828, | |
| "train_tokens_per_second": 38897.29 | |
| }, | |
| { | |
| "epoch": 9.573247078464107, | |
| "grad_norm": 3.1430675983428955, | |
| "learning_rate": 2.1340254590984976e-06, | |
| "loss": 1.3181, | |
| "num_input_tokens_seen": 106197728, | |
| "step": 183500, | |
| "train_runtime": 2741.6799, | |
| "train_tokens_per_second": 38734.547 | |
| }, | |
| { | |
| "epoch": 9.599332220367279, | |
| "grad_norm": 3.099498748779297, | |
| "learning_rate": 2.0035997495826376e-06, | |
| "loss": 1.321, | |
| "num_input_tokens_seen": 106489536, | |
| "step": 184000, | |
| "train_runtime": 2760.6349, | |
| "train_tokens_per_second": 38574.292 | |
| }, | |
| { | |
| "epoch": 9.62541736227045, | |
| "grad_norm": 3.0963542461395264, | |
| "learning_rate": 1.8731740400667781e-06, | |
| "loss": 1.3177, | |
| "num_input_tokens_seen": 106779640, | |
| "step": 184500, | |
| "train_runtime": 2779.5118, | |
| "train_tokens_per_second": 38416.688 | |
| }, | |
| { | |
| "epoch": 9.651502504173623, | |
| "grad_norm": 2.6030497550964355, | |
| "learning_rate": 1.742748330550918e-06, | |
| "loss": 1.3052, | |
| "num_input_tokens_seen": 107073888, | |
| "step": 185000, | |
| "train_runtime": 2798.5054, | |
| "train_tokens_per_second": 38261.097 | |
| }, | |
| { | |
| "epoch": 9.677587646076795, | |
| "grad_norm": 3.022160768508911, | |
| "learning_rate": 1.6123226210350585e-06, | |
| "loss": 1.3436, | |
| "num_input_tokens_seen": 107364848, | |
| "step": 185500, | |
| "train_runtime": 2817.3417, | |
| "train_tokens_per_second": 38108.565 | |
| }, | |
| { | |
| "epoch": 9.703672787979967, | |
| "grad_norm": 2.626763105392456, | |
| "learning_rate": 1.4818969115191989e-06, | |
| "loss": 1.3352, | |
| "num_input_tokens_seen": 107659488, | |
| "step": 186000, | |
| "train_runtime": 2836.337, | |
| "train_tokens_per_second": 37957.227 | |
| }, | |
| { | |
| "epoch": 9.729757929883139, | |
| "grad_norm": 3.0171899795532227, | |
| "learning_rate": 1.351471202003339e-06, | |
| "loss": 1.317, | |
| "num_input_tokens_seen": 107949608, | |
| "step": 186500, | |
| "train_runtime": 2855.247, | |
| "train_tokens_per_second": 37807.45 | |
| }, | |
| { | |
| "epoch": 9.75584307178631, | |
| "grad_norm": 2.22269868850708, | |
| "learning_rate": 1.2210454924874792e-06, | |
| "loss": 1.3193, | |
| "num_input_tokens_seen": 108245936, | |
| "step": 187000, | |
| "train_runtime": 2874.2964, | |
| "train_tokens_per_second": 37659.977 | |
| }, | |
| { | |
| "epoch": 9.781928213689483, | |
| "grad_norm": 2.8673713207244873, | |
| "learning_rate": 1.0906197829716196e-06, | |
| "loss": 1.3392, | |
| "num_input_tokens_seen": 108539552, | |
| "step": 187500, | |
| "train_runtime": 2893.1693, | |
| "train_tokens_per_second": 37515.798 | |
| }, | |
| { | |
| "epoch": 9.808013355592655, | |
| "grad_norm": 2.645888566970825, | |
| "learning_rate": 9.601940734557598e-07, | |
| "loss": 1.3395, | |
| "num_input_tokens_seen": 108827736, | |
| "step": 188000, | |
| "train_runtime": 2912.2618, | |
| "train_tokens_per_second": 37368.802 | |
| }, | |
| { | |
| "epoch": 9.834098497495827, | |
| "grad_norm": 3.0480117797851562, | |
| "learning_rate": 8.297683639398999e-07, | |
| "loss": 1.3325, | |
| "num_input_tokens_seen": 109119720, | |
| "step": 188500, | |
| "train_runtime": 2931.2115, | |
| "train_tokens_per_second": 37226.832 | |
| }, | |
| { | |
| "epoch": 9.860183639398999, | |
| "grad_norm": 3.1074326038360596, | |
| "learning_rate": 6.993426544240401e-07, | |
| "loss": 1.3365, | |
| "num_input_tokens_seen": 109406600, | |
| "step": 189000, | |
| "train_runtime": 2950.1291, | |
| "train_tokens_per_second": 37085.361 | |
| }, | |
| { | |
| "epoch": 9.88626878130217, | |
| "grad_norm": 2.7331807613372803, | |
| "learning_rate": 5.689169449081803e-07, | |
| "loss": 1.346, | |
| "num_input_tokens_seen": 109694976, | |
| "step": 189500, | |
| "train_runtime": 2969.1526, | |
| "train_tokens_per_second": 36944.877 | |
| }, | |
| { | |
| "epoch": 9.912353923205343, | |
| "grad_norm": 2.5716543197631836, | |
| "learning_rate": 4.3849123539232055e-07, | |
| "loss": 1.3331, | |
| "num_input_tokens_seen": 109985584, | |
| "step": 190000, | |
| "train_runtime": 2988.0525, | |
| "train_tokens_per_second": 36808.451 | |
| }, | |
| { | |
| "epoch": 9.938439065108515, | |
| "grad_norm": 2.6166512966156006, | |
| "learning_rate": 3.080655258764608e-07, | |
| "loss": 1.3292, | |
| "num_input_tokens_seen": 110272368, | |
| "step": 190500, | |
| "train_runtime": 3007.0207, | |
| "train_tokens_per_second": 36671.636 | |
| }, | |
| { | |
| "epoch": 9.964524207011687, | |
| "grad_norm": 2.8893744945526123, | |
| "learning_rate": 1.77639816360601e-07, | |
| "loss": 1.3166, | |
| "num_input_tokens_seen": 110557664, | |
| "step": 191000, | |
| "train_runtime": 3025.9068, | |
| "train_tokens_per_second": 36537.035 | |
| }, | |
| { | |
| "epoch": 9.990609348914859, | |
| "grad_norm": 2.441220998764038, | |
| "learning_rate": 4.721410684474124e-08, | |
| "loss": 1.3429, | |
| "num_input_tokens_seen": 110851304, | |
| "step": 191500, | |
| "train_runtime": 3044.8849, | |
| "train_tokens_per_second": 36405.745 | |
| }, | |
| { | |
| "epoch": 10.0, | |
| "eval_loss": 1.3094313144683838, | |
| "eval_runtime": 45.924, | |
| "eval_samples_per_second": 834.748, | |
| "eval_steps_per_second": 104.346, | |
| "num_input_tokens_seen": 110955972, | |
| "step": 191680 | |
| }, | |
| { | |
| "epoch": 10.01669449081803, | |
| "grad_norm": 3.2865073680877686, | |
| "learning_rate": 2.495839419866444e-05, | |
| "loss": 1.3149, | |
| "num_input_tokens_seen": 111137740, | |
| "step": 192000, | |
| "train_runtime": 12.5405, | |
| "train_tokens_per_second": 8862327.118 | |
| }, | |
| { | |
| "epoch": 10.042779632721203, | |
| "grad_norm": 1.945192813873291, | |
| "learning_rate": 2.4893181343906512e-05, | |
| "loss": 1.3105, | |
| "num_input_tokens_seen": 111433348, | |
| "step": 192500, | |
| "train_runtime": 31.7519, | |
| "train_tokens_per_second": 3509506.821 | |
| }, | |
| { | |
| "epoch": 10.068864774624373, | |
| "grad_norm": 2.5163190364837646, | |
| "learning_rate": 2.482796848914858e-05, | |
| "loss": 1.3183, | |
| "num_input_tokens_seen": 111720916, | |
| "step": 193000, | |
| "train_runtime": 52.2905, | |
| "train_tokens_per_second": 2136543.334 | |
| }, | |
| { | |
| "epoch": 10.094949916527545, | |
| "grad_norm": 2.6350646018981934, | |
| "learning_rate": 2.4762755634390652e-05, | |
| "loss": 1.3066, | |
| "num_input_tokens_seen": 112012948, | |
| "step": 193500, | |
| "train_runtime": 72.2858, | |
| "train_tokens_per_second": 1549584.477 | |
| }, | |
| { | |
| "epoch": 10.121035058430717, | |
| "grad_norm": 2.0416669845581055, | |
| "learning_rate": 2.4697542779632723e-05, | |
| "loss": 1.3383, | |
| "num_input_tokens_seen": 112299028, | |
| "step": 194000, | |
| "train_runtime": 92.8402, | |
| "train_tokens_per_second": 1209594.915 | |
| }, | |
| { | |
| "epoch": 10.14712020033389, | |
| "grad_norm": 2.219244956970215, | |
| "learning_rate": 2.463232992487479e-05, | |
| "loss": 1.3436, | |
| "num_input_tokens_seen": 112590044, | |
| "step": 194500, | |
| "train_runtime": 113.0832, | |
| "train_tokens_per_second": 995639.026 | |
| }, | |
| { | |
| "epoch": 10.173205342237061, | |
| "grad_norm": 3.015204429626465, | |
| "learning_rate": 2.4567117070116863e-05, | |
| "loss": 1.3393, | |
| "num_input_tokens_seen": 112885940, | |
| "step": 195000, | |
| "train_runtime": 133.1713, | |
| "train_tokens_per_second": 847674.91 | |
| }, | |
| { | |
| "epoch": 10.199290484140233, | |
| "grad_norm": 2.1486213207244873, | |
| "learning_rate": 2.4501904215358934e-05, | |
| "loss": 1.3463, | |
| "num_input_tokens_seen": 113169852, | |
| "step": 195500, | |
| "train_runtime": 153.2143, | |
| "train_tokens_per_second": 738637.549 | |
| }, | |
| { | |
| "epoch": 10.225375626043405, | |
| "grad_norm": 2.8701765537261963, | |
| "learning_rate": 2.4436691360601002e-05, | |
| "loss": 1.3125, | |
| "num_input_tokens_seen": 113459500, | |
| "step": 196000, | |
| "train_runtime": 173.4235, | |
| "train_tokens_per_second": 654233.54 | |
| }, | |
| { | |
| "epoch": 10.251460767946577, | |
| "grad_norm": 2.4410154819488525, | |
| "learning_rate": 2.4371478505843074e-05, | |
| "loss": 1.3423, | |
| "num_input_tokens_seen": 113754868, | |
| "step": 196500, | |
| "train_runtime": 193.1046, | |
| "train_tokens_per_second": 589084.328 | |
| }, | |
| { | |
| "epoch": 10.27754590984975, | |
| "grad_norm": 2.3649730682373047, | |
| "learning_rate": 2.4306265651085145e-05, | |
| "loss": 1.3583, | |
| "num_input_tokens_seen": 114041052, | |
| "step": 197000, | |
| "train_runtime": 211.6778, | |
| "train_tokens_per_second": 538748.365 | |
| }, | |
| { | |
| "epoch": 10.303631051752921, | |
| "grad_norm": 2.661882162094116, | |
| "learning_rate": 2.4241052796327213e-05, | |
| "loss": 1.3226, | |
| "num_input_tokens_seen": 114327300, | |
| "step": 197500, | |
| "train_runtime": 230.1549, | |
| "train_tokens_per_second": 496740.752 | |
| }, | |
| { | |
| "epoch": 10.329716193656093, | |
| "grad_norm": 3.2307496070861816, | |
| "learning_rate": 2.4175839941569285e-05, | |
| "loss": 1.3317, | |
| "num_input_tokens_seen": 114614836, | |
| "step": 198000, | |
| "train_runtime": 248.6123, | |
| "train_tokens_per_second": 461018.428 | |
| }, | |
| { | |
| "epoch": 10.355801335559265, | |
| "grad_norm": 2.0446155071258545, | |
| "learning_rate": 2.4110627086811353e-05, | |
| "loss": 1.3289, | |
| "num_input_tokens_seen": 114898460, | |
| "step": 198500, | |
| "train_runtime": 267.0794, | |
| "train_tokens_per_second": 430203.456 | |
| }, | |
| { | |
| "epoch": 10.381886477462437, | |
| "grad_norm": 2.149264335632324, | |
| "learning_rate": 2.4045414232053424e-05, | |
| "loss": 1.3479, | |
| "num_input_tokens_seen": 115190612, | |
| "step": 199000, | |
| "train_runtime": 285.5801, | |
| "train_tokens_per_second": 403356.593 | |
| }, | |
| { | |
| "epoch": 10.40797161936561, | |
| "grad_norm": 2.5007822513580322, | |
| "learning_rate": 2.3980201377295496e-05, | |
| "loss": 1.3398, | |
| "num_input_tokens_seen": 115480604, | |
| "step": 199500, | |
| "train_runtime": 304.0559, | |
| "train_tokens_per_second": 379800.589 | |
| }, | |
| { | |
| "epoch": 10.434056761268781, | |
| "grad_norm": 2.485358238220215, | |
| "learning_rate": 2.3914988522537564e-05, | |
| "loss": 1.3471, | |
| "num_input_tokens_seen": 115772396, | |
| "step": 200000, | |
| "train_runtime": 322.5534, | |
| "train_tokens_per_second": 358924.703 | |
| }, | |
| { | |
| "epoch": 10.460141903171953, | |
| "grad_norm": 3.0661306381225586, | |
| "learning_rate": 2.3849775667779635e-05, | |
| "loss": 1.3459, | |
| "num_input_tokens_seen": 116055028, | |
| "step": 200500, | |
| "train_runtime": 341.0251, | |
| "train_tokens_per_second": 340312.297 | |
| }, | |
| { | |
| "epoch": 10.486227045075125, | |
| "grad_norm": 3.0374038219451904, | |
| "learning_rate": 2.3784562813021703e-05, | |
| "loss": 1.3294, | |
| "num_input_tokens_seen": 116342956, | |
| "step": 201000, | |
| "train_runtime": 359.5564, | |
| "train_tokens_per_second": 323573.635 | |
| }, | |
| { | |
| "epoch": 10.512312186978297, | |
| "grad_norm": 2.4844298362731934, | |
| "learning_rate": 2.371934995826377e-05, | |
| "loss": 1.347, | |
| "num_input_tokens_seen": 116629444, | |
| "step": 201500, | |
| "train_runtime": 378.1517, | |
| "train_tokens_per_second": 308419.724 | |
| }, | |
| { | |
| "epoch": 10.53839732888147, | |
| "grad_norm": 3.5257129669189453, | |
| "learning_rate": 2.3654137103505843e-05, | |
| "loss": 1.3621, | |
| "num_input_tokens_seen": 116918476, | |
| "step": 202000, | |
| "train_runtime": 396.7415, | |
| "train_tokens_per_second": 294696.879 | |
| }, | |
| { | |
| "epoch": 10.564482470784641, | |
| "grad_norm": 2.989980936050415, | |
| "learning_rate": 2.3588924248747914e-05, | |
| "loss": 1.3474, | |
| "num_input_tokens_seen": 117203300, | |
| "step": 202500, | |
| "train_runtime": 415.3093, | |
| "train_tokens_per_second": 282207.249 | |
| }, | |
| { | |
| "epoch": 10.590567612687813, | |
| "grad_norm": 2.9134278297424316, | |
| "learning_rate": 2.3523711393989982e-05, | |
| "loss": 1.3293, | |
| "num_input_tokens_seen": 117490356, | |
| "step": 203000, | |
| "train_runtime": 433.8487, | |
| "train_tokens_per_second": 270809.506 | |
| }, | |
| { | |
| "epoch": 10.616652754590985, | |
| "grad_norm": 3.4408249855041504, | |
| "learning_rate": 2.3458498539232054e-05, | |
| "loss": 1.3323, | |
| "num_input_tokens_seen": 117778116, | |
| "step": 203500, | |
| "train_runtime": 452.4151, | |
| "train_tokens_per_second": 260331.947 | |
| }, | |
| { | |
| "epoch": 10.642737896494157, | |
| "grad_norm": 2.5976977348327637, | |
| "learning_rate": 2.3393285684474125e-05, | |
| "loss": 1.326, | |
| "num_input_tokens_seen": 118066028, | |
| "step": 204000, | |
| "train_runtime": 471.0506, | |
| "train_tokens_per_second": 250644.038 | |
| }, | |
| { | |
| "epoch": 10.66882303839733, | |
| "grad_norm": 2.8414862155914307, | |
| "learning_rate": 2.3328072829716193e-05, | |
| "loss": 1.3268, | |
| "num_input_tokens_seen": 118349812, | |
| "step": 204500, | |
| "train_runtime": 489.6127, | |
| "train_tokens_per_second": 241721.304 | |
| }, | |
| { | |
| "epoch": 10.694908180300501, | |
| "grad_norm": 2.9611923694610596, | |
| "learning_rate": 2.3262859974958265e-05, | |
| "loss": 1.3183, | |
| "num_input_tokens_seen": 118641012, | |
| "step": 205000, | |
| "train_runtime": 508.2873, | |
| "train_tokens_per_second": 233413.297 | |
| }, | |
| { | |
| "epoch": 10.720993322203674, | |
| "grad_norm": 3.3537490367889404, | |
| "learning_rate": 2.3197647120200336e-05, | |
| "loss": 1.3344, | |
| "num_input_tokens_seen": 118928020, | |
| "step": 205500, | |
| "train_runtime": 526.9474, | |
| "train_tokens_per_second": 225692.395 | |
| }, | |
| { | |
| "epoch": 10.747078464106846, | |
| "grad_norm": 2.557131290435791, | |
| "learning_rate": 2.3132434265442404e-05, | |
| "loss": 1.341, | |
| "num_input_tokens_seen": 119221628, | |
| "step": 206000, | |
| "train_runtime": 545.6837, | |
| "train_tokens_per_second": 218481.209 | |
| }, | |
| { | |
| "epoch": 10.773163606010016, | |
| "grad_norm": 3.0086355209350586, | |
| "learning_rate": 2.3067221410684476e-05, | |
| "loss": 1.3298, | |
| "num_input_tokens_seen": 119513436, | |
| "step": 206500, | |
| "train_runtime": 564.4783, | |
| "train_tokens_per_second": 211723.717 | |
| }, | |
| { | |
| "epoch": 10.79924874791319, | |
| "grad_norm": 3.600940227508545, | |
| "learning_rate": 2.3002008555926547e-05, | |
| "loss": 1.3572, | |
| "num_input_tokens_seen": 119801196, | |
| "step": 207000, | |
| "train_runtime": 583.2037, | |
| "train_tokens_per_second": 205419.144 | |
| }, | |
| { | |
| "epoch": 10.82533388981636, | |
| "grad_norm": 2.5225415229797363, | |
| "learning_rate": 2.2936795701168615e-05, | |
| "loss": 1.3173, | |
| "num_input_tokens_seen": 120090740, | |
| "step": 207500, | |
| "train_runtime": 601.9046, | |
| "train_tokens_per_second": 199517.884 | |
| }, | |
| { | |
| "epoch": 10.851419031719532, | |
| "grad_norm": 2.092555046081543, | |
| "learning_rate": 2.2871582846410687e-05, | |
| "loss": 1.3557, | |
| "num_input_tokens_seen": 120377796, | |
| "step": 208000, | |
| "train_runtime": 620.6064, | |
| "train_tokens_per_second": 193968.023 | |
| }, | |
| { | |
| "epoch": 10.877504173622704, | |
| "grad_norm": 2.5600435733795166, | |
| "learning_rate": 2.2806369991652758e-05, | |
| "loss": 1.3432, | |
| "num_input_tokens_seen": 120669548, | |
| "step": 208500, | |
| "train_runtime": 639.2536, | |
| "train_tokens_per_second": 188766.325 | |
| }, | |
| { | |
| "epoch": 10.903589315525876, | |
| "grad_norm": 2.583836793899536, | |
| "learning_rate": 2.2741157136894826e-05, | |
| "loss": 1.3376, | |
| "num_input_tokens_seen": 120961348, | |
| "step": 209000, | |
| "train_runtime": 657.9319, | |
| "train_tokens_per_second": 183850.86 | |
| }, | |
| { | |
| "epoch": 10.929674457429048, | |
| "grad_norm": 3.099386692047119, | |
| "learning_rate": 2.2675944282136898e-05, | |
| "loss": 1.3296, | |
| "num_input_tokens_seen": 121257580, | |
| "step": 209500, | |
| "train_runtime": 676.7145, | |
| "train_tokens_per_second": 179185.735 | |
| }, | |
| { | |
| "epoch": 10.95575959933222, | |
| "grad_norm": 3.329822063446045, | |
| "learning_rate": 2.2610731427378966e-05, | |
| "loss": 1.3424, | |
| "num_input_tokens_seen": 121550684, | |
| "step": 210000, | |
| "train_runtime": 695.457, | |
| "train_tokens_per_second": 174778.155 | |
| }, | |
| { | |
| "epoch": 10.981844741235392, | |
| "grad_norm": 2.160890817642212, | |
| "learning_rate": 2.2545518572621034e-05, | |
| "loss": 1.3391, | |
| "num_input_tokens_seen": 121840244, | |
| "step": 210500, | |
| "train_runtime": 714.1173, | |
| "train_tokens_per_second": 170616.572 | |
| }, | |
| { | |
| "epoch": 11.0, | |
| "eval_loss": 1.3036798238754272, | |
| "eval_runtime": 45.5874, | |
| "eval_samples_per_second": 840.912, | |
| "eval_steps_per_second": 105.117, | |
| "num_input_tokens_seen": 122042976, | |
| "step": 210848 | |
| }, | |
| { | |
| "epoch": 11.007929883138564, | |
| "grad_norm": 2.8093433380126953, | |
| "learning_rate": 2.2480305717863105e-05, | |
| "loss": 1.3252, | |
| "num_input_tokens_seen": 122133808, | |
| "step": 211000, | |
| "train_runtime": 779.5622, | |
| "train_tokens_per_second": 156669.744 | |
| }, | |
| { | |
| "epoch": 11.034015025041736, | |
| "grad_norm": 2.5687525272369385, | |
| "learning_rate": 2.2415092863105177e-05, | |
| "loss": 1.3285, | |
| "num_input_tokens_seen": 122424408, | |
| "step": 211500, | |
| "train_runtime": 798.366, | |
| "train_tokens_per_second": 153343.713 | |
| }, | |
| { | |
| "epoch": 11.060100166944908, | |
| "grad_norm": 2.920220136642456, | |
| "learning_rate": 2.2349880008347245e-05, | |
| "loss": 1.2892, | |
| "num_input_tokens_seen": 122706872, | |
| "step": 212000, | |
| "train_runtime": 817.1043, | |
| "train_tokens_per_second": 150172.829 | |
| }, | |
| { | |
| "epoch": 11.08618530884808, | |
| "grad_norm": 2.7014081478118896, | |
| "learning_rate": 2.2284667153589316e-05, | |
| "loss": 1.3207, | |
| "num_input_tokens_seen": 122993992, | |
| "step": 212500, | |
| "train_runtime": 835.8914, | |
| "train_tokens_per_second": 147141.106 | |
| }, | |
| { | |
| "epoch": 11.112270450751252, | |
| "grad_norm": 2.6697499752044678, | |
| "learning_rate": 2.2219454298831388e-05, | |
| "loss": 1.3299, | |
| "num_input_tokens_seen": 123284616, | |
| "step": 213000, | |
| "train_runtime": 854.6172, | |
| "train_tokens_per_second": 144257.114 | |
| }, | |
| { | |
| "epoch": 11.138355592654424, | |
| "grad_norm": 3.0389206409454346, | |
| "learning_rate": 2.2154241444073456e-05, | |
| "loss": 1.3267, | |
| "num_input_tokens_seen": 123574760, | |
| "step": 213500, | |
| "train_runtime": 873.3482, | |
| "train_tokens_per_second": 141495.405 | |
| }, | |
| { | |
| "epoch": 11.164440734557596, | |
| "grad_norm": 2.5090649127960205, | |
| "learning_rate": 2.2089028589315527e-05, | |
| "loss": 1.3173, | |
| "num_input_tokens_seen": 123863512, | |
| "step": 214000, | |
| "train_runtime": 892.119, | |
| "train_tokens_per_second": 138841.92 | |
| }, | |
| { | |
| "epoch": 11.190525876460768, | |
| "grad_norm": 2.458717107772827, | |
| "learning_rate": 2.2023815734557595e-05, | |
| "loss": 1.3488, | |
| "num_input_tokens_seen": 124153280, | |
| "step": 214500, | |
| "train_runtime": 910.8704, | |
| "train_tokens_per_second": 136301.807 | |
| }, | |
| { | |
| "epoch": 11.21661101836394, | |
| "grad_norm": 2.2780613899230957, | |
| "learning_rate": 2.1958602879799667e-05, | |
| "loss": 1.3227, | |
| "num_input_tokens_seen": 124441304, | |
| "step": 215000, | |
| "train_runtime": 929.5347, | |
| "train_tokens_per_second": 133874.841 | |
| }, | |
| { | |
| "epoch": 11.242696160267112, | |
| "grad_norm": 2.2592554092407227, | |
| "learning_rate": 2.189339002504174e-05, | |
| "loss": 1.3417, | |
| "num_input_tokens_seen": 124732192, | |
| "step": 215500, | |
| "train_runtime": 948.4081, | |
| "train_tokens_per_second": 131517.428 | |
| }, | |
| { | |
| "epoch": 11.268781302170284, | |
| "grad_norm": 1.9092062711715698, | |
| "learning_rate": 2.1828177170283806e-05, | |
| "loss": 1.3168, | |
| "num_input_tokens_seen": 125026840, | |
| "step": 216000, | |
| "train_runtime": 967.1853, | |
| "train_tokens_per_second": 129268.756 | |
| }, | |
| { | |
| "epoch": 11.294866444073456, | |
| "grad_norm": 2.6668968200683594, | |
| "learning_rate": 2.1762964315525878e-05, | |
| "loss": 1.3158, | |
| "num_input_tokens_seen": 125322792, | |
| "step": 216500, | |
| "train_runtime": 985.9404, | |
| "train_tokens_per_second": 127109.902 | |
| }, | |
| { | |
| "epoch": 11.320951585976628, | |
| "grad_norm": 2.6406455039978027, | |
| "learning_rate": 2.169775146076795e-05, | |
| "loss": 1.3155, | |
| "num_input_tokens_seen": 125610912, | |
| "step": 217000, | |
| "train_runtime": 1004.7846, | |
| "train_tokens_per_second": 125012.78 | |
| }, | |
| { | |
| "epoch": 11.3470367278798, | |
| "grad_norm": 3.033663272857666, | |
| "learning_rate": 2.1632538606010017e-05, | |
| "loss": 1.3048, | |
| "num_input_tokens_seen": 125899904, | |
| "step": 217500, | |
| "train_runtime": 1023.5588, | |
| "train_tokens_per_second": 123002.125 | |
| }, | |
| { | |
| "epoch": 11.373121869782972, | |
| "grad_norm": 2.4079842567443848, | |
| "learning_rate": 2.156732575125209e-05, | |
| "loss": 1.3217, | |
| "num_input_tokens_seen": 126190608, | |
| "step": 218000, | |
| "train_runtime": 1042.2822, | |
| "train_tokens_per_second": 121071.437 | |
| }, | |
| { | |
| "epoch": 11.399207011686144, | |
| "grad_norm": 2.4821534156799316, | |
| "learning_rate": 2.150211289649416e-05, | |
| "loss": 1.3127, | |
| "num_input_tokens_seen": 126477736, | |
| "step": 218500, | |
| "train_runtime": 1060.9849, | |
| "train_tokens_per_second": 119207.852 | |
| }, | |
| { | |
| "epoch": 11.425292153589316, | |
| "grad_norm": 3.1184568405151367, | |
| "learning_rate": 2.143690004173623e-05, | |
| "loss": 1.3191, | |
| "num_input_tokens_seen": 126768304, | |
| "step": 219000, | |
| "train_runtime": 1079.744, | |
| "train_tokens_per_second": 117405.884 | |
| }, | |
| { | |
| "epoch": 11.451377295492488, | |
| "grad_norm": 2.4726860523223877, | |
| "learning_rate": 2.1371687186978297e-05, | |
| "loss": 1.3, | |
| "num_input_tokens_seen": 127057344, | |
| "step": 219500, | |
| "train_runtime": 1098.4724, | |
| "train_tokens_per_second": 115667.311 | |
| }, | |
| { | |
| "epoch": 11.47746243739566, | |
| "grad_norm": 2.8745577335357666, | |
| "learning_rate": 2.1306474332220368e-05, | |
| "loss": 1.3066, | |
| "num_input_tokens_seen": 127342264, | |
| "step": 220000, | |
| "train_runtime": 1117.2372, | |
| "train_tokens_per_second": 113979.609 | |
| }, | |
| { | |
| "epoch": 11.503547579298832, | |
| "grad_norm": 2.5106630325317383, | |
| "learning_rate": 2.1241261477462436e-05, | |
| "loss": 1.3081, | |
| "num_input_tokens_seen": 127636384, | |
| "step": 220500, | |
| "train_runtime": 1136.0017, | |
| "train_tokens_per_second": 112355.806 | |
| }, | |
| { | |
| "epoch": 11.529632721202002, | |
| "grad_norm": 2.9184515476226807, | |
| "learning_rate": 2.1176048622704508e-05, | |
| "loss": 1.3162, | |
| "num_input_tokens_seen": 127929168, | |
| "step": 221000, | |
| "train_runtime": 1154.8123, | |
| "train_tokens_per_second": 110779.183 | |
| }, | |
| { | |
| "epoch": 11.555717863105176, | |
| "grad_norm": 2.631758689880371, | |
| "learning_rate": 2.111083576794658e-05, | |
| "loss": 1.3154, | |
| "num_input_tokens_seen": 128214768, | |
| "step": 221500, | |
| "train_runtime": 1173.5738, | |
| "train_tokens_per_second": 109251.562 | |
| }, | |
| { | |
| "epoch": 11.581803005008346, | |
| "grad_norm": 3.0632224082946777, | |
| "learning_rate": 2.1045622913188647e-05, | |
| "loss": 1.3265, | |
| "num_input_tokens_seen": 128502040, | |
| "step": 222000, | |
| "train_runtime": 1192.3765, | |
| "train_tokens_per_second": 107769.681 | |
| }, | |
| { | |
| "epoch": 11.607888146911518, | |
| "grad_norm": 3.1149165630340576, | |
| "learning_rate": 2.098041005843072e-05, | |
| "loss": 1.321, | |
| "num_input_tokens_seen": 128788576, | |
| "step": 222500, | |
| "train_runtime": 1211.1873, | |
| "train_tokens_per_second": 106332.503 | |
| }, | |
| { | |
| "epoch": 11.63397328881469, | |
| "grad_norm": 3.4126601219177246, | |
| "learning_rate": 2.091519720367279e-05, | |
| "loss": 1.3089, | |
| "num_input_tokens_seen": 129075456, | |
| "step": 223000, | |
| "train_runtime": 1229.9696, | |
| "train_tokens_per_second": 104941.986 | |
| }, | |
| { | |
| "epoch": 11.660058430717863, | |
| "grad_norm": 2.5633208751678467, | |
| "learning_rate": 2.0849984348914858e-05, | |
| "loss": 1.3354, | |
| "num_input_tokens_seen": 129363864, | |
| "step": 223500, | |
| "train_runtime": 1248.7371, | |
| "train_tokens_per_second": 103595.756 | |
| }, | |
| { | |
| "epoch": 11.686143572621035, | |
| "grad_norm": 2.816091775894165, | |
| "learning_rate": 2.078477149415693e-05, | |
| "loss": 1.3338, | |
| "num_input_tokens_seen": 129649336, | |
| "step": 224000, | |
| "train_runtime": 1267.5029, | |
| "train_tokens_per_second": 102287.208 | |
| }, | |
| { | |
| "epoch": 11.712228714524207, | |
| "grad_norm": 3.5613439083099365, | |
| "learning_rate": 2.0719558639399e-05, | |
| "loss": 1.3199, | |
| "num_input_tokens_seen": 129942320, | |
| "step": 224500, | |
| "train_runtime": 1286.259, | |
| "train_tokens_per_second": 101023.451 | |
| }, | |
| { | |
| "epoch": 11.738313856427379, | |
| "grad_norm": 2.822772741317749, | |
| "learning_rate": 2.065434578464107e-05, | |
| "loss": 1.3044, | |
| "num_input_tokens_seen": 130232704, | |
| "step": 225000, | |
| "train_runtime": 1305.0245, | |
| "train_tokens_per_second": 99793.304 | |
| }, | |
| { | |
| "epoch": 11.76439899833055, | |
| "grad_norm": 2.610865592956543, | |
| "learning_rate": 2.058913292988314e-05, | |
| "loss": 1.3334, | |
| "num_input_tokens_seen": 130524424, | |
| "step": 225500, | |
| "train_runtime": 1323.7569, | |
| "train_tokens_per_second": 98601.505 | |
| }, | |
| { | |
| "epoch": 11.790484140233723, | |
| "grad_norm": 2.68410325050354, | |
| "learning_rate": 2.0523920075125212e-05, | |
| "loss": 1.3042, | |
| "num_input_tokens_seen": 130811008, | |
| "step": 226000, | |
| "train_runtime": 1342.504, | |
| "train_tokens_per_second": 97438.079 | |
| }, | |
| { | |
| "epoch": 11.816569282136895, | |
| "grad_norm": 2.4882125854492188, | |
| "learning_rate": 2.045870722036728e-05, | |
| "loss": 1.365, | |
| "num_input_tokens_seen": 131095640, | |
| "step": 226500, | |
| "train_runtime": 1361.2815, | |
| "train_tokens_per_second": 96303.109 | |
| }, | |
| { | |
| "epoch": 11.842654424040067, | |
| "grad_norm": 2.4496724605560303, | |
| "learning_rate": 2.039349436560935e-05, | |
| "loss": 1.3053, | |
| "num_input_tokens_seen": 131380824, | |
| "step": 227000, | |
| "train_runtime": 1380.0428, | |
| "train_tokens_per_second": 95200.546 | |
| }, | |
| { | |
| "epoch": 11.868739565943239, | |
| "grad_norm": 2.1208622455596924, | |
| "learning_rate": 2.032828151085142e-05, | |
| "loss": 1.3387, | |
| "num_input_tokens_seen": 131669800, | |
| "step": 227500, | |
| "train_runtime": 1398.7962, | |
| "train_tokens_per_second": 94130.797 | |
| }, | |
| { | |
| "epoch": 11.89482470784641, | |
| "grad_norm": 2.5656790733337402, | |
| "learning_rate": 2.026306865609349e-05, | |
| "loss": 1.3109, | |
| "num_input_tokens_seen": 131956504, | |
| "step": 228000, | |
| "train_runtime": 1417.5824, | |
| "train_tokens_per_second": 93085.598 | |
| }, | |
| { | |
| "epoch": 11.920909849749583, | |
| "grad_norm": 2.894057035446167, | |
| "learning_rate": 2.019785580133556e-05, | |
| "loss": 1.3385, | |
| "num_input_tokens_seen": 132249552, | |
| "step": 228500, | |
| "train_runtime": 1436.3884, | |
| "train_tokens_per_second": 92070.886 | |
| }, | |
| { | |
| "epoch": 11.946994991652755, | |
| "grad_norm": 4.0213446617126465, | |
| "learning_rate": 2.013264294657763e-05, | |
| "loss": 1.3252, | |
| "num_input_tokens_seen": 132541072, | |
| "step": 229000, | |
| "train_runtime": 1455.1937, | |
| "train_tokens_per_second": 91081.394 | |
| }, | |
| { | |
| "epoch": 11.973080133555927, | |
| "grad_norm": 2.279191255569458, | |
| "learning_rate": 2.00674300918197e-05, | |
| "loss": 1.3362, | |
| "num_input_tokens_seen": 132831104, | |
| "step": 229500, | |
| "train_runtime": 1473.9285, | |
| "train_tokens_per_second": 90120.453 | |
| }, | |
| { | |
| "epoch": 11.999165275459099, | |
| "grad_norm": 2.1568970680236816, | |
| "learning_rate": 2.000221723706177e-05, | |
| "loss": 1.293, | |
| "num_input_tokens_seen": 133123320, | |
| "step": 230000, | |
| "train_runtime": 1492.6888, | |
| "train_tokens_per_second": 89183.575 | |
| }, | |
| { | |
| "epoch": 12.0, | |
| "eval_loss": 1.303634762763977, | |
| "eval_runtime": 45.533, | |
| "eval_samples_per_second": 841.917, | |
| "eval_steps_per_second": 105.242, | |
| "num_input_tokens_seen": 133131832, | |
| "step": 230016 | |
| }, | |
| { | |
| "epoch": 12.02525041736227, | |
| "grad_norm": 2.564668655395508, | |
| "learning_rate": 1.9937004382303838e-05, | |
| "loss": 1.2803, | |
| "num_input_tokens_seen": 133411856, | |
| "step": 230500, | |
| "train_runtime": 1558.1778, | |
| "train_tokens_per_second": 85620.431 | |
| }, | |
| { | |
| "epoch": 12.051335559265443, | |
| "grad_norm": 1.8836562633514404, | |
| "learning_rate": 1.987179152754591e-05, | |
| "loss": 1.3323, | |
| "num_input_tokens_seen": 133703544, | |
| "step": 231000, | |
| "train_runtime": 1576.9538, | |
| "train_tokens_per_second": 84785.959 | |
| }, | |
| { | |
| "epoch": 12.077420701168615, | |
| "grad_norm": 3.665679693222046, | |
| "learning_rate": 1.980657867278798e-05, | |
| "loss": 1.3101, | |
| "num_input_tokens_seen": 133990048, | |
| "step": 231500, | |
| "train_runtime": 1595.7021, | |
| "train_tokens_per_second": 83969.336 | |
| }, | |
| { | |
| "epoch": 12.103505843071787, | |
| "grad_norm": 2.481233596801758, | |
| "learning_rate": 1.974136581803005e-05, | |
| "loss": 1.3122, | |
| "num_input_tokens_seen": 134279720, | |
| "step": 232000, | |
| "train_runtime": 1614.4866, | |
| "train_tokens_per_second": 83171.778 | |
| }, | |
| { | |
| "epoch": 12.129590984974959, | |
| "grad_norm": 2.0712811946868896, | |
| "learning_rate": 1.967615296327212e-05, | |
| "loss": 1.3191, | |
| "num_input_tokens_seen": 134570152, | |
| "step": 232500, | |
| "train_runtime": 1633.2616, | |
| "train_tokens_per_second": 82393.51 | |
| }, | |
| { | |
| "epoch": 12.15567612687813, | |
| "grad_norm": 2.377253293991089, | |
| "learning_rate": 1.9610940108514192e-05, | |
| "loss": 1.303, | |
| "num_input_tokens_seen": 134859336, | |
| "step": 233000, | |
| "train_runtime": 1652.0277, | |
| "train_tokens_per_second": 81632.612 | |
| }, | |
| { | |
| "epoch": 12.181761268781303, | |
| "grad_norm": 2.749286651611328, | |
| "learning_rate": 1.954572725375626e-05, | |
| "loss": 1.3219, | |
| "num_input_tokens_seen": 135151088, | |
| "step": 233500, | |
| "train_runtime": 1670.9562, | |
| "train_tokens_per_second": 80882.482 | |
| }, | |
| { | |
| "epoch": 12.207846410684475, | |
| "grad_norm": 1.9715009927749634, | |
| "learning_rate": 1.948051439899833e-05, | |
| "loss": 1.3164, | |
| "num_input_tokens_seen": 135441304, | |
| "step": 234000, | |
| "train_runtime": 1689.8315, | |
| "train_tokens_per_second": 80150.776 | |
| }, | |
| { | |
| "epoch": 12.233931552587647, | |
| "grad_norm": 2.8835082054138184, | |
| "learning_rate": 1.9415301544240403e-05, | |
| "loss": 1.3164, | |
| "num_input_tokens_seen": 135728888, | |
| "step": 234500, | |
| "train_runtime": 1708.6414, | |
| "train_tokens_per_second": 79436.731 | |
| }, | |
| { | |
| "epoch": 12.260016694490819, | |
| "grad_norm": 2.7887117862701416, | |
| "learning_rate": 1.935008868948247e-05, | |
| "loss": 1.3003, | |
| "num_input_tokens_seen": 136016392, | |
| "step": 235000, | |
| "train_runtime": 1727.3834, | |
| "train_tokens_per_second": 78741.287 | |
| }, | |
| { | |
| "epoch": 12.28610183639399, | |
| "grad_norm": 2.219428777694702, | |
| "learning_rate": 1.9284875834724543e-05, | |
| "loss": 1.2853, | |
| "num_input_tokens_seen": 136304528, | |
| "step": 235500, | |
| "train_runtime": 1746.1346, | |
| "train_tokens_per_second": 78060.723 | |
| }, | |
| { | |
| "epoch": 12.312186978297161, | |
| "grad_norm": 2.7682409286499023, | |
| "learning_rate": 1.9219662979966614e-05, | |
| "loss": 1.3175, | |
| "num_input_tokens_seen": 136593504, | |
| "step": 236000, | |
| "train_runtime": 1764.8652, | |
| "train_tokens_per_second": 77395.999 | |
| }, | |
| { | |
| "epoch": 12.338272120200333, | |
| "grad_norm": 4.289463520050049, | |
| "learning_rate": 1.9154450125208682e-05, | |
| "loss": 1.2741, | |
| "num_input_tokens_seen": 136885144, | |
| "step": 236500, | |
| "train_runtime": 1783.6052, | |
| "train_tokens_per_second": 76746.323 | |
| }, | |
| { | |
| "epoch": 12.364357262103505, | |
| "grad_norm": 3.1798133850097656, | |
| "learning_rate": 1.9089237270450754e-05, | |
| "loss": 1.2896, | |
| "num_input_tokens_seen": 137168736, | |
| "step": 237000, | |
| "train_runtime": 1802.3604, | |
| "train_tokens_per_second": 76105.055 | |
| }, | |
| { | |
| "epoch": 12.390442404006677, | |
| "grad_norm": 3.9631903171539307, | |
| "learning_rate": 1.9024024415692822e-05, | |
| "loss": 1.3425, | |
| "num_input_tokens_seen": 137463960, | |
| "step": 237500, | |
| "train_runtime": 1821.2214, | |
| "train_tokens_per_second": 75478.997 | |
| }, | |
| { | |
| "epoch": 12.41652754590985, | |
| "grad_norm": 3.6029210090637207, | |
| "learning_rate": 1.8958811560934893e-05, | |
| "loss": 1.3134, | |
| "num_input_tokens_seen": 137751968, | |
| "step": 238000, | |
| "train_runtime": 1839.9397, | |
| "train_tokens_per_second": 74867.655 | |
| }, | |
| { | |
| "epoch": 12.442612687813021, | |
| "grad_norm": 2.178394317626953, | |
| "learning_rate": 1.889359870617696e-05, | |
| "loss": 1.2797, | |
| "num_input_tokens_seen": 138044520, | |
| "step": 238500, | |
| "train_runtime": 1858.64, | |
| "train_tokens_per_second": 74271.788 | |
| }, | |
| { | |
| "epoch": 12.468697829716193, | |
| "grad_norm": 2.5995266437530518, | |
| "learning_rate": 1.8828385851419033e-05, | |
| "loss": 1.3029, | |
| "num_input_tokens_seen": 138334136, | |
| "step": 239000, | |
| "train_runtime": 1877.3231, | |
| "train_tokens_per_second": 73686.909 | |
| }, | |
| { | |
| "epoch": 12.494782971619365, | |
| "grad_norm": 2.1378602981567383, | |
| "learning_rate": 1.87631729966611e-05, | |
| "loss": 1.3092, | |
| "num_input_tokens_seen": 138621760, | |
| "step": 239500, | |
| "train_runtime": 1895.9609, | |
| "train_tokens_per_second": 73114.252 | |
| }, | |
| { | |
| "epoch": 12.520868113522537, | |
| "grad_norm": 2.3101305961608887, | |
| "learning_rate": 1.8697960141903172e-05, | |
| "loss": 1.3457, | |
| "num_input_tokens_seen": 138914632, | |
| "step": 240000, | |
| "train_runtime": 1914.6876, | |
| "train_tokens_per_second": 72552.113 | |
| }, | |
| { | |
| "epoch": 12.54695325542571, | |
| "grad_norm": 2.8269946575164795, | |
| "learning_rate": 1.8632747287145244e-05, | |
| "loss": 1.3055, | |
| "num_input_tokens_seen": 139199064, | |
| "step": 240500, | |
| "train_runtime": 1933.403, | |
| "train_tokens_per_second": 71996.923 | |
| }, | |
| { | |
| "epoch": 12.573038397328881, | |
| "grad_norm": 4.536306858062744, | |
| "learning_rate": 1.8567534432387312e-05, | |
| "loss": 1.3104, | |
| "num_input_tokens_seen": 139488888, | |
| "step": 241000, | |
| "train_runtime": 1952.205, | |
| "train_tokens_per_second": 71451.969 | |
| }, | |
| { | |
| "epoch": 12.599123539232053, | |
| "grad_norm": 2.898843765258789, | |
| "learning_rate": 1.8502321577629383e-05, | |
| "loss": 1.2751, | |
| "num_input_tokens_seen": 139777560, | |
| "step": 241500, | |
| "train_runtime": 1970.9694, | |
| "train_tokens_per_second": 70918.18 | |
| }, | |
| { | |
| "epoch": 12.625208681135225, | |
| "grad_norm": 2.233572006225586, | |
| "learning_rate": 1.8437108722871455e-05, | |
| "loss": 1.2931, | |
| "num_input_tokens_seen": 140065240, | |
| "step": 242000, | |
| "train_runtime": 1989.7056, | |
| "train_tokens_per_second": 70394.956 | |
| }, | |
| { | |
| "epoch": 12.651293823038397, | |
| "grad_norm": 4.327518939971924, | |
| "learning_rate": 1.8371895868113523e-05, | |
| "loss": 1.2964, | |
| "num_input_tokens_seen": 140353912, | |
| "step": 242500, | |
| "train_runtime": 2008.5433, | |
| "train_tokens_per_second": 69878.458 | |
| }, | |
| { | |
| "epoch": 12.67737896494157, | |
| "grad_norm": 2.5169992446899414, | |
| "learning_rate": 1.8306683013355594e-05, | |
| "loss": 1.3056, | |
| "num_input_tokens_seen": 140643424, | |
| "step": 243000, | |
| "train_runtime": 2027.3392, | |
| "train_tokens_per_second": 69373.405 | |
| }, | |
| { | |
| "epoch": 12.703464106844741, | |
| "grad_norm": 2.1607372760772705, | |
| "learning_rate": 1.8241470158597666e-05, | |
| "loss": 1.2978, | |
| "num_input_tokens_seen": 140936888, | |
| "step": 243500, | |
| "train_runtime": 2046.1461, | |
| "train_tokens_per_second": 68879.19 | |
| }, | |
| { | |
| "epoch": 12.729549248747913, | |
| "grad_norm": 3.104569673538208, | |
| "learning_rate": 1.8176257303839734e-05, | |
| "loss": 1.3203, | |
| "num_input_tokens_seen": 141227736, | |
| "step": 244000, | |
| "train_runtime": 2064.9328, | |
| "train_tokens_per_second": 68393.38 | |
| }, | |
| { | |
| "epoch": 12.755634390651085, | |
| "grad_norm": 2.6793630123138428, | |
| "learning_rate": 1.8111044449081805e-05, | |
| "loss": 1.2928, | |
| "num_input_tokens_seen": 141513976, | |
| "step": 244500, | |
| "train_runtime": 2083.7183, | |
| "train_tokens_per_second": 67914.159 | |
| }, | |
| { | |
| "epoch": 12.781719532554257, | |
| "grad_norm": 2.779440402984619, | |
| "learning_rate": 1.8045831594323873e-05, | |
| "loss": 1.2963, | |
| "num_input_tokens_seen": 141801952, | |
| "step": 245000, | |
| "train_runtime": 2102.5169, | |
| "train_tokens_per_second": 67443.905 | |
| }, | |
| { | |
| "epoch": 12.80780467445743, | |
| "grad_norm": 2.685547351837158, | |
| "learning_rate": 1.7980618739565945e-05, | |
| "loss": 1.3113, | |
| "num_input_tokens_seen": 142087288, | |
| "step": 245500, | |
| "train_runtime": 2121.2928, | |
| "train_tokens_per_second": 66981.458 | |
| }, | |
| { | |
| "epoch": 12.833889816360601, | |
| "grad_norm": 3.5041792392730713, | |
| "learning_rate": 1.7915405884808016e-05, | |
| "loss": 1.3062, | |
| "num_input_tokens_seen": 142379312, | |
| "step": 246000, | |
| "train_runtime": 2140.0752, | |
| "train_tokens_per_second": 66530.051 | |
| }, | |
| { | |
| "epoch": 12.859974958263773, | |
| "grad_norm": 3.0701446533203125, | |
| "learning_rate": 1.7850193030050084e-05, | |
| "loss": 1.3036, | |
| "num_input_tokens_seen": 142666568, | |
| "step": 246500, | |
| "train_runtime": 2158.8062, | |
| "train_tokens_per_second": 66085.862 | |
| }, | |
| { | |
| "epoch": 12.886060100166945, | |
| "grad_norm": 1.8722320795059204, | |
| "learning_rate": 1.7784980175292152e-05, | |
| "loss": 1.3004, | |
| "num_input_tokens_seen": 142954624, | |
| "step": 247000, | |
| "train_runtime": 2177.6361, | |
| "train_tokens_per_second": 65646.701 | |
| }, | |
| { | |
| "epoch": 12.912145242070117, | |
| "grad_norm": 3.499333381652832, | |
| "learning_rate": 1.7719767320534224e-05, | |
| "loss": 1.3213, | |
| "num_input_tokens_seen": 143246680, | |
| "step": 247500, | |
| "train_runtime": 2196.4459, | |
| "train_tokens_per_second": 65217.486 | |
| }, | |
| { | |
| "epoch": 12.93823038397329, | |
| "grad_norm": 4.5629353523254395, | |
| "learning_rate": 1.7654554465776292e-05, | |
| "loss": 1.3231, | |
| "num_input_tokens_seen": 143537736, | |
| "step": 248000, | |
| "train_runtime": 2215.2759, | |
| "train_tokens_per_second": 64794.52 | |
| }, | |
| { | |
| "epoch": 12.964315525876462, | |
| "grad_norm": 3.0510342121124268, | |
| "learning_rate": 1.7589341611018363e-05, | |
| "loss": 1.2966, | |
| "num_input_tokens_seen": 143823008, | |
| "step": 248500, | |
| "train_runtime": 2233.9986, | |
| "train_tokens_per_second": 64379.186 | |
| }, | |
| { | |
| "epoch": 12.990400667779634, | |
| "grad_norm": 3.152311325073242, | |
| "learning_rate": 1.7524128756260435e-05, | |
| "loss": 1.2741, | |
| "num_input_tokens_seen": 144116976, | |
| "step": 249000, | |
| "train_runtime": 2252.7592, | |
| "train_tokens_per_second": 63973.537 | |
| }, | |
| { | |
| "epoch": 13.0, | |
| "eval_loss": 1.3037497997283936, | |
| "eval_runtime": 45.363, | |
| "eval_samples_per_second": 845.072, | |
| "eval_steps_per_second": 105.637, | |
| "num_input_tokens_seen": 144224222, | |
| "step": 249184 | |
| }, | |
| { | |
| "epoch": 13.016485809682806, | |
| "grad_norm": 2.950641632080078, | |
| "learning_rate": 1.7458915901502503e-05, | |
| "loss": 1.2892, | |
| "num_input_tokens_seen": 144404846, | |
| "step": 249500, | |
| "train_runtime": 2317.872, | |
| "train_tokens_per_second": 62300.612 | |
| }, | |
| { | |
| "epoch": 13.042570951585976, | |
| "grad_norm": 3.1258602142333984, | |
| "learning_rate": 1.7393703046744574e-05, | |
| "loss": 1.279, | |
| "num_input_tokens_seen": 144697406, | |
| "step": 250000, | |
| "train_runtime": 2336.6661, | |
| "train_tokens_per_second": 61924.725 | |
| }, | |
| { | |
| "epoch": 13.068656093489148, | |
| "grad_norm": 2.8600733280181885, | |
| "learning_rate": 1.7328490191986646e-05, | |
| "loss": 1.2856, | |
| "num_input_tokens_seen": 144992526, | |
| "step": 250500, | |
| "train_runtime": 2355.4549, | |
| "train_tokens_per_second": 61556.062 | |
| }, | |
| { | |
| "epoch": 13.09474123539232, | |
| "grad_norm": 2.740837335586548, | |
| "learning_rate": 1.7263277337228714e-05, | |
| "loss": 1.2793, | |
| "num_input_tokens_seen": 145286206, | |
| "step": 251000, | |
| "train_runtime": 2374.3019, | |
| "train_tokens_per_second": 61191.125 | |
| }, | |
| { | |
| "epoch": 13.120826377295492, | |
| "grad_norm": 2.514106035232544, | |
| "learning_rate": 1.7198064482470785e-05, | |
| "loss": 1.2966, | |
| "num_input_tokens_seen": 145576638, | |
| "step": 251500, | |
| "train_runtime": 2393.1024, | |
| "train_tokens_per_second": 60831.763 | |
| }, | |
| { | |
| "epoch": 13.146911519198664, | |
| "grad_norm": 2.3407087326049805, | |
| "learning_rate": 1.7132851627712857e-05, | |
| "loss": 1.288, | |
| "num_input_tokens_seen": 145861950, | |
| "step": 252000, | |
| "train_runtime": 2411.8629, | |
| "train_tokens_per_second": 60476.884 | |
| }, | |
| { | |
| "epoch": 13.172996661101836, | |
| "grad_norm": 2.940520763397217, | |
| "learning_rate": 1.7067638772954925e-05, | |
| "loss": 1.2828, | |
| "num_input_tokens_seen": 146153318, | |
| "step": 252500, | |
| "train_runtime": 2430.6861, | |
| "train_tokens_per_second": 60128.423 | |
| }, | |
| { | |
| "epoch": 13.199081803005008, | |
| "grad_norm": 2.352440595626831, | |
| "learning_rate": 1.7002425918196996e-05, | |
| "loss": 1.3483, | |
| "num_input_tokens_seen": 146442846, | |
| "step": 253000, | |
| "train_runtime": 2449.4406, | |
| "train_tokens_per_second": 59786.24 | |
| }, | |
| { | |
| "epoch": 13.22516694490818, | |
| "grad_norm": 3.5476200580596924, | |
| "learning_rate": 1.6937213063439068e-05, | |
| "loss": 1.286, | |
| "num_input_tokens_seen": 146729830, | |
| "step": 253500, | |
| "train_runtime": 2468.227, | |
| "train_tokens_per_second": 59447.462 | |
| }, | |
| { | |
| "epoch": 13.251252086811352, | |
| "grad_norm": 3.1068811416625977, | |
| "learning_rate": 1.6872000208681136e-05, | |
| "loss": 1.2873, | |
| "num_input_tokens_seen": 147026030, | |
| "step": 254000, | |
| "train_runtime": 2486.9722, | |
| "train_tokens_per_second": 59118.484 | |
| }, | |
| { | |
| "epoch": 13.277337228714524, | |
| "grad_norm": 3.000011920928955, | |
| "learning_rate": 1.6806787353923207e-05, | |
| "loss": 1.2832, | |
| "num_input_tokens_seen": 147309830, | |
| "step": 254500, | |
| "train_runtime": 2505.7198, | |
| "train_tokens_per_second": 58789.428 | |
| }, | |
| { | |
| "epoch": 13.303422370617696, | |
| "grad_norm": 3.2478373050689697, | |
| "learning_rate": 1.674157449916528e-05, | |
| "loss": 1.3025, | |
| "num_input_tokens_seen": 147604054, | |
| "step": 255000, | |
| "train_runtime": 2524.5534, | |
| "train_tokens_per_second": 58467.393 | |
| }, | |
| { | |
| "epoch": 13.329507512520868, | |
| "grad_norm": 2.5078775882720947, | |
| "learning_rate": 1.6676361644407347e-05, | |
| "loss": 1.2669, | |
| "num_input_tokens_seen": 147894782, | |
| "step": 255500, | |
| "train_runtime": 2543.3336, | |
| "train_tokens_per_second": 58149.974 | |
| }, | |
| { | |
| "epoch": 13.35559265442404, | |
| "grad_norm": 2.6515934467315674, | |
| "learning_rate": 1.6611148789649415e-05, | |
| "loss": 1.2827, | |
| "num_input_tokens_seen": 148189078, | |
| "step": 256000, | |
| "train_runtime": 2562.0637, | |
| "train_tokens_per_second": 57839.731 | |
| }, | |
| { | |
| "epoch": 13.381677796327212, | |
| "grad_norm": 3.669487237930298, | |
| "learning_rate": 1.6545935934891486e-05, | |
| "loss": 1.3063, | |
| "num_input_tokens_seen": 148477710, | |
| "step": 256500, | |
| "train_runtime": 2580.8969, | |
| "train_tokens_per_second": 57529.5 | |
| }, | |
| { | |
| "epoch": 13.407762938230384, | |
| "grad_norm": 2.5362067222595215, | |
| "learning_rate": 1.6480723080133555e-05, | |
| "loss": 1.311, | |
| "num_input_tokens_seen": 148771438, | |
| "step": 257000, | |
| "train_runtime": 2599.7745, | |
| "train_tokens_per_second": 57224.747 | |
| }, | |
| { | |
| "epoch": 13.433848080133556, | |
| "grad_norm": 1.743450403213501, | |
| "learning_rate": 1.6415510225375626e-05, | |
| "loss": 1.2843, | |
| "num_input_tokens_seen": 149060526, | |
| "step": 257500, | |
| "train_runtime": 2618.5247, | |
| "train_tokens_per_second": 56925.386 | |
| }, | |
| { | |
| "epoch": 13.459933222036728, | |
| "grad_norm": 2.875257968902588, | |
| "learning_rate": 1.6350297370617697e-05, | |
| "loss": 1.2692, | |
| "num_input_tokens_seen": 149346974, | |
| "step": 258000, | |
| "train_runtime": 2637.3123, | |
| "train_tokens_per_second": 56628.474 | |
| }, | |
| { | |
| "epoch": 13.4860183639399, | |
| "grad_norm": 3.3050479888916016, | |
| "learning_rate": 1.6285084515859766e-05, | |
| "loss": 1.2869, | |
| "num_input_tokens_seen": 149633070, | |
| "step": 258500, | |
| "train_runtime": 2656.0943, | |
| "train_tokens_per_second": 56335.751 | |
| }, | |
| { | |
| "epoch": 13.512103505843072, | |
| "grad_norm": 2.2370221614837646, | |
| "learning_rate": 1.6219871661101837e-05, | |
| "loss": 1.3004, | |
| "num_input_tokens_seen": 149926758, | |
| "step": 259000, | |
| "train_runtime": 2674.8246, | |
| "train_tokens_per_second": 56051.06 | |
| }, | |
| { | |
| "epoch": 13.538188647746244, | |
| "grad_norm": 4.20009183883667, | |
| "learning_rate": 1.615465880634391e-05, | |
| "loss": 1.2629, | |
| "num_input_tokens_seen": 150212054, | |
| "step": 259500, | |
| "train_runtime": 2693.5708, | |
| "train_tokens_per_second": 55766.885 | |
| }, | |
| { | |
| "epoch": 13.564273789649416, | |
| "grad_norm": 2.247492551803589, | |
| "learning_rate": 1.6089445951585977e-05, | |
| "loss": 1.3251, | |
| "num_input_tokens_seen": 150502366, | |
| "step": 260000, | |
| "train_runtime": 2712.3292, | |
| "train_tokens_per_second": 55488.237 | |
| }, | |
| { | |
| "epoch": 13.590358931552588, | |
| "grad_norm": 2.1950037479400635, | |
| "learning_rate": 1.6024233096828048e-05, | |
| "loss": 1.2798, | |
| "num_input_tokens_seen": 150787110, | |
| "step": 260500, | |
| "train_runtime": 2731.1083, | |
| "train_tokens_per_second": 55210.959 | |
| }, | |
| { | |
| "epoch": 13.61644407345576, | |
| "grad_norm": 2.5948126316070557, | |
| "learning_rate": 1.5959020242070116e-05, | |
| "loss": 1.2685, | |
| "num_input_tokens_seen": 151072982, | |
| "step": 261000, | |
| "train_runtime": 2749.822, | |
| "train_tokens_per_second": 54939.185 | |
| }, | |
| { | |
| "epoch": 13.642529215358932, | |
| "grad_norm": 3.1042332649230957, | |
| "learning_rate": 1.5893807387312188e-05, | |
| "loss": 1.2917, | |
| "num_input_tokens_seen": 151366958, | |
| "step": 261500, | |
| "train_runtime": 2768.6641, | |
| "train_tokens_per_second": 54671.478 | |
| }, | |
| { | |
| "epoch": 13.668614357262104, | |
| "grad_norm": 2.2142746448516846, | |
| "learning_rate": 1.582859453255426e-05, | |
| "loss": 1.2928, | |
| "num_input_tokens_seen": 151651278, | |
| "step": 262000, | |
| "train_runtime": 2787.4176, | |
| "train_tokens_per_second": 54405.653 | |
| }, | |
| { | |
| "epoch": 13.694699499165276, | |
| "grad_norm": 2.406888008117676, | |
| "learning_rate": 1.5763381677796327e-05, | |
| "loss": 1.2888, | |
| "num_input_tokens_seen": 151940174, | |
| "step": 262500, | |
| "train_runtime": 2806.1465, | |
| "train_tokens_per_second": 54145.488 | |
| }, | |
| { | |
| "epoch": 13.720784641068448, | |
| "grad_norm": 2.989021062850952, | |
| "learning_rate": 1.56981688230384e-05, | |
| "loss": 1.3058, | |
| "num_input_tokens_seen": 152226926, | |
| "step": 263000, | |
| "train_runtime": 2824.9214, | |
| "train_tokens_per_second": 53887.137 | |
| }, | |
| { | |
| "epoch": 13.746869782971618, | |
| "grad_norm": 2.4519472122192383, | |
| "learning_rate": 1.563295596828047e-05, | |
| "loss": 1.3242, | |
| "num_input_tokens_seen": 152519390, | |
| "step": 263500, | |
| "train_runtime": 2843.6976, | |
| "train_tokens_per_second": 53634.181 | |
| }, | |
| { | |
| "epoch": 13.772954924874792, | |
| "grad_norm": 3.375582456588745, | |
| "learning_rate": 1.5567743113522538e-05, | |
| "loss": 1.2878, | |
| "num_input_tokens_seen": 152810446, | |
| "step": 264000, | |
| "train_runtime": 2862.4801, | |
| "train_tokens_per_second": 53383.932 | |
| }, | |
| { | |
| "epoch": 13.799040066777962, | |
| "grad_norm": 2.5288329124450684, | |
| "learning_rate": 1.550253025876461e-05, | |
| "loss": 1.279, | |
| "num_input_tokens_seen": 153100030, | |
| "step": 264500, | |
| "train_runtime": 2881.2536, | |
| "train_tokens_per_second": 53136.604 | |
| }, | |
| { | |
| "epoch": 13.825125208681134, | |
| "grad_norm": 2.273123025894165, | |
| "learning_rate": 1.5437317404006678e-05, | |
| "loss": 1.2912, | |
| "num_input_tokens_seen": 153385646, | |
| "step": 265000, | |
| "train_runtime": 2900.0148, | |
| "train_tokens_per_second": 52891.332 | |
| }, | |
| { | |
| "epoch": 13.851210350584306, | |
| "grad_norm": 5.488306522369385, | |
| "learning_rate": 1.537210454924875e-05, | |
| "loss": 1.3079, | |
| "num_input_tokens_seen": 153672086, | |
| "step": 265500, | |
| "train_runtime": 2918.7985, | |
| "train_tokens_per_second": 52649.091 | |
| }, | |
| { | |
| "epoch": 13.877295492487479, | |
| "grad_norm": 2.2071919441223145, | |
| "learning_rate": 1.5306891694490817e-05, | |
| "loss": 1.3046, | |
| "num_input_tokens_seen": 153960638, | |
| "step": 266000, | |
| "train_runtime": 2937.5127, | |
| "train_tokens_per_second": 52411.906 | |
| }, | |
| { | |
| "epoch": 13.90338063439065, | |
| "grad_norm": 3.046309471130371, | |
| "learning_rate": 1.524167883973289e-05, | |
| "loss": 1.2832, | |
| "num_input_tokens_seen": 154246150, | |
| "step": 266500, | |
| "train_runtime": 2956.2532, | |
| "train_tokens_per_second": 52176.231 | |
| }, | |
| { | |
| "epoch": 13.929465776293823, | |
| "grad_norm": 2.4747865200042725, | |
| "learning_rate": 1.5176465984974958e-05, | |
| "loss": 1.2976, | |
| "num_input_tokens_seen": 154534870, | |
| "step": 267000, | |
| "train_runtime": 2975.0146, | |
| "train_tokens_per_second": 51944.238 | |
| }, | |
| { | |
| "epoch": 13.955550918196995, | |
| "grad_norm": 2.148017168045044, | |
| "learning_rate": 1.511125313021703e-05, | |
| "loss": 1.3016, | |
| "num_input_tokens_seen": 154821518, | |
| "step": 267500, | |
| "train_runtime": 2993.7975, | |
| "train_tokens_per_second": 51714.091 | |
| }, | |
| { | |
| "epoch": 13.981636060100167, | |
| "grad_norm": 2.248180389404297, | |
| "learning_rate": 1.50460402754591e-05, | |
| "loss": 1.2983, | |
| "num_input_tokens_seen": 155115046, | |
| "step": 268000, | |
| "train_runtime": 3012.5625, | |
| "train_tokens_per_second": 51489.403 | |
| }, | |
| { | |
| "epoch": 14.0, | |
| "eval_loss": 1.2995389699935913, | |
| "eval_runtime": 45.4147, | |
| "eval_samples_per_second": 844.109, | |
| "eval_steps_per_second": 105.516, | |
| "num_input_tokens_seen": 155319448, | |
| "step": 268352 | |
| }, | |
| { | |
| "epoch": 14.007721202003339, | |
| "grad_norm": 3.0312399864196777, | |
| "learning_rate": 1.4980827420701168e-05, | |
| "loss": 1.3027, | |
| "num_input_tokens_seen": 155408024, | |
| "step": 268500, | |
| "train_runtime": 3077.7818, | |
| "train_tokens_per_second": 50493.516 | |
| }, | |
| { | |
| "epoch": 14.03380634390651, | |
| "grad_norm": 4.309081077575684, | |
| "learning_rate": 1.4915614565943239e-05, | |
| "loss": 1.2652, | |
| "num_input_tokens_seen": 155690152, | |
| "step": 269000, | |
| "train_runtime": 3096.5771, | |
| "train_tokens_per_second": 50278.144 | |
| }, | |
| { | |
| "epoch": 14.059891485809683, | |
| "grad_norm": 2.96939754486084, | |
| "learning_rate": 1.485040171118531e-05, | |
| "loss": 1.271, | |
| "num_input_tokens_seen": 155981000, | |
| "step": 269500, | |
| "train_runtime": 3115.3826, | |
| "train_tokens_per_second": 50068.008 | |
| }, | |
| { | |
| "epoch": 14.085976627712855, | |
| "grad_norm": 2.4417145252227783, | |
| "learning_rate": 1.4785188856427379e-05, | |
| "loss": 1.2753, | |
| "num_input_tokens_seen": 156272536, | |
| "step": 270000, | |
| "train_runtime": 3134.181, | |
| "train_tokens_per_second": 49860.724 | |
| }, | |
| { | |
| "epoch": 14.112061769616027, | |
| "grad_norm": 3.6525328159332275, | |
| "learning_rate": 1.471997600166945e-05, | |
| "loss": 1.2708, | |
| "num_input_tokens_seen": 156564232, | |
| "step": 270500, | |
| "train_runtime": 3152.9933, | |
| "train_tokens_per_second": 49655.746 | |
| }, | |
| { | |
| "epoch": 14.138146911519199, | |
| "grad_norm": 2.702702045440674, | |
| "learning_rate": 1.4654763146911522e-05, | |
| "loss": 1.2644, | |
| "num_input_tokens_seen": 156847192, | |
| "step": 271000, | |
| "train_runtime": 3171.767, | |
| "train_tokens_per_second": 49451.045 | |
| }, | |
| { | |
| "epoch": 14.16423205342237, | |
| "grad_norm": 2.738504648208618, | |
| "learning_rate": 1.458955029215359e-05, | |
| "loss": 1.2735, | |
| "num_input_tokens_seen": 157138056, | |
| "step": 271500, | |
| "train_runtime": 3190.5858, | |
| "train_tokens_per_second": 49250.534 | |
| }, | |
| { | |
| "epoch": 14.190317195325543, | |
| "grad_norm": 2.680459976196289, | |
| "learning_rate": 1.4524337437395661e-05, | |
| "loss": 1.2923, | |
| "num_input_tokens_seen": 157427656, | |
| "step": 272000, | |
| "train_runtime": 3209.3923, | |
| "train_tokens_per_second": 49052.17 | |
| }, | |
| { | |
| "epoch": 14.216402337228715, | |
| "grad_norm": 2.5472817420959473, | |
| "learning_rate": 1.4459124582637731e-05, | |
| "loss": 1.2812, | |
| "num_input_tokens_seen": 157714904, | |
| "step": 272500, | |
| "train_runtime": 3228.1634, | |
| "train_tokens_per_second": 48855.924 | |
| }, | |
| { | |
| "epoch": 14.242487479131887, | |
| "grad_norm": 2.909809112548828, | |
| "learning_rate": 1.4393911727879799e-05, | |
| "loss": 1.3002, | |
| "num_input_tokens_seen": 158004216, | |
| "step": 273000, | |
| "train_runtime": 3246.9319, | |
| "train_tokens_per_second": 48662.621 | |
| }, | |
| { | |
| "epoch": 14.268572621035059, | |
| "grad_norm": 3.222720146179199, | |
| "learning_rate": 1.432869887312187e-05, | |
| "loss": 1.2887, | |
| "num_input_tokens_seen": 158292352, | |
| "step": 273500, | |
| "train_runtime": 3265.647, | |
| "train_tokens_per_second": 48471.973 | |
| }, | |
| { | |
| "epoch": 14.29465776293823, | |
| "grad_norm": 1.991113543510437, | |
| "learning_rate": 1.4263486018363942e-05, | |
| "loss": 1.2627, | |
| "num_input_tokens_seen": 158587024, | |
| "step": 274000, | |
| "train_runtime": 3284.4013, | |
| "train_tokens_per_second": 48284.91 | |
| }, | |
| { | |
| "epoch": 14.320742904841403, | |
| "grad_norm": 2.8505282402038574, | |
| "learning_rate": 1.419827316360601e-05, | |
| "loss": 1.2836, | |
| "num_input_tokens_seen": 158886520, | |
| "step": 274500, | |
| "train_runtime": 3303.3083, | |
| "train_tokens_per_second": 48099.209 | |
| }, | |
| { | |
| "epoch": 14.346828046744575, | |
| "grad_norm": 2.9469573497772217, | |
| "learning_rate": 1.4133060308848081e-05, | |
| "loss": 1.2749, | |
| "num_input_tokens_seen": 159177696, | |
| "step": 275000, | |
| "train_runtime": 3322.098, | |
| "train_tokens_per_second": 47914.811 | |
| }, | |
| { | |
| "epoch": 14.372913188647747, | |
| "grad_norm": 4.244631767272949, | |
| "learning_rate": 1.4067847454090153e-05, | |
| "loss": 1.2695, | |
| "num_input_tokens_seen": 159460280, | |
| "step": 275500, | |
| "train_runtime": 3340.8943, | |
| "train_tokens_per_second": 47729.819 | |
| }, | |
| { | |
| "epoch": 14.398998330550919, | |
| "grad_norm": 3.174166440963745, | |
| "learning_rate": 1.4002634599332221e-05, | |
| "loss": 1.2888, | |
| "num_input_tokens_seen": 159745000, | |
| "step": 276000, | |
| "train_runtime": 3359.6609, | |
| "train_tokens_per_second": 47547.953 | |
| }, | |
| { | |
| "epoch": 14.42508347245409, | |
| "grad_norm": 2.760267496109009, | |
| "learning_rate": 1.3937421744574292e-05, | |
| "loss": 1.2714, | |
| "num_input_tokens_seen": 160037624, | |
| "step": 276500, | |
| "train_runtime": 3378.4646, | |
| "train_tokens_per_second": 47369.928 | |
| }, | |
| { | |
| "epoch": 14.451168614357263, | |
| "grad_norm": 3.1717495918273926, | |
| "learning_rate": 1.387220888981636e-05, | |
| "loss": 1.2967, | |
| "num_input_tokens_seen": 160328736, | |
| "step": 277000, | |
| "train_runtime": 3397.3414, | |
| "train_tokens_per_second": 47192.412 | |
| }, | |
| { | |
| "epoch": 14.477253756260435, | |
| "grad_norm": 2.68973708152771, | |
| "learning_rate": 1.380699603505843e-05, | |
| "loss": 1.2688, | |
| "num_input_tokens_seen": 160619656, | |
| "step": 277500, | |
| "train_runtime": 3416.1542, | |
| "train_tokens_per_second": 47017.683 | |
| }, | |
| { | |
| "epoch": 14.503338898163605, | |
| "grad_norm": 2.4333648681640625, | |
| "learning_rate": 1.3741783180300502e-05, | |
| "loss": 1.2797, | |
| "num_input_tokens_seen": 160908592, | |
| "step": 278000, | |
| "train_runtime": 3434.8918, | |
| "train_tokens_per_second": 46845.316 | |
| }, | |
| { | |
| "epoch": 14.529424040066779, | |
| "grad_norm": 2.4637181758880615, | |
| "learning_rate": 1.367657032554257e-05, | |
| "loss": 1.2733, | |
| "num_input_tokens_seen": 161202600, | |
| "step": 278500, | |
| "train_runtime": 3453.6295, | |
| "train_tokens_per_second": 46676.287 | |
| }, | |
| { | |
| "epoch": 14.55550918196995, | |
| "grad_norm": 2.199878215789795, | |
| "learning_rate": 1.3611357470784641e-05, | |
| "loss": 1.2812, | |
| "num_input_tokens_seen": 161493960, | |
| "step": 279000, | |
| "train_runtime": 3472.3475, | |
| "train_tokens_per_second": 46508.583 | |
| }, | |
| { | |
| "epoch": 14.581594323873121, | |
| "grad_norm": 2.7561452388763428, | |
| "learning_rate": 1.3546144616026713e-05, | |
| "loss": 1.2981, | |
| "num_input_tokens_seen": 161780984, | |
| "step": 279500, | |
| "train_runtime": 3491.0873, | |
| "train_tokens_per_second": 46341.146 | |
| }, | |
| { | |
| "epoch": 14.607679465776293, | |
| "grad_norm": 2.5802223682403564, | |
| "learning_rate": 1.348093176126878e-05, | |
| "loss": 1.2772, | |
| "num_input_tokens_seen": 162067272, | |
| "step": 280000, | |
| "train_runtime": 3509.8281, | |
| "train_tokens_per_second": 46175.274 | |
| }, | |
| { | |
| "epoch": 14.633764607679465, | |
| "grad_norm": 2.8847203254699707, | |
| "learning_rate": 1.3415718906510852e-05, | |
| "loss": 1.2868, | |
| "num_input_tokens_seen": 162356640, | |
| "step": 280500, | |
| "train_runtime": 3528.574, | |
| "train_tokens_per_second": 46011.97 | |
| }, | |
| { | |
| "epoch": 14.659849749582637, | |
| "grad_norm": 2.8300564289093018, | |
| "learning_rate": 1.3350506051752924e-05, | |
| "loss": 1.3286, | |
| "num_input_tokens_seen": 162645952, | |
| "step": 281000, | |
| "train_runtime": 3547.3388, | |
| "train_tokens_per_second": 45850.132 | |
| }, | |
| { | |
| "epoch": 14.68593489148581, | |
| "grad_norm": 2.2055959701538086, | |
| "learning_rate": 1.3285293196994992e-05, | |
| "loss": 1.2874, | |
| "num_input_tokens_seen": 162937608, | |
| "step": 281500, | |
| "train_runtime": 3566.1498, | |
| "train_tokens_per_second": 45690.063 | |
| }, | |
| { | |
| "epoch": 14.712020033388981, | |
| "grad_norm": 2.794443368911743, | |
| "learning_rate": 1.3220080342237062e-05, | |
| "loss": 1.2976, | |
| "num_input_tokens_seen": 163226160, | |
| "step": 282000, | |
| "train_runtime": 3584.9392, | |
| "train_tokens_per_second": 45531.081 | |
| }, | |
| { | |
| "epoch": 14.738105175292153, | |
| "grad_norm": 2.3322718143463135, | |
| "learning_rate": 1.3154867487479133e-05, | |
| "loss": 1.3031, | |
| "num_input_tokens_seen": 163520392, | |
| "step": 282500, | |
| "train_runtime": 3603.7244, | |
| "train_tokens_per_second": 45375.388 | |
| }, | |
| { | |
| "epoch": 14.764190317195325, | |
| "grad_norm": 2.4972341060638428, | |
| "learning_rate": 1.3089654632721201e-05, | |
| "loss": 1.2688, | |
| "num_input_tokens_seen": 163814080, | |
| "step": 283000, | |
| "train_runtime": 3622.5289, | |
| "train_tokens_per_second": 45220.917 | |
| }, | |
| { | |
| "epoch": 14.790275459098497, | |
| "grad_norm": 2.5767734050750732, | |
| "learning_rate": 1.3024441777963273e-05, | |
| "loss": 1.2623, | |
| "num_input_tokens_seen": 164098944, | |
| "step": 283500, | |
| "train_runtime": 3641.3406, | |
| "train_tokens_per_second": 45065.531 | |
| }, | |
| { | |
| "epoch": 14.81636060100167, | |
| "grad_norm": 2.557332992553711, | |
| "learning_rate": 1.2959228923205344e-05, | |
| "loss": 1.2782, | |
| "num_input_tokens_seen": 164388472, | |
| "step": 284000, | |
| "train_runtime": 3660.0837, | |
| "train_tokens_per_second": 44913.856 | |
| }, | |
| { | |
| "epoch": 14.842445742904841, | |
| "grad_norm": 2.9156086444854736, | |
| "learning_rate": 1.2894016068447412e-05, | |
| "loss": 1.2929, | |
| "num_input_tokens_seen": 164678824, | |
| "step": 284500, | |
| "train_runtime": 3678.8815, | |
| "train_tokens_per_second": 44763.286 | |
| }, | |
| { | |
| "epoch": 14.868530884808013, | |
| "grad_norm": 2.550926685333252, | |
| "learning_rate": 1.2828803213689484e-05, | |
| "loss": 1.2843, | |
| "num_input_tokens_seen": 164964520, | |
| "step": 285000, | |
| "train_runtime": 3697.6895, | |
| "train_tokens_per_second": 44612.864 | |
| }, | |
| { | |
| "epoch": 14.894616026711185, | |
| "grad_norm": 3.0715761184692383, | |
| "learning_rate": 1.2763590358931555e-05, | |
| "loss": 1.2791, | |
| "num_input_tokens_seen": 165252424, | |
| "step": 285500, | |
| "train_runtime": 3716.4903, | |
| "train_tokens_per_second": 44464.646 | |
| }, | |
| { | |
| "epoch": 14.920701168614357, | |
| "grad_norm": 3.2298481464385986, | |
| "learning_rate": 1.2698377504173623e-05, | |
| "loss": 1.286, | |
| "num_input_tokens_seen": 165546752, | |
| "step": 286000, | |
| "train_runtime": 3735.2292, | |
| "train_tokens_per_second": 44320.373 | |
| }, | |
| { | |
| "epoch": 14.94678631051753, | |
| "grad_norm": 2.6789731979370117, | |
| "learning_rate": 1.2633164649415693e-05, | |
| "loss": 1.2922, | |
| "num_input_tokens_seen": 165831800, | |
| "step": 286500, | |
| "train_runtime": 3754.0295, | |
| "train_tokens_per_second": 44174.346 | |
| }, | |
| { | |
| "epoch": 14.972871452420701, | |
| "grad_norm": 2.6322739124298096, | |
| "learning_rate": 1.2567951794657764e-05, | |
| "loss": 1.2873, | |
| "num_input_tokens_seen": 166125192, | |
| "step": 287000, | |
| "train_runtime": 3772.8414, | |
| "train_tokens_per_second": 44031.852 | |
| }, | |
| { | |
| "epoch": 14.998956594323873, | |
| "grad_norm": 2.762434244155884, | |
| "learning_rate": 1.2502738939899832e-05, | |
| "loss": 1.2715, | |
| "num_input_tokens_seen": 166410264, | |
| "step": 287500, | |
| "train_runtime": 3791.6838, | |
| "train_tokens_per_second": 43888.222 | |
| }, | |
| { | |
| "epoch": 15.0, | |
| "eval_loss": 1.2970120906829834, | |
| "eval_runtime": 45.5176, | |
| "eval_samples_per_second": 842.201, | |
| "eval_steps_per_second": 105.278, | |
| "num_input_tokens_seen": 166422516, | |
| "step": 287520 | |
| }, | |
| { | |
| "epoch": 15.025041736227045, | |
| "grad_norm": 2.177825927734375, | |
| "learning_rate": 1.2437526085141904e-05, | |
| "loss": 1.2801, | |
| "num_input_tokens_seen": 166697628, | |
| "step": 288000, | |
| "train_runtime": 3857.3769, | |
| "train_tokens_per_second": 43215.282 | |
| }, | |
| { | |
| "epoch": 15.051126878130217, | |
| "grad_norm": 3.206347703933716, | |
| "learning_rate": 1.2372313230383974e-05, | |
| "loss": 1.2709, | |
| "num_input_tokens_seen": 166992924, | |
| "step": 288500, | |
| "train_runtime": 3876.1711, | |
| "train_tokens_per_second": 43081.927 | |
| }, | |
| { | |
| "epoch": 15.07721202003339, | |
| "grad_norm": 2.4079601764678955, | |
| "learning_rate": 1.2307100375626043e-05, | |
| "loss": 1.2744, | |
| "num_input_tokens_seen": 167286132, | |
| "step": 289000, | |
| "train_runtime": 3895.0066, | |
| "train_tokens_per_second": 42948.869 | |
| }, | |
| { | |
| "epoch": 15.103297161936561, | |
| "grad_norm": 1.9692761898040771, | |
| "learning_rate": 1.2241887520868115e-05, | |
| "loss": 1.2559, | |
| "num_input_tokens_seen": 167572372, | |
| "step": 289500, | |
| "train_runtime": 3913.7506, | |
| "train_tokens_per_second": 42816.313 | |
| }, | |
| { | |
| "epoch": 15.129382303839733, | |
| "grad_norm": 2.694408416748047, | |
| "learning_rate": 1.2176674666110185e-05, | |
| "loss": 1.2661, | |
| "num_input_tokens_seen": 167863284, | |
| "step": 290000, | |
| "train_runtime": 3932.5501, | |
| "train_tokens_per_second": 42685.606 | |
| }, | |
| { | |
| "epoch": 15.155467445742905, | |
| "grad_norm": 2.9768283367156982, | |
| "learning_rate": 1.2111461811352254e-05, | |
| "loss": 1.2868, | |
| "num_input_tokens_seen": 168153292, | |
| "step": 290500, | |
| "train_runtime": 3951.2884, | |
| "train_tokens_per_second": 42556.573 | |
| }, | |
| { | |
| "epoch": 15.181552587646078, | |
| "grad_norm": 3.165743112564087, | |
| "learning_rate": 1.2046248956594324e-05, | |
| "loss": 1.2598, | |
| "num_input_tokens_seen": 168442780, | |
| "step": 291000, | |
| "train_runtime": 3970.108, | |
| "train_tokens_per_second": 42427.758 | |
| }, | |
| { | |
| "epoch": 15.20763772954925, | |
| "grad_norm": 2.1122047901153564, | |
| "learning_rate": 1.1981036101836394e-05, | |
| "loss": 1.2777, | |
| "num_input_tokens_seen": 168730764, | |
| "step": 291500, | |
| "train_runtime": 3989.0323, | |
| "train_tokens_per_second": 42298.671 | |
| }, | |
| { | |
| "epoch": 15.233722871452422, | |
| "grad_norm": 2.8908307552337646, | |
| "learning_rate": 1.1915823247078464e-05, | |
| "loss": 1.2524, | |
| "num_input_tokens_seen": 169023804, | |
| "step": 292000, | |
| "train_runtime": 4008.0188, | |
| "train_tokens_per_second": 42171.41 | |
| }, | |
| { | |
| "epoch": 15.259808013355592, | |
| "grad_norm": 5.693580627441406, | |
| "learning_rate": 1.1850610392320535e-05, | |
| "loss": 1.2636, | |
| "num_input_tokens_seen": 169313124, | |
| "step": 292500, | |
| "train_runtime": 4028.3264, | |
| "train_tokens_per_second": 42030.637 | |
| }, | |
| { | |
| "epoch": 15.285893155258764, | |
| "grad_norm": 2.3008134365081787, | |
| "learning_rate": 1.1785397537562605e-05, | |
| "loss": 1.2828, | |
| "num_input_tokens_seen": 169601124, | |
| "step": 293000, | |
| "train_runtime": 4048.6666, | |
| "train_tokens_per_second": 41890.613 | |
| }, | |
| { | |
| "epoch": 15.311978297161936, | |
| "grad_norm": 2.8285107612609863, | |
| "learning_rate": 1.1720184682804675e-05, | |
| "loss": 1.2528, | |
| "num_input_tokens_seen": 169887028, | |
| "step": 293500, | |
| "train_runtime": 4068.1864, | |
| "train_tokens_per_second": 41759.893 | |
| }, | |
| { | |
| "epoch": 15.338063439065108, | |
| "grad_norm": 2.4193263053894043, | |
| "learning_rate": 1.1654971828046746e-05, | |
| "loss": 1.272, | |
| "num_input_tokens_seen": 170171812, | |
| "step": 294000, | |
| "train_runtime": 4087.6299, | |
| "train_tokens_per_second": 41630.925 | |
| }, | |
| { | |
| "epoch": 15.36414858096828, | |
| "grad_norm": 2.8411006927490234, | |
| "learning_rate": 1.1589758973288816e-05, | |
| "loss": 1.2846, | |
| "num_input_tokens_seen": 170459652, | |
| "step": 294500, | |
| "train_runtime": 4106.8845, | |
| "train_tokens_per_second": 41505.83 | |
| }, | |
| { | |
| "epoch": 15.390233722871452, | |
| "grad_norm": 3.2765908241271973, | |
| "learning_rate": 1.1524546118530886e-05, | |
| "loss": 1.283, | |
| "num_input_tokens_seen": 170746052, | |
| "step": 295000, | |
| "train_runtime": 4125.6554, | |
| "train_tokens_per_second": 41386.407 | |
| }, | |
| { | |
| "epoch": 15.416318864774624, | |
| "grad_norm": 4.315444469451904, | |
| "learning_rate": 1.1459333263772955e-05, | |
| "loss": 1.2499, | |
| "num_input_tokens_seen": 171039820, | |
| "step": 295500, | |
| "train_runtime": 4144.6159, | |
| "train_tokens_per_second": 41267.954 | |
| }, | |
| { | |
| "epoch": 15.442404006677796, | |
| "grad_norm": 2.635226249694824, | |
| "learning_rate": 1.1394120409015025e-05, | |
| "loss": 1.271, | |
| "num_input_tokens_seen": 171325612, | |
| "step": 296000, | |
| "train_runtime": 4164.0018, | |
| "train_tokens_per_second": 41144.461 | |
| }, | |
| { | |
| "epoch": 15.468489148580968, | |
| "grad_norm": 2.699335813522339, | |
| "learning_rate": 1.1328907554257095e-05, | |
| "loss": 1.276, | |
| "num_input_tokens_seen": 171612740, | |
| "step": 296500, | |
| "train_runtime": 4184.2714, | |
| "train_tokens_per_second": 41013.768 | |
| }, | |
| { | |
| "epoch": 15.49457429048414, | |
| "grad_norm": 2.0063083171844482, | |
| "learning_rate": 1.1263694699499165e-05, | |
| "loss": 1.2596, | |
| "num_input_tokens_seen": 171906348, | |
| "step": 297000, | |
| "train_runtime": 4203.2579, | |
| "train_tokens_per_second": 40898.358 | |
| }, | |
| { | |
| "epoch": 15.520659432387312, | |
| "grad_norm": 2.836402654647827, | |
| "learning_rate": 1.1198481844741236e-05, | |
| "loss": 1.2578, | |
| "num_input_tokens_seen": 172189356, | |
| "step": 297500, | |
| "train_runtime": 4222.1833, | |
| "train_tokens_per_second": 40782.066 | |
| }, | |
| { | |
| "epoch": 15.546744574290484, | |
| "grad_norm": 3.0927999019622803, | |
| "learning_rate": 1.1133268989983306e-05, | |
| "loss": 1.2973, | |
| "num_input_tokens_seen": 172482468, | |
| "step": 298000, | |
| "train_runtime": 4241.2002, | |
| "train_tokens_per_second": 40668.316 | |
| }, | |
| { | |
| "epoch": 15.572829716193656, | |
| "grad_norm": 3.955559492111206, | |
| "learning_rate": 1.1068056135225376e-05, | |
| "loss": 1.272, | |
| "num_input_tokens_seen": 172775212, | |
| "step": 298500, | |
| "train_runtime": 4260.8077, | |
| "train_tokens_per_second": 40549.873 | |
| }, | |
| { | |
| "epoch": 15.598914858096828, | |
| "grad_norm": 2.954066753387451, | |
| "learning_rate": 1.1002843280467447e-05, | |
| "loss": 1.2696, | |
| "num_input_tokens_seen": 173066468, | |
| "step": 299000, | |
| "train_runtime": 4279.6208, | |
| "train_tokens_per_second": 40439.674 | |
| }, | |
| { | |
| "epoch": 15.625, | |
| "grad_norm": 2.927549362182617, | |
| "learning_rate": 1.0937630425709517e-05, | |
| "loss": 1.2947, | |
| "num_input_tokens_seen": 173362372, | |
| "step": 299500, | |
| "train_runtime": 4298.4621, | |
| "train_tokens_per_second": 40331.255 | |
| }, | |
| { | |
| "epoch": 15.651085141903172, | |
| "grad_norm": 3.2571945190429688, | |
| "learning_rate": 1.0872417570951587e-05, | |
| "loss": 1.2612, | |
| "num_input_tokens_seen": 173657292, | |
| "step": 300000, | |
| "train_runtime": 4317.6857, | |
| "train_tokens_per_second": 40219.994 | |
| }, | |
| { | |
| "epoch": 15.677170283806344, | |
| "grad_norm": 4.016629695892334, | |
| "learning_rate": 1.0807204716193657e-05, | |
| "loss": 1.2903, | |
| "num_input_tokens_seen": 173953028, | |
| "step": 300500, | |
| "train_runtime": 4337.5188, | |
| "train_tokens_per_second": 40104.27 | |
| }, | |
| { | |
| "epoch": 15.703255425709516, | |
| "grad_norm": 3.677175998687744, | |
| "learning_rate": 1.0741991861435726e-05, | |
| "loss": 1.2654, | |
| "num_input_tokens_seen": 174243612, | |
| "step": 301000, | |
| "train_runtime": 4357.6686, | |
| "train_tokens_per_second": 39985.512 | |
| }, | |
| { | |
| "epoch": 15.729340567612688, | |
| "grad_norm": 2.5401861667633057, | |
| "learning_rate": 1.0676779006677796e-05, | |
| "loss": 1.2785, | |
| "num_input_tokens_seen": 174528492, | |
| "step": 301500, | |
| "train_runtime": 4377.8182, | |
| "train_tokens_per_second": 39866.546 | |
| }, | |
| { | |
| "epoch": 15.75542570951586, | |
| "grad_norm": 3.0386669635772705, | |
| "learning_rate": 1.0611566151919868e-05, | |
| "loss": 1.2672, | |
| "num_input_tokens_seen": 174824740, | |
| "step": 302000, | |
| "train_runtime": 4397.7063, | |
| "train_tokens_per_second": 39753.619 | |
| }, | |
| { | |
| "epoch": 15.781510851419032, | |
| "grad_norm": 2.869920253753662, | |
| "learning_rate": 1.0546353297161937e-05, | |
| "loss": 1.2971, | |
| "num_input_tokens_seen": 175115884, | |
| "step": 302500, | |
| "train_runtime": 4417.5927, | |
| "train_tokens_per_second": 39640.568 | |
| }, | |
| { | |
| "epoch": 15.807595993322204, | |
| "grad_norm": 2.551456928253174, | |
| "learning_rate": 1.0481140442404007e-05, | |
| "loss": 1.2603, | |
| "num_input_tokens_seen": 175404964, | |
| "step": 303000, | |
| "train_runtime": 4437.075, | |
| "train_tokens_per_second": 39531.665 | |
| }, | |
| { | |
| "epoch": 15.833681135225376, | |
| "grad_norm": 2.8451788425445557, | |
| "learning_rate": 1.0415927587646079e-05, | |
| "loss": 1.3059, | |
| "num_input_tokens_seen": 175694332, | |
| "step": 303500, | |
| "train_runtime": 4456.4315, | |
| "train_tokens_per_second": 39424.893 | |
| }, | |
| { | |
| "epoch": 15.859766277128548, | |
| "grad_norm": 3.364713668823242, | |
| "learning_rate": 1.0350714732888148e-05, | |
| "loss": 1.2669, | |
| "num_input_tokens_seen": 175983324, | |
| "step": 304000, | |
| "train_runtime": 4475.8992, | |
| "train_tokens_per_second": 39317.982 | |
| }, | |
| { | |
| "epoch": 15.88585141903172, | |
| "grad_norm": 3.5180881023406982, | |
| "learning_rate": 1.0285501878130218e-05, | |
| "loss": 1.2704, | |
| "num_input_tokens_seen": 176271988, | |
| "step": 304500, | |
| "train_runtime": 4494.9616, | |
| "train_tokens_per_second": 39215.46 | |
| }, | |
| { | |
| "epoch": 15.911936560934892, | |
| "grad_norm": 3.1893362998962402, | |
| "learning_rate": 1.0220289023372288e-05, | |
| "loss": 1.2689, | |
| "num_input_tokens_seen": 176565276, | |
| "step": 305000, | |
| "train_runtime": 4513.98, | |
| "train_tokens_per_second": 39115.21 | |
| }, | |
| { | |
| "epoch": 15.938021702838064, | |
| "grad_norm": 3.272306442260742, | |
| "learning_rate": 1.0155076168614358e-05, | |
| "loss": 1.27, | |
| "num_input_tokens_seen": 176847788, | |
| "step": 305500, | |
| "train_runtime": 4533.1414, | |
| "train_tokens_per_second": 39012.193 | |
| }, | |
| { | |
| "epoch": 15.964106844741236, | |
| "grad_norm": 2.6090383529663086, | |
| "learning_rate": 1.0089863313856427e-05, | |
| "loss": 1.2684, | |
| "num_input_tokens_seen": 177132460, | |
| "step": 306000, | |
| "train_runtime": 4551.9653, | |
| "train_tokens_per_second": 38913.403 | |
| }, | |
| { | |
| "epoch": 15.990191986644408, | |
| "grad_norm": 2.874281644821167, | |
| "learning_rate": 1.0024650459098497e-05, | |
| "loss": 1.2839, | |
| "num_input_tokens_seen": 177417428, | |
| "step": 306500, | |
| "train_runtime": 4571.3016, | |
| "train_tokens_per_second": 38811.141 | |
| }, | |
| { | |
| "epoch": 16.0, | |
| "eval_loss": 1.2972913980484009, | |
| "eval_runtime": 46.7515, | |
| "eval_samples_per_second": 819.974, | |
| "eval_steps_per_second": 102.499, | |
| "num_input_tokens_seen": 177522072, | |
| "step": 306688 | |
| }, | |
| { | |
| "epoch": 16.01627712854758, | |
| "grad_norm": 2.4503226280212402, | |
| "learning_rate": 9.959437604340569e-06, | |
| "loss": 1.2666, | |
| "num_input_tokens_seen": 177704312, | |
| "step": 307000, | |
| "train_runtime": 4639.2676, | |
| "train_tokens_per_second": 38304.389 | |
| }, | |
| { | |
| "epoch": 16.042362270450752, | |
| "grad_norm": 2.57148814201355, | |
| "learning_rate": 9.894224749582638e-06, | |
| "loss": 1.2827, | |
| "num_input_tokens_seen": 177987984, | |
| "step": 307500, | |
| "train_runtime": 4658.9243, | |
| "train_tokens_per_second": 38203.665 | |
| }, | |
| { | |
| "epoch": 16.068447412353922, | |
| "grad_norm": 2.241555690765381, | |
| "learning_rate": 9.829011894824708e-06, | |
| "loss": 1.2417, | |
| "num_input_tokens_seen": 178276096, | |
| "step": 308000, | |
| "train_runtime": 4678.6926, | |
| "train_tokens_per_second": 38103.828 | |
| }, | |
| { | |
| "epoch": 16.094532554257096, | |
| "grad_norm": 3.140139579772949, | |
| "learning_rate": 9.76379904006678e-06, | |
| "loss": 1.2696, | |
| "num_input_tokens_seen": 178568312, | |
| "step": 308500, | |
| "train_runtime": 4698.0151, | |
| "train_tokens_per_second": 38009.31 | |
| }, | |
| { | |
| "epoch": 16.120617696160267, | |
| "grad_norm": 2.9327456951141357, | |
| "learning_rate": 9.69858618530885e-06, | |
| "loss": 1.2835, | |
| "num_input_tokens_seen": 178856160, | |
| "step": 309000, | |
| "train_runtime": 4717.2135, | |
| "train_tokens_per_second": 37915.638 | |
| }, | |
| { | |
| "epoch": 16.14670283806344, | |
| "grad_norm": 3.2067556381225586, | |
| "learning_rate": 9.633373330550919e-06, | |
| "loss": 1.2688, | |
| "num_input_tokens_seen": 179143944, | |
| "step": 309500, | |
| "train_runtime": 4736.2008, | |
| "train_tokens_per_second": 37824.398 | |
| }, | |
| { | |
| "epoch": 16.17278797996661, | |
| "grad_norm": 2.4767651557922363, | |
| "learning_rate": 9.568160475792989e-06, | |
| "loss": 1.2721, | |
| "num_input_tokens_seen": 179434664, | |
| "step": 310000, | |
| "train_runtime": 4755.3615, | |
| "train_tokens_per_second": 37733.128 | |
| }, | |
| { | |
| "epoch": 16.198873121869784, | |
| "grad_norm": 2.9996862411499023, | |
| "learning_rate": 9.502947621035059e-06, | |
| "loss": 1.2569, | |
| "num_input_tokens_seen": 179724792, | |
| "step": 310500, | |
| "train_runtime": 4774.7367, | |
| "train_tokens_per_second": 37640.776 | |
| }, | |
| { | |
| "epoch": 16.224958263772955, | |
| "grad_norm": 2.587339162826538, | |
| "learning_rate": 9.437734766277128e-06, | |
| "loss": 1.2562, | |
| "num_input_tokens_seen": 180020736, | |
| "step": 311000, | |
| "train_runtime": 4794.2064, | |
| "train_tokens_per_second": 37549.642 | |
| }, | |
| { | |
| "epoch": 16.25104340567613, | |
| "grad_norm": 2.425332546234131, | |
| "learning_rate": 9.3725219115192e-06, | |
| "loss": 1.2859, | |
| "num_input_tokens_seen": 180308088, | |
| "step": 311500, | |
| "train_runtime": 4813.4723, | |
| "train_tokens_per_second": 37459.048 | |
| }, | |
| { | |
| "epoch": 16.2771285475793, | |
| "grad_norm": 3.213170289993286, | |
| "learning_rate": 9.30730905676127e-06, | |
| "loss": 1.2648, | |
| "num_input_tokens_seen": 180593256, | |
| "step": 312000, | |
| "train_runtime": 4832.7472, | |
| "train_tokens_per_second": 37368.653 | |
| }, | |
| { | |
| "epoch": 16.303213689482472, | |
| "grad_norm": 2.971393346786499, | |
| "learning_rate": 9.24209620200334e-06, | |
| "loss": 1.2565, | |
| "num_input_tokens_seen": 180883912, | |
| "step": 312500, | |
| "train_runtime": 4853.0289, | |
| "train_tokens_per_second": 37272.375 | |
| }, | |
| { | |
| "epoch": 16.329298831385643, | |
| "grad_norm": 3.2865586280822754, | |
| "learning_rate": 9.17688334724541e-06, | |
| "loss": 1.2695, | |
| "num_input_tokens_seen": 181172920, | |
| "step": 313000, | |
| "train_runtime": 4872.3486, | |
| "train_tokens_per_second": 37183.899 | |
| }, | |
| { | |
| "epoch": 16.355383973288816, | |
| "grad_norm": 2.691861867904663, | |
| "learning_rate": 9.11167049248748e-06, | |
| "loss": 1.2742, | |
| "num_input_tokens_seen": 181457952, | |
| "step": 313500, | |
| "train_runtime": 4891.6907, | |
| "train_tokens_per_second": 37095.14 | |
| }, | |
| { | |
| "epoch": 16.381469115191987, | |
| "grad_norm": 3.302048444747925, | |
| "learning_rate": 9.04645763772955e-06, | |
| "loss": 1.261, | |
| "num_input_tokens_seen": 181746184, | |
| "step": 314000, | |
| "train_runtime": 4911.0159, | |
| "train_tokens_per_second": 37007.859 | |
| }, | |
| { | |
| "epoch": 16.407554257095157, | |
| "grad_norm": 3.427002191543579, | |
| "learning_rate": 8.981244782971618e-06, | |
| "loss": 1.2763, | |
| "num_input_tokens_seen": 182036728, | |
| "step": 314500, | |
| "train_runtime": 4930.339, | |
| "train_tokens_per_second": 36921.747 | |
| }, | |
| { | |
| "epoch": 16.43363939899833, | |
| "grad_norm": 2.194302558898926, | |
| "learning_rate": 8.91603192821369e-06, | |
| "loss": 1.2347, | |
| "num_input_tokens_seen": 182327360, | |
| "step": 315000, | |
| "train_runtime": 4949.6263, | |
| "train_tokens_per_second": 36836.591 | |
| }, | |
| { | |
| "epoch": 16.4597245409015, | |
| "grad_norm": 2.6108365058898926, | |
| "learning_rate": 8.85081907345576e-06, | |
| "loss": 1.3033, | |
| "num_input_tokens_seen": 182614776, | |
| "step": 315500, | |
| "train_runtime": 4968.861, | |
| "train_tokens_per_second": 36751.839 | |
| }, | |
| { | |
| "epoch": 16.485809682804675, | |
| "grad_norm": 3.398846387863159, | |
| "learning_rate": 8.78560621869783e-06, | |
| "loss": 1.231, | |
| "num_input_tokens_seen": 182898920, | |
| "step": 316000, | |
| "train_runtime": 4988.2986, | |
| "train_tokens_per_second": 36665.592 | |
| }, | |
| { | |
| "epoch": 16.511894824707845, | |
| "grad_norm": 3.175825357437134, | |
| "learning_rate": 8.720393363939901e-06, | |
| "loss": 1.2653, | |
| "num_input_tokens_seen": 183194016, | |
| "step": 316500, | |
| "train_runtime": 5007.4717, | |
| "train_tokens_per_second": 36584.134 | |
| }, | |
| { | |
| "epoch": 16.53797996661102, | |
| "grad_norm": 3.3755290508270264, | |
| "learning_rate": 8.65518050918197e-06, | |
| "loss": 1.2382, | |
| "num_input_tokens_seen": 183486192, | |
| "step": 317000, | |
| "train_runtime": 5026.7596, | |
| "train_tokens_per_second": 36501.883 | |
| }, | |
| { | |
| "epoch": 16.56406510851419, | |
| "grad_norm": 3.120741128921509, | |
| "learning_rate": 8.58996765442404e-06, | |
| "loss": 1.2661, | |
| "num_input_tokens_seen": 183774000, | |
| "step": 317500, | |
| "train_runtime": 5045.8839, | |
| "train_tokens_per_second": 36420.577 | |
| }, | |
| { | |
| "epoch": 16.590150250417363, | |
| "grad_norm": 4.2182440757751465, | |
| "learning_rate": 8.524754799666112e-06, | |
| "loss": 1.254, | |
| "num_input_tokens_seen": 184064816, | |
| "step": 318000, | |
| "train_runtime": 5065.2521, | |
| "train_tokens_per_second": 36338.727 | |
| }, | |
| { | |
| "epoch": 16.616235392320533, | |
| "grad_norm": 3.3010435104370117, | |
| "learning_rate": 8.459541944908182e-06, | |
| "loss": 1.2621, | |
| "num_input_tokens_seen": 184350480, | |
| "step": 318500, | |
| "train_runtime": 5084.6874, | |
| "train_tokens_per_second": 36256.011 | |
| }, | |
| { | |
| "epoch": 16.642320534223707, | |
| "grad_norm": 3.2120778560638428, | |
| "learning_rate": 8.39432909015025e-06, | |
| "loss": 1.2563, | |
| "num_input_tokens_seen": 184642440, | |
| "step": 319000, | |
| "train_runtime": 5103.9372, | |
| "train_tokens_per_second": 36176.472 | |
| }, | |
| { | |
| "epoch": 16.668405676126877, | |
| "grad_norm": 2.9939897060394287, | |
| "learning_rate": 8.329116235392321e-06, | |
| "loss": 1.2594, | |
| "num_input_tokens_seen": 184928112, | |
| "step": 319500, | |
| "train_runtime": 5123.0191, | |
| "train_tokens_per_second": 36097.486 | |
| }, | |
| { | |
| "epoch": 16.69449081803005, | |
| "grad_norm": 3.710550308227539, | |
| "learning_rate": 8.263903380634391e-06, | |
| "loss": 1.2634, | |
| "num_input_tokens_seen": 185211440, | |
| "step": 320000, | |
| "train_runtime": 5142.5889, | |
| "train_tokens_per_second": 36015.214 | |
| }, | |
| { | |
| "epoch": 16.72057595993322, | |
| "grad_norm": 2.5137531757354736, | |
| "learning_rate": 8.19869052587646e-06, | |
| "loss": 1.2601, | |
| "num_input_tokens_seen": 185506864, | |
| "step": 320500, | |
| "train_runtime": 5162.7072, | |
| "train_tokens_per_second": 35932.091 | |
| }, | |
| { | |
| "epoch": 16.746661101836395, | |
| "grad_norm": 4.654266834259033, | |
| "learning_rate": 8.13347767111853e-06, | |
| "loss": 1.282, | |
| "num_input_tokens_seen": 185792944, | |
| "step": 321000, | |
| "train_runtime": 5181.946, | |
| "train_tokens_per_second": 35853.894 | |
| }, | |
| { | |
| "epoch": 16.772746243739565, | |
| "grad_norm": 2.9473636150360107, | |
| "learning_rate": 8.068264816360602e-06, | |
| "loss": 1.2839, | |
| "num_input_tokens_seen": 186086024, | |
| "step": 321500, | |
| "train_runtime": 5201.2933, | |
| "train_tokens_per_second": 35776.876 | |
| }, | |
| { | |
| "epoch": 16.79883138564274, | |
| "grad_norm": 2.2345118522644043, | |
| "learning_rate": 8.003051961602672e-06, | |
| "loss": 1.249, | |
| "num_input_tokens_seen": 186378104, | |
| "step": 322000, | |
| "train_runtime": 5221.502, | |
| "train_tokens_per_second": 35694.347 | |
| }, | |
| { | |
| "epoch": 16.82491652754591, | |
| "grad_norm": 2.1228227615356445, | |
| "learning_rate": 7.937839106844742e-06, | |
| "loss": 1.2856, | |
| "num_input_tokens_seen": 186672776, | |
| "step": 322500, | |
| "train_runtime": 5242.0802, | |
| "train_tokens_per_second": 35610.438 | |
| }, | |
| { | |
| "epoch": 16.851001669449083, | |
| "grad_norm": 3.548326253890991, | |
| "learning_rate": 7.872626252086811e-06, | |
| "loss": 1.2777, | |
| "num_input_tokens_seen": 186964952, | |
| "step": 323000, | |
| "train_runtime": 5262.7553, | |
| "train_tokens_per_second": 35526.058 | |
| }, | |
| { | |
| "epoch": 16.877086811352253, | |
| "grad_norm": 3.222048044204712, | |
| "learning_rate": 7.807413397328881e-06, | |
| "loss": 1.288, | |
| "num_input_tokens_seen": 187250864, | |
| "step": 323500, | |
| "train_runtime": 5283.1207, | |
| "train_tokens_per_second": 35443.23 | |
| }, | |
| { | |
| "epoch": 16.903171953255427, | |
| "grad_norm": 3.267969846725464, | |
| "learning_rate": 7.74220054257095e-06, | |
| "loss": 1.2746, | |
| "num_input_tokens_seen": 187543856, | |
| "step": 324000, | |
| "train_runtime": 5303.6214, | |
| "train_tokens_per_second": 35361.471 | |
| }, | |
| { | |
| "epoch": 16.929257095158597, | |
| "grad_norm": 2.1591436862945557, | |
| "learning_rate": 7.676987687813022e-06, | |
| "loss": 1.2524, | |
| "num_input_tokens_seen": 187833368, | |
| "step": 324500, | |
| "train_runtime": 5324.0933, | |
| "train_tokens_per_second": 35279.879 | |
| }, | |
| { | |
| "epoch": 16.95534223706177, | |
| "grad_norm": 5.07979154586792, | |
| "learning_rate": 7.611774833055092e-06, | |
| "loss": 1.2888, | |
| "num_input_tokens_seen": 188120672, | |
| "step": 325000, | |
| "train_runtime": 5344.7171, | |
| "train_tokens_per_second": 35197.499 | |
| }, | |
| { | |
| "epoch": 16.98142737896494, | |
| "grad_norm": 3.134291410446167, | |
| "learning_rate": 7.546561978297162e-06, | |
| "loss": 1.2575, | |
| "num_input_tokens_seen": 188407336, | |
| "step": 325500, | |
| "train_runtime": 5365.0524, | |
| "train_tokens_per_second": 35117.52 | |
| }, | |
| { | |
| "epoch": 17.0, | |
| "eval_loss": 1.2976926565170288, | |
| "eval_runtime": 49.7121, | |
| "eval_samples_per_second": 771.141, | |
| "eval_steps_per_second": 96.395, | |
| "num_input_tokens_seen": 188612114, | |
| "step": 325856 | |
| }, | |
| { | |
| "epoch": 17.007512520868115, | |
| "grad_norm": 2.3629326820373535, | |
| "learning_rate": 7.481349123539233e-06, | |
| "loss": 1.2388, | |
| "num_input_tokens_seen": 188700266, | |
| "step": 326000, | |
| "train_runtime": 5436.6369, | |
| "train_tokens_per_second": 34709.007 | |
| }, | |
| { | |
| "epoch": 17.033597662771285, | |
| "grad_norm": 2.8408102989196777, | |
| "learning_rate": 7.416136268781303e-06, | |
| "loss": 1.2502, | |
| "num_input_tokens_seen": 188990786, | |
| "step": 326500, | |
| "train_runtime": 5458.7072, | |
| "train_tokens_per_second": 34621.894 | |
| }, | |
| { | |
| "epoch": 17.05968280467446, | |
| "grad_norm": 3.5564496517181396, | |
| "learning_rate": 7.350923414023372e-06, | |
| "loss": 1.2586, | |
| "num_input_tokens_seen": 189289114, | |
| "step": 327000, | |
| "train_runtime": 5481.0865, | |
| "train_tokens_per_second": 34534.962 | |
| }, | |
| { | |
| "epoch": 17.08576794657763, | |
| "grad_norm": 2.573309898376465, | |
| "learning_rate": 7.2857105592654434e-06, | |
| "loss": 1.255, | |
| "num_input_tokens_seen": 189582338, | |
| "step": 327500, | |
| "train_runtime": 5502.6097, | |
| "train_tokens_per_second": 34453.168 | |
| }, | |
| { | |
| "epoch": 17.1118530884808, | |
| "grad_norm": 2.900810718536377, | |
| "learning_rate": 7.220497704507513e-06, | |
| "loss": 1.2625, | |
| "num_input_tokens_seen": 189873506, | |
| "step": 328000, | |
| "train_runtime": 5523.9124, | |
| "train_tokens_per_second": 34373.012 | |
| }, | |
| { | |
| "epoch": 17.137938230383973, | |
| "grad_norm": 2.80328106880188, | |
| "learning_rate": 7.155284849749583e-06, | |
| "loss": 1.2621, | |
| "num_input_tokens_seen": 190163986, | |
| "step": 328500, | |
| "train_runtime": 5545.2522, | |
| "train_tokens_per_second": 34293.117 | |
| }, | |
| { | |
| "epoch": 17.164023372287144, | |
| "grad_norm": 2.8359973430633545, | |
| "learning_rate": 7.090071994991653e-06, | |
| "loss": 1.2276, | |
| "num_input_tokens_seen": 190454602, | |
| "step": 329000, | |
| "train_runtime": 5566.6958, | |
| "train_tokens_per_second": 34213.223 | |
| }, | |
| { | |
| "epoch": 17.190108514190317, | |
| "grad_norm": 2.6880123615264893, | |
| "learning_rate": 7.024859140233723e-06, | |
| "loss": 1.2414, | |
| "num_input_tokens_seen": 190749178, | |
| "step": 329500, | |
| "train_runtime": 5587.661, | |
| "train_tokens_per_second": 34137.572 | |
| }, | |
| { | |
| "epoch": 17.216193656093488, | |
| "grad_norm": 2.2190914154052734, | |
| "learning_rate": 6.959646285475793e-06, | |
| "loss": 1.2697, | |
| "num_input_tokens_seen": 191041514, | |
| "step": 330000, | |
| "train_runtime": 5608.8953, | |
| "train_tokens_per_second": 34060.453 | |
| }, | |
| { | |
| "epoch": 17.24227879799666, | |
| "grad_norm": 2.855161428451538, | |
| "learning_rate": 6.894433430717863e-06, | |
| "loss": 1.2656, | |
| "num_input_tokens_seen": 191333666, | |
| "step": 330500, | |
| "train_runtime": 5629.9424, | |
| "train_tokens_per_second": 33985.013 | |
| }, | |
| { | |
| "epoch": 17.26836393989983, | |
| "grad_norm": 2.8625779151916504, | |
| "learning_rate": 6.829220575959934e-06, | |
| "loss": 1.2595, | |
| "num_input_tokens_seen": 191622570, | |
| "step": 331000, | |
| "train_runtime": 5650.9098, | |
| "train_tokens_per_second": 33910.039 | |
| }, | |
| { | |
| "epoch": 17.294449081803005, | |
| "grad_norm": 2.630918502807617, | |
| "learning_rate": 6.764007721202003e-06, | |
| "loss": 1.2521, | |
| "num_input_tokens_seen": 191911738, | |
| "step": 331500, | |
| "train_runtime": 5671.7949, | |
| "train_tokens_per_second": 33836.156 | |
| }, | |
| { | |
| "epoch": 17.320534223706176, | |
| "grad_norm": 2.7609314918518066, | |
| "learning_rate": 6.698794866444073e-06, | |
| "loss": 1.2586, | |
| "num_input_tokens_seen": 192200466, | |
| "step": 332000, | |
| "train_runtime": 5692.6673, | |
| "train_tokens_per_second": 33762.814 | |
| }, | |
| { | |
| "epoch": 17.34661936560935, | |
| "grad_norm": 2.250659465789795, | |
| "learning_rate": 6.6335820116861445e-06, | |
| "loss": 1.2388, | |
| "num_input_tokens_seen": 192489178, | |
| "step": 332500, | |
| "train_runtime": 5713.6569, | |
| "train_tokens_per_second": 33689.313 | |
| }, | |
| { | |
| "epoch": 17.37270450751252, | |
| "grad_norm": 3.1896932125091553, | |
| "learning_rate": 6.568369156928214e-06, | |
| "loss": 1.2559, | |
| "num_input_tokens_seen": 192778922, | |
| "step": 333000, | |
| "train_runtime": 5734.7257, | |
| "train_tokens_per_second": 33616.067 | |
| }, | |
| { | |
| "epoch": 17.398789649415694, | |
| "grad_norm": 3.3856568336486816, | |
| "learning_rate": 6.503156302170284e-06, | |
| "loss": 1.267, | |
| "num_input_tokens_seen": 193066674, | |
| "step": 333500, | |
| "train_runtime": 5755.6678, | |
| "train_tokens_per_second": 33543.748 | |
| }, | |
| { | |
| "epoch": 17.424874791318864, | |
| "grad_norm": 2.031611919403076, | |
| "learning_rate": 6.437943447412355e-06, | |
| "loss": 1.2624, | |
| "num_input_tokens_seen": 193346250, | |
| "step": 334000, | |
| "train_runtime": 5776.7111, | |
| "train_tokens_per_second": 33469.953 | |
| }, | |
| { | |
| "epoch": 17.450959933222038, | |
| "grad_norm": 6.999661922454834, | |
| "learning_rate": 6.3727305926544244e-06, | |
| "loss": 1.25, | |
| "num_input_tokens_seen": 193636658, | |
| "step": 334500, | |
| "train_runtime": 5797.5543, | |
| "train_tokens_per_second": 33399.715 | |
| }, | |
| { | |
| "epoch": 17.477045075125208, | |
| "grad_norm": 3.335151433944702, | |
| "learning_rate": 6.307517737896494e-06, | |
| "loss": 1.2646, | |
| "num_input_tokens_seen": 193927418, | |
| "step": 335000, | |
| "train_runtime": 5818.5663, | |
| "train_tokens_per_second": 33329.072 | |
| }, | |
| { | |
| "epoch": 17.50313021702838, | |
| "grad_norm": 3.0118470191955566, | |
| "learning_rate": 6.242304883138565e-06, | |
| "loss": 1.2595, | |
| "num_input_tokens_seen": 194220626, | |
| "step": 335500, | |
| "train_runtime": 5839.6078, | |
| "train_tokens_per_second": 33259.19 | |
| }, | |
| { | |
| "epoch": 17.529215358931552, | |
| "grad_norm": 2.819512128829956, | |
| "learning_rate": 6.177092028380635e-06, | |
| "loss": 1.2604, | |
| "num_input_tokens_seen": 194507882, | |
| "step": 336000, | |
| "train_runtime": 5860.9533, | |
| "train_tokens_per_second": 33187.072 | |
| }, | |
| { | |
| "epoch": 17.555300500834726, | |
| "grad_norm": 2.87508225440979, | |
| "learning_rate": 6.111879173622704e-06, | |
| "loss": 1.2855, | |
| "num_input_tokens_seen": 194796762, | |
| "step": 336500, | |
| "train_runtime": 5882.4096, | |
| "train_tokens_per_second": 33115.13 | |
| }, | |
| { | |
| "epoch": 17.581385642737896, | |
| "grad_norm": 2.2459728717803955, | |
| "learning_rate": 6.046666318864775e-06, | |
| "loss": 1.2522, | |
| "num_input_tokens_seen": 195084282, | |
| "step": 337000, | |
| "train_runtime": 5904.0114, | |
| "train_tokens_per_second": 33042.667 | |
| }, | |
| { | |
| "epoch": 17.60747078464107, | |
| "grad_norm": 2.935845375061035, | |
| "learning_rate": 5.981453464106846e-06, | |
| "loss": 1.2545, | |
| "num_input_tokens_seen": 195375162, | |
| "step": 337500, | |
| "train_runtime": 5925.5481, | |
| "train_tokens_per_second": 32971.661 | |
| }, | |
| { | |
| "epoch": 17.63355592654424, | |
| "grad_norm": 3.0520784854888916, | |
| "learning_rate": 5.916240609348915e-06, | |
| "loss": 1.2587, | |
| "num_input_tokens_seen": 195666498, | |
| "step": 338000, | |
| "train_runtime": 5946.9563, | |
| "train_tokens_per_second": 32901.957 | |
| }, | |
| { | |
| "epoch": 17.659641068447414, | |
| "grad_norm": 1.9762933254241943, | |
| "learning_rate": 5.851027754590985e-06, | |
| "loss": 1.2714, | |
| "num_input_tokens_seen": 195952418, | |
| "step": 338500, | |
| "train_runtime": 5968.684, | |
| "train_tokens_per_second": 32830.087 | |
| }, | |
| { | |
| "epoch": 17.685726210350584, | |
| "grad_norm": 3.0459036827087402, | |
| "learning_rate": 5.785814899833055e-06, | |
| "loss": 1.2819, | |
| "num_input_tokens_seen": 196243738, | |
| "step": 339000, | |
| "train_runtime": 5990.4534, | |
| "train_tokens_per_second": 32759.413 | |
| }, | |
| { | |
| "epoch": 17.711811352253758, | |
| "grad_norm": 2.7781834602355957, | |
| "learning_rate": 5.7206020450751255e-06, | |
| "loss": 1.253, | |
| "num_input_tokens_seen": 196532034, | |
| "step": 339500, | |
| "train_runtime": 6011.9799, | |
| "train_tokens_per_second": 32690.068 | |
| }, | |
| { | |
| "epoch": 17.737896494156928, | |
| "grad_norm": 3.383931875228882, | |
| "learning_rate": 5.655389190317196e-06, | |
| "loss": 1.2521, | |
| "num_input_tokens_seen": 196822202, | |
| "step": 340000, | |
| "train_runtime": 6033.5216, | |
| "train_tokens_per_second": 32621.446 | |
| }, | |
| { | |
| "epoch": 17.7639816360601, | |
| "grad_norm": 2.72835373878479, | |
| "learning_rate": 5.590176335559266e-06, | |
| "loss": 1.2494, | |
| "num_input_tokens_seen": 197110802, | |
| "step": 340500, | |
| "train_runtime": 6054.9604, | |
| "train_tokens_per_second": 32553.607 | |
| }, | |
| { | |
| "epoch": 17.790066777963272, | |
| "grad_norm": 2.868680000305176, | |
| "learning_rate": 5.524963480801336e-06, | |
| "loss": 1.2436, | |
| "num_input_tokens_seen": 197396914, | |
| "step": 341000, | |
| "train_runtime": 6076.2211, | |
| "train_tokens_per_second": 32486.789 | |
| }, | |
| { | |
| "epoch": 17.816151919866446, | |
| "grad_norm": 2.985006809234619, | |
| "learning_rate": 5.459750626043405e-06, | |
| "loss": 1.269, | |
| "num_input_tokens_seen": 197687178, | |
| "step": 341500, | |
| "train_runtime": 6097.2778, | |
| "train_tokens_per_second": 32422.203 | |
| }, | |
| { | |
| "epoch": 17.842237061769616, | |
| "grad_norm": 2.457155704498291, | |
| "learning_rate": 5.394537771285476e-06, | |
| "loss": 1.2725, | |
| "num_input_tokens_seen": 197978106, | |
| "step": 342000, | |
| "train_runtime": 6118.3065, | |
| "train_tokens_per_second": 32358.318 | |
| }, | |
| { | |
| "epoch": 17.86832220367279, | |
| "grad_norm": 2.6323978900909424, | |
| "learning_rate": 5.329324916527547e-06, | |
| "loss": 1.2691, | |
| "num_input_tokens_seen": 198267826, | |
| "step": 342500, | |
| "train_runtime": 6138.9907, | |
| "train_tokens_per_second": 32296.486 | |
| }, | |
| { | |
| "epoch": 17.89440734557596, | |
| "grad_norm": 2.9683570861816406, | |
| "learning_rate": 5.264112061769616e-06, | |
| "loss": 1.2606, | |
| "num_input_tokens_seen": 198555794, | |
| "step": 343000, | |
| "train_runtime": 6159.8347, | |
| "train_tokens_per_second": 32233.948 | |
| }, | |
| { | |
| "epoch": 17.92049248747913, | |
| "grad_norm": 2.6426734924316406, | |
| "learning_rate": 5.198899207011686e-06, | |
| "loss": 1.2572, | |
| "num_input_tokens_seen": 198837802, | |
| "step": 343500, | |
| "train_runtime": 6180.7096, | |
| "train_tokens_per_second": 32170.708 | |
| }, | |
| { | |
| "epoch": 17.946577629382304, | |
| "grad_norm": 2.743959426879883, | |
| "learning_rate": 5.133686352253757e-06, | |
| "loss": 1.2584, | |
| "num_input_tokens_seen": 199125674, | |
| "step": 344000, | |
| "train_runtime": 6201.5223, | |
| "train_tokens_per_second": 32109.16 | |
| }, | |
| { | |
| "epoch": 17.972662771285474, | |
| "grad_norm": 2.5115082263946533, | |
| "learning_rate": 5.0684734974958266e-06, | |
| "loss": 1.2496, | |
| "num_input_tokens_seen": 199418034, | |
| "step": 344500, | |
| "train_runtime": 6222.8136, | |
| "train_tokens_per_second": 32046.281 | |
| }, | |
| { | |
| "epoch": 17.998747913188648, | |
| "grad_norm": 2.3742177486419678, | |
| "learning_rate": 5.003260642737897e-06, | |
| "loss": 1.2601, | |
| "num_input_tokens_seen": 199702842, | |
| "step": 345000, | |
| "train_runtime": 6244.2476, | |
| "train_tokens_per_second": 31981.89 | |
| }, | |
| { | |
| "epoch": 18.0, | |
| "eval_loss": 1.296248197555542, | |
| "eval_runtime": 50.1469, | |
| "eval_samples_per_second": 764.454, | |
| "eval_steps_per_second": 95.559, | |
| "num_input_tokens_seen": 199715854, | |
| "step": 345024 | |
| }, | |
| { | |
| "epoch": 18.02483305509182, | |
| "grad_norm": 2.2281575202941895, | |
| "learning_rate": 4.938047787979966e-06, | |
| "loss": 1.2228, | |
| "num_input_tokens_seen": 199990102, | |
| "step": 345500, | |
| "train_runtime": 6316.4242, | |
| "train_tokens_per_second": 31661.917 | |
| }, | |
| { | |
| "epoch": 18.050918196994992, | |
| "grad_norm": 2.840803384780884, | |
| "learning_rate": 4.872834933222037e-06, | |
| "loss": 1.2581, | |
| "num_input_tokens_seen": 200279302, | |
| "step": 346000, | |
| "train_runtime": 6337.2304, | |
| "train_tokens_per_second": 31603.601 | |
| }, | |
| { | |
| "epoch": 18.077003338898162, | |
| "grad_norm": 2.4082562923431396, | |
| "learning_rate": 4.807622078464107e-06, | |
| "loss": 1.2566, | |
| "num_input_tokens_seen": 200566038, | |
| "step": 346500, | |
| "train_runtime": 6358.142, | |
| "train_tokens_per_second": 31544.756 | |
| }, | |
| { | |
| "epoch": 18.103088480801336, | |
| "grad_norm": 3.136262893676758, | |
| "learning_rate": 4.742409223706177e-06, | |
| "loss": 1.2631, | |
| "num_input_tokens_seen": 200854406, | |
| "step": 347000, | |
| "train_runtime": 6379.2543, | |
| "train_tokens_per_second": 31485.562 | |
| }, | |
| { | |
| "epoch": 18.129173622704506, | |
| "grad_norm": 2.251553535461426, | |
| "learning_rate": 4.677196368948248e-06, | |
| "loss": 1.2434, | |
| "num_input_tokens_seen": 201141734, | |
| "step": 347500, | |
| "train_runtime": 6400.5038, | |
| "train_tokens_per_second": 31425.922 | |
| }, | |
| { | |
| "epoch": 18.15525876460768, | |
| "grad_norm": 2.587162971496582, | |
| "learning_rate": 4.6119835141903175e-06, | |
| "loss": 1.2481, | |
| "num_input_tokens_seen": 201429926, | |
| "step": 348000, | |
| "train_runtime": 6421.5455, | |
| "train_tokens_per_second": 31367.827 | |
| }, | |
| { | |
| "epoch": 18.18134390651085, | |
| "grad_norm": 2.8229830265045166, | |
| "learning_rate": 4.546770659432387e-06, | |
| "loss": 1.2536, | |
| "num_input_tokens_seen": 201720902, | |
| "step": 348500, | |
| "train_runtime": 6442.673, | |
| "train_tokens_per_second": 31310.126 | |
| }, | |
| { | |
| "epoch": 18.207429048414024, | |
| "grad_norm": 2.943593740463257, | |
| "learning_rate": 4.481557804674458e-06, | |
| "loss": 1.2687, | |
| "num_input_tokens_seen": 202015494, | |
| "step": 349000, | |
| "train_runtime": 6463.8115, | |
| "train_tokens_per_second": 31253.308 | |
| }, | |
| { | |
| "epoch": 18.233514190317194, | |
| "grad_norm": 2.8468620777130127, | |
| "learning_rate": 4.416344949916528e-06, | |
| "loss": 1.2475, | |
| "num_input_tokens_seen": 202301734, | |
| "step": 349500, | |
| "train_runtime": 6484.7729, | |
| "train_tokens_per_second": 31196.426 | |
| }, | |
| { | |
| "epoch": 18.25959933222037, | |
| "grad_norm": 2.5584495067596436, | |
| "learning_rate": 4.351132095158597e-06, | |
| "loss": 1.2464, | |
| "num_input_tokens_seen": 202582798, | |
| "step": 350000, | |
| "train_runtime": 6505.8233, | |
| "train_tokens_per_second": 31138.688 | |
| }, | |
| { | |
| "epoch": 18.28568447412354, | |
| "grad_norm": 3.42409348487854, | |
| "learning_rate": 4.285919240400668e-06, | |
| "loss": 1.2696, | |
| "num_input_tokens_seen": 202872662, | |
| "step": 350500, | |
| "train_runtime": 6526.8475, | |
| "train_tokens_per_second": 31082.795 | |
| }, | |
| { | |
| "epoch": 18.311769616026712, | |
| "grad_norm": 2.7311031818389893, | |
| "learning_rate": 4.220706385642738e-06, | |
| "loss": 1.249, | |
| "num_input_tokens_seen": 203159246, | |
| "step": 351000, | |
| "train_runtime": 6547.7257, | |
| "train_tokens_per_second": 31027.452 | |
| }, | |
| { | |
| "epoch": 18.337854757929883, | |
| "grad_norm": 3.2200024127960205, | |
| "learning_rate": 4.155493530884808e-06, | |
| "loss": 1.2766, | |
| "num_input_tokens_seen": 203449598, | |
| "step": 351500, | |
| "train_runtime": 6568.6802, | |
| "train_tokens_per_second": 30972.675 | |
| }, | |
| { | |
| "epoch": 18.363939899833056, | |
| "grad_norm": 3.4853382110595703, | |
| "learning_rate": 4.090280676126879e-06, | |
| "loss": 1.2478, | |
| "num_input_tokens_seen": 203737350, | |
| "step": 352000, | |
| "train_runtime": 6589.7847, | |
| "train_tokens_per_second": 30917.148 | |
| }, | |
| { | |
| "epoch": 18.390025041736227, | |
| "grad_norm": 2.6248600482940674, | |
| "learning_rate": 4.025067821368948e-06, | |
| "loss": 1.2461, | |
| "num_input_tokens_seen": 204033470, | |
| "step": 352500, | |
| "train_runtime": 6610.8585, | |
| "train_tokens_per_second": 30863.385 | |
| }, | |
| { | |
| "epoch": 18.4161101836394, | |
| "grad_norm": 3.1528148651123047, | |
| "learning_rate": 3.9598549666110185e-06, | |
| "loss": 1.2487, | |
| "num_input_tokens_seen": 204320822, | |
| "step": 353000, | |
| "train_runtime": 6632.008, | |
| "train_tokens_per_second": 30808.289 | |
| }, | |
| { | |
| "epoch": 18.44219532554257, | |
| "grad_norm": 2.4708855152130127, | |
| "learning_rate": 3.894642111853088e-06, | |
| "loss": 1.2493, | |
| "num_input_tokens_seen": 204615126, | |
| "step": 353500, | |
| "train_runtime": 6653.2853, | |
| "train_tokens_per_second": 30753.998 | |
| }, | |
| { | |
| "epoch": 18.468280467445744, | |
| "grad_norm": 2.8539340496063232, | |
| "learning_rate": 3.829429257095159e-06, | |
| "loss": 1.2469, | |
| "num_input_tokens_seen": 204908238, | |
| "step": 354000, | |
| "train_runtime": 6674.3465, | |
| "train_tokens_per_second": 30700.869 | |
| }, | |
| { | |
| "epoch": 18.494365609348915, | |
| "grad_norm": 3.047869920730591, | |
| "learning_rate": 3.764216402337229e-06, | |
| "loss": 1.2571, | |
| "num_input_tokens_seen": 205200078, | |
| "step": 354500, | |
| "train_runtime": 6695.2164, | |
| "train_tokens_per_second": 30648.759 | |
| }, | |
| { | |
| "epoch": 18.52045075125209, | |
| "grad_norm": 3.70831298828125, | |
| "learning_rate": 3.699003547579299e-06, | |
| "loss": 1.2544, | |
| "num_input_tokens_seen": 205493198, | |
| "step": 355000, | |
| "train_runtime": 6716.2607, | |
| "train_tokens_per_second": 30596.37 | |
| }, | |
| { | |
| "epoch": 18.54653589315526, | |
| "grad_norm": 2.9419515132904053, | |
| "learning_rate": 3.633790692821369e-06, | |
| "loss": 1.2406, | |
| "num_input_tokens_seen": 205782654, | |
| "step": 355500, | |
| "train_runtime": 6737.4279, | |
| "train_tokens_per_second": 30543.207 | |
| }, | |
| { | |
| "epoch": 18.572621035058432, | |
| "grad_norm": 3.3979151248931885, | |
| "learning_rate": 3.5685778380634397e-06, | |
| "loss": 1.2387, | |
| "num_input_tokens_seen": 206078310, | |
| "step": 356000, | |
| "train_runtime": 6758.4267, | |
| "train_tokens_per_second": 30492.054 | |
| }, | |
| { | |
| "epoch": 18.598706176961603, | |
| "grad_norm": 2.5537753105163574, | |
| "learning_rate": 3.503364983305509e-06, | |
| "loss": 1.2454, | |
| "num_input_tokens_seen": 206364678, | |
| "step": 356500, | |
| "train_runtime": 6779.3225, | |
| "train_tokens_per_second": 30440.31 | |
| }, | |
| { | |
| "epoch": 18.624791318864773, | |
| "grad_norm": 3.0519020557403564, | |
| "learning_rate": 3.4381521285475796e-06, | |
| "loss": 1.2617, | |
| "num_input_tokens_seen": 206651694, | |
| "step": 357000, | |
| "train_runtime": 6800.2161, | |
| "train_tokens_per_second": 30388.989 | |
| }, | |
| { | |
| "epoch": 18.650876460767947, | |
| "grad_norm": 2.832632541656494, | |
| "learning_rate": 3.3729392737896494e-06, | |
| "loss": 1.2594, | |
| "num_input_tokens_seen": 206935862, | |
| "step": 357500, | |
| "train_runtime": 6821.3364, | |
| "train_tokens_per_second": 30336.557 | |
| }, | |
| { | |
| "epoch": 18.676961602671117, | |
| "grad_norm": 3.5510575771331787, | |
| "learning_rate": 3.3077264190317196e-06, | |
| "loss": 1.2576, | |
| "num_input_tokens_seen": 207225006, | |
| "step": 358000, | |
| "train_runtime": 6842.5612, | |
| "train_tokens_per_second": 30284.713 | |
| }, | |
| { | |
| "epoch": 18.70304674457429, | |
| "grad_norm": 2.7018370628356934, | |
| "learning_rate": 3.24251356427379e-06, | |
| "loss": 1.2524, | |
| "num_input_tokens_seen": 207518494, | |
| "step": 358500, | |
| "train_runtime": 6863.6965, | |
| "train_tokens_per_second": 30234.218 | |
| }, | |
| { | |
| "epoch": 18.72913188647746, | |
| "grad_norm": 2.3896238803863525, | |
| "learning_rate": 3.1773007095158596e-06, | |
| "loss": 1.2787, | |
| "num_input_tokens_seen": 207806854, | |
| "step": 359000, | |
| "train_runtime": 6884.7293, | |
| "train_tokens_per_second": 30183.736 | |
| }, | |
| { | |
| "epoch": 18.755217028380635, | |
| "grad_norm": 2.3457329273223877, | |
| "learning_rate": 3.11208785475793e-06, | |
| "loss": 1.2612, | |
| "num_input_tokens_seen": 208104358, | |
| "step": 359500, | |
| "train_runtime": 6906.0499, | |
| "train_tokens_per_second": 30133.631 | |
| }, | |
| { | |
| "epoch": 18.781302170283805, | |
| "grad_norm": 3.7799017429351807, | |
| "learning_rate": 3.046875e-06, | |
| "loss": 1.2278, | |
| "num_input_tokens_seen": 208395230, | |
| "step": 360000, | |
| "train_runtime": 6927.2417, | |
| "train_tokens_per_second": 30083.436 | |
| }, | |
| { | |
| "epoch": 18.80738731218698, | |
| "grad_norm": 2.9162731170654297, | |
| "learning_rate": 2.98166214524207e-06, | |
| "loss": 1.2495, | |
| "num_input_tokens_seen": 208684190, | |
| "step": 360500, | |
| "train_runtime": 6948.3051, | |
| "train_tokens_per_second": 30033.826 | |
| }, | |
| { | |
| "epoch": 18.83347245409015, | |
| "grad_norm": 3.2956576347351074, | |
| "learning_rate": 2.9164492904841403e-06, | |
| "loss": 1.2556, | |
| "num_input_tokens_seen": 208972206, | |
| "step": 361000, | |
| "train_runtime": 6969.2518, | |
| "train_tokens_per_second": 29984.884 | |
| }, | |
| { | |
| "epoch": 18.859557595993323, | |
| "grad_norm": 2.974874496459961, | |
| "learning_rate": 2.8512364357262105e-06, | |
| "loss": 1.2433, | |
| "num_input_tokens_seen": 209260382, | |
| "step": 361500, | |
| "train_runtime": 6990.1944, | |
| "train_tokens_per_second": 29936.275 | |
| }, | |
| { | |
| "epoch": 18.885642737896493, | |
| "grad_norm": 2.385434150695801, | |
| "learning_rate": 2.7860235809682807e-06, | |
| "loss": 1.2529, | |
| "num_input_tokens_seen": 209544430, | |
| "step": 362000, | |
| "train_runtime": 7011.1006, | |
| "train_tokens_per_second": 29887.523 | |
| }, | |
| { | |
| "epoch": 18.911727879799667, | |
| "grad_norm": 2.289966344833374, | |
| "learning_rate": 2.7208107262103505e-06, | |
| "loss": 1.262, | |
| "num_input_tokens_seen": 209834774, | |
| "step": 362500, | |
| "train_runtime": 7032.3451, | |
| "train_tokens_per_second": 29838.521 | |
| }, | |
| { | |
| "epoch": 18.937813021702837, | |
| "grad_norm": 2.8906939029693604, | |
| "learning_rate": 2.655597871452421e-06, | |
| "loss": 1.2716, | |
| "num_input_tokens_seen": 210123054, | |
| "step": 363000, | |
| "train_runtime": 7053.702, | |
| "train_tokens_per_second": 29789.046 | |
| }, | |
| { | |
| "epoch": 18.96389816360601, | |
| "grad_norm": 3.4153401851654053, | |
| "learning_rate": 2.590385016694491e-06, | |
| "loss": 1.2774, | |
| "num_input_tokens_seen": 210412382, | |
| "step": 363500, | |
| "train_runtime": 7075.1001, | |
| "train_tokens_per_second": 29739.845 | |
| }, | |
| { | |
| "epoch": 18.98998330550918, | |
| "grad_norm": 3.0862789154052734, | |
| "learning_rate": 2.525172161936561e-06, | |
| "loss": 1.2665, | |
| "num_input_tokens_seen": 210705166, | |
| "step": 364000, | |
| "train_runtime": 7096.4466, | |
| "train_tokens_per_second": 29691.644 | |
| }, | |
| { | |
| "epoch": 19.0, | |
| "eval_loss": 1.296281337738037, | |
| "eval_runtime": 51.4225, | |
| "eval_samples_per_second": 745.49, | |
| "eval_steps_per_second": 93.189, | |
| "num_input_tokens_seen": 210813428, | |
| "step": 364192 | |
| }, | |
| { | |
| "epoch": 19.016068447412355, | |
| "grad_norm": 2.282921314239502, | |
| "learning_rate": 2.459959307178631e-06, | |
| "loss": 1.2246, | |
| "num_input_tokens_seen": 210992604, | |
| "step": 364500, | |
| "train_runtime": 7170.5972, | |
| "train_tokens_per_second": 29424.69 | |
| }, | |
| { | |
| "epoch": 19.042153589315525, | |
| "grad_norm": 2.1377789974212646, | |
| "learning_rate": 2.3947464524207014e-06, | |
| "loss": 1.2377, | |
| "num_input_tokens_seen": 211280204, | |
| "step": 365000, | |
| "train_runtime": 7192.2041, | |
| "train_tokens_per_second": 29376.28 | |
| }, | |
| { | |
| "epoch": 19.0682387312187, | |
| "grad_norm": 3.454662799835205, | |
| "learning_rate": 2.3295335976627716e-06, | |
| "loss": 1.2658, | |
| "num_input_tokens_seen": 211569500, | |
| "step": 365500, | |
| "train_runtime": 7213.663, | |
| "train_tokens_per_second": 29328.997 | |
| }, | |
| { | |
| "epoch": 19.09432387312187, | |
| "grad_norm": 2.45365309715271, | |
| "learning_rate": 2.2643207429048414e-06, | |
| "loss": 1.2296, | |
| "num_input_tokens_seen": 211851156, | |
| "step": 366000, | |
| "train_runtime": 7235.0182, | |
| "train_tokens_per_second": 29281.358 | |
| }, | |
| { | |
| "epoch": 19.120409015025043, | |
| "grad_norm": 2.841344118118286, | |
| "learning_rate": 2.1991078881469116e-06, | |
| "loss": 1.2817, | |
| "num_input_tokens_seen": 212137660, | |
| "step": 366500, | |
| "train_runtime": 7256.1778, | |
| "train_tokens_per_second": 29235.455 | |
| }, | |
| { | |
| "epoch": 19.146494156928213, | |
| "grad_norm": 2.386323928833008, | |
| "learning_rate": 2.1338950333889818e-06, | |
| "loss": 1.2336, | |
| "num_input_tokens_seen": 212425948, | |
| "step": 367000, | |
| "train_runtime": 7277.2221, | |
| "train_tokens_per_second": 29190.527 | |
| }, | |
| { | |
| "epoch": 19.172579298831387, | |
| "grad_norm": 3.1663670539855957, | |
| "learning_rate": 2.068682178631052e-06, | |
| "loss": 1.2755, | |
| "num_input_tokens_seen": 212713028, | |
| "step": 367500, | |
| "train_runtime": 7298.2567, | |
| "train_tokens_per_second": 29145.731 | |
| }, | |
| { | |
| "epoch": 19.198664440734557, | |
| "grad_norm": 2.1720612049102783, | |
| "learning_rate": 2.0034693238731217e-06, | |
| "loss": 1.2636, | |
| "num_input_tokens_seen": 213002716, | |
| "step": 368000, | |
| "train_runtime": 7318.2656, | |
| "train_tokens_per_second": 29105.628 | |
| }, | |
| { | |
| "epoch": 19.22474958263773, | |
| "grad_norm": 2.9212682247161865, | |
| "learning_rate": 1.938256469115192e-06, | |
| "loss": 1.2423, | |
| "num_input_tokens_seen": 213288196, | |
| "step": 368500, | |
| "train_runtime": 7337.9518, | |
| "train_tokens_per_second": 29066.448 | |
| }, | |
| { | |
| "epoch": 19.2508347245409, | |
| "grad_norm": 2.7475364208221436, | |
| "learning_rate": 1.8730436143572623e-06, | |
| "loss": 1.2443, | |
| "num_input_tokens_seen": 213574692, | |
| "step": 369000, | |
| "train_runtime": 7356.9693, | |
| "train_tokens_per_second": 29030.255 | |
| }, | |
| { | |
| "epoch": 19.276919866444075, | |
| "grad_norm": 2.422600030899048, | |
| "learning_rate": 1.8078307595993323e-06, | |
| "loss": 1.2201, | |
| "num_input_tokens_seen": 213864116, | |
| "step": 369500, | |
| "train_runtime": 7375.605, | |
| "train_tokens_per_second": 28996.146 | |
| }, | |
| { | |
| "epoch": 19.303005008347245, | |
| "grad_norm": 2.7195160388946533, | |
| "learning_rate": 1.7426179048414023e-06, | |
| "loss": 1.2481, | |
| "num_input_tokens_seen": 214150676, | |
| "step": 370000, | |
| "train_runtime": 7396.4144, | |
| "train_tokens_per_second": 28953.31 | |
| }, | |
| { | |
| "epoch": 19.32909015025042, | |
| "grad_norm": 2.50443172454834, | |
| "learning_rate": 1.6774050500834725e-06, | |
| "loss": 1.2302, | |
| "num_input_tokens_seen": 214440244, | |
| "step": 370500, | |
| "train_runtime": 7416.7831, | |
| "train_tokens_per_second": 28912.837 | |
| }, | |
| { | |
| "epoch": 19.35517529215359, | |
| "grad_norm": 2.887474775314331, | |
| "learning_rate": 1.6121921953255427e-06, | |
| "loss": 1.2449, | |
| "num_input_tokens_seen": 214730132, | |
| "step": 371000, | |
| "train_runtime": 7437.2553, | |
| "train_tokens_per_second": 28872.228 | |
| }, | |
| { | |
| "epoch": 19.38126043405676, | |
| "grad_norm": 2.5884950160980225, | |
| "learning_rate": 1.5469793405676129e-06, | |
| "loss": 1.2521, | |
| "num_input_tokens_seen": 215019220, | |
| "step": 371500, | |
| "train_runtime": 7457.6502, | |
| "train_tokens_per_second": 28832.033 | |
| }, | |
| { | |
| "epoch": 19.407345575959933, | |
| "grad_norm": 2.357685089111328, | |
| "learning_rate": 1.4817664858096828e-06, | |
| "loss": 1.2443, | |
| "num_input_tokens_seen": 215310132, | |
| "step": 372000, | |
| "train_runtime": 7478.2575, | |
| "train_tokens_per_second": 28791.484 | |
| }, | |
| { | |
| "epoch": 19.433430717863104, | |
| "grad_norm": 2.3335018157958984, | |
| "learning_rate": 1.416553631051753e-06, | |
| "loss": 1.2469, | |
| "num_input_tokens_seen": 215600084, | |
| "step": 372500, | |
| "train_runtime": 7498.6623, | |
| "train_tokens_per_second": 28751.806 | |
| }, | |
| { | |
| "epoch": 19.459515859766277, | |
| "grad_norm": 2.7641124725341797, | |
| "learning_rate": 1.351340776293823e-06, | |
| "loss": 1.228, | |
| "num_input_tokens_seen": 215888340, | |
| "step": 373000, | |
| "train_runtime": 7519.0798, | |
| "train_tokens_per_second": 28712.069 | |
| }, | |
| { | |
| "epoch": 19.485601001669448, | |
| "grad_norm": 2.7597529888153076, | |
| "learning_rate": 1.2861279215358932e-06, | |
| "loss": 1.2499, | |
| "num_input_tokens_seen": 216178932, | |
| "step": 373500, | |
| "train_runtime": 7539.4463, | |
| "train_tokens_per_second": 28673.051 | |
| }, | |
| { | |
| "epoch": 19.51168614357262, | |
| "grad_norm": 2.3733975887298584, | |
| "learning_rate": 1.2209150667779632e-06, | |
| "loss": 1.2484, | |
| "num_input_tokens_seen": 216470580, | |
| "step": 374000, | |
| "train_runtime": 7559.9641, | |
| "train_tokens_per_second": 28633.811 | |
| }, | |
| { | |
| "epoch": 19.53777128547579, | |
| "grad_norm": 2.3238165378570557, | |
| "learning_rate": 1.1557022120200334e-06, | |
| "loss": 1.2364, | |
| "num_input_tokens_seen": 216763740, | |
| "step": 374500, | |
| "train_runtime": 7579.7748, | |
| "train_tokens_per_second": 28597.649 | |
| }, | |
| { | |
| "epoch": 19.563856427378965, | |
| "grad_norm": 2.8229446411132812, | |
| "learning_rate": 1.0904893572621036e-06, | |
| "loss": 1.2358, | |
| "num_input_tokens_seen": 217053292, | |
| "step": 375000, | |
| "train_runtime": 7598.7817, | |
| "train_tokens_per_second": 28564.223 | |
| }, | |
| { | |
| "epoch": 19.589941569282136, | |
| "grad_norm": 2.4836158752441406, | |
| "learning_rate": 1.0252765025041738e-06, | |
| "loss": 1.2606, | |
| "num_input_tokens_seen": 217344428, | |
| "step": 375500, | |
| "train_runtime": 7618.304, | |
| "train_tokens_per_second": 28529.241 | |
| }, | |
| { | |
| "epoch": 19.61602671118531, | |
| "grad_norm": 2.7675931453704834, | |
| "learning_rate": 9.600636477462437e-07, | |
| "loss": 1.2629, | |
| "num_input_tokens_seen": 217634524, | |
| "step": 376000, | |
| "train_runtime": 7637.288, | |
| "train_tokens_per_second": 28496.31 | |
| }, | |
| { | |
| "epoch": 19.64211185308848, | |
| "grad_norm": 2.331380844116211, | |
| "learning_rate": 8.948507929883139e-07, | |
| "loss": 1.2521, | |
| "num_input_tokens_seen": 217924508, | |
| "step": 376500, | |
| "train_runtime": 7656.4955, | |
| "train_tokens_per_second": 28462.697 | |
| }, | |
| { | |
| "epoch": 19.668196994991654, | |
| "grad_norm": 3.3577489852905273, | |
| "learning_rate": 8.29637938230384e-07, | |
| "loss": 1.2571, | |
| "num_input_tokens_seen": 218217084, | |
| "step": 377000, | |
| "train_runtime": 7675.4197, | |
| "train_tokens_per_second": 28430.639 | |
| }, | |
| { | |
| "epoch": 19.694282136894824, | |
| "grad_norm": 2.872344970703125, | |
| "learning_rate": 7.644250834724542e-07, | |
| "loss": 1.271, | |
| "num_input_tokens_seen": 218508180, | |
| "step": 377500, | |
| "train_runtime": 7694.4779, | |
| "train_tokens_per_second": 28398.052 | |
| }, | |
| { | |
| "epoch": 19.720367278797998, | |
| "grad_norm": 2.9395909309387207, | |
| "learning_rate": 6.992122287145243e-07, | |
| "loss": 1.25, | |
| "num_input_tokens_seen": 218798076, | |
| "step": 378000, | |
| "train_runtime": 7712.7627, | |
| "train_tokens_per_second": 28368.314 | |
| }, | |
| { | |
| "epoch": 19.746452420701168, | |
| "grad_norm": 2.5424513816833496, | |
| "learning_rate": 6.339993739565944e-07, | |
| "loss": 1.2817, | |
| "num_input_tokens_seen": 219089308, | |
| "step": 378500, | |
| "train_runtime": 7731.9549, | |
| "train_tokens_per_second": 28335.565 | |
| }, | |
| { | |
| "epoch": 19.77253756260434, | |
| "grad_norm": 2.9725682735443115, | |
| "learning_rate": 5.687865191986645e-07, | |
| "loss": 1.2418, | |
| "num_input_tokens_seen": 219383604, | |
| "step": 379000, | |
| "train_runtime": 7751.6984, | |
| "train_tokens_per_second": 28301.36 | |
| }, | |
| { | |
| "epoch": 19.798622704507512, | |
| "grad_norm": 3.3688950538635254, | |
| "learning_rate": 5.035736644407346e-07, | |
| "loss": 1.2449, | |
| "num_input_tokens_seen": 219679124, | |
| "step": 379500, | |
| "train_runtime": 7771.8118, | |
| "train_tokens_per_second": 28266.14 | |
| }, | |
| { | |
| "epoch": 19.824707846410686, | |
| "grad_norm": 2.398789882659912, | |
| "learning_rate": 4.3836080968280473e-07, | |
| "loss": 1.2362, | |
| "num_input_tokens_seen": 219963660, | |
| "step": 380000, | |
| "train_runtime": 7790.6642, | |
| "train_tokens_per_second": 28234.263 | |
| }, | |
| { | |
| "epoch": 19.850792988313856, | |
| "grad_norm": 2.845128059387207, | |
| "learning_rate": 3.731479549248748e-07, | |
| "loss": 1.2803, | |
| "num_input_tokens_seen": 220255900, | |
| "step": 380500, | |
| "train_runtime": 7809.3731, | |
| "train_tokens_per_second": 28204.044 | |
| }, | |
| { | |
| "epoch": 19.87687813021703, | |
| "grad_norm": 2.6180248260498047, | |
| "learning_rate": 3.079351001669449e-07, | |
| "loss": 1.2634, | |
| "num_input_tokens_seen": 220547100, | |
| "step": 381000, | |
| "train_runtime": 7827.8518, | |
| "train_tokens_per_second": 28174.665 | |
| }, | |
| { | |
| "epoch": 19.9029632721202, | |
| "grad_norm": 2.5833303928375244, | |
| "learning_rate": 2.4272224540901504e-07, | |
| "loss": 1.2482, | |
| "num_input_tokens_seen": 220835100, | |
| "step": 381500, | |
| "train_runtime": 7847.7813, | |
| "train_tokens_per_second": 28139.813 | |
| }, | |
| { | |
| "epoch": 19.929048414023374, | |
| "grad_norm": 2.800402879714966, | |
| "learning_rate": 1.7750939065108515e-07, | |
| "loss": 1.2335, | |
| "num_input_tokens_seen": 221122004, | |
| "step": 382000, | |
| "train_runtime": 7866.365, | |
| "train_tokens_per_second": 28109.807 | |
| }, | |
| { | |
| "epoch": 19.955133555926544, | |
| "grad_norm": 2.8612380027770996, | |
| "learning_rate": 1.1229653589315525e-07, | |
| "loss": 1.2251, | |
| "num_input_tokens_seen": 221409964, | |
| "step": 382500, | |
| "train_runtime": 7884.8107, | |
| "train_tokens_per_second": 28080.568 | |
| }, | |
| { | |
| "epoch": 19.981218697829718, | |
| "grad_norm": 3.3842055797576904, | |
| "learning_rate": 4.7083681135225376e-08, | |
| "loss": 1.2888, | |
| "num_input_tokens_seen": 221700476, | |
| "step": 383000, | |
| "train_runtime": 7903.5539, | |
| "train_tokens_per_second": 28050.732 | |
| }, | |
| { | |
| "epoch": 20.0, | |
| "eval_loss": 1.2961275577545166, | |
| "eval_runtime": 46.2863, | |
| "eval_samples_per_second": 828.215, | |
| "eval_steps_per_second": 103.53, | |
| "num_input_tokens_seen": 221910640, | |
| "step": 383360 | |
| }, | |
| { | |
| "epoch": 20.0, | |
| "num_input_tokens_seen": 221910640, | |
| "step": 383360, | |
| "total_flos": 8.056851732185088e+16, | |
| "train_loss": 0.641815161904031, | |
| "train_runtime": 7964.4512, | |
| "train_samples_per_second": 385.056, | |
| "train_steps_per_second": 48.134, | |
| "train_tokens_per_second": 27853.103 | |
| } | |
| ], | |
| "logging_steps": 500, | |
| "max_steps": 383360, | |
| "num_input_tokens_seen": 221910640, | |
| "num_train_epochs": 20, | |
| "save_steps": 500, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 8.056851732185088e+16, | |
| "train_batch_size": 8, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |