{ "best_metric": null, "best_model_checkpoint": null, "epoch": 3.9998179584034954, "eval_steps": 500, "global_step": 21972, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.045510399126200335, "grad_norm": 2.6165335178375244, "learning_rate": 9.887118798361402e-05, "loss": 0.8444, "step": 250 }, { "epoch": 0.09102079825240067, "grad_norm": 2.7013211250305176, "learning_rate": 9.773327264451525e-05, "loss": 0.5413, "step": 500 }, { "epoch": 0.136531197378601, "grad_norm": 2.3657588958740234, "learning_rate": 9.659535730541647e-05, "loss": 0.4992, "step": 750 }, { "epoch": 0.18204159650480134, "grad_norm": 2.571484088897705, "learning_rate": 9.54574419663177e-05, "loss": 0.4771, "step": 1000 }, { "epoch": 0.22755199563100167, "grad_norm": 2.439297676086426, "learning_rate": 9.431952662721894e-05, "loss": 0.4604, "step": 1250 }, { "epoch": 0.273062394757202, "grad_norm": 2.3595223426818848, "learning_rate": 9.318161128812017e-05, "loss": 0.452, "step": 1500 }, { "epoch": 0.31857279388340237, "grad_norm": 2.526773452758789, "learning_rate": 9.204369594902141e-05, "loss": 0.4438, "step": 1750 }, { "epoch": 0.3640831930096027, "grad_norm": 2.1642537117004395, "learning_rate": 9.090578060992263e-05, "loss": 0.4338, "step": 2000 }, { "epoch": 0.40959359213580304, "grad_norm": 2.5757973194122314, "learning_rate": 8.976786527082386e-05, "loss": 0.4277, "step": 2250 }, { "epoch": 0.45510399126200335, "grad_norm": 2.473083734512329, "learning_rate": 8.862994993172509e-05, "loss": 0.4175, "step": 2500 }, { "epoch": 0.5006143903882037, "grad_norm": 2.8206324577331543, "learning_rate": 8.749203459262631e-05, "loss": 0.4039, "step": 2750 }, { "epoch": 0.546124789514404, "grad_norm": 2.784949779510498, "learning_rate": 8.635411925352754e-05, "loss": 0.3923, "step": 3000 }, { "epoch": 0.5916351886406044, "grad_norm": 3.3137335777282715, "learning_rate": 8.521620391442878e-05, "loss": 0.3802, "step": 3250 }, { "epoch": 0.6371455877668047, "grad_norm": 3.6405413150787354, "learning_rate": 8.407828857533e-05, "loss": 0.3678, "step": 3500 }, { "epoch": 0.6826559868930051, "grad_norm": 3.3154547214508057, "learning_rate": 8.294037323623123e-05, "loss": 0.3623, "step": 3750 }, { "epoch": 0.7281663860192054, "grad_norm": 3.19486141204834, "learning_rate": 8.180245789713246e-05, "loss": 0.3499, "step": 4000 }, { "epoch": 0.7736767851454057, "grad_norm": 3.61464524269104, "learning_rate": 8.066454255803368e-05, "loss": 0.3429, "step": 4250 }, { "epoch": 0.8191871842716061, "grad_norm": 3.3924710750579834, "learning_rate": 7.952662721893491e-05, "loss": 0.3346, "step": 4500 }, { "epoch": 0.8646975833978064, "grad_norm": 3.7511956691741943, "learning_rate": 7.838871187983614e-05, "loss": 0.3238, "step": 4750 }, { "epoch": 0.9102079825240067, "grad_norm": 3.344393253326416, "learning_rate": 7.725079654073737e-05, "loss": 0.3186, "step": 5000 }, { "epoch": 0.9557183816502071, "grad_norm": 3.1730661392211914, "learning_rate": 7.61128812016386e-05, "loss": 0.3119, "step": 5250 }, { "epoch": 1.0012287807764073, "grad_norm": 3.959838628768921, "learning_rate": 7.497496586253983e-05, "loss": 0.3085, "step": 5500 }, { "epoch": 1.0467391799026078, "grad_norm": 3.2908735275268555, "learning_rate": 7.383705052344107e-05, "loss": 0.2975, "step": 5750 }, { "epoch": 1.092249579028808, "grad_norm": 3.441880941390991, "learning_rate": 7.26991351843423e-05, "loss": 0.2926, "step": 6000 }, { "epoch": 1.1377599781550085, "grad_norm": 4.100038051605225, "learning_rate": 7.156121984524352e-05, "loss": 0.2859, "step": 6250 }, { "epoch": 1.1832703772812088, "grad_norm": 3.5992090702056885, "learning_rate": 7.042330450614475e-05, "loss": 0.2808, "step": 6500 }, { "epoch": 1.228780776407409, "grad_norm": 3.3500804901123047, "learning_rate": 6.928538916704597e-05, "loss": 0.2791, "step": 6750 }, { "epoch": 1.2742911755336095, "grad_norm": 3.3923401832580566, "learning_rate": 6.814747382794721e-05, "loss": 0.2749, "step": 7000 }, { "epoch": 1.3198015746598097, "grad_norm": 3.785520315170288, "learning_rate": 6.700955848884844e-05, "loss": 0.2713, "step": 7250 }, { "epoch": 1.3653119737860102, "grad_norm": 3.4256818294525146, "learning_rate": 6.587164314974966e-05, "loss": 0.2668, "step": 7500 }, { "epoch": 1.4108223729122105, "grad_norm": 3.892718553543091, "learning_rate": 6.473372781065089e-05, "loss": 0.265, "step": 7750 }, { "epoch": 1.4563327720384107, "grad_norm": 3.272980213165283, "learning_rate": 6.359581247155212e-05, "loss": 0.2602, "step": 8000 }, { "epoch": 1.501843171164611, "grad_norm": 3.327261209487915, "learning_rate": 6.245789713245334e-05, "loss": 0.2604, "step": 8250 }, { "epoch": 1.5473535702908114, "grad_norm": 3.1805334091186523, "learning_rate": 6.131998179335457e-05, "loss": 0.2572, "step": 8500 }, { "epoch": 1.592863969417012, "grad_norm": 3.5480244159698486, "learning_rate": 6.01820664542558e-05, "loss": 0.2523, "step": 8750 }, { "epoch": 1.6383743685432122, "grad_norm": 3.761888265609741, "learning_rate": 5.904415111515703e-05, "loss": 0.2491, "step": 9000 }, { "epoch": 1.6838847676694124, "grad_norm": 3.0907113552093506, "learning_rate": 5.790623577605826e-05, "loss": 0.2458, "step": 9250 }, { "epoch": 1.7293951667956127, "grad_norm": 3.0482497215270996, "learning_rate": 5.676832043695949e-05, "loss": 0.2444, "step": 9500 }, { "epoch": 1.7749055659218131, "grad_norm": 3.088454008102417, "learning_rate": 5.563040509786073e-05, "loss": 0.2442, "step": 9750 }, { "epoch": 1.8204159650480136, "grad_norm": 3.492504119873047, "learning_rate": 5.4492489758761954e-05, "loss": 0.24, "step": 10000 }, { "epoch": 1.8659263641742139, "grad_norm": 3.4256038665771484, "learning_rate": 5.3354574419663187e-05, "loss": 0.2381, "step": 10250 }, { "epoch": 1.9114367633004141, "grad_norm": 3.163264036178589, "learning_rate": 5.221665908056441e-05, "loss": 0.2356, "step": 10500 }, { "epoch": 1.9569471624266144, "grad_norm": 3.4566102027893066, "learning_rate": 5.107874374146564e-05, "loss": 0.2356, "step": 10750 }, { "epoch": 2.0024575615528146, "grad_norm": 3.404165744781494, "learning_rate": 4.9940828402366865e-05, "loss": 0.2318, "step": 11000 }, { "epoch": 2.0479679606790153, "grad_norm": 3.004925012588501, "learning_rate": 4.88029130632681e-05, "loss": 0.2259, "step": 11250 }, { "epoch": 2.0934783598052156, "grad_norm": 3.1267077922821045, "learning_rate": 4.7664997724169324e-05, "loss": 0.2253, "step": 11500 }, { "epoch": 2.138988758931416, "grad_norm": 3.2789225578308105, "learning_rate": 4.652708238507055e-05, "loss": 0.2211, "step": 11750 }, { "epoch": 2.184499158057616, "grad_norm": 3.55497407913208, "learning_rate": 4.5389167045971784e-05, "loss": 0.2237, "step": 12000 }, { "epoch": 2.2300095571838163, "grad_norm": 3.159508228302002, "learning_rate": 4.425125170687301e-05, "loss": 0.219, "step": 12250 }, { "epoch": 2.275519956310017, "grad_norm": 3.4701318740844727, "learning_rate": 4.3113336367774236e-05, "loss": 0.2204, "step": 12500 }, { "epoch": 2.3210303554362173, "grad_norm": 3.129274606704712, "learning_rate": 4.197542102867547e-05, "loss": 0.2192, "step": 12750 }, { "epoch": 2.3665407545624175, "grad_norm": 3.207860231399536, "learning_rate": 4.08375056895767e-05, "loss": 0.2179, "step": 13000 }, { "epoch": 2.4120511536886178, "grad_norm": 2.966536283493042, "learning_rate": 3.969959035047793e-05, "loss": 0.2149, "step": 13250 }, { "epoch": 2.457561552814818, "grad_norm": 3.199531078338623, "learning_rate": 3.8561675011379155e-05, "loss": 0.2149, "step": 13500 }, { "epoch": 2.5030719519410187, "grad_norm": 2.925417423248291, "learning_rate": 3.742375967228038e-05, "loss": 0.2127, "step": 13750 }, { "epoch": 2.548582351067219, "grad_norm": 3.013047218322754, "learning_rate": 3.6285844333181614e-05, "loss": 0.21, "step": 14000 }, { "epoch": 2.5940927501934192, "grad_norm": 3.139263153076172, "learning_rate": 3.514792899408284e-05, "loss": 0.2117, "step": 14250 }, { "epoch": 2.6396031493196195, "grad_norm": 3.118286371231079, "learning_rate": 3.4010013654984066e-05, "loss": 0.2113, "step": 14500 }, { "epoch": 2.6851135484458197, "grad_norm": 3.226870536804199, "learning_rate": 3.28720983158853e-05, "loss": 0.2087, "step": 14750 }, { "epoch": 2.7306239475720204, "grad_norm": 3.2298526763916016, "learning_rate": 3.173418297678653e-05, "loss": 0.2083, "step": 15000 }, { "epoch": 2.7761343466982207, "grad_norm": 2.83184814453125, "learning_rate": 3.059626763768776e-05, "loss": 0.2089, "step": 15250 }, { "epoch": 2.821644745824421, "grad_norm": 3.196331262588501, "learning_rate": 2.9458352298588988e-05, "loss": 0.2051, "step": 15500 }, { "epoch": 2.867155144950621, "grad_norm": 2.892674446105957, "learning_rate": 2.8320436959490214e-05, "loss": 0.2062, "step": 15750 }, { "epoch": 2.9126655440768214, "grad_norm": 2.7058119773864746, "learning_rate": 2.7182521620391444e-05, "loss": 0.2044, "step": 16000 }, { "epoch": 2.958175943203022, "grad_norm": 2.6910922527313232, "learning_rate": 2.604460628129267e-05, "loss": 0.2027, "step": 16250 }, { "epoch": 3.0036863423292224, "grad_norm": 2.9895167350769043, "learning_rate": 2.4906690942193903e-05, "loss": 0.2025, "step": 16500 }, { "epoch": 3.0491967414554226, "grad_norm": 2.6071224212646484, "learning_rate": 2.376877560309513e-05, "loss": 0.1976, "step": 16750 }, { "epoch": 3.094707140581623, "grad_norm": 2.265737533569336, "learning_rate": 2.263086026399636e-05, "loss": 0.1969, "step": 17000 }, { "epoch": 3.140217539707823, "grad_norm": 2.356515645980835, "learning_rate": 2.149294492489759e-05, "loss": 0.1967, "step": 17250 }, { "epoch": 3.1857279388340234, "grad_norm": 2.7601630687713623, "learning_rate": 2.0355029585798818e-05, "loss": 0.1972, "step": 17500 }, { "epoch": 3.231238337960224, "grad_norm": 3.170037031173706, "learning_rate": 1.9217114246700048e-05, "loss": 0.197, "step": 17750 }, { "epoch": 3.2767487370864243, "grad_norm": 2.934422016143799, "learning_rate": 1.8079198907601274e-05, "loss": 0.1933, "step": 18000 }, { "epoch": 3.3222591362126246, "grad_norm": 2.9783575534820557, "learning_rate": 1.6941283568502504e-05, "loss": 0.1947, "step": 18250 }, { "epoch": 3.367769535338825, "grad_norm": 2.80299711227417, "learning_rate": 1.5803368229403733e-05, "loss": 0.1938, "step": 18500 }, { "epoch": 3.413279934465025, "grad_norm": 2.343477487564087, "learning_rate": 1.4665452890304963e-05, "loss": 0.1934, "step": 18750 }, { "epoch": 3.458790333591226, "grad_norm": 3.13242769241333, "learning_rate": 1.352753755120619e-05, "loss": 0.1913, "step": 19000 }, { "epoch": 3.504300732717426, "grad_norm": 2.3755791187286377, "learning_rate": 1.238962221210742e-05, "loss": 0.1916, "step": 19250 }, { "epoch": 3.5498111318436263, "grad_norm": 3.209934949874878, "learning_rate": 1.1251706873008648e-05, "loss": 0.1899, "step": 19500 }, { "epoch": 3.5953215309698265, "grad_norm": 2.815525531768799, "learning_rate": 1.0113791533909878e-05, "loss": 0.1911, "step": 19750 }, { "epoch": 3.640831930096027, "grad_norm": 3.1206352710723877, "learning_rate": 8.975876194811106e-06, "loss": 0.1895, "step": 20000 }, { "epoch": 3.686342329222227, "grad_norm": 3.1889894008636475, "learning_rate": 7.837960855712335e-06, "loss": 0.1889, "step": 20250 }, { "epoch": 3.7318527283484277, "grad_norm": 3.095888376235962, "learning_rate": 6.700045516613563e-06, "loss": 0.1895, "step": 20500 }, { "epoch": 3.777363127474628, "grad_norm": 2.9188358783721924, "learning_rate": 5.562130177514793e-06, "loss": 0.1882, "step": 20750 }, { "epoch": 3.8228735266008282, "grad_norm": 2.6947951316833496, "learning_rate": 4.4242148384160225e-06, "loss": 0.1871, "step": 21000 }, { "epoch": 3.8683839257270285, "grad_norm": 2.88930606842041, "learning_rate": 3.286299499317251e-06, "loss": 0.1884, "step": 21250 }, { "epoch": 3.9138943248532287, "grad_norm": 3.069244146347046, "learning_rate": 2.14838416021848e-06, "loss": 0.1888, "step": 21500 }, { "epoch": 3.9594047239794294, "grad_norm": 2.7428455352783203, "learning_rate": 1.0104688211197086e-06, "loss": 0.1869, "step": 21750 }, { "epoch": 3.9998179584034954, "step": 21972, "total_flos": 1.737238850670485e+18, "train_loss": 0.2707560721716333, "train_runtime": 41232.1952, "train_samples_per_second": 51.159, "train_steps_per_second": 0.533 } ], "logging_steps": 250, "max_steps": 21972, "num_input_tokens_seen": 0, "num_train_epochs": 4, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": false, "should_training_stop": false }, "attributes": {} } }, "total_flos": 1.737238850670485e+18, "train_batch_size": 6, "trial_name": null, "trial_params": null }