{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 6.0, "eval_steps": 50, "global_step": 942, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.3194888178913738, "grad_norm": 0.4640214145183563, "learning_rate": 1.0315789473684213e-05, "loss": 2.3153, "mean_token_accuracy": 0.5581120517104864, "num_tokens": 936278.0, "step": 50 }, { "epoch": 0.3194888178913738, "eval_loss": 2.3463408946990967, "eval_mean_token_accuracy": 0.5574123318922721, "eval_num_tokens": 936278.0, "eval_runtime": 849.9086, "eval_samples_per_second": 1.105, "eval_steps_per_second": 0.139, "step": 50 }, { "epoch": 0.6389776357827476, "grad_norm": 0.14180870354175568, "learning_rate": 1.999889943690545e-05, "loss": 2.2111, "mean_token_accuracy": 0.5665408112108707, "num_tokens": 1858244.0, "step": 100 }, { "epoch": 0.6389776357827476, "eval_loss": 2.161623239517212, "eval_mean_token_accuracy": 0.5832038072206206, "eval_num_tokens": 1858244.0, "eval_runtime": 849.6915, "eval_samples_per_second": 1.105, "eval_steps_per_second": 0.139, "step": 100 }, { "epoch": 0.9584664536741214, "grad_norm": 0.1999247670173645, "learning_rate": 1.9800088348404778e-05, "loss": 2.0427, "mean_token_accuracy": 0.595615528896451, "num_tokens": 2787448.0, "step": 150 }, { "epoch": 0.9584664536741214, "eval_loss": 1.964790940284729, "eval_mean_token_accuracy": 0.6102252491449905, "eval_num_tokens": 2787448.0, "eval_runtime": 850.5137, "eval_samples_per_second": 1.104, "eval_steps_per_second": 0.139, "step": 150 }, { "epoch": 1.2747603833865815, "grad_norm": 0.18980443477630615, "learning_rate": 1.9265185523130156e-05, "loss": 1.9083, "mean_token_accuracy": 0.6146050283704141, "num_tokens": 3708402.0, "step": 200 }, { "epoch": 1.2747603833865815, "eval_loss": 1.8844295740127563, "eval_mean_token_accuracy": 0.6211647886340901, "eval_num_tokens": 3708402.0, "eval_runtime": 850.6392, "eval_samples_per_second": 1.104, "eval_steps_per_second": 0.139, "step": 200 }, { "epoch": 1.5942492012779552, "grad_norm": 0.14579655230045319, "learning_rate": 1.8412535328311813e-05, "loss": 1.8207, "mean_token_accuracy": 0.6267321394383907, "num_tokens": 4634196.0, "step": 250 }, { "epoch": 1.5942492012779552, "eval_loss": 1.8456555604934692, "eval_mean_token_accuracy": 0.6270800980470949, "eval_num_tokens": 4634196.0, "eval_runtime": 851.8063, "eval_samples_per_second": 1.102, "eval_steps_per_second": 0.139, "step": 250 }, { "epoch": 1.9137380191693292, "grad_norm": 0.14154069125652313, "learning_rate": 1.7271379202868394e-05, "loss": 1.7851, "mean_token_accuracy": 0.6324610809981823, "num_tokens": 5563181.0, "step": 300 }, { "epoch": 1.9137380191693292, "eval_loss": 1.8308707475662231, "eval_mean_token_accuracy": 0.6285602904982486, "eval_num_tokens": 5563181.0, "eval_runtime": 850.7358, "eval_samples_per_second": 1.104, "eval_steps_per_second": 0.139, "step": 300 }, { "epoch": 2.230031948881789, "grad_norm": 0.16205650568008423, "learning_rate": 1.5880852829101464e-05, "loss": 1.7711, "mean_token_accuracy": 0.636140936327101, "num_tokens": 6483800.0, "step": 350 }, { "epoch": 2.230031948881789, "eval_loss": 1.8208913803100586, "eval_mean_token_accuracy": 0.6294448860621048, "eval_num_tokens": 6483800.0, "eval_runtime": 851.4097, "eval_samples_per_second": 1.103, "eval_steps_per_second": 0.139, "step": 350 }, { "epoch": 2.549520766773163, "grad_norm": 0.20008337497711182, "learning_rate": 1.428864398362841e-05, "loss": 1.7823, "mean_token_accuracy": 0.6310028441995382, "num_tokens": 7408532.0, "step": 400 }, { "epoch": 2.549520766773163, "eval_loss": 1.8133732080459595, "eval_mean_token_accuracy": 0.630439654750339, "eval_num_tokens": 7408532.0, "eval_runtime": 850.3072, "eval_samples_per_second": 1.104, "eval_steps_per_second": 0.139, "step": 400 }, { "epoch": 2.8690095846645365, "grad_norm": 0.1719253808259964, "learning_rate": 1.2549357096241841e-05, "loss": 1.7909, "mean_token_accuracy": 0.628285994976759, "num_tokens": 8329291.0, "step": 450 }, { "epoch": 2.8690095846645365, "eval_loss": 1.8075919151306152, "eval_mean_token_accuracy": 0.63100187152119, "eval_num_tokens": 8329291.0, "eval_runtime": 849.8047, "eval_samples_per_second": 1.105, "eval_steps_per_second": 0.139, "step": 450 }, { "epoch": 3.1853035143769968, "grad_norm": 0.173483207821846, "learning_rate": 1.0722640603766825e-05, "loss": 1.7433, "mean_token_accuracy": 0.6376601189675958, "num_tokens": 9252324.0, "step": 500 }, { "epoch": 3.1853035143769968, "eval_loss": 1.8033778667449951, "eval_mean_token_accuracy": 0.631462009781498, "eval_num_tokens": 9252324.0, "eval_runtime": 849.6602, "eval_samples_per_second": 1.105, "eval_steps_per_second": 0.139, "step": 500 }, { "epoch": 3.5047923322683707, "grad_norm": 0.16342906653881073, "learning_rate": 8.871141320877181e-06, "loss": 1.768, "mean_token_accuracy": 0.6327251829206944, "num_tokens": 10178542.0, "step": 550 }, { "epoch": 3.5047923322683707, "eval_loss": 1.799792766571045, "eval_mean_token_accuracy": 0.6317491682909303, "eval_num_tokens": 10178542.0, "eval_runtime": 850.4015, "eval_samples_per_second": 1.104, "eval_steps_per_second": 0.139, "step": 550 }, { "epoch": 3.8242811501597442, "grad_norm": 0.1763872355222702, "learning_rate": 7.058355982245038e-06, "loss": 1.7466, "mean_token_accuracy": 0.6369322521239519, "num_tokens": 11110223.0, "step": 600 }, { "epoch": 3.8242811501597442, "eval_loss": 1.7971426248550415, "eval_mean_token_accuracy": 0.6322182533094438, "eval_num_tokens": 11110223.0, "eval_runtime": 851.1474, "eval_samples_per_second": 1.103, "eval_steps_per_second": 0.139, "step": 600 }, { "epoch": 4.140575079872204, "grad_norm": 0.1760365068912506, "learning_rate": 5.346453636882939e-06, "loss": 1.7488, "mean_token_accuracy": 0.635444945683985, "num_tokens": 12027841.0, "step": 650 }, { "epoch": 4.140575079872204, "eval_loss": 1.7952942848205566, "eval_mean_token_accuracy": 0.6321543343996597, "eval_num_tokens": 12027841.0, "eval_runtime": 850.8638, "eval_samples_per_second": 1.104, "eval_steps_per_second": 0.139, "step": 650 }, { "epoch": 4.460063897763578, "grad_norm": 0.18568162620067596, "learning_rate": 3.794143575154964e-06, "loss": 1.7644, "mean_token_accuracy": 0.6322370621562005, "num_tokens": 12951270.0, "step": 700 }, { "epoch": 4.460063897763578, "eval_loss": 1.7938051223754883, "eval_mean_token_accuracy": 0.6323889074689251, "eval_num_tokens": 12951270.0, "eval_runtime": 850.6954, "eval_samples_per_second": 1.104, "eval_steps_per_second": 0.139, "step": 700 }, { "epoch": 4.779552715654952, "grad_norm": 0.17006878554821014, "learning_rate": 2.4546619074011603e-06, "loss": 1.7434, "mean_token_accuracy": 0.63632001131773, "num_tokens": 13879907.0, "step": 750 }, { "epoch": 4.779552715654952, "eval_loss": 1.7928369045257568, "eval_mean_token_accuracy": 0.6325142757367279, "eval_num_tokens": 13879907.0, "eval_runtime": 851.6054, "eval_samples_per_second": 1.103, "eval_steps_per_second": 0.139, "step": 750 }, { "epoch": 5.095846645367412, "grad_norm": 0.16964676976203918, "learning_rate": 1.373945843990192e-06, "loss": 1.7534, "mean_token_accuracy": 0.6362755319686851, "num_tokens": 14805495.0, "step": 800 }, { "epoch": 5.095846645367412, "eval_loss": 1.7922581434249878, "eval_mean_token_accuracy": 0.6325970463833567, "eval_num_tokens": 14805495.0, "eval_runtime": 852.0987, "eval_samples_per_second": 1.102, "eval_steps_per_second": 0.138, "step": 800 }, { "epoch": 5.415335463258786, "grad_norm": 0.15671594440937042, "learning_rate": 5.890582894386798e-07, "loss": 1.7308, "mean_token_accuracy": 0.6389838096499443, "num_tokens": 15729518.0, "step": 850 }, { "epoch": 5.415335463258786, "eval_loss": 1.7919801473617554, "eval_mean_token_accuracy": 0.6326618699704186, "eval_num_tokens": 15729518.0, "eval_runtime": 850.7319, "eval_samples_per_second": 1.104, "eval_steps_per_second": 0.139, "step": 850 }, { "epoch": 5.73482428115016, "grad_norm": 0.17389754951000214, "learning_rate": 1.269167787731662e-07, "loss": 1.7591, "mean_token_accuracy": 0.6318698778748513, "num_tokens": 16657397.0, "step": 900 }, { "epoch": 5.73482428115016, "eval_loss": 1.7918750047683716, "eval_mean_token_accuracy": 0.6326669464677067, "eval_num_tokens": 16657397.0, "eval_runtime": 851.3734, "eval_samples_per_second": 1.103, "eval_steps_per_second": 0.139, "step": 900 } ], "logging_steps": 50, "max_steps": 942, "num_input_tokens_seen": 0, "num_train_epochs": 6, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 9.882764527567749e+17, "train_batch_size": 3, "trial_name": null, "trial_params": null }