| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 6.187560425394779, | |
| "eval_steps": 100, | |
| "global_step": 2400, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.12890750886239125, | |
| "grad_norm": 5.744519798404323, | |
| "learning_rate": 1e-05, | |
| "loss": 3.8898, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.2578150177247825, | |
| "grad_norm": 7.663336118255639, | |
| "learning_rate": 9.999766822485166e-06, | |
| "loss": 2.7367, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.2578150177247825, | |
| "eval_loss": 2.547008991241455, | |
| "eval_runtime": 13.6928, | |
| "eval_samples_per_second": 73.031, | |
| "eval_steps_per_second": 2.337, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.3867225265871737, | |
| "grad_norm": 6.687747513820881, | |
| "learning_rate": 9.999067314105889e-06, | |
| "loss": 2.4915, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.515630035449565, | |
| "grad_norm": 3.3619995730848027, | |
| "learning_rate": 9.997901547355329e-06, | |
| "loss": 2.3844, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.515630035449565, | |
| "eval_loss": 2.2610087394714355, | |
| "eval_runtime": 13.0766, | |
| "eval_samples_per_second": 76.472, | |
| "eval_steps_per_second": 2.447, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.6445375443119562, | |
| "grad_norm": 5.032691180052785, | |
| "learning_rate": 9.996269643047091e-06, | |
| "loss": 2.2534, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.7734450531743474, | |
| "grad_norm": 3.975968794985624, | |
| "learning_rate": 9.99417177030268e-06, | |
| "loss": 2.1781, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.7734450531743474, | |
| "eval_loss": 2.0959012508392334, | |
| "eval_runtime": 13.0524, | |
| "eval_samples_per_second": 76.614, | |
| "eval_steps_per_second": 2.452, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.9023525620367386, | |
| "grad_norm": 3.795190451013864, | |
| "learning_rate": 9.991608146533984e-06, | |
| "loss": 2.1342, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 1.03126007089913, | |
| "grad_norm": 3.850079627270192, | |
| "learning_rate": 9.988579037420745e-06, | |
| "loss": 2.0796, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 1.03126007089913, | |
| "eval_loss": 2.029681921005249, | |
| "eval_runtime": 13.0552, | |
| "eval_samples_per_second": 76.598, | |
| "eval_steps_per_second": 2.451, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 1.1601675797615212, | |
| "grad_norm": 2.0860401247438722, | |
| "learning_rate": 9.985084756883026e-06, | |
| "loss": 2.0583, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 1.2890750886239124, | |
| "grad_norm": 1.7587705935004414, | |
| "learning_rate": 9.98112566704867e-06, | |
| "loss": 2.0388, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 1.2890750886239124, | |
| "eval_loss": 1.9869602918624878, | |
| "eval_runtime": 13.1308, | |
| "eval_samples_per_second": 76.157, | |
| "eval_steps_per_second": 2.437, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 1.4179825974863036, | |
| "grad_norm": 2.675426697648005, | |
| "learning_rate": 9.97670217821578e-06, | |
| "loss": 2.0418, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 1.5468901063486948, | |
| "grad_norm": 2.1347946648640694, | |
| "learning_rate": 9.971814748810192e-06, | |
| "loss": 2.0001, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 1.5468901063486948, | |
| "eval_loss": 1.963959813117981, | |
| "eval_runtime": 13.0814, | |
| "eval_samples_per_second": 76.444, | |
| "eval_steps_per_second": 2.446, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 1.675797615211086, | |
| "grad_norm": 2.8870637538075505, | |
| "learning_rate": 9.96646388533797e-06, | |
| "loss": 1.9743, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 1.8047051240734773, | |
| "grad_norm": 4.178579445918737, | |
| "learning_rate": 9.960650142332914e-06, | |
| "loss": 1.9717, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 1.8047051240734773, | |
| "eval_loss": 1.9353902339935303, | |
| "eval_runtime": 13.1146, | |
| "eval_samples_per_second": 76.251, | |
| "eval_steps_per_second": 2.44, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 1.9336126329358685, | |
| "grad_norm": 2.199357001490578, | |
| "learning_rate": 9.954374122299082e-06, | |
| "loss": 1.9634, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 2.06252014179826, | |
| "grad_norm": 2.166363542050421, | |
| "learning_rate": 9.947636475648373e-06, | |
| "loss": 1.9475, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 2.06252014179826, | |
| "eval_loss": 1.9235832691192627, | |
| "eval_runtime": 13.0977, | |
| "eval_samples_per_second": 76.349, | |
| "eval_steps_per_second": 2.443, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 2.191427650660651, | |
| "grad_norm": 2.467046686495788, | |
| "learning_rate": 9.940437900633096e-06, | |
| "loss": 1.9309, | |
| "step": 850 | |
| }, | |
| { | |
| "epoch": 2.3203351595230424, | |
| "grad_norm": 2.855139647937512, | |
| "learning_rate": 9.932779143273619e-06, | |
| "loss": 1.9347, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 2.3203351595230424, | |
| "eval_loss": 1.9101444482803345, | |
| "eval_runtime": 13.1292, | |
| "eval_samples_per_second": 76.166, | |
| "eval_steps_per_second": 2.437, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 2.4492426683854336, | |
| "grad_norm": 1.9014562925621068, | |
| "learning_rate": 9.92466099728106e-06, | |
| "loss": 1.9278, | |
| "step": 950 | |
| }, | |
| { | |
| "epoch": 2.578150177247825, | |
| "grad_norm": 1.3665703880188187, | |
| "learning_rate": 9.91608430397502e-06, | |
| "loss": 1.9157, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 2.578150177247825, | |
| "eval_loss": 1.8934762477874756, | |
| "eval_runtime": 13.0864, | |
| "eval_samples_per_second": 76.415, | |
| "eval_steps_per_second": 2.445, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 2.707057686110216, | |
| "grad_norm": 1.2319877018351602, | |
| "learning_rate": 9.907049952196403e-06, | |
| "loss": 1.9105, | |
| "step": 1050 | |
| }, | |
| { | |
| "epoch": 2.8359651949726072, | |
| "grad_norm": 1.5481172638750316, | |
| "learning_rate": 9.897558878215295e-06, | |
| "loss": 1.907, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 2.8359651949726072, | |
| "eval_loss": 1.8839563131332397, | |
| "eval_runtime": 13.153, | |
| "eval_samples_per_second": 76.028, | |
| "eval_steps_per_second": 2.433, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 2.9648727038349985, | |
| "grad_norm": 2.045794673210496, | |
| "learning_rate": 9.887612065633936e-06, | |
| "loss": 1.8945, | |
| "step": 1150 | |
| }, | |
| { | |
| "epoch": 3.0937802126973897, | |
| "grad_norm": 1.1757006962680892, | |
| "learning_rate": 9.877210545284792e-06, | |
| "loss": 1.888, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 3.0937802126973897, | |
| "eval_loss": 1.874881386756897, | |
| "eval_runtime": 13.1346, | |
| "eval_samples_per_second": 76.135, | |
| "eval_steps_per_second": 2.436, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 3.222687721559781, | |
| "grad_norm": 4.950973442850626, | |
| "learning_rate": 9.86635539512371e-06, | |
| "loss": 1.8811, | |
| "step": 1250 | |
| }, | |
| { | |
| "epoch": 3.351595230422172, | |
| "grad_norm": 1.8124376039297767, | |
| "learning_rate": 9.855047740118221e-06, | |
| "loss": 1.876, | |
| "step": 1300 | |
| }, | |
| { | |
| "epoch": 3.351595230422172, | |
| "eval_loss": 1.86701238155365, | |
| "eval_runtime": 13.0772, | |
| "eval_samples_per_second": 76.469, | |
| "eval_steps_per_second": 2.447, | |
| "step": 1300 | |
| }, | |
| { | |
| "epoch": 3.4805027392845633, | |
| "grad_norm": 1.0286338189260262, | |
| "learning_rate": 9.843288752130942e-06, | |
| "loss": 1.8683, | |
| "step": 1350 | |
| }, | |
| { | |
| "epoch": 3.6094102481469545, | |
| "grad_norm": 1.9894455905335977, | |
| "learning_rate": 9.831079649798138e-06, | |
| "loss": 1.8731, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 3.6094102481469545, | |
| "eval_loss": 1.8612475395202637, | |
| "eval_runtime": 13.1177, | |
| "eval_samples_per_second": 76.233, | |
| "eval_steps_per_second": 2.439, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 3.7383177570093458, | |
| "grad_norm": 1.7685619948548048, | |
| "learning_rate": 9.818421698403429e-06, | |
| "loss": 1.8648, | |
| "step": 1450 | |
| }, | |
| { | |
| "epoch": 3.867225265871737, | |
| "grad_norm": 1.5478355130519232, | |
| "learning_rate": 9.805316209746655e-06, | |
| "loss": 1.8665, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 3.867225265871737, | |
| "eval_loss": 1.8537719249725342, | |
| "eval_runtime": 13.1085, | |
| "eval_samples_per_second": 76.286, | |
| "eval_steps_per_second": 2.441, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 3.996132774734128, | |
| "grad_norm": 2.1355502302510447, | |
| "learning_rate": 9.791764542007945e-06, | |
| "loss": 1.8655, | |
| "step": 1550 | |
| }, | |
| { | |
| "epoch": 4.12504028359652, | |
| "grad_norm": 1.1480708382569662, | |
| "learning_rate": 9.777768099606938e-06, | |
| "loss": 1.8346, | |
| "step": 1600 | |
| }, | |
| { | |
| "epoch": 4.12504028359652, | |
| "eval_loss": 1.8495159149169922, | |
| "eval_runtime": 13.1242, | |
| "eval_samples_per_second": 76.195, | |
| "eval_steps_per_second": 2.438, | |
| "step": 1600 | |
| }, | |
| { | |
| "epoch": 4.253947792458911, | |
| "grad_norm": 1.4360213593754323, | |
| "learning_rate": 9.763328333057263e-06, | |
| "loss": 1.8265, | |
| "step": 1650 | |
| }, | |
| { | |
| "epoch": 4.382855301321302, | |
| "grad_norm": 1.8676589580299272, | |
| "learning_rate": 9.748446738816201e-06, | |
| "loss": 1.8391, | |
| "step": 1700 | |
| }, | |
| { | |
| "epoch": 4.382855301321302, | |
| "eval_loss": 1.8443361520767212, | |
| "eval_runtime": 13.1311, | |
| "eval_samples_per_second": 76.155, | |
| "eval_steps_per_second": 2.437, | |
| "step": 1700 | |
| }, | |
| { | |
| "epoch": 4.5117628101836935, | |
| "grad_norm": 1.2690786695530565, | |
| "learning_rate": 9.733124859129598e-06, | |
| "loss": 1.8434, | |
| "step": 1750 | |
| }, | |
| { | |
| "epoch": 4.640670319046085, | |
| "grad_norm": 1.0388627604335856, | |
| "learning_rate": 9.717364281872047e-06, | |
| "loss": 1.842, | |
| "step": 1800 | |
| }, | |
| { | |
| "epoch": 4.640670319046085, | |
| "eval_loss": 1.8362445831298828, | |
| "eval_runtime": 13.1151, | |
| "eval_samples_per_second": 76.248, | |
| "eval_steps_per_second": 2.44, | |
| "step": 1800 | |
| }, | |
| { | |
| "epoch": 4.769577827908476, | |
| "grad_norm": 1.8674582891348575, | |
| "learning_rate": 9.701166640382317e-06, | |
| "loss": 1.8308, | |
| "step": 1850 | |
| }, | |
| { | |
| "epoch": 4.898485336770867, | |
| "grad_norm": 1.4621440377933717, | |
| "learning_rate": 9.684533613294096e-06, | |
| "loss": 1.8382, | |
| "step": 1900 | |
| }, | |
| { | |
| "epoch": 4.898485336770867, | |
| "eval_loss": 1.831936240196228, | |
| "eval_runtime": 13.0768, | |
| "eval_samples_per_second": 76.472, | |
| "eval_steps_per_second": 2.447, | |
| "step": 1900 | |
| }, | |
| { | |
| "epoch": 5.027392845633258, | |
| "grad_norm": 1.0861177702948985, | |
| "learning_rate": 9.667466924362013e-06, | |
| "loss": 1.8308, | |
| "step": 1950 | |
| }, | |
| { | |
| "epoch": 5.15630035449565, | |
| "grad_norm": 1.8095526457252593, | |
| "learning_rate": 9.649968342283005e-06, | |
| "loss": 1.8161, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 5.15630035449565, | |
| "eval_loss": 1.829033374786377, | |
| "eval_runtime": 13.067, | |
| "eval_samples_per_second": 76.529, | |
| "eval_steps_per_second": 2.449, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 5.285207863358041, | |
| "grad_norm": 1.518641242728159, | |
| "learning_rate": 9.632039680513024e-06, | |
| "loss": 1.8007, | |
| "step": 2050 | |
| }, | |
| { | |
| "epoch": 5.414115372220432, | |
| "grad_norm": 1.0542508895398046, | |
| "learning_rate": 9.613682797079086e-06, | |
| "loss": 1.7999, | |
| "step": 2100 | |
| }, | |
| { | |
| "epoch": 5.414115372220432, | |
| "eval_loss": 1.8255321979522705, | |
| "eval_runtime": 13.0764, | |
| "eval_samples_per_second": 76.474, | |
| "eval_steps_per_second": 2.447, | |
| "step": 2100 | |
| }, | |
| { | |
| "epoch": 5.543022881082823, | |
| "grad_norm": 1.9568398940474616, | |
| "learning_rate": 9.594899594386732e-06, | |
| "loss": 1.8189, | |
| "step": 2150 | |
| }, | |
| { | |
| "epoch": 5.6719303899452145, | |
| "grad_norm": 0.8933581069243784, | |
| "learning_rate": 9.57569201902286e-06, | |
| "loss": 1.8066, | |
| "step": 2200 | |
| }, | |
| { | |
| "epoch": 5.6719303899452145, | |
| "eval_loss": 1.8212575912475586, | |
| "eval_runtime": 13.0944, | |
| "eval_samples_per_second": 76.369, | |
| "eval_steps_per_second": 2.444, | |
| "step": 2200 | |
| }, | |
| { | |
| "epoch": 5.800837898807606, | |
| "grad_norm": 0.7789642769876175, | |
| "learning_rate": 9.556062061553995e-06, | |
| "loss": 1.8068, | |
| "step": 2250 | |
| }, | |
| { | |
| "epoch": 5.929745407669997, | |
| "grad_norm": 1.0394842073428503, | |
| "learning_rate": 9.536011756320011e-06, | |
| "loss": 1.8165, | |
| "step": 2300 | |
| }, | |
| { | |
| "epoch": 5.929745407669997, | |
| "eval_loss": 1.8185018301010132, | |
| "eval_runtime": 13.0777, | |
| "eval_samples_per_second": 76.466, | |
| "eval_steps_per_second": 2.447, | |
| "step": 2300 | |
| }, | |
| { | |
| "epoch": 6.058652916532388, | |
| "grad_norm": 1.9360605440141716, | |
| "learning_rate": 9.515543181223277e-06, | |
| "loss": 1.7866, | |
| "step": 2350 | |
| }, | |
| { | |
| "epoch": 6.187560425394779, | |
| "grad_norm": 1.2520661239107576, | |
| "learning_rate": 9.494658457513341e-06, | |
| "loss": 1.7824, | |
| "step": 2400 | |
| }, | |
| { | |
| "epoch": 6.187560425394779, | |
| "eval_loss": 1.8156663179397583, | |
| "eval_runtime": 13.1655, | |
| "eval_samples_per_second": 75.956, | |
| "eval_steps_per_second": 2.431, | |
| "step": 2400 | |
| } | |
| ], | |
| "logging_steps": 50, | |
| "max_steps": 15480, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 40, | |
| "save_steps": 800, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": false | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 2134166405644288.0, | |
| "train_batch_size": 4, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |