| { | |
| "best_global_step": null, | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 2.0, | |
| "eval_steps": 500, | |
| "global_step": 262, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.03816793893129771, | |
| "grad_norm": 9.226670913876331, | |
| "learning_rate": 7.407407407407407e-07, | |
| "loss": 1.0409, | |
| "mean_token_accuracy": 0.7926116704940795, | |
| "num_tokens": 386810.0, | |
| "step": 5 | |
| }, | |
| { | |
| "epoch": 0.07633587786259542, | |
| "grad_norm": 3.804973403997504, | |
| "learning_rate": 1.6666666666666667e-06, | |
| "loss": 1.0712, | |
| "mean_token_accuracy": 0.7756210923194885, | |
| "num_tokens": 787708.0, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.11450381679389313, | |
| "grad_norm": 1.7499258578222254, | |
| "learning_rate": 2.5925925925925925e-06, | |
| "loss": 0.8985, | |
| "mean_token_accuracy": 0.7973829984664917, | |
| "num_tokens": 1192940.0, | |
| "step": 15 | |
| }, | |
| { | |
| "epoch": 0.15267175572519084, | |
| "grad_norm": 6.83927543051746, | |
| "learning_rate": 3.5185185185185187e-06, | |
| "loss": 0.8262, | |
| "mean_token_accuracy": 0.8118989109992981, | |
| "num_tokens": 1584559.0, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.19083969465648856, | |
| "grad_norm": 0.9445650007024579, | |
| "learning_rate": 4.444444444444444e-06, | |
| "loss": 0.5909, | |
| "mean_token_accuracy": 0.8610062956809997, | |
| "num_tokens": 1994159.0, | |
| "step": 25 | |
| }, | |
| { | |
| "epoch": 0.22900763358778625, | |
| "grad_norm": 0.7333396651548034, | |
| "learning_rate": 4.957446808510639e-06, | |
| "loss": 0.6298, | |
| "mean_token_accuracy": 0.8575117826461792, | |
| "num_tokens": 2395528.0, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.26717557251908397, | |
| "grad_norm": 0.8342949171905462, | |
| "learning_rate": 4.851063829787234e-06, | |
| "loss": 0.4509, | |
| "mean_token_accuracy": 0.8964329123497009, | |
| "num_tokens": 2800227.0, | |
| "step": 35 | |
| }, | |
| { | |
| "epoch": 0.3053435114503817, | |
| "grad_norm": 0.7517210923147133, | |
| "learning_rate": 4.7446808510638305e-06, | |
| "loss": 0.6186, | |
| "mean_token_accuracy": 0.8617437362670899, | |
| "num_tokens": 3193615.0, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.3435114503816794, | |
| "grad_norm": 0.7224468196652443, | |
| "learning_rate": 4.638297872340426e-06, | |
| "loss": 0.5298, | |
| "mean_token_accuracy": 0.8760863780975342, | |
| "num_tokens": 3602519.0, | |
| "step": 45 | |
| }, | |
| { | |
| "epoch": 0.3816793893129771, | |
| "grad_norm": 9.152121654360869, | |
| "learning_rate": 4.5319148936170215e-06, | |
| "loss": 0.6868, | |
| "mean_token_accuracy": 0.8515465021133423, | |
| "num_tokens": 4001905.0, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.4198473282442748, | |
| "grad_norm": 3.892592763846139, | |
| "learning_rate": 4.425531914893617e-06, | |
| "loss": 0.6565, | |
| "mean_token_accuracy": 0.858378803730011, | |
| "num_tokens": 4396347.0, | |
| "step": 55 | |
| }, | |
| { | |
| "epoch": 0.4580152671755725, | |
| "grad_norm": 0.9697380164764114, | |
| "learning_rate": 4.319148936170213e-06, | |
| "loss": 0.5786, | |
| "mean_token_accuracy": 0.8708330988883972, | |
| "num_tokens": 4805947.0, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.4961832061068702, | |
| "grad_norm": 0.5790341294192543, | |
| "learning_rate": 4.212765957446809e-06, | |
| "loss": 0.4529, | |
| "mean_token_accuracy": 0.889950966835022, | |
| "num_tokens": 5212394.0, | |
| "step": 65 | |
| }, | |
| { | |
| "epoch": 0.5343511450381679, | |
| "grad_norm": 0.7713566871833059, | |
| "learning_rate": 4.106382978723404e-06, | |
| "loss": 0.5353, | |
| "mean_token_accuracy": 0.8747799396514893, | |
| "num_tokens": 5613999.0, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.5725190839694656, | |
| "grad_norm": 0.5654210829824995, | |
| "learning_rate": 4.000000000000001e-06, | |
| "loss": 0.5676, | |
| "mean_token_accuracy": 0.8743081212043762, | |
| "num_tokens": 6023295.0, | |
| "step": 75 | |
| }, | |
| { | |
| "epoch": 0.6106870229007634, | |
| "grad_norm": 0.5378495172846931, | |
| "learning_rate": 3.893617021276596e-06, | |
| "loss": 0.6124, | |
| "mean_token_accuracy": 0.8694724321365357, | |
| "num_tokens": 6423827.0, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.648854961832061, | |
| "grad_norm": 4.6481796647437275, | |
| "learning_rate": 3.7872340425531917e-06, | |
| "loss": 0.5453, | |
| "mean_token_accuracy": 0.8780820608139038, | |
| "num_tokens": 6830002.0, | |
| "step": 85 | |
| }, | |
| { | |
| "epoch": 0.6870229007633588, | |
| "grad_norm": 0.6013868777206862, | |
| "learning_rate": 3.680851063829787e-06, | |
| "loss": 0.4699, | |
| "mean_token_accuracy": 0.8892535328865051, | |
| "num_tokens": 7239602.0, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.7251908396946565, | |
| "grad_norm": 0.5982251388238545, | |
| "learning_rate": 3.5744680851063835e-06, | |
| "loss": 0.5096, | |
| "mean_token_accuracy": 0.8798805952072144, | |
| "num_tokens": 7639508.0, | |
| "step": 95 | |
| }, | |
| { | |
| "epoch": 0.7633587786259542, | |
| "grad_norm": 0.6181420027899801, | |
| "learning_rate": 3.468085106382979e-06, | |
| "loss": 0.4994, | |
| "mean_token_accuracy": 0.8826330661773681, | |
| "num_tokens": 8010548.0, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.8015267175572519, | |
| "grad_norm": 0.6930619383810027, | |
| "learning_rate": 3.3617021276595745e-06, | |
| "loss": 0.4875, | |
| "mean_token_accuracy": 0.8840958476066589, | |
| "num_tokens": 8420148.0, | |
| "step": 105 | |
| }, | |
| { | |
| "epoch": 0.8396946564885496, | |
| "grad_norm": 0.584421267267461, | |
| "learning_rate": 3.255319148936171e-06, | |
| "loss": 0.6218, | |
| "mean_token_accuracy": 0.8632862210273743, | |
| "num_tokens": 8811142.0, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.8778625954198473, | |
| "grad_norm": 0.6910785224299197, | |
| "learning_rate": 3.1489361702127664e-06, | |
| "loss": 0.5439, | |
| "mean_token_accuracy": 0.8773741364479065, | |
| "num_tokens": 9207371.0, | |
| "step": 115 | |
| }, | |
| { | |
| "epoch": 0.916030534351145, | |
| "grad_norm": 0.6529252723486935, | |
| "learning_rate": 3.042553191489362e-06, | |
| "loss": 0.5855, | |
| "mean_token_accuracy": 0.8638482332229614, | |
| "num_tokens": 9606796.0, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.9541984732824428, | |
| "grad_norm": 0.8472446417517443, | |
| "learning_rate": 2.9361702127659574e-06, | |
| "loss": 0.5128, | |
| "mean_token_accuracy": 0.8809929728507996, | |
| "num_tokens": 10009722.0, | |
| "step": 125 | |
| }, | |
| { | |
| "epoch": 0.9923664122137404, | |
| "grad_norm": 0.551644409578081, | |
| "learning_rate": 2.8297872340425537e-06, | |
| "loss": 0.4421, | |
| "mean_token_accuracy": 0.8924627542495728, | |
| "num_tokens": 10403148.0, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 1.0305343511450382, | |
| "grad_norm": 0.5625472220277399, | |
| "learning_rate": 2.7234042553191492e-06, | |
| "loss": 0.4711, | |
| "mean_token_accuracy": 0.8930499076843261, | |
| "num_tokens": 10754393.0, | |
| "step": 135 | |
| }, | |
| { | |
| "epoch": 1.0687022900763359, | |
| "grad_norm": 0.7509720129792777, | |
| "learning_rate": 2.6170212765957447e-06, | |
| "loss": 0.6222, | |
| "mean_token_accuracy": 0.8663328528404236, | |
| "num_tokens": 11130826.0, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 1.1068702290076335, | |
| "grad_norm": 0.618538531749113, | |
| "learning_rate": 2.5106382978723402e-06, | |
| "loss": 0.3556, | |
| "mean_token_accuracy": 0.9134134173393249, | |
| "num_tokens": 11539730.0, | |
| "step": 145 | |
| }, | |
| { | |
| "epoch": 1.1450381679389312, | |
| "grad_norm": 0.5959373365925987, | |
| "learning_rate": 2.404255319148936e-06, | |
| "loss": 0.5278, | |
| "mean_token_accuracy": 0.8801125884056091, | |
| "num_tokens": 11935963.0, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 1.183206106870229, | |
| "grad_norm": 0.5865155175564103, | |
| "learning_rate": 2.297872340425532e-06, | |
| "loss": 0.4265, | |
| "mean_token_accuracy": 0.8947603702545166, | |
| "num_tokens": 12343667.0, | |
| "step": 155 | |
| }, | |
| { | |
| "epoch": 1.2213740458015268, | |
| "grad_norm": 0.5659455399478734, | |
| "learning_rate": 2.191489361702128e-06, | |
| "loss": 0.383, | |
| "mean_token_accuracy": 0.9063637614250183, | |
| "num_tokens": 12738487.0, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 1.2595419847328244, | |
| "grad_norm": 0.5226402464959303, | |
| "learning_rate": 2.0851063829787235e-06, | |
| "loss": 0.397, | |
| "mean_token_accuracy": 0.9050293564796448, | |
| "num_tokens": 13144435.0, | |
| "step": 165 | |
| }, | |
| { | |
| "epoch": 1.297709923664122, | |
| "grad_norm": 0.6417118035542916, | |
| "learning_rate": 1.9787234042553194e-06, | |
| "loss": 0.5203, | |
| "mean_token_accuracy": 0.8780357480049134, | |
| "num_tokens": 13552365.0, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 1.33587786259542, | |
| "grad_norm": 0.5899140268106529, | |
| "learning_rate": 1.872340425531915e-06, | |
| "loss": 0.4622, | |
| "mean_token_accuracy": 0.894351315498352, | |
| "num_tokens": 13952271.0, | |
| "step": 175 | |
| }, | |
| { | |
| "epoch": 1.3740458015267176, | |
| "grad_norm": 0.6961173596879818, | |
| "learning_rate": 1.7659574468085109e-06, | |
| "loss": 0.448, | |
| "mean_token_accuracy": 0.8905507564544678, | |
| "num_tokens": 14347634.0, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 1.4122137404580153, | |
| "grad_norm": 0.7939108466614023, | |
| "learning_rate": 1.6595744680851064e-06, | |
| "loss": 0.5382, | |
| "mean_token_accuracy": 0.8792445421218872, | |
| "num_tokens": 14749885.0, | |
| "step": 185 | |
| }, | |
| { | |
| "epoch": 1.450381679389313, | |
| "grad_norm": 0.6478170620292795, | |
| "learning_rate": 1.5531914893617023e-06, | |
| "loss": 0.398, | |
| "mean_token_accuracy": 0.9026367902755738, | |
| "num_tokens": 15159485.0, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 1.4885496183206106, | |
| "grad_norm": 0.6144372382488293, | |
| "learning_rate": 1.4468085106382978e-06, | |
| "loss": 0.3856, | |
| "mean_token_accuracy": 0.9071364164352417, | |
| "num_tokens": 15555175.0, | |
| "step": 195 | |
| }, | |
| { | |
| "epoch": 1.5267175572519083, | |
| "grad_norm": 0.5792827372554384, | |
| "learning_rate": 1.3404255319148937e-06, | |
| "loss": 0.3427, | |
| "mean_token_accuracy": 0.9158633589744568, | |
| "num_tokens": 15961426.0, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 1.5648854961832062, | |
| "grad_norm": 0.6055405255828548, | |
| "learning_rate": 1.2340425531914894e-06, | |
| "loss": 0.4136, | |
| "mean_token_accuracy": 0.9000585198402404, | |
| "num_tokens": 16365746.0, | |
| "step": 205 | |
| }, | |
| { | |
| "epoch": 1.6030534351145038, | |
| "grad_norm": 0.6399791944894293, | |
| "learning_rate": 1.1276595744680851e-06, | |
| "loss": 0.4336, | |
| "mean_token_accuracy": 0.8964980363845825, | |
| "num_tokens": 16756037.0, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 1.6412213740458015, | |
| "grad_norm": 0.7671848929989695, | |
| "learning_rate": 1.0212765957446809e-06, | |
| "loss": 0.5051, | |
| "mean_token_accuracy": 0.8887869715690613, | |
| "num_tokens": 17141806.0, | |
| "step": 215 | |
| }, | |
| { | |
| "epoch": 1.6793893129770994, | |
| "grad_norm": 0.4980858703360447, | |
| "learning_rate": 9.148936170212766e-07, | |
| "loss": 0.3698, | |
| "mean_token_accuracy": 0.9083195209503174, | |
| "num_tokens": 17539462.0, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 1.717557251908397, | |
| "grad_norm": 0.643707317701808, | |
| "learning_rate": 8.085106382978725e-07, | |
| "loss": 0.5255, | |
| "mean_token_accuracy": 0.8803560018539429, | |
| "num_tokens": 17946504.0, | |
| "step": 225 | |
| }, | |
| { | |
| "epoch": 1.7557251908396947, | |
| "grad_norm": 0.6085852220164623, | |
| "learning_rate": 7.021276595744682e-07, | |
| "loss": 0.4066, | |
| "mean_token_accuracy": 0.8994758605957032, | |
| "num_tokens": 18356104.0, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 1.7938931297709924, | |
| "grad_norm": 0.5913104137597855, | |
| "learning_rate": 5.957446808510639e-07, | |
| "loss": 0.4863, | |
| "mean_token_accuracy": 0.8849827289581299, | |
| "num_tokens": 18757709.0, | |
| "step": 235 | |
| }, | |
| { | |
| "epoch": 1.83206106870229, | |
| "grad_norm": 0.8448497829484911, | |
| "learning_rate": 4.893617021276596e-07, | |
| "loss": 0.5258, | |
| "mean_token_accuracy": 0.8826202154159546, | |
| "num_tokens": 19165468.0, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 1.8702290076335877, | |
| "grad_norm": 0.5878320328654364, | |
| "learning_rate": 3.8297872340425535e-07, | |
| "loss": 0.4478, | |
| "mean_token_accuracy": 0.8924641013145447, | |
| "num_tokens": 19562227.0, | |
| "step": 245 | |
| }, | |
| { | |
| "epoch": 1.9083969465648853, | |
| "grad_norm": 0.6221314800813964, | |
| "learning_rate": 2.7659574468085106e-07, | |
| "loss": 0.579, | |
| "mean_token_accuracy": 0.8665671706199646, | |
| "num_tokens": 19958088.0, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 1.9465648854961832, | |
| "grad_norm": 0.4957957383410633, | |
| "learning_rate": 1.7021276595744683e-07, | |
| "loss": 0.3479, | |
| "mean_token_accuracy": 0.9116143703460693, | |
| "num_tokens": 20348442.0, | |
| "step": 255 | |
| }, | |
| { | |
| "epoch": 1.984732824427481, | |
| "grad_norm": 0.5216187078326494, | |
| "learning_rate": 6.382978723404255e-08, | |
| "loss": 0.3969, | |
| "mean_token_accuracy": 0.9038890838623047, | |
| "num_tokens": 20755096.0, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 2.0, | |
| "mean_token_accuracy": 0.9072179198265076, | |
| "num_tokens": 20867736.0, | |
| "step": 262, | |
| "total_flos": 41921614839808.0, | |
| "train_loss": 0.5300418632176086, | |
| "train_runtime": 511.2418, | |
| "train_samples_per_second": 4.08, | |
| "train_steps_per_second": 0.512 | |
| } | |
| ], | |
| "logging_steps": 5, | |
| "max_steps": 262, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 2, | |
| "save_steps": 500, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 41921614839808.0, | |
| "train_batch_size": 1, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |