| { | |
| "best_global_step": null, | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 0.9987639060568603, | |
| "eval_steps": 500, | |
| "global_step": 505, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.009888751545117428, | |
| "grad_norm": 25.25, | |
| "learning_rate": 0.0002, | |
| "loss": 3.4401, | |
| "step": 5 | |
| }, | |
| { | |
| "epoch": 0.019777503090234856, | |
| "grad_norm": 11.125, | |
| "learning_rate": 0.0002, | |
| "loss": 2.2116, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.029666254635352288, | |
| "grad_norm": 4.65625, | |
| "learning_rate": 0.0002, | |
| "loss": 1.6815, | |
| "step": 15 | |
| }, | |
| { | |
| "epoch": 0.03955500618046971, | |
| "grad_norm": 13.0, | |
| "learning_rate": 0.0002, | |
| "loss": 1.906, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.049443757725587144, | |
| "grad_norm": 6.03125, | |
| "learning_rate": 0.0002, | |
| "loss": 1.8419, | |
| "step": 25 | |
| }, | |
| { | |
| "epoch": 0.059332509270704575, | |
| "grad_norm": 7.125, | |
| "learning_rate": 0.0002, | |
| "loss": 1.7867, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.069221260815822, | |
| "grad_norm": 4.125, | |
| "learning_rate": 0.0002, | |
| "loss": 1.7177, | |
| "step": 35 | |
| }, | |
| { | |
| "epoch": 0.07911001236093942, | |
| "grad_norm": 4.375, | |
| "learning_rate": 0.0002, | |
| "loss": 1.4636, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.08899876390605686, | |
| "grad_norm": 42.5, | |
| "learning_rate": 0.0002, | |
| "loss": 1.6758, | |
| "step": 45 | |
| }, | |
| { | |
| "epoch": 0.09888751545117429, | |
| "grad_norm": 3.71875, | |
| "learning_rate": 0.0002, | |
| "loss": 1.6556, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.10877626699629171, | |
| "grad_norm": 3.953125, | |
| "learning_rate": 0.0002, | |
| "loss": 1.5998, | |
| "step": 55 | |
| }, | |
| { | |
| "epoch": 0.11866501854140915, | |
| "grad_norm": 3.21875, | |
| "learning_rate": 0.0002, | |
| "loss": 1.5765, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.12855377008652658, | |
| "grad_norm": 3.71875, | |
| "learning_rate": 0.0002, | |
| "loss": 1.5169, | |
| "step": 65 | |
| }, | |
| { | |
| "epoch": 0.138442521631644, | |
| "grad_norm": 3.359375, | |
| "learning_rate": 0.0002, | |
| "loss": 1.582, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.14833127317676142, | |
| "grad_norm": 4.96875, | |
| "learning_rate": 0.0002, | |
| "loss": 1.5751, | |
| "step": 75 | |
| }, | |
| { | |
| "epoch": 0.15822002472187885, | |
| "grad_norm": 3.03125, | |
| "learning_rate": 0.0002, | |
| "loss": 1.6266, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.1681087762669963, | |
| "grad_norm": 3.640625, | |
| "learning_rate": 0.0002, | |
| "loss": 1.508, | |
| "step": 85 | |
| }, | |
| { | |
| "epoch": 0.17799752781211373, | |
| "grad_norm": 2.765625, | |
| "learning_rate": 0.0002, | |
| "loss": 1.5031, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.18788627935723115, | |
| "grad_norm": 2.734375, | |
| "learning_rate": 0.0002, | |
| "loss": 1.5231, | |
| "step": 95 | |
| }, | |
| { | |
| "epoch": 0.19777503090234858, | |
| "grad_norm": 8.3125, | |
| "learning_rate": 0.0002, | |
| "loss": 1.6393, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.207663782447466, | |
| "grad_norm": 145.0, | |
| "learning_rate": 0.0002, | |
| "loss": 1.7596, | |
| "step": 105 | |
| }, | |
| { | |
| "epoch": 0.21755253399258342, | |
| "grad_norm": 2.578125, | |
| "learning_rate": 0.0002, | |
| "loss": 1.5453, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.22744128553770088, | |
| "grad_norm": 3.375, | |
| "learning_rate": 0.0002, | |
| "loss": 1.5337, | |
| "step": 115 | |
| }, | |
| { | |
| "epoch": 0.2373300370828183, | |
| "grad_norm": 15.375, | |
| "learning_rate": 0.0002, | |
| "loss": 1.6691, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.24721878862793573, | |
| "grad_norm": 6.5625, | |
| "learning_rate": 0.0002, | |
| "loss": 1.5903, | |
| "step": 125 | |
| }, | |
| { | |
| "epoch": 0.25710754017305315, | |
| "grad_norm": 2.953125, | |
| "learning_rate": 0.0002, | |
| "loss": 1.4876, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.2669962917181706, | |
| "grad_norm": 24.875, | |
| "learning_rate": 0.0002, | |
| "loss": 1.6292, | |
| "step": 135 | |
| }, | |
| { | |
| "epoch": 0.276885043263288, | |
| "grad_norm": 3.03125, | |
| "learning_rate": 0.0002, | |
| "loss": 1.5652, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.2867737948084054, | |
| "grad_norm": 3.03125, | |
| "learning_rate": 0.0002, | |
| "loss": 1.5782, | |
| "step": 145 | |
| }, | |
| { | |
| "epoch": 0.29666254635352285, | |
| "grad_norm": 3.828125, | |
| "learning_rate": 0.0002, | |
| "loss": 1.5251, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.3065512978986403, | |
| "grad_norm": 4.71875, | |
| "learning_rate": 0.0002, | |
| "loss": 1.4326, | |
| "step": 155 | |
| }, | |
| { | |
| "epoch": 0.3164400494437577, | |
| "grad_norm": 2.375, | |
| "learning_rate": 0.0002, | |
| "loss": 1.5085, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.3263288009888752, | |
| "grad_norm": 3.84375, | |
| "learning_rate": 0.0002, | |
| "loss": 1.482, | |
| "step": 165 | |
| }, | |
| { | |
| "epoch": 0.3362175525339926, | |
| "grad_norm": 2.328125, | |
| "learning_rate": 0.0002, | |
| "loss": 1.516, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.34610630407911, | |
| "grad_norm": 2.453125, | |
| "learning_rate": 0.0002, | |
| "loss": 1.5091, | |
| "step": 175 | |
| }, | |
| { | |
| "epoch": 0.35599505562422745, | |
| "grad_norm": 2.625, | |
| "learning_rate": 0.0002, | |
| "loss": 1.4517, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.3658838071693449, | |
| "grad_norm": 2.84375, | |
| "learning_rate": 0.0002, | |
| "loss": 1.4647, | |
| "step": 185 | |
| }, | |
| { | |
| "epoch": 0.3757725587144623, | |
| "grad_norm": 2.734375, | |
| "learning_rate": 0.0002, | |
| "loss": 1.4764, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 0.3856613102595797, | |
| "grad_norm": 2.15625, | |
| "learning_rate": 0.0002, | |
| "loss": 1.4254, | |
| "step": 195 | |
| }, | |
| { | |
| "epoch": 0.39555006180469715, | |
| "grad_norm": 2.34375, | |
| "learning_rate": 0.0002, | |
| "loss": 1.4868, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.4054388133498146, | |
| "grad_norm": 2.609375, | |
| "learning_rate": 0.0002, | |
| "loss": 1.4534, | |
| "step": 205 | |
| }, | |
| { | |
| "epoch": 0.415327564894932, | |
| "grad_norm": 2.34375, | |
| "learning_rate": 0.0002, | |
| "loss": 1.447, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 0.4252163164400494, | |
| "grad_norm": 2.3125, | |
| "learning_rate": 0.0002, | |
| "loss": 1.4514, | |
| "step": 215 | |
| }, | |
| { | |
| "epoch": 0.43510506798516685, | |
| "grad_norm": 2.109375, | |
| "learning_rate": 0.0002, | |
| "loss": 1.4372, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 0.44499381953028433, | |
| "grad_norm": 2.859375, | |
| "learning_rate": 0.0002, | |
| "loss": 1.3961, | |
| "step": 225 | |
| }, | |
| { | |
| "epoch": 0.45488257107540175, | |
| "grad_norm": 3.0625, | |
| "learning_rate": 0.0002, | |
| "loss": 1.4363, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 0.4647713226205192, | |
| "grad_norm": 2.703125, | |
| "learning_rate": 0.0002, | |
| "loss": 1.4355, | |
| "step": 235 | |
| }, | |
| { | |
| "epoch": 0.4746600741656366, | |
| "grad_norm": 2.8125, | |
| "learning_rate": 0.0002, | |
| "loss": 1.525, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 0.484548825710754, | |
| "grad_norm": 2.46875, | |
| "learning_rate": 0.0002, | |
| "loss": 1.3801, | |
| "step": 245 | |
| }, | |
| { | |
| "epoch": 0.49443757725587145, | |
| "grad_norm": 2.40625, | |
| "learning_rate": 0.0002, | |
| "loss": 1.5133, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.5043263288009888, | |
| "grad_norm": 2.328125, | |
| "learning_rate": 0.0002, | |
| "loss": 1.4364, | |
| "step": 255 | |
| }, | |
| { | |
| "epoch": 0.5142150803461063, | |
| "grad_norm": 2.3125, | |
| "learning_rate": 0.0002, | |
| "loss": 1.4363, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 0.5241038318912238, | |
| "grad_norm": 2.75, | |
| "learning_rate": 0.0002, | |
| "loss": 1.521, | |
| "step": 265 | |
| }, | |
| { | |
| "epoch": 0.5339925834363412, | |
| "grad_norm": 2.4375, | |
| "learning_rate": 0.0002, | |
| "loss": 1.4559, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 0.5438813349814586, | |
| "grad_norm": 2.5, | |
| "learning_rate": 0.0002, | |
| "loss": 1.4516, | |
| "step": 275 | |
| }, | |
| { | |
| "epoch": 0.553770086526576, | |
| "grad_norm": 2.125, | |
| "learning_rate": 0.0002, | |
| "loss": 1.3968, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 0.5636588380716935, | |
| "grad_norm": 2.328125, | |
| "learning_rate": 0.0002, | |
| "loss": 1.4183, | |
| "step": 285 | |
| }, | |
| { | |
| "epoch": 0.5735475896168108, | |
| "grad_norm": 1.8046875, | |
| "learning_rate": 0.0002, | |
| "loss": 1.3653, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 0.5834363411619283, | |
| "grad_norm": 2.0, | |
| "learning_rate": 0.0002, | |
| "loss": 1.388, | |
| "step": 295 | |
| }, | |
| { | |
| "epoch": 0.5933250927070457, | |
| "grad_norm": 1.953125, | |
| "learning_rate": 0.0002, | |
| "loss": 1.3131, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.6032138442521632, | |
| "grad_norm": 2.390625, | |
| "learning_rate": 0.0002, | |
| "loss": 1.4784, | |
| "step": 305 | |
| }, | |
| { | |
| "epoch": 0.6131025957972805, | |
| "grad_norm": 2.34375, | |
| "learning_rate": 0.0002, | |
| "loss": 1.4339, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 0.622991347342398, | |
| "grad_norm": 2.140625, | |
| "learning_rate": 0.0002, | |
| "loss": 1.4425, | |
| "step": 315 | |
| }, | |
| { | |
| "epoch": 0.6328800988875154, | |
| "grad_norm": 2.078125, | |
| "learning_rate": 0.0002, | |
| "loss": 1.3847, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 0.6427688504326329, | |
| "grad_norm": 2.359375, | |
| "learning_rate": 0.0002, | |
| "loss": 1.4252, | |
| "step": 325 | |
| }, | |
| { | |
| "epoch": 0.6526576019777504, | |
| "grad_norm": 2.640625, | |
| "learning_rate": 0.0002, | |
| "loss": 1.4253, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 0.6625463535228677, | |
| "grad_norm": 2.015625, | |
| "learning_rate": 0.0002, | |
| "loss": 1.3336, | |
| "step": 335 | |
| }, | |
| { | |
| "epoch": 0.6724351050679852, | |
| "grad_norm": 2.171875, | |
| "learning_rate": 0.0002, | |
| "loss": 1.4209, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 0.6823238566131026, | |
| "grad_norm": 4.0625, | |
| "learning_rate": 0.0002, | |
| "loss": 1.4485, | |
| "step": 345 | |
| }, | |
| { | |
| "epoch": 0.69221260815822, | |
| "grad_norm": 2.1875, | |
| "learning_rate": 0.0002, | |
| "loss": 1.4261, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 0.7021013597033374, | |
| "grad_norm": 2.515625, | |
| "learning_rate": 0.0002, | |
| "loss": 1.3869, | |
| "step": 355 | |
| }, | |
| { | |
| "epoch": 0.7119901112484549, | |
| "grad_norm": 2.1875, | |
| "learning_rate": 0.0002, | |
| "loss": 1.4681, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 0.7218788627935723, | |
| "grad_norm": 3.609375, | |
| "learning_rate": 0.0002, | |
| "loss": 1.3513, | |
| "step": 365 | |
| }, | |
| { | |
| "epoch": 0.7317676143386898, | |
| "grad_norm": 2.296875, | |
| "learning_rate": 0.0002, | |
| "loss": 1.4029, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 0.7416563658838071, | |
| "grad_norm": 2.15625, | |
| "learning_rate": 0.0002, | |
| "loss": 1.3716, | |
| "step": 375 | |
| }, | |
| { | |
| "epoch": 0.7515451174289246, | |
| "grad_norm": 3.75, | |
| "learning_rate": 0.0002, | |
| "loss": 1.3902, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 0.761433868974042, | |
| "grad_norm": 2.0625, | |
| "learning_rate": 0.0002, | |
| "loss": 1.3469, | |
| "step": 385 | |
| }, | |
| { | |
| "epoch": 0.7713226205191595, | |
| "grad_norm": 1.9609375, | |
| "learning_rate": 0.0002, | |
| "loss": 1.3216, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 0.7812113720642769, | |
| "grad_norm": 1.6796875, | |
| "learning_rate": 0.0002, | |
| "loss": 1.4774, | |
| "step": 395 | |
| }, | |
| { | |
| "epoch": 0.7911001236093943, | |
| "grad_norm": 1.7265625, | |
| "learning_rate": 0.0002, | |
| "loss": 1.372, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.8009888751545118, | |
| "grad_norm": 1.921875, | |
| "learning_rate": 0.0002, | |
| "loss": 1.3574, | |
| "step": 405 | |
| }, | |
| { | |
| "epoch": 0.8108776266996292, | |
| "grad_norm": 2.1875, | |
| "learning_rate": 0.0002, | |
| "loss": 1.4444, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 0.8207663782447466, | |
| "grad_norm": 2.296875, | |
| "learning_rate": 0.0002, | |
| "loss": 1.3743, | |
| "step": 415 | |
| }, | |
| { | |
| "epoch": 0.830655129789864, | |
| "grad_norm": 2.125, | |
| "learning_rate": 0.0002, | |
| "loss": 1.3589, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 0.8405438813349815, | |
| "grad_norm": 2.046875, | |
| "learning_rate": 0.0002, | |
| "loss": 1.3336, | |
| "step": 425 | |
| }, | |
| { | |
| "epoch": 0.8504326328800988, | |
| "grad_norm": 3.84375, | |
| "learning_rate": 0.0002, | |
| "loss": 1.3584, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 0.8603213844252163, | |
| "grad_norm": 2.125, | |
| "learning_rate": 0.0002, | |
| "loss": 1.332, | |
| "step": 435 | |
| }, | |
| { | |
| "epoch": 0.8702101359703337, | |
| "grad_norm": 1.8828125, | |
| "learning_rate": 0.0002, | |
| "loss": 1.3678, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 0.8800988875154512, | |
| "grad_norm": 2.0625, | |
| "learning_rate": 0.0002, | |
| "loss": 1.2967, | |
| "step": 445 | |
| }, | |
| { | |
| "epoch": 0.8899876390605687, | |
| "grad_norm": 2.3125, | |
| "learning_rate": 0.0002, | |
| "loss": 1.3276, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 0.899876390605686, | |
| "grad_norm": 1.953125, | |
| "learning_rate": 0.0002, | |
| "loss": 1.3175, | |
| "step": 455 | |
| }, | |
| { | |
| "epoch": 0.9097651421508035, | |
| "grad_norm": 1.7734375, | |
| "learning_rate": 0.0002, | |
| "loss": 1.366, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 0.9196538936959209, | |
| "grad_norm": 2.46875, | |
| "learning_rate": 0.0002, | |
| "loss": 1.3105, | |
| "step": 465 | |
| }, | |
| { | |
| "epoch": 0.9295426452410384, | |
| "grad_norm": 1.84375, | |
| "learning_rate": 0.0002, | |
| "loss": 1.2856, | |
| "step": 470 | |
| }, | |
| { | |
| "epoch": 0.9394313967861557, | |
| "grad_norm": 1.7890625, | |
| "learning_rate": 0.0002, | |
| "loss": 1.3662, | |
| "step": 475 | |
| }, | |
| { | |
| "epoch": 0.9493201483312732, | |
| "grad_norm": 1.921875, | |
| "learning_rate": 0.0002, | |
| "loss": 1.3289, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 0.9592088998763906, | |
| "grad_norm": 2.078125, | |
| "learning_rate": 0.0002, | |
| "loss": 1.3812, | |
| "step": 485 | |
| }, | |
| { | |
| "epoch": 0.969097651421508, | |
| "grad_norm": 2.703125, | |
| "learning_rate": 0.0002, | |
| "loss": 1.3344, | |
| "step": 490 | |
| }, | |
| { | |
| "epoch": 0.9789864029666254, | |
| "grad_norm": 1.9609375, | |
| "learning_rate": 0.0002, | |
| "loss": 1.345, | |
| "step": 495 | |
| }, | |
| { | |
| "epoch": 0.9888751545117429, | |
| "grad_norm": 2.125, | |
| "learning_rate": 0.0002, | |
| "loss": 1.3163, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.9987639060568603, | |
| "grad_norm": 2.125, | |
| "learning_rate": 0.0002, | |
| "loss": 1.354, | |
| "step": 505 | |
| }, | |
| { | |
| "epoch": 0.9987639060568603, | |
| "step": 505, | |
| "total_flos": 9435624701952000.0, | |
| "train_loss": 1.491983689412032, | |
| "train_runtime": 449.9567, | |
| "train_samples_per_second": 17.977, | |
| "train_steps_per_second": 1.122 | |
| } | |
| ], | |
| "logging_steps": 5, | |
| "max_steps": 505, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 1, | |
| "save_steps": 100, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 9435624701952000.0, | |
| "train_batch_size": 2, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |