| { |
| "best_global_step": null, |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 1.0, |
| "eval_steps": 500, |
| "global_step": 700, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "epoch": 0.014295925661186561, |
| "grad_norm": 16.642337799072266, |
| "learning_rate": 1.9742857142857144e-05, |
| "loss": 4.2448, |
| "mean_token_accuracy": 0.44755197104532274, |
| "num_tokens": 63714.0, |
| "step": 10 |
| }, |
| { |
| "epoch": 0.028591851322373123, |
| "grad_norm": 12.869135856628418, |
| "learning_rate": 1.945714285714286e-05, |
| "loss": 1.9287, |
| "mean_token_accuracy": 0.5766903940588236, |
| "num_tokens": 128528.0, |
| "step": 20 |
| }, |
| { |
| "epoch": 0.04288777698355969, |
| "grad_norm": 18.376684188842773, |
| "learning_rate": 1.9171428571428573e-05, |
| "loss": 1.6956, |
| "mean_token_accuracy": 0.597195016592741, |
| "num_tokens": 191200.0, |
| "step": 30 |
| }, |
| { |
| "epoch": 0.057183702644746245, |
| "grad_norm": 17.71656036376953, |
| "learning_rate": 1.888571428571429e-05, |
| "loss": 1.6076, |
| "mean_token_accuracy": 0.6067132025957107, |
| "num_tokens": 255728.0, |
| "step": 40 |
| }, |
| { |
| "epoch": 0.07147962830593281, |
| "grad_norm": 21.026283264160156, |
| "learning_rate": 1.86e-05, |
| "loss": 1.5728, |
| "mean_token_accuracy": 0.612850959226489, |
| "num_tokens": 319058.0, |
| "step": 50 |
| }, |
| { |
| "epoch": 0.08577555396711938, |
| "grad_norm": 46.10198974609375, |
| "learning_rate": 1.8314285714285714e-05, |
| "loss": 1.5977, |
| "mean_token_accuracy": 0.6111391615122557, |
| "num_tokens": 384900.0, |
| "step": 60 |
| }, |
| { |
| "epoch": 0.10007147962830593, |
| "grad_norm": 14.742942810058594, |
| "learning_rate": 1.802857142857143e-05, |
| "loss": 1.5649, |
| "mean_token_accuracy": 0.6097237385809422, |
| "num_tokens": 450346.0, |
| "step": 70 |
| }, |
| { |
| "epoch": 0.11436740528949249, |
| "grad_norm": 43.62748718261719, |
| "learning_rate": 1.7742857142857143e-05, |
| "loss": 1.5184, |
| "mean_token_accuracy": 0.6210372049361468, |
| "num_tokens": 515018.0, |
| "step": 80 |
| }, |
| { |
| "epoch": 0.12866333095067906, |
| "grad_norm": 15.469511032104492, |
| "learning_rate": 1.745714285714286e-05, |
| "loss": 1.4736, |
| "mean_token_accuracy": 0.6270900748670101, |
| "num_tokens": 576955.0, |
| "step": 90 |
| }, |
| { |
| "epoch": 0.14295925661186562, |
| "grad_norm": 19.448793411254883, |
| "learning_rate": 1.717142857142857e-05, |
| "loss": 1.4637, |
| "mean_token_accuracy": 0.6368957210332156, |
| "num_tokens": 641295.0, |
| "step": 100 |
| }, |
| { |
| "epoch": 0.15725518227305219, |
| "grad_norm": 37.31778335571289, |
| "learning_rate": 1.6885714285714288e-05, |
| "loss": 1.5303, |
| "mean_token_accuracy": 0.6210926879197359, |
| "num_tokens": 706683.0, |
| "step": 110 |
| }, |
| { |
| "epoch": 0.17155110793423875, |
| "grad_norm": 9.722342491149902, |
| "learning_rate": 1.66e-05, |
| "loss": 1.4596, |
| "mean_token_accuracy": 0.6298462159931659, |
| "num_tokens": 771285.0, |
| "step": 120 |
| }, |
| { |
| "epoch": 0.18584703359542531, |
| "grad_norm": 9.656769752502441, |
| "learning_rate": 1.6314285714285716e-05, |
| "loss": 1.5281, |
| "mean_token_accuracy": 0.6251190695911646, |
| "num_tokens": 840678.0, |
| "step": 130 |
| }, |
| { |
| "epoch": 0.20014295925661185, |
| "grad_norm": 9.608354568481445, |
| "learning_rate": 1.602857142857143e-05, |
| "loss": 1.4438, |
| "mean_token_accuracy": 0.6370445918291807, |
| "num_tokens": 905832.0, |
| "step": 140 |
| }, |
| { |
| "epoch": 0.21443888491779842, |
| "grad_norm": 9.842904090881348, |
| "learning_rate": 1.5742857142857145e-05, |
| "loss": 1.5379, |
| "mean_token_accuracy": 0.6172758720815181, |
| "num_tokens": 972946.0, |
| "step": 150 |
| }, |
| { |
| "epoch": 0.22873481057898498, |
| "grad_norm": 18.17994499206543, |
| "learning_rate": 1.545714285714286e-05, |
| "loss": 1.4322, |
| "mean_token_accuracy": 0.6351331725716591, |
| "num_tokens": 1034427.0, |
| "step": 160 |
| }, |
| { |
| "epoch": 0.24303073624017155, |
| "grad_norm": 8.876994132995605, |
| "learning_rate": 1.5171428571428572e-05, |
| "loss": 1.4343, |
| "mean_token_accuracy": 0.6313620086759328, |
| "num_tokens": 1101359.0, |
| "step": 170 |
| }, |
| { |
| "epoch": 0.2573266619013581, |
| "grad_norm": 10.895979881286621, |
| "learning_rate": 1.4885714285714288e-05, |
| "loss": 1.4537, |
| "mean_token_accuracy": 0.633945481479168, |
| "num_tokens": 1166538.0, |
| "step": 180 |
| }, |
| { |
| "epoch": 0.27162258756254465, |
| "grad_norm": 12.30453872680664, |
| "learning_rate": 1.46e-05, |
| "loss": 1.5363, |
| "mean_token_accuracy": 0.6255367647856473, |
| "num_tokens": 1235118.0, |
| "step": 190 |
| }, |
| { |
| "epoch": 0.28591851322373124, |
| "grad_norm": 10.28065299987793, |
| "learning_rate": 1.4314285714285717e-05, |
| "loss": 1.4199, |
| "mean_token_accuracy": 0.635765865072608, |
| "num_tokens": 1300601.0, |
| "step": 200 |
| }, |
| { |
| "epoch": 0.3002144388849178, |
| "grad_norm": 11.893675804138184, |
| "learning_rate": 1.402857142857143e-05, |
| "loss": 1.3881, |
| "mean_token_accuracy": 0.6449477795511485, |
| "num_tokens": 1362732.0, |
| "step": 210 |
| }, |
| { |
| "epoch": 0.31451036454610437, |
| "grad_norm": 11.485602378845215, |
| "learning_rate": 1.3742857142857144e-05, |
| "loss": 1.3647, |
| "mean_token_accuracy": 0.6486451178789139, |
| "num_tokens": 1424780.0, |
| "step": 220 |
| }, |
| { |
| "epoch": 0.3288062902072909, |
| "grad_norm": 8.882689476013184, |
| "learning_rate": 1.3457142857142858e-05, |
| "loss": 1.3915, |
| "mean_token_accuracy": 0.6436454936861992, |
| "num_tokens": 1490808.0, |
| "step": 230 |
| }, |
| { |
| "epoch": 0.3431022158684775, |
| "grad_norm": 18.272981643676758, |
| "learning_rate": 1.3171428571428573e-05, |
| "loss": 1.4796, |
| "mean_token_accuracy": 0.6283955980092287, |
| "num_tokens": 1556933.0, |
| "step": 240 |
| }, |
| { |
| "epoch": 0.35739814152966404, |
| "grad_norm": 11.947668075561523, |
| "learning_rate": 1.2885714285714285e-05, |
| "loss": 1.4398, |
| "mean_token_accuracy": 0.638329004868865, |
| "num_tokens": 1621053.0, |
| "step": 250 |
| }, |
| { |
| "epoch": 0.37169406719085063, |
| "grad_norm": 23.547773361206055, |
| "learning_rate": 1.2600000000000001e-05, |
| "loss": 1.3801, |
| "mean_token_accuracy": 0.6448216594755649, |
| "num_tokens": 1686747.0, |
| "step": 260 |
| }, |
| { |
| "epoch": 0.38598999285203717, |
| "grad_norm": 16.82425880432129, |
| "learning_rate": 1.2314285714285716e-05, |
| "loss": 1.3891, |
| "mean_token_accuracy": 0.6475608512759209, |
| "num_tokens": 1751946.0, |
| "step": 270 |
| }, |
| { |
| "epoch": 0.4002859185132237, |
| "grad_norm": 11.931357383728027, |
| "learning_rate": 1.202857142857143e-05, |
| "loss": 1.3768, |
| "mean_token_accuracy": 0.6439446356147528, |
| "num_tokens": 1816123.0, |
| "step": 280 |
| }, |
| { |
| "epoch": 0.4145818441744103, |
| "grad_norm": 14.375319480895996, |
| "learning_rate": 1.1742857142857144e-05, |
| "loss": 1.315, |
| "mean_token_accuracy": 0.6517576463520527, |
| "num_tokens": 1879227.0, |
| "step": 290 |
| }, |
| { |
| "epoch": 0.42887776983559683, |
| "grad_norm": 10.699817657470703, |
| "learning_rate": 1.1457142857142857e-05, |
| "loss": 1.3519, |
| "mean_token_accuracy": 0.6487406313419342, |
| "num_tokens": 1944238.0, |
| "step": 300 |
| }, |
| { |
| "epoch": 0.4431736954967834, |
| "grad_norm": 12.067941665649414, |
| "learning_rate": 1.1171428571428573e-05, |
| "loss": 1.2784, |
| "mean_token_accuracy": 0.6627866499125957, |
| "num_tokens": 2007629.0, |
| "step": 310 |
| }, |
| { |
| "epoch": 0.45746962115796996, |
| "grad_norm": 15.550559997558594, |
| "learning_rate": 1.0885714285714286e-05, |
| "loss": 1.3495, |
| "mean_token_accuracy": 0.6514371998608113, |
| "num_tokens": 2076666.0, |
| "step": 320 |
| }, |
| { |
| "epoch": 0.47176554681915656, |
| "grad_norm": 30.000173568725586, |
| "learning_rate": 1.0600000000000002e-05, |
| "loss": 1.3358, |
| "mean_token_accuracy": 0.6507035464048385, |
| "num_tokens": 2140860.0, |
| "step": 330 |
| }, |
| { |
| "epoch": 0.4860614724803431, |
| "grad_norm": 7.962319850921631, |
| "learning_rate": 1.0314285714285715e-05, |
| "loss": 1.3231, |
| "mean_token_accuracy": 0.6570919144898653, |
| "num_tokens": 2204773.0, |
| "step": 340 |
| }, |
| { |
| "epoch": 0.5003573981415297, |
| "grad_norm": 24.023008346557617, |
| "learning_rate": 1.002857142857143e-05, |
| "loss": 1.3936, |
| "mean_token_accuracy": 0.6455658808350563, |
| "num_tokens": 2270923.0, |
| "step": 350 |
| }, |
| { |
| "epoch": 0.5146533238027162, |
| "grad_norm": 8.74783706665039, |
| "learning_rate": 9.742857142857143e-06, |
| "loss": 1.3383, |
| "mean_token_accuracy": 0.6552599217742682, |
| "num_tokens": 2337009.0, |
| "step": 360 |
| }, |
| { |
| "epoch": 0.5289492494639028, |
| "grad_norm": 17.01344108581543, |
| "learning_rate": 9.457142857142858e-06, |
| "loss": 1.3524, |
| "mean_token_accuracy": 0.6488417606800795, |
| "num_tokens": 2405973.0, |
| "step": 370 |
| }, |
| { |
| "epoch": 0.5432451751250893, |
| "grad_norm": 9.353411674499512, |
| "learning_rate": 9.171428571428572e-06, |
| "loss": 1.2638, |
| "mean_token_accuracy": 0.6636200629174709, |
| "num_tokens": 2469824.0, |
| "step": 380 |
| }, |
| { |
| "epoch": 0.557541100786276, |
| "grad_norm": 13.265799522399902, |
| "learning_rate": 8.885714285714286e-06, |
| "loss": 1.2254, |
| "mean_token_accuracy": 0.6689145911484957, |
| "num_tokens": 2535167.0, |
| "step": 390 |
| }, |
| { |
| "epoch": 0.5718370264474625, |
| "grad_norm": 19.46824836730957, |
| "learning_rate": 8.6e-06, |
| "loss": 1.3844, |
| "mean_token_accuracy": 0.6483918268233537, |
| "num_tokens": 2607759.0, |
| "step": 400 |
| }, |
| { |
| "epoch": 0.586132952108649, |
| "grad_norm": 15.773782730102539, |
| "learning_rate": 8.314285714285715e-06, |
| "loss": 1.2708, |
| "mean_token_accuracy": 0.6655503377318382, |
| "num_tokens": 2670580.0, |
| "step": 410 |
| }, |
| { |
| "epoch": 0.6004288777698356, |
| "grad_norm": 8.917901039123535, |
| "learning_rate": 8.02857142857143e-06, |
| "loss": 1.2726, |
| "mean_token_accuracy": 0.6589578501880169, |
| "num_tokens": 2737272.0, |
| "step": 420 |
| }, |
| { |
| "epoch": 0.6147248034310222, |
| "grad_norm": 8.988587379455566, |
| "learning_rate": 7.742857142857144e-06, |
| "loss": 1.221, |
| "mean_token_accuracy": 0.664219357818365, |
| "num_tokens": 2803875.0, |
| "step": 430 |
| }, |
| { |
| "epoch": 0.6290207290922087, |
| "grad_norm": 12.661059379577637, |
| "learning_rate": 7.457142857142857e-06, |
| "loss": 1.2658, |
| "mean_token_accuracy": 0.662236025184393, |
| "num_tokens": 2869457.0, |
| "step": 440 |
| }, |
| { |
| "epoch": 0.6433166547533953, |
| "grad_norm": 8.545147895812988, |
| "learning_rate": 7.1714285714285725e-06, |
| "loss": 1.2778, |
| "mean_token_accuracy": 0.6622273363173008, |
| "num_tokens": 2931790.0, |
| "step": 450 |
| }, |
| { |
| "epoch": 0.6576125804145818, |
| "grad_norm": 20.769514083862305, |
| "learning_rate": 6.885714285714287e-06, |
| "loss": 1.2951, |
| "mean_token_accuracy": 0.6606701787561178, |
| "num_tokens": 2997229.0, |
| "step": 460 |
| }, |
| { |
| "epoch": 0.6719085060757684, |
| "grad_norm": 12.466110229492188, |
| "learning_rate": 6.600000000000001e-06, |
| "loss": 1.1754, |
| "mean_token_accuracy": 0.6822692640125751, |
| "num_tokens": 3063485.0, |
| "step": 470 |
| }, |
| { |
| "epoch": 0.686204431736955, |
| "grad_norm": 8.45051383972168, |
| "learning_rate": 6.314285714285715e-06, |
| "loss": 1.2102, |
| "mean_token_accuracy": 0.6759132348001003, |
| "num_tokens": 3127984.0, |
| "step": 480 |
| }, |
| { |
| "epoch": 0.7005003573981415, |
| "grad_norm": 12.029594421386719, |
| "learning_rate": 6.028571428571429e-06, |
| "loss": 1.3355, |
| "mean_token_accuracy": 0.6649406619369984, |
| "num_tokens": 3194219.0, |
| "step": 490 |
| }, |
| { |
| "epoch": 0.7147962830593281, |
| "grad_norm": 8.824553489685059, |
| "learning_rate": 5.742857142857143e-06, |
| "loss": 1.2317, |
| "mean_token_accuracy": 0.6705160938203335, |
| "num_tokens": 3259068.0, |
| "step": 500 |
| }, |
| { |
| "epoch": 0.7290922087205146, |
| "grad_norm": 16.150766372680664, |
| "learning_rate": 5.457142857142858e-06, |
| "loss": 1.1558, |
| "mean_token_accuracy": 0.6850677601993084, |
| "num_tokens": 3324070.0, |
| "step": 510 |
| }, |
| { |
| "epoch": 0.7433881343817013, |
| "grad_norm": 7.721499919891357, |
| "learning_rate": 5.171428571428571e-06, |
| "loss": 1.168, |
| "mean_token_accuracy": 0.6747931383550168, |
| "num_tokens": 3386885.0, |
| "step": 520 |
| }, |
| { |
| "epoch": 0.7576840600428878, |
| "grad_norm": 9.311972618103027, |
| "learning_rate": 4.885714285714286e-06, |
| "loss": 1.1645, |
| "mean_token_accuracy": 0.6775478422641754, |
| "num_tokens": 3448602.0, |
| "step": 530 |
| }, |
| { |
| "epoch": 0.7719799857040743, |
| "grad_norm": 9.636552810668945, |
| "learning_rate": 4.600000000000001e-06, |
| "loss": 1.2542, |
| "mean_token_accuracy": 0.6680241461843253, |
| "num_tokens": 3516481.0, |
| "step": 540 |
| }, |
| { |
| "epoch": 0.7862759113652609, |
| "grad_norm": 36.31599044799805, |
| "learning_rate": 4.314285714285714e-06, |
| "loss": 1.1866, |
| "mean_token_accuracy": 0.6768352195620537, |
| "num_tokens": 3580217.0, |
| "step": 550 |
| }, |
| { |
| "epoch": 0.8005718370264474, |
| "grad_norm": 7.471230506896973, |
| "learning_rate": 4.028571428571429e-06, |
| "loss": 1.1705, |
| "mean_token_accuracy": 0.6818295098841191, |
| "num_tokens": 3643021.0, |
| "step": 560 |
| }, |
| { |
| "epoch": 0.8148677626876341, |
| "grad_norm": 48.099830627441406, |
| "learning_rate": 3.742857142857143e-06, |
| "loss": 1.1602, |
| "mean_token_accuracy": 0.6852999441325665, |
| "num_tokens": 3710116.0, |
| "step": 570 |
| }, |
| { |
| "epoch": 0.8291636883488206, |
| "grad_norm": 13.096914291381836, |
| "learning_rate": 3.4571428571428574e-06, |
| "loss": 1.1942, |
| "mean_token_accuracy": 0.6752621583640576, |
| "num_tokens": 3775926.0, |
| "step": 580 |
| }, |
| { |
| "epoch": 0.8434596140100071, |
| "grad_norm": 11.580378532409668, |
| "learning_rate": 3.1714285714285714e-06, |
| "loss": 1.1277, |
| "mean_token_accuracy": 0.6849311918020249, |
| "num_tokens": 3840218.0, |
| "step": 590 |
| }, |
| { |
| "epoch": 0.8577555396711937, |
| "grad_norm": 9.58252239227295, |
| "learning_rate": 2.885714285714286e-06, |
| "loss": 1.187, |
| "mean_token_accuracy": 0.6740429483354091, |
| "num_tokens": 3904300.0, |
| "step": 600 |
| }, |
| { |
| "epoch": 0.8720514653323803, |
| "grad_norm": 9.778560638427734, |
| "learning_rate": 2.6e-06, |
| "loss": 1.2088, |
| "mean_token_accuracy": 0.6759266927838326, |
| "num_tokens": 3970409.0, |
| "step": 610 |
| }, |
| { |
| "epoch": 0.8863473909935669, |
| "grad_norm": 9.931038856506348, |
| "learning_rate": 2.3142857142857145e-06, |
| "loss": 1.1766, |
| "mean_token_accuracy": 0.6766778022050858, |
| "num_tokens": 4038742.0, |
| "step": 620 |
| }, |
| { |
| "epoch": 0.9006433166547534, |
| "grad_norm": 7.126023769378662, |
| "learning_rate": 2.028571428571429e-06, |
| "loss": 1.0968, |
| "mean_token_accuracy": 0.6913008309900761, |
| "num_tokens": 4103374.0, |
| "step": 630 |
| }, |
| { |
| "epoch": 0.9149392423159399, |
| "grad_norm": 7.73612642288208, |
| "learning_rate": 1.7428571428571432e-06, |
| "loss": 1.1254, |
| "mean_token_accuracy": 0.6863209947943687, |
| "num_tokens": 4170239.0, |
| "step": 640 |
| }, |
| { |
| "epoch": 0.9292351679771265, |
| "grad_norm": 6.532904148101807, |
| "learning_rate": 1.4571428571428573e-06, |
| "loss": 1.1586, |
| "mean_token_accuracy": 0.6804635964334012, |
| "num_tokens": 4237810.0, |
| "step": 650 |
| }, |
| { |
| "epoch": 0.9435310936383131, |
| "grad_norm": 7.370081901550293, |
| "learning_rate": 1.1714285714285715e-06, |
| "loss": 1.174, |
| "mean_token_accuracy": 0.6809860028326511, |
| "num_tokens": 4302937.0, |
| "step": 660 |
| }, |
| { |
| "epoch": 0.9578270192994996, |
| "grad_norm": 7.471885681152344, |
| "learning_rate": 8.857142857142857e-07, |
| "loss": 1.1755, |
| "mean_token_accuracy": 0.6858656518161297, |
| "num_tokens": 4368704.0, |
| "step": 670 |
| }, |
| { |
| "epoch": 0.9721229449606862, |
| "grad_norm": 9.739863395690918, |
| "learning_rate": 6.000000000000001e-07, |
| "loss": 1.1052, |
| "mean_token_accuracy": 0.6904201626777648, |
| "num_tokens": 4431384.0, |
| "step": 680 |
| }, |
| { |
| "epoch": 0.9864188706218727, |
| "grad_norm": 11.182050704956055, |
| "learning_rate": 3.1428571428571433e-07, |
| "loss": 1.1422, |
| "mean_token_accuracy": 0.688240597397089, |
| "num_tokens": 4500182.0, |
| "step": 690 |
| }, |
| { |
| "epoch": 1.0, |
| "grad_norm": 11.066879272460938, |
| "learning_rate": 2.8571428571428575e-08, |
| "loss": 1.131, |
| "mean_token_accuracy": 0.6868848518321389, |
| "num_tokens": 4559091.0, |
| "step": 700 |
| } |
| ], |
| "logging_steps": 10, |
| "max_steps": 700, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 1, |
| "save_steps": 500, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": true |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 4791381278720.0, |
| "train_batch_size": 1, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|