{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.5970731707317074, "eval_steps": 500, "global_step": 1330, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.01951219512195122, "grad_norm": 0.6663638353347778, "learning_rate": 0.00019934683213585893, "loss": 0.5255, "step": 10 }, { "epoch": 0.03902439024390244, "grad_norm": 0.7442628741264343, "learning_rate": 0.00019804049640757677, "loss": 0.3367, "step": 20 }, { "epoch": 0.05853658536585366, "grad_norm": 0.504467785358429, "learning_rate": 0.0001967341606792946, "loss": 0.3368, "step": 30 }, { "epoch": 0.07804878048780488, "grad_norm": 0.339693546295166, "learning_rate": 0.00019542782495101242, "loss": 0.3064, "step": 40 }, { "epoch": 0.0975609756097561, "grad_norm": 0.267976313829422, "learning_rate": 0.00019412148922273026, "loss": 0.3108, "step": 50 }, { "epoch": 0.11707317073170732, "grad_norm": 0.48640233278274536, "learning_rate": 0.00019281515349444807, "loss": 0.3069, "step": 60 }, { "epoch": 0.13658536585365855, "grad_norm": 0.7269986867904663, "learning_rate": 0.0001915088177661659, "loss": 0.3134, "step": 70 }, { "epoch": 0.15609756097560976, "grad_norm": 0.31376832723617554, "learning_rate": 0.00019020248203788375, "loss": 0.2915, "step": 80 }, { "epoch": 0.17560975609756097, "grad_norm": 0.6166387796401978, "learning_rate": 0.00018889614630960156, "loss": 0.2862, "step": 90 }, { "epoch": 0.1951219512195122, "grad_norm": 0.6621638536453247, "learning_rate": 0.0001875898105813194, "loss": 0.263, "step": 100 }, { "epoch": 0.2146341463414634, "grad_norm": 0.2815336287021637, "learning_rate": 0.00018628347485303724, "loss": 0.2747, "step": 110 }, { "epoch": 0.23414634146341465, "grad_norm": 0.5862469673156738, "learning_rate": 0.00018497713912475508, "loss": 0.2833, "step": 120 }, { "epoch": 0.25365853658536586, "grad_norm": 0.5362260937690735, "learning_rate": 0.00018367080339647292, "loss": 0.2613, "step": 130 }, { "epoch": 0.2731707317073171, "grad_norm": 0.7799074053764343, "learning_rate": 0.00018236446766819073, "loss": 0.2535, "step": 140 }, { "epoch": 0.2926829268292683, "grad_norm": 0.8866592645645142, "learning_rate": 0.00018105813193990857, "loss": 0.2603, "step": 150 }, { "epoch": 0.3121951219512195, "grad_norm": 0.9003716707229614, "learning_rate": 0.00017975179621162638, "loss": 0.27, "step": 160 }, { "epoch": 0.33170731707317075, "grad_norm": 0.5946381092071533, "learning_rate": 0.00017844546048334422, "loss": 0.2572, "step": 170 }, { "epoch": 0.35121951219512193, "grad_norm": 0.8860711455345154, "learning_rate": 0.00017713912475506206, "loss": 0.2839, "step": 180 }, { "epoch": 0.37073170731707317, "grad_norm": 0.8693526983261108, "learning_rate": 0.0001758327890267799, "loss": 0.2477, "step": 190 }, { "epoch": 0.3902439024390244, "grad_norm": 0.9044304490089417, "learning_rate": 0.00017452645329849774, "loss": 0.2674, "step": 200 }, { "epoch": 0.4097560975609756, "grad_norm": 0.5563161969184875, "learning_rate": 0.00017322011757021555, "loss": 0.2436, "step": 210 }, { "epoch": 0.4292682926829268, "grad_norm": 1.1451846361160278, "learning_rate": 0.0001719137818419334, "loss": 0.25, "step": 220 }, { "epoch": 0.44878048780487806, "grad_norm": 0.8895041942596436, "learning_rate": 0.00017060744611365123, "loss": 0.2542, "step": 230 }, { "epoch": 0.4682926829268293, "grad_norm": 0.8991382718086243, "learning_rate": 0.00016930111038536904, "loss": 0.2523, "step": 240 }, { "epoch": 0.4878048780487805, "grad_norm": 1.0106490850448608, "learning_rate": 0.00016799477465708688, "loss": 0.2554, "step": 250 }, { "epoch": 0.5073170731707317, "grad_norm": 0.5570860505104065, "learning_rate": 0.0001666884389288047, "loss": 0.2431, "step": 260 }, { "epoch": 0.526829268292683, "grad_norm": 1.1715517044067383, "learning_rate": 0.00016538210320052253, "loss": 0.2383, "step": 270 }, { "epoch": 0.5463414634146342, "grad_norm": 0.9527117609977722, "learning_rate": 0.00016407576747224037, "loss": 0.2222, "step": 280 }, { "epoch": 0.5658536585365853, "grad_norm": 1.012949824333191, "learning_rate": 0.0001627694317439582, "loss": 0.2743, "step": 290 }, { "epoch": 0.5853658536585366, "grad_norm": 1.157406210899353, "learning_rate": 0.00016146309601567605, "loss": 0.261, "step": 300 }, { "epoch": 0.6048780487804878, "grad_norm": 1.044209599494934, "learning_rate": 0.00016015676028739386, "loss": 0.2346, "step": 310 }, { "epoch": 0.624390243902439, "grad_norm": 0.8777090311050415, "learning_rate": 0.0001588504245591117, "loss": 0.2734, "step": 320 }, { "epoch": 0.6439024390243903, "grad_norm": 1.159125804901123, "learning_rate": 0.00015754408883082954, "loss": 0.2655, "step": 330 }, { "epoch": 0.6634146341463415, "grad_norm": 0.8555986881256104, "learning_rate": 0.00015623775310254735, "loss": 0.2584, "step": 340 }, { "epoch": 0.6829268292682927, "grad_norm": 0.4655376374721527, "learning_rate": 0.0001549314173742652, "loss": 0.2553, "step": 350 }, { "epoch": 0.7024390243902439, "grad_norm": 0.5464235544204712, "learning_rate": 0.000153625081645983, "loss": 0.2493, "step": 360 }, { "epoch": 0.7219512195121951, "grad_norm": 0.5507020354270935, "learning_rate": 0.00015231874591770084, "loss": 0.2485, "step": 370 }, { "epoch": 0.7414634146341463, "grad_norm": 0.47853443026542664, "learning_rate": 0.00015101241018941868, "loss": 0.2348, "step": 380 }, { "epoch": 0.7609756097560976, "grad_norm": 0.22285830974578857, "learning_rate": 0.00014970607446113652, "loss": 0.2393, "step": 390 }, { "epoch": 0.7804878048780488, "grad_norm": 1.4561814069747925, "learning_rate": 0.00014839973873285436, "loss": 0.2432, "step": 400 }, { "epoch": 0.8, "grad_norm": 0.6247652769088745, "learning_rate": 0.00014709340300457217, "loss": 0.242, "step": 410 }, { "epoch": 0.8195121951219512, "grad_norm": 1.0940589904785156, "learning_rate": 0.00014578706727629, "loss": 0.2632, "step": 420 }, { "epoch": 0.8390243902439024, "grad_norm": 1.2155462503433228, "learning_rate": 0.00014448073154800785, "loss": 0.2432, "step": 430 }, { "epoch": 0.8585365853658536, "grad_norm": 0.3646700978279114, "learning_rate": 0.00014317439581972566, "loss": 0.2342, "step": 440 }, { "epoch": 0.8780487804878049, "grad_norm": 0.3318726122379303, "learning_rate": 0.0001418680600914435, "loss": 0.2454, "step": 450 }, { "epoch": 0.8975609756097561, "grad_norm": 1.2063515186309814, "learning_rate": 0.00014056172436316132, "loss": 0.2475, "step": 460 }, { "epoch": 0.9170731707317074, "grad_norm": 0.7661004066467285, "learning_rate": 0.00013925538863487918, "loss": 0.2635, "step": 470 }, { "epoch": 0.9365853658536586, "grad_norm": 0.2922651469707489, "learning_rate": 0.00013794905290659702, "loss": 0.2235, "step": 480 }, { "epoch": 0.9560975609756097, "grad_norm": 1.2844675779342651, "learning_rate": 0.00013664271717831483, "loss": 0.2294, "step": 490 }, { "epoch": 0.975609756097561, "grad_norm": 0.38282549381256104, "learning_rate": 0.00013533638145003267, "loss": 0.2265, "step": 500 }, { "epoch": 0.9951219512195122, "grad_norm": 0.286447674036026, "learning_rate": 0.00013403004572175048, "loss": 0.2237, "step": 510 }, { "epoch": 1.015609756097561, "grad_norm": 0.2928523123264313, "learning_rate": 0.00013272370999346832, "loss": 0.2358, "step": 520 }, { "epoch": 1.0351219512195122, "grad_norm": 1.7477126121520996, "learning_rate": 0.00013141737426518616, "loss": 0.194, "step": 530 }, { "epoch": 1.0546341463414635, "grad_norm": 1.4766732454299927, "learning_rate": 0.00013011103853690398, "loss": 0.1828, "step": 540 }, { "epoch": 1.0741463414634147, "grad_norm": 1.7536602020263672, "learning_rate": 0.00012880470280862181, "loss": 0.1936, "step": 550 }, { "epoch": 1.093658536585366, "grad_norm": 1.5118074417114258, "learning_rate": 0.00012749836708033965, "loss": 0.1757, "step": 560 }, { "epoch": 1.1131707317073172, "grad_norm": 0.5326169729232788, "learning_rate": 0.0001261920313520575, "loss": 0.1982, "step": 570 }, { "epoch": 1.1326829268292684, "grad_norm": 1.1812074184417725, "learning_rate": 0.00012488569562377533, "loss": 0.2055, "step": 580 }, { "epoch": 1.1521951219512194, "grad_norm": 1.6071784496307373, "learning_rate": 0.00012357935989549314, "loss": 0.19, "step": 590 }, { "epoch": 1.1717073170731707, "grad_norm": 1.3864187002182007, "learning_rate": 0.00012227302416721098, "loss": 0.2059, "step": 600 }, { "epoch": 1.191219512195122, "grad_norm": 0.2694201171398163, "learning_rate": 0.00012096668843892882, "loss": 0.1966, "step": 610 }, { "epoch": 1.2107317073170731, "grad_norm": 0.7169620990753174, "learning_rate": 0.00011966035271064664, "loss": 0.186, "step": 620 }, { "epoch": 1.2302439024390244, "grad_norm": 1.2233684062957764, "learning_rate": 0.00011835401698236447, "loss": 0.1971, "step": 630 }, { "epoch": 1.2497560975609756, "grad_norm": 0.7872809767723083, "learning_rate": 0.0001170476812540823, "loss": 0.1874, "step": 640 }, { "epoch": 1.2692682926829268, "grad_norm": 1.3358359336853027, "learning_rate": 0.00011574134552580014, "loss": 0.2072, "step": 650 }, { "epoch": 1.288780487804878, "grad_norm": 1.397544264793396, "learning_rate": 0.00011443500979751798, "loss": 0.1797, "step": 660 }, { "epoch": 1.3082926829268293, "grad_norm": 0.4923507571220398, "learning_rate": 0.00011312867406923579, "loss": 0.1791, "step": 670 }, { "epoch": 1.3278048780487806, "grad_norm": 0.9196809530258179, "learning_rate": 0.00011182233834095363, "loss": 0.1931, "step": 680 }, { "epoch": 1.3473170731707316, "grad_norm": 1.4173572063446045, "learning_rate": 0.00011051600261267146, "loss": 0.1958, "step": 690 }, { "epoch": 1.3668292682926828, "grad_norm": 0.23955851793289185, "learning_rate": 0.0001092096668843893, "loss": 0.2053, "step": 700 }, { "epoch": 1.386341463414634, "grad_norm": 1.5932824611663818, "learning_rate": 0.00010790333115610713, "loss": 0.1861, "step": 710 }, { "epoch": 1.4058536585365853, "grad_norm": 1.292710304260254, "learning_rate": 0.00010659699542782495, "loss": 0.1949, "step": 720 }, { "epoch": 1.4253658536585365, "grad_norm": 1.5422075986862183, "learning_rate": 0.0001052906596995428, "loss": 0.1902, "step": 730 }, { "epoch": 1.4448780487804878, "grad_norm": 1.8156495094299316, "learning_rate": 0.00010398432397126061, "loss": 0.1984, "step": 740 }, { "epoch": 1.464390243902439, "grad_norm": 0.6971287727355957, "learning_rate": 0.00010267798824297845, "loss": 0.1865, "step": 750 }, { "epoch": 1.4839024390243902, "grad_norm": 1.6063241958618164, "learning_rate": 0.00010137165251469629, "loss": 0.1771, "step": 760 }, { "epoch": 1.5034146341463415, "grad_norm": 1.5314477682113647, "learning_rate": 0.00010006531678641412, "loss": 0.2034, "step": 770 }, { "epoch": 1.5229268292682927, "grad_norm": 1.4807705879211426, "learning_rate": 9.875898105813195e-05, "loss": 0.1853, "step": 780 }, { "epoch": 1.542439024390244, "grad_norm": 1.519795536994934, "learning_rate": 9.745264532984978e-05, "loss": 0.1994, "step": 790 }, { "epoch": 1.5619512195121952, "grad_norm": 1.4948278665542603, "learning_rate": 9.61463096015676e-05, "loss": 0.1826, "step": 800 }, { "epoch": 1.5814634146341464, "grad_norm": 1.2426303625106812, "learning_rate": 9.483997387328543e-05, "loss": 0.1968, "step": 810 }, { "epoch": 1.6009756097560977, "grad_norm": 1.4434090852737427, "learning_rate": 9.353363814500327e-05, "loss": 0.2016, "step": 820 }, { "epoch": 1.6204878048780489, "grad_norm": 1.3460036516189575, "learning_rate": 9.222730241672111e-05, "loss": 0.1853, "step": 830 }, { "epoch": 1.6400000000000001, "grad_norm": 1.008080005645752, "learning_rate": 9.092096668843894e-05, "loss": 0.2051, "step": 840 }, { "epoch": 1.6595121951219514, "grad_norm": 0.591911792755127, "learning_rate": 8.961463096015676e-05, "loss": 0.2003, "step": 850 }, { "epoch": 1.6790243902439026, "grad_norm": 0.8856975436210632, "learning_rate": 8.830829523187459e-05, "loss": 0.1994, "step": 860 }, { "epoch": 1.6985365853658536, "grad_norm": 0.24821911752223969, "learning_rate": 8.700195950359243e-05, "loss": 0.1801, "step": 870 }, { "epoch": 1.7180487804878048, "grad_norm": 0.5248575806617737, "learning_rate": 8.569562377531027e-05, "loss": 0.1815, "step": 880 }, { "epoch": 1.737560975609756, "grad_norm": 0.600226104259491, "learning_rate": 8.438928804702809e-05, "loss": 0.1885, "step": 890 }, { "epoch": 1.7570731707317073, "grad_norm": 0.720282793045044, "learning_rate": 8.308295231874592e-05, "loss": 0.1956, "step": 900 }, { "epoch": 1.7765853658536586, "grad_norm": 0.780463695526123, "learning_rate": 8.177661659046376e-05, "loss": 0.1892, "step": 910 }, { "epoch": 1.7960975609756098, "grad_norm": 0.5687152147293091, "learning_rate": 8.047028086218158e-05, "loss": 0.1905, "step": 920 }, { "epoch": 1.8156097560975608, "grad_norm": 0.33384960889816284, "learning_rate": 7.916394513389942e-05, "loss": 0.1854, "step": 930 }, { "epoch": 1.835121951219512, "grad_norm": 0.6061075329780579, "learning_rate": 7.785760940561725e-05, "loss": 0.1808, "step": 940 }, { "epoch": 1.8546341463414633, "grad_norm": 0.2853333353996277, "learning_rate": 7.655127367733507e-05, "loss": 0.1873, "step": 950 }, { "epoch": 1.8741463414634145, "grad_norm": 0.5911210775375366, "learning_rate": 7.524493794905291e-05, "loss": 0.1965, "step": 960 }, { "epoch": 1.8936585365853658, "grad_norm": 0.8281512260437012, "learning_rate": 7.393860222077074e-05, "loss": 0.2078, "step": 970 }, { "epoch": 1.913170731707317, "grad_norm": 1.2774325609207153, "learning_rate": 7.263226649248858e-05, "loss": 0.1943, "step": 980 }, { "epoch": 1.9326829268292682, "grad_norm": 0.544083833694458, "learning_rate": 7.13259307642064e-05, "loss": 0.1892, "step": 990 }, { "epoch": 1.9521951219512195, "grad_norm": 0.6588655114173889, "learning_rate": 7.001959503592423e-05, "loss": 0.1873, "step": 1000 }, { "epoch": 1.9717073170731707, "grad_norm": 0.6782810688018799, "learning_rate": 6.871325930764207e-05, "loss": 0.2181, "step": 1010 }, { "epoch": 1.991219512195122, "grad_norm": 0.27836665511131287, "learning_rate": 6.740692357935991e-05, "loss": 0.1895, "step": 1020 }, { "epoch": 2.0117073170731707, "grad_norm": 1.3147952556610107, "learning_rate": 6.610058785107773e-05, "loss": 0.2023, "step": 1030 }, { "epoch": 2.031219512195122, "grad_norm": 0.32955631613731384, "learning_rate": 6.479425212279556e-05, "loss": 0.1539, "step": 1040 }, { "epoch": 2.050731707317073, "grad_norm": 0.7191469073295593, "learning_rate": 6.34879163945134e-05, "loss": 0.1522, "step": 1050 }, { "epoch": 2.0702439024390245, "grad_norm": 0.9275949001312256, "learning_rate": 6.218158066623122e-05, "loss": 0.1481, "step": 1060 }, { "epoch": 2.0897560975609757, "grad_norm": 0.37640875577926636, "learning_rate": 6.087524493794906e-05, "loss": 0.1425, "step": 1070 }, { "epoch": 2.109268292682927, "grad_norm": 1.2650576829910278, "learning_rate": 5.956890920966689e-05, "loss": 0.149, "step": 1080 }, { "epoch": 2.128780487804878, "grad_norm": 1.5103838443756104, "learning_rate": 5.826257348138472e-05, "loss": 0.1566, "step": 1090 }, { "epoch": 2.1482926829268294, "grad_norm": 0.9958144426345825, "learning_rate": 5.6956237753102546e-05, "loss": 0.1519, "step": 1100 }, { "epoch": 2.1678048780487806, "grad_norm": 0.3257206082344055, "learning_rate": 5.564990202482038e-05, "loss": 0.1691, "step": 1110 }, { "epoch": 2.187317073170732, "grad_norm": 0.6846854090690613, "learning_rate": 5.434356629653822e-05, "loss": 0.1466, "step": 1120 }, { "epoch": 2.206829268292683, "grad_norm": 0.8696099519729614, "learning_rate": 5.3037230568256044e-05, "loss": 0.1593, "step": 1130 }, { "epoch": 2.2263414634146343, "grad_norm": 0.20721249282360077, "learning_rate": 5.1730894839973876e-05, "loss": 0.1475, "step": 1140 }, { "epoch": 2.2458536585365856, "grad_norm": 0.5807740092277527, "learning_rate": 5.04245591116917e-05, "loss": 0.1525, "step": 1150 }, { "epoch": 2.265365853658537, "grad_norm": 0.6303423643112183, "learning_rate": 4.911822338340954e-05, "loss": 0.1414, "step": 1160 }, { "epoch": 2.2848780487804876, "grad_norm": 0.6723970174789429, "learning_rate": 4.781188765512737e-05, "loss": 0.1286, "step": 1170 }, { "epoch": 2.304390243902439, "grad_norm": 0.5512611269950867, "learning_rate": 4.6505551926845206e-05, "loss": 0.137, "step": 1180 }, { "epoch": 2.32390243902439, "grad_norm": 0.32534515857696533, "learning_rate": 4.519921619856303e-05, "loss": 0.1509, "step": 1190 }, { "epoch": 2.3434146341463413, "grad_norm": 0.37095433473587036, "learning_rate": 4.3892880470280864e-05, "loss": 0.1447, "step": 1200 }, { "epoch": 2.3629268292682926, "grad_norm": 0.35774731636047363, "learning_rate": 4.25865447419987e-05, "loss": 0.1455, "step": 1210 }, { "epoch": 2.382439024390244, "grad_norm": 0.6414694786071777, "learning_rate": 4.128020901371652e-05, "loss": 0.1445, "step": 1220 }, { "epoch": 2.401951219512195, "grad_norm": 0.39603760838508606, "learning_rate": 3.997387328543436e-05, "loss": 0.142, "step": 1230 }, { "epoch": 2.4214634146341463, "grad_norm": 0.4492717981338501, "learning_rate": 3.866753755715219e-05, "loss": 0.132, "step": 1240 }, { "epoch": 2.4409756097560975, "grad_norm": 0.455432265996933, "learning_rate": 3.736120182887003e-05, "loss": 0.1483, "step": 1250 }, { "epoch": 2.4604878048780487, "grad_norm": 0.34108078479766846, "learning_rate": 3.605486610058785e-05, "loss": 0.1353, "step": 1260 }, { "epoch": 2.48, "grad_norm": 0.8474462628364563, "learning_rate": 3.4748530372305685e-05, "loss": 0.1433, "step": 1270 }, { "epoch": 2.499512195121951, "grad_norm": 0.490139365196228, "learning_rate": 3.344219464402352e-05, "loss": 0.1405, "step": 1280 }, { "epoch": 2.5190243902439025, "grad_norm": 0.2619794011116028, "learning_rate": 3.213585891574134e-05, "loss": 0.1431, "step": 1290 }, { "epoch": 2.5385365853658537, "grad_norm": 0.35025978088378906, "learning_rate": 3.082952318745918e-05, "loss": 0.1314, "step": 1300 }, { "epoch": 2.558048780487805, "grad_norm": 0.6071056127548218, "learning_rate": 2.952318745917701e-05, "loss": 0.1335, "step": 1310 }, { "epoch": 2.577560975609756, "grad_norm": 0.410426527261734, "learning_rate": 2.821685173089484e-05, "loss": 0.1466, "step": 1320 }, { "epoch": 2.5970731707317074, "grad_norm": 0.4310607612133026, "learning_rate": 2.6910516002612673e-05, "loss": 0.1452, "step": 1330 } ], "logging_steps": 10, "max_steps": 1536, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 10, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 2.5120967251922534e+17, "train_batch_size": 1, "trial_name": null, "trial_params": null }