|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 2.7336585365853656, |
|
"eval_steps": 500, |
|
"global_step": 1400, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.01951219512195122, |
|
"grad_norm": 0.6663638353347778, |
|
"learning_rate": 0.00019934683213585893, |
|
"loss": 0.5255, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.03902439024390244, |
|
"grad_norm": 0.7442628741264343, |
|
"learning_rate": 0.00019804049640757677, |
|
"loss": 0.3367, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.05853658536585366, |
|
"grad_norm": 0.504467785358429, |
|
"learning_rate": 0.0001967341606792946, |
|
"loss": 0.3368, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.07804878048780488, |
|
"grad_norm": 0.339693546295166, |
|
"learning_rate": 0.00019542782495101242, |
|
"loss": 0.3064, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.0975609756097561, |
|
"grad_norm": 0.267976313829422, |
|
"learning_rate": 0.00019412148922273026, |
|
"loss": 0.3108, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.11707317073170732, |
|
"grad_norm": 0.48640233278274536, |
|
"learning_rate": 0.00019281515349444807, |
|
"loss": 0.3069, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.13658536585365855, |
|
"grad_norm": 0.7269986867904663, |
|
"learning_rate": 0.0001915088177661659, |
|
"loss": 0.3134, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.15609756097560976, |
|
"grad_norm": 0.31376832723617554, |
|
"learning_rate": 0.00019020248203788375, |
|
"loss": 0.2915, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.17560975609756097, |
|
"grad_norm": 0.6166387796401978, |
|
"learning_rate": 0.00018889614630960156, |
|
"loss": 0.2862, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.1951219512195122, |
|
"grad_norm": 0.6621638536453247, |
|
"learning_rate": 0.0001875898105813194, |
|
"loss": 0.263, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.2146341463414634, |
|
"grad_norm": 0.2815336287021637, |
|
"learning_rate": 0.00018628347485303724, |
|
"loss": 0.2747, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.23414634146341465, |
|
"grad_norm": 0.5862469673156738, |
|
"learning_rate": 0.00018497713912475508, |
|
"loss": 0.2833, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.25365853658536586, |
|
"grad_norm": 0.5362260937690735, |
|
"learning_rate": 0.00018367080339647292, |
|
"loss": 0.2613, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.2731707317073171, |
|
"grad_norm": 0.7799074053764343, |
|
"learning_rate": 0.00018236446766819073, |
|
"loss": 0.2535, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.2926829268292683, |
|
"grad_norm": 0.8866592645645142, |
|
"learning_rate": 0.00018105813193990857, |
|
"loss": 0.2603, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.3121951219512195, |
|
"grad_norm": 0.9003716707229614, |
|
"learning_rate": 0.00017975179621162638, |
|
"loss": 0.27, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.33170731707317075, |
|
"grad_norm": 0.5946381092071533, |
|
"learning_rate": 0.00017844546048334422, |
|
"loss": 0.2572, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.35121951219512193, |
|
"grad_norm": 0.8860711455345154, |
|
"learning_rate": 0.00017713912475506206, |
|
"loss": 0.2839, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.37073170731707317, |
|
"grad_norm": 0.8693526983261108, |
|
"learning_rate": 0.0001758327890267799, |
|
"loss": 0.2477, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.3902439024390244, |
|
"grad_norm": 0.9044304490089417, |
|
"learning_rate": 0.00017452645329849774, |
|
"loss": 0.2674, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.4097560975609756, |
|
"grad_norm": 0.5563161969184875, |
|
"learning_rate": 0.00017322011757021555, |
|
"loss": 0.2436, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.4292682926829268, |
|
"grad_norm": 1.1451846361160278, |
|
"learning_rate": 0.0001719137818419334, |
|
"loss": 0.25, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.44878048780487806, |
|
"grad_norm": 0.8895041942596436, |
|
"learning_rate": 0.00017060744611365123, |
|
"loss": 0.2542, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.4682926829268293, |
|
"grad_norm": 0.8991382718086243, |
|
"learning_rate": 0.00016930111038536904, |
|
"loss": 0.2523, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.4878048780487805, |
|
"grad_norm": 1.0106490850448608, |
|
"learning_rate": 0.00016799477465708688, |
|
"loss": 0.2554, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.5073170731707317, |
|
"grad_norm": 0.5570860505104065, |
|
"learning_rate": 0.0001666884389288047, |
|
"loss": 0.2431, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.526829268292683, |
|
"grad_norm": 1.1715517044067383, |
|
"learning_rate": 0.00016538210320052253, |
|
"loss": 0.2383, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.5463414634146342, |
|
"grad_norm": 0.9527117609977722, |
|
"learning_rate": 0.00016407576747224037, |
|
"loss": 0.2222, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.5658536585365853, |
|
"grad_norm": 1.012949824333191, |
|
"learning_rate": 0.0001627694317439582, |
|
"loss": 0.2743, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.5853658536585366, |
|
"grad_norm": 1.157406210899353, |
|
"learning_rate": 0.00016146309601567605, |
|
"loss": 0.261, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.6048780487804878, |
|
"grad_norm": 1.044209599494934, |
|
"learning_rate": 0.00016015676028739386, |
|
"loss": 0.2346, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.624390243902439, |
|
"grad_norm": 0.8777090311050415, |
|
"learning_rate": 0.0001588504245591117, |
|
"loss": 0.2734, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.6439024390243903, |
|
"grad_norm": 1.159125804901123, |
|
"learning_rate": 0.00015754408883082954, |
|
"loss": 0.2655, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.6634146341463415, |
|
"grad_norm": 0.8555986881256104, |
|
"learning_rate": 0.00015623775310254735, |
|
"loss": 0.2584, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.6829268292682927, |
|
"grad_norm": 0.4655376374721527, |
|
"learning_rate": 0.0001549314173742652, |
|
"loss": 0.2553, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.7024390243902439, |
|
"grad_norm": 0.5464235544204712, |
|
"learning_rate": 0.000153625081645983, |
|
"loss": 0.2493, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.7219512195121951, |
|
"grad_norm": 0.5507020354270935, |
|
"learning_rate": 0.00015231874591770084, |
|
"loss": 0.2485, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.7414634146341463, |
|
"grad_norm": 0.47853443026542664, |
|
"learning_rate": 0.00015101241018941868, |
|
"loss": 0.2348, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.7609756097560976, |
|
"grad_norm": 0.22285830974578857, |
|
"learning_rate": 0.00014970607446113652, |
|
"loss": 0.2393, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.7804878048780488, |
|
"grad_norm": 1.4561814069747925, |
|
"learning_rate": 0.00014839973873285436, |
|
"loss": 0.2432, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.8, |
|
"grad_norm": 0.6247652769088745, |
|
"learning_rate": 0.00014709340300457217, |
|
"loss": 0.242, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.8195121951219512, |
|
"grad_norm": 1.0940589904785156, |
|
"learning_rate": 0.00014578706727629, |
|
"loss": 0.2632, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.8390243902439024, |
|
"grad_norm": 1.2155462503433228, |
|
"learning_rate": 0.00014448073154800785, |
|
"loss": 0.2432, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 0.8585365853658536, |
|
"grad_norm": 0.3646700978279114, |
|
"learning_rate": 0.00014317439581972566, |
|
"loss": 0.2342, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.8780487804878049, |
|
"grad_norm": 0.3318726122379303, |
|
"learning_rate": 0.0001418680600914435, |
|
"loss": 0.2454, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.8975609756097561, |
|
"grad_norm": 1.2063515186309814, |
|
"learning_rate": 0.00014056172436316132, |
|
"loss": 0.2475, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.9170731707317074, |
|
"grad_norm": 0.7661004066467285, |
|
"learning_rate": 0.00013925538863487918, |
|
"loss": 0.2635, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 0.9365853658536586, |
|
"grad_norm": 0.2922651469707489, |
|
"learning_rate": 0.00013794905290659702, |
|
"loss": 0.2235, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.9560975609756097, |
|
"grad_norm": 1.2844675779342651, |
|
"learning_rate": 0.00013664271717831483, |
|
"loss": 0.2294, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 0.975609756097561, |
|
"grad_norm": 0.38282549381256104, |
|
"learning_rate": 0.00013533638145003267, |
|
"loss": 0.2265, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.9951219512195122, |
|
"grad_norm": 0.286447674036026, |
|
"learning_rate": 0.00013403004572175048, |
|
"loss": 0.2237, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 1.015609756097561, |
|
"grad_norm": 0.2928523123264313, |
|
"learning_rate": 0.00013272370999346832, |
|
"loss": 0.2358, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 1.0351219512195122, |
|
"grad_norm": 1.7477126121520996, |
|
"learning_rate": 0.00013141737426518616, |
|
"loss": 0.194, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 1.0546341463414635, |
|
"grad_norm": 1.4766732454299927, |
|
"learning_rate": 0.00013011103853690398, |
|
"loss": 0.1828, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 1.0741463414634147, |
|
"grad_norm": 1.7536602020263672, |
|
"learning_rate": 0.00012880470280862181, |
|
"loss": 0.1936, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 1.093658536585366, |
|
"grad_norm": 1.5118074417114258, |
|
"learning_rate": 0.00012749836708033965, |
|
"loss": 0.1757, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 1.1131707317073172, |
|
"grad_norm": 0.5326169729232788, |
|
"learning_rate": 0.0001261920313520575, |
|
"loss": 0.1982, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 1.1326829268292684, |
|
"grad_norm": 1.1812074184417725, |
|
"learning_rate": 0.00012488569562377533, |
|
"loss": 0.2055, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 1.1521951219512194, |
|
"grad_norm": 1.6071784496307373, |
|
"learning_rate": 0.00012357935989549314, |
|
"loss": 0.19, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 1.1717073170731707, |
|
"grad_norm": 1.3864187002182007, |
|
"learning_rate": 0.00012227302416721098, |
|
"loss": 0.2059, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 1.191219512195122, |
|
"grad_norm": 0.2694201171398163, |
|
"learning_rate": 0.00012096668843892882, |
|
"loss": 0.1966, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 1.2107317073170731, |
|
"grad_norm": 0.7169620990753174, |
|
"learning_rate": 0.00011966035271064664, |
|
"loss": 0.186, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 1.2302439024390244, |
|
"grad_norm": 1.2233684062957764, |
|
"learning_rate": 0.00011835401698236447, |
|
"loss": 0.1971, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 1.2497560975609756, |
|
"grad_norm": 0.7872809767723083, |
|
"learning_rate": 0.0001170476812540823, |
|
"loss": 0.1874, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 1.2692682926829268, |
|
"grad_norm": 1.3358359336853027, |
|
"learning_rate": 0.00011574134552580014, |
|
"loss": 0.2072, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 1.288780487804878, |
|
"grad_norm": 1.397544264793396, |
|
"learning_rate": 0.00011443500979751798, |
|
"loss": 0.1797, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 1.3082926829268293, |
|
"grad_norm": 0.4923507571220398, |
|
"learning_rate": 0.00011312867406923579, |
|
"loss": 0.1791, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 1.3278048780487806, |
|
"grad_norm": 0.9196809530258179, |
|
"learning_rate": 0.00011182233834095363, |
|
"loss": 0.1931, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 1.3473170731707316, |
|
"grad_norm": 1.4173572063446045, |
|
"learning_rate": 0.00011051600261267146, |
|
"loss": 0.1958, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 1.3668292682926828, |
|
"grad_norm": 0.23955851793289185, |
|
"learning_rate": 0.0001092096668843893, |
|
"loss": 0.2053, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 1.386341463414634, |
|
"grad_norm": 1.5932824611663818, |
|
"learning_rate": 0.00010790333115610713, |
|
"loss": 0.1861, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 1.4058536585365853, |
|
"grad_norm": 1.292710304260254, |
|
"learning_rate": 0.00010659699542782495, |
|
"loss": 0.1949, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 1.4253658536585365, |
|
"grad_norm": 1.5422075986862183, |
|
"learning_rate": 0.0001052906596995428, |
|
"loss": 0.1902, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 1.4448780487804878, |
|
"grad_norm": 1.8156495094299316, |
|
"learning_rate": 0.00010398432397126061, |
|
"loss": 0.1984, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 1.464390243902439, |
|
"grad_norm": 0.6971287727355957, |
|
"learning_rate": 0.00010267798824297845, |
|
"loss": 0.1865, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 1.4839024390243902, |
|
"grad_norm": 1.6063241958618164, |
|
"learning_rate": 0.00010137165251469629, |
|
"loss": 0.1771, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 1.5034146341463415, |
|
"grad_norm": 1.5314477682113647, |
|
"learning_rate": 0.00010006531678641412, |
|
"loss": 0.2034, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 1.5229268292682927, |
|
"grad_norm": 1.4807705879211426, |
|
"learning_rate": 9.875898105813195e-05, |
|
"loss": 0.1853, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 1.542439024390244, |
|
"grad_norm": 1.519795536994934, |
|
"learning_rate": 9.745264532984978e-05, |
|
"loss": 0.1994, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 1.5619512195121952, |
|
"grad_norm": 1.4948278665542603, |
|
"learning_rate": 9.61463096015676e-05, |
|
"loss": 0.1826, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 1.5814634146341464, |
|
"grad_norm": 1.2426303625106812, |
|
"learning_rate": 9.483997387328543e-05, |
|
"loss": 0.1968, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 1.6009756097560977, |
|
"grad_norm": 1.4434090852737427, |
|
"learning_rate": 9.353363814500327e-05, |
|
"loss": 0.2016, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 1.6204878048780489, |
|
"grad_norm": 1.3460036516189575, |
|
"learning_rate": 9.222730241672111e-05, |
|
"loss": 0.1853, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 1.6400000000000001, |
|
"grad_norm": 1.008080005645752, |
|
"learning_rate": 9.092096668843894e-05, |
|
"loss": 0.2051, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 1.6595121951219514, |
|
"grad_norm": 0.591911792755127, |
|
"learning_rate": 8.961463096015676e-05, |
|
"loss": 0.2003, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 1.6790243902439026, |
|
"grad_norm": 0.8856975436210632, |
|
"learning_rate": 8.830829523187459e-05, |
|
"loss": 0.1994, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 1.6985365853658536, |
|
"grad_norm": 0.24821911752223969, |
|
"learning_rate": 8.700195950359243e-05, |
|
"loss": 0.1801, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 1.7180487804878048, |
|
"grad_norm": 0.5248575806617737, |
|
"learning_rate": 8.569562377531027e-05, |
|
"loss": 0.1815, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 1.737560975609756, |
|
"grad_norm": 0.600226104259491, |
|
"learning_rate": 8.438928804702809e-05, |
|
"loss": 0.1885, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 1.7570731707317073, |
|
"grad_norm": 0.720282793045044, |
|
"learning_rate": 8.308295231874592e-05, |
|
"loss": 0.1956, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 1.7765853658536586, |
|
"grad_norm": 0.780463695526123, |
|
"learning_rate": 8.177661659046376e-05, |
|
"loss": 0.1892, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 1.7960975609756098, |
|
"grad_norm": 0.5687152147293091, |
|
"learning_rate": 8.047028086218158e-05, |
|
"loss": 0.1905, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 1.8156097560975608, |
|
"grad_norm": 0.33384960889816284, |
|
"learning_rate": 7.916394513389942e-05, |
|
"loss": 0.1854, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 1.835121951219512, |
|
"grad_norm": 0.6061075329780579, |
|
"learning_rate": 7.785760940561725e-05, |
|
"loss": 0.1808, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 1.8546341463414633, |
|
"grad_norm": 0.2853333353996277, |
|
"learning_rate": 7.655127367733507e-05, |
|
"loss": 0.1873, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 1.8741463414634145, |
|
"grad_norm": 0.5911210775375366, |
|
"learning_rate": 7.524493794905291e-05, |
|
"loss": 0.1965, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 1.8936585365853658, |
|
"grad_norm": 0.8281512260437012, |
|
"learning_rate": 7.393860222077074e-05, |
|
"loss": 0.2078, |
|
"step": 970 |
|
}, |
|
{ |
|
"epoch": 1.913170731707317, |
|
"grad_norm": 1.2774325609207153, |
|
"learning_rate": 7.263226649248858e-05, |
|
"loss": 0.1943, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 1.9326829268292682, |
|
"grad_norm": 0.544083833694458, |
|
"learning_rate": 7.13259307642064e-05, |
|
"loss": 0.1892, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 1.9521951219512195, |
|
"grad_norm": 0.6588655114173889, |
|
"learning_rate": 7.001959503592423e-05, |
|
"loss": 0.1873, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 1.9717073170731707, |
|
"grad_norm": 0.6782810688018799, |
|
"learning_rate": 6.871325930764207e-05, |
|
"loss": 0.2181, |
|
"step": 1010 |
|
}, |
|
{ |
|
"epoch": 1.991219512195122, |
|
"grad_norm": 0.27836665511131287, |
|
"learning_rate": 6.740692357935991e-05, |
|
"loss": 0.1895, |
|
"step": 1020 |
|
}, |
|
{ |
|
"epoch": 2.0117073170731707, |
|
"grad_norm": 1.3147952556610107, |
|
"learning_rate": 6.610058785107773e-05, |
|
"loss": 0.2023, |
|
"step": 1030 |
|
}, |
|
{ |
|
"epoch": 2.031219512195122, |
|
"grad_norm": 0.32955631613731384, |
|
"learning_rate": 6.479425212279556e-05, |
|
"loss": 0.1539, |
|
"step": 1040 |
|
}, |
|
{ |
|
"epoch": 2.050731707317073, |
|
"grad_norm": 0.7191469073295593, |
|
"learning_rate": 6.34879163945134e-05, |
|
"loss": 0.1522, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 2.0702439024390245, |
|
"grad_norm": 0.9275949001312256, |
|
"learning_rate": 6.218158066623122e-05, |
|
"loss": 0.1481, |
|
"step": 1060 |
|
}, |
|
{ |
|
"epoch": 2.0897560975609757, |
|
"grad_norm": 0.37640875577926636, |
|
"learning_rate": 6.087524493794906e-05, |
|
"loss": 0.1425, |
|
"step": 1070 |
|
}, |
|
{ |
|
"epoch": 2.109268292682927, |
|
"grad_norm": 1.2650576829910278, |
|
"learning_rate": 5.956890920966689e-05, |
|
"loss": 0.149, |
|
"step": 1080 |
|
}, |
|
{ |
|
"epoch": 2.128780487804878, |
|
"grad_norm": 1.5103838443756104, |
|
"learning_rate": 5.826257348138472e-05, |
|
"loss": 0.1566, |
|
"step": 1090 |
|
}, |
|
{ |
|
"epoch": 2.1482926829268294, |
|
"grad_norm": 0.9958144426345825, |
|
"learning_rate": 5.6956237753102546e-05, |
|
"loss": 0.1519, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 2.1678048780487806, |
|
"grad_norm": 0.3257206082344055, |
|
"learning_rate": 5.564990202482038e-05, |
|
"loss": 0.1691, |
|
"step": 1110 |
|
}, |
|
{ |
|
"epoch": 2.187317073170732, |
|
"grad_norm": 0.6846854090690613, |
|
"learning_rate": 5.434356629653822e-05, |
|
"loss": 0.1466, |
|
"step": 1120 |
|
}, |
|
{ |
|
"epoch": 2.206829268292683, |
|
"grad_norm": 0.8696099519729614, |
|
"learning_rate": 5.3037230568256044e-05, |
|
"loss": 0.1593, |
|
"step": 1130 |
|
}, |
|
{ |
|
"epoch": 2.2263414634146343, |
|
"grad_norm": 0.20721249282360077, |
|
"learning_rate": 5.1730894839973876e-05, |
|
"loss": 0.1475, |
|
"step": 1140 |
|
}, |
|
{ |
|
"epoch": 2.2458536585365856, |
|
"grad_norm": 0.5807740092277527, |
|
"learning_rate": 5.04245591116917e-05, |
|
"loss": 0.1525, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 2.265365853658537, |
|
"grad_norm": 0.6303423643112183, |
|
"learning_rate": 4.911822338340954e-05, |
|
"loss": 0.1414, |
|
"step": 1160 |
|
}, |
|
{ |
|
"epoch": 2.2848780487804876, |
|
"grad_norm": 0.6723970174789429, |
|
"learning_rate": 4.781188765512737e-05, |
|
"loss": 0.1286, |
|
"step": 1170 |
|
}, |
|
{ |
|
"epoch": 2.304390243902439, |
|
"grad_norm": 0.5512611269950867, |
|
"learning_rate": 4.6505551926845206e-05, |
|
"loss": 0.137, |
|
"step": 1180 |
|
}, |
|
{ |
|
"epoch": 2.32390243902439, |
|
"grad_norm": 0.32534515857696533, |
|
"learning_rate": 4.519921619856303e-05, |
|
"loss": 0.1509, |
|
"step": 1190 |
|
}, |
|
{ |
|
"epoch": 2.3434146341463413, |
|
"grad_norm": 0.37095433473587036, |
|
"learning_rate": 4.3892880470280864e-05, |
|
"loss": 0.1447, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 2.3629268292682926, |
|
"grad_norm": 0.35774731636047363, |
|
"learning_rate": 4.25865447419987e-05, |
|
"loss": 0.1455, |
|
"step": 1210 |
|
}, |
|
{ |
|
"epoch": 2.382439024390244, |
|
"grad_norm": 0.6414694786071777, |
|
"learning_rate": 4.128020901371652e-05, |
|
"loss": 0.1445, |
|
"step": 1220 |
|
}, |
|
{ |
|
"epoch": 2.401951219512195, |
|
"grad_norm": 0.39603760838508606, |
|
"learning_rate": 3.997387328543436e-05, |
|
"loss": 0.142, |
|
"step": 1230 |
|
}, |
|
{ |
|
"epoch": 2.4214634146341463, |
|
"grad_norm": 0.4492717981338501, |
|
"learning_rate": 3.866753755715219e-05, |
|
"loss": 0.132, |
|
"step": 1240 |
|
}, |
|
{ |
|
"epoch": 2.4409756097560975, |
|
"grad_norm": 0.455432265996933, |
|
"learning_rate": 3.736120182887003e-05, |
|
"loss": 0.1483, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 2.4604878048780487, |
|
"grad_norm": 0.34108078479766846, |
|
"learning_rate": 3.605486610058785e-05, |
|
"loss": 0.1353, |
|
"step": 1260 |
|
}, |
|
{ |
|
"epoch": 2.48, |
|
"grad_norm": 0.8474462628364563, |
|
"learning_rate": 3.4748530372305685e-05, |
|
"loss": 0.1433, |
|
"step": 1270 |
|
}, |
|
{ |
|
"epoch": 2.499512195121951, |
|
"grad_norm": 0.490139365196228, |
|
"learning_rate": 3.344219464402352e-05, |
|
"loss": 0.1405, |
|
"step": 1280 |
|
}, |
|
{ |
|
"epoch": 2.5190243902439025, |
|
"grad_norm": 0.2619794011116028, |
|
"learning_rate": 3.213585891574134e-05, |
|
"loss": 0.1431, |
|
"step": 1290 |
|
}, |
|
{ |
|
"epoch": 2.5385365853658537, |
|
"grad_norm": 0.35025978088378906, |
|
"learning_rate": 3.082952318745918e-05, |
|
"loss": 0.1314, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 2.558048780487805, |
|
"grad_norm": 0.6071056127548218, |
|
"learning_rate": 2.952318745917701e-05, |
|
"loss": 0.1335, |
|
"step": 1310 |
|
}, |
|
{ |
|
"epoch": 2.577560975609756, |
|
"grad_norm": 0.410426527261734, |
|
"learning_rate": 2.821685173089484e-05, |
|
"loss": 0.1466, |
|
"step": 1320 |
|
}, |
|
{ |
|
"epoch": 2.5970731707317074, |
|
"grad_norm": 0.4310607612133026, |
|
"learning_rate": 2.6910516002612673e-05, |
|
"loss": 0.1452, |
|
"step": 1330 |
|
}, |
|
{ |
|
"epoch": 2.6165853658536586, |
|
"grad_norm": 0.48974138498306274, |
|
"learning_rate": 2.5604180274330502e-05, |
|
"loss": 0.1488, |
|
"step": 1340 |
|
}, |
|
{ |
|
"epoch": 2.63609756097561, |
|
"grad_norm": 0.4195750653743744, |
|
"learning_rate": 2.4297844546048334e-05, |
|
"loss": 0.146, |
|
"step": 1350 |
|
}, |
|
{ |
|
"epoch": 2.655609756097561, |
|
"grad_norm": 0.27998435497283936, |
|
"learning_rate": 2.2991508817766167e-05, |
|
"loss": 0.1335, |
|
"step": 1360 |
|
}, |
|
{ |
|
"epoch": 2.6751219512195124, |
|
"grad_norm": 0.5254719853401184, |
|
"learning_rate": 2.1685173089484e-05, |
|
"loss": 0.1317, |
|
"step": 1370 |
|
}, |
|
{ |
|
"epoch": 2.694634146341463, |
|
"grad_norm": 0.3470640480518341, |
|
"learning_rate": 2.0378837361201832e-05, |
|
"loss": 0.1584, |
|
"step": 1380 |
|
}, |
|
{ |
|
"epoch": 2.7141463414634144, |
|
"grad_norm": 0.29102063179016113, |
|
"learning_rate": 1.907250163291966e-05, |
|
"loss": 0.1354, |
|
"step": 1390 |
|
}, |
|
{ |
|
"epoch": 2.7336585365853656, |
|
"grad_norm": 0.2697664201259613, |
|
"learning_rate": 1.7766165904637493e-05, |
|
"loss": 0.1379, |
|
"step": 1400 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 1536, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 3, |
|
"save_steps": 10, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 2.6443635349501133e+17, |
|
"train_batch_size": 1, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|