ahmedheakl's picture
End of training
baa0356 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 2.0,
"eval_steps": 500,
"global_step": 2916,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.006858710562414266,
"grad_norm": 1.162638545036316,
"learning_rate": 6.849315068493151e-07,
"loss": 0.0267,
"step": 10
},
{
"epoch": 0.013717421124828532,
"grad_norm": 0.5459424257278442,
"learning_rate": 1.3698630136986302e-06,
"loss": 0.0209,
"step": 20
},
{
"epoch": 0.0205761316872428,
"grad_norm": 0.484558641910553,
"learning_rate": 2.0547945205479454e-06,
"loss": 0.0117,
"step": 30
},
{
"epoch": 0.027434842249657063,
"grad_norm": 0.23133069276809692,
"learning_rate": 2.7397260273972604e-06,
"loss": 0.0065,
"step": 40
},
{
"epoch": 0.03429355281207133,
"grad_norm": 0.20168930292129517,
"learning_rate": 3.4246575342465754e-06,
"loss": 0.0097,
"step": 50
},
{
"epoch": 0.0411522633744856,
"grad_norm": 0.20999373495578766,
"learning_rate": 4.109589041095891e-06,
"loss": 0.0152,
"step": 60
},
{
"epoch": 0.04801097393689986,
"grad_norm": 0.42117851972579956,
"learning_rate": 4.7945205479452054e-06,
"loss": 0.0194,
"step": 70
},
{
"epoch": 0.05486968449931413,
"grad_norm": 0.1798817217350006,
"learning_rate": 5.479452054794521e-06,
"loss": 0.0028,
"step": 80
},
{
"epoch": 0.06172839506172839,
"grad_norm": 0.058727577328681946,
"learning_rate": 6.164383561643836e-06,
"loss": 0.0015,
"step": 90
},
{
"epoch": 0.06858710562414266,
"grad_norm": 0.22002284228801727,
"learning_rate": 6.849315068493151e-06,
"loss": 0.0055,
"step": 100
},
{
"epoch": 0.07544581618655692,
"grad_norm": 0.2096785008907318,
"learning_rate": 7.534246575342466e-06,
"loss": 0.011,
"step": 110
},
{
"epoch": 0.0823045267489712,
"grad_norm": 0.06468941271305084,
"learning_rate": 8.219178082191782e-06,
"loss": 0.0092,
"step": 120
},
{
"epoch": 0.08916323731138547,
"grad_norm": 0.03226824477314949,
"learning_rate": 8.904109589041097e-06,
"loss": 0.0019,
"step": 130
},
{
"epoch": 0.09602194787379972,
"grad_norm": 0.10582853108644485,
"learning_rate": 9.589041095890411e-06,
"loss": 0.0027,
"step": 140
},
{
"epoch": 0.102880658436214,
"grad_norm": 0.14337775111198425,
"learning_rate": 1.0273972602739728e-05,
"loss": 0.0034,
"step": 150
},
{
"epoch": 0.10973936899862825,
"grad_norm": 0.09626258909702301,
"learning_rate": 1.0958904109589042e-05,
"loss": 0.0024,
"step": 160
},
{
"epoch": 0.11659807956104253,
"grad_norm": 0.29003870487213135,
"learning_rate": 1.1643835616438357e-05,
"loss": 0.0238,
"step": 170
},
{
"epoch": 0.12345679012345678,
"grad_norm": 0.12768058478832245,
"learning_rate": 1.2328767123287673e-05,
"loss": 0.0039,
"step": 180
},
{
"epoch": 0.13031550068587106,
"grad_norm": 0.04557815566658974,
"learning_rate": 1.3013698630136988e-05,
"loss": 0.0023,
"step": 190
},
{
"epoch": 0.13717421124828533,
"grad_norm": 0.15683676302433014,
"learning_rate": 1.3698630136986302e-05,
"loss": 0.0048,
"step": 200
},
{
"epoch": 0.1440329218106996,
"grad_norm": 0.042118772864341736,
"learning_rate": 1.4383561643835617e-05,
"loss": 0.003,
"step": 210
},
{
"epoch": 0.15089163237311384,
"grad_norm": 0.08400426059961319,
"learning_rate": 1.5068493150684933e-05,
"loss": 0.0043,
"step": 220
},
{
"epoch": 0.15775034293552812,
"grad_norm": 0.17980587482452393,
"learning_rate": 1.5753424657534248e-05,
"loss": 0.0078,
"step": 230
},
{
"epoch": 0.1646090534979424,
"grad_norm": 0.14988136291503906,
"learning_rate": 1.6438356164383563e-05,
"loss": 0.0061,
"step": 240
},
{
"epoch": 0.17146776406035666,
"grad_norm": 0.15037085115909576,
"learning_rate": 1.712328767123288e-05,
"loss": 0.0028,
"step": 250
},
{
"epoch": 0.17832647462277093,
"grad_norm": 0.235006645321846,
"learning_rate": 1.7808219178082194e-05,
"loss": 0.0045,
"step": 260
},
{
"epoch": 0.18518518518518517,
"grad_norm": 0.22422538697719574,
"learning_rate": 1.849315068493151e-05,
"loss": 0.0023,
"step": 270
},
{
"epoch": 0.19204389574759945,
"grad_norm": 0.2827920615673065,
"learning_rate": 1.9178082191780822e-05,
"loss": 0.0124,
"step": 280
},
{
"epoch": 0.19890260631001372,
"grad_norm": 0.27119961380958557,
"learning_rate": 1.9863013698630137e-05,
"loss": 0.002,
"step": 290
},
{
"epoch": 0.205761316872428,
"grad_norm": 0.1681121289730072,
"learning_rate": 1.9999541310559686e-05,
"loss": 0.0074,
"step": 300
},
{
"epoch": 0.21262002743484226,
"grad_norm": 0.13818895816802979,
"learning_rate": 1.9997677956826334e-05,
"loss": 0.003,
"step": 310
},
{
"epoch": 0.2194787379972565,
"grad_norm": 0.06377672404050827,
"learning_rate": 1.9994381537597277e-05,
"loss": 0.018,
"step": 320
},
{
"epoch": 0.22633744855967078,
"grad_norm": 0.12524163722991943,
"learning_rate": 1.9989652525380695e-05,
"loss": 0.0209,
"step": 330
},
{
"epoch": 0.23319615912208505,
"grad_norm": 0.04168206453323364,
"learning_rate": 1.998349159803241e-05,
"loss": 0.0027,
"step": 340
},
{
"epoch": 0.24005486968449932,
"grad_norm": 0.27079302072525024,
"learning_rate": 1.9975899638658733e-05,
"loss": 0.0116,
"step": 350
},
{
"epoch": 0.24691358024691357,
"grad_norm": 0.13237667083740234,
"learning_rate": 1.9966877735489846e-05,
"loss": 0.0019,
"step": 360
},
{
"epoch": 0.25377229080932784,
"grad_norm": 0.06160791590809822,
"learning_rate": 1.995642718172386e-05,
"loss": 0.0014,
"step": 370
},
{
"epoch": 0.2606310013717421,
"grad_norm": 0.07572882622480392,
"learning_rate": 1.9944549475341404e-05,
"loss": 0.006,
"step": 380
},
{
"epoch": 0.2674897119341564,
"grad_norm": 0.10234888643026352,
"learning_rate": 1.9931246318890943e-05,
"loss": 0.0039,
"step": 390
},
{
"epoch": 0.27434842249657065,
"grad_norm": 0.12086405605077744,
"learning_rate": 1.9916519619244707e-05,
"loss": 0.0046,
"step": 400
},
{
"epoch": 0.2812071330589849,
"grad_norm": 0.046436768025159836,
"learning_rate": 1.990037148732537e-05,
"loss": 0.0097,
"step": 410
},
{
"epoch": 0.2880658436213992,
"grad_norm": 0.043175164610147476,
"learning_rate": 1.9882804237803487e-05,
"loss": 0.0015,
"step": 420
},
{
"epoch": 0.29492455418381347,
"grad_norm": 0.3755040168762207,
"learning_rate": 1.9863820388765672e-05,
"loss": 0.0077,
"step": 430
},
{
"epoch": 0.3017832647462277,
"grad_norm": 0.10730766505002975,
"learning_rate": 1.9843422661353697e-05,
"loss": 0.001,
"step": 440
},
{
"epoch": 0.30864197530864196,
"grad_norm": 0.09198994934558868,
"learning_rate": 1.9821613979374414e-05,
"loss": 0.0052,
"step": 450
},
{
"epoch": 0.31550068587105623,
"grad_norm": 1.359625220298767,
"learning_rate": 1.979839746888067e-05,
"loss": 0.0146,
"step": 460
},
{
"epoch": 0.3223593964334705,
"grad_norm": 0.2990811765193939,
"learning_rate": 1.9773776457723216e-05,
"loss": 0.0083,
"step": 470
},
{
"epoch": 0.3292181069958848,
"grad_norm": 0.09418093413114548,
"learning_rate": 1.9747754475073707e-05,
"loss": 0.0057,
"step": 480
},
{
"epoch": 0.33607681755829905,
"grad_norm": 0.1610361486673355,
"learning_rate": 1.9720335250918797e-05,
"loss": 0.0066,
"step": 490
},
{
"epoch": 0.3429355281207133,
"grad_norm": 0.2202681005001068,
"learning_rate": 1.969152271552552e-05,
"loss": 0.0096,
"step": 500
},
{
"epoch": 0.3497942386831276,
"grad_norm": 0.2531183362007141,
"learning_rate": 1.966132099887791e-05,
"loss": 0.0078,
"step": 510
},
{
"epoch": 0.35665294924554186,
"grad_norm": 0.07069271057844162,
"learning_rate": 1.9629734430085007e-05,
"loss": 0.0045,
"step": 520
},
{
"epoch": 0.3635116598079561,
"grad_norm": 0.08322709053754807,
"learning_rate": 1.9596767536760328e-05,
"loss": 0.0044,
"step": 530
},
{
"epoch": 0.37037037037037035,
"grad_norm": 0.2892574667930603,
"learning_rate": 1.9562425044372884e-05,
"loss": 0.0034,
"step": 540
},
{
"epoch": 0.3772290809327846,
"grad_norm": 0.21209566295146942,
"learning_rate": 1.9526711875569817e-05,
"loss": 0.0105,
"step": 550
},
{
"epoch": 0.3840877914951989,
"grad_norm": 0.0845257118344307,
"learning_rate": 1.948963314947081e-05,
"loss": 0.006,
"step": 560
},
{
"epoch": 0.39094650205761317,
"grad_norm": 0.3820701241493225,
"learning_rate": 1.945119418093429e-05,
"loss": 0.0104,
"step": 570
},
{
"epoch": 0.39780521262002744,
"grad_norm": 0.2731214463710785,
"learning_rate": 1.9411400479795618e-05,
"loss": 0.0121,
"step": 580
},
{
"epoch": 0.4046639231824417,
"grad_norm": 0.01668688841164112,
"learning_rate": 1.9370257750077296e-05,
"loss": 0.0023,
"step": 590
},
{
"epoch": 0.411522633744856,
"grad_norm": 0.28591388463974,
"learning_rate": 1.932777188917136e-05,
"loss": 0.0363,
"step": 600
},
{
"epoch": 0.41838134430727025,
"grad_norm": 0.22390630841255188,
"learning_rate": 1.9283948986994047e-05,
"loss": 0.0055,
"step": 610
},
{
"epoch": 0.4252400548696845,
"grad_norm": 0.22514230012893677,
"learning_rate": 1.9238795325112867e-05,
"loss": 0.0109,
"step": 620
},
{
"epoch": 0.43209876543209874,
"grad_norm": 0.11907146126031876,
"learning_rate": 1.919231737584621e-05,
"loss": 0.0009,
"step": 630
},
{
"epoch": 0.438957475994513,
"grad_norm": 0.030443059280514717,
"learning_rate": 1.9144521801335588e-05,
"loss": 0.0031,
"step": 640
},
{
"epoch": 0.4458161865569273,
"grad_norm": 0.22063292562961578,
"learning_rate": 1.90954154525907e-05,
"loss": 0.0164,
"step": 650
},
{
"epoch": 0.45267489711934156,
"grad_norm": 0.3322547972202301,
"learning_rate": 1.9045005368507418e-05,
"loss": 0.0024,
"step": 660
},
{
"epoch": 0.45953360768175583,
"grad_norm": 0.16343438625335693,
"learning_rate": 1.899329877485881e-05,
"loss": 0.0033,
"step": 670
},
{
"epoch": 0.4663923182441701,
"grad_norm": 0.4316878020763397,
"learning_rate": 1.89403030832594e-05,
"loss": 0.0025,
"step": 680
},
{
"epoch": 0.4732510288065844,
"grad_norm": 0.16055700182914734,
"learning_rate": 1.888602589010282e-05,
"loss": 0.0024,
"step": 690
},
{
"epoch": 0.48010973936899864,
"grad_norm": 0.18311993777751923,
"learning_rate": 1.8830474975472904e-05,
"loss": 0.0056,
"step": 700
},
{
"epoch": 0.4869684499314129,
"grad_norm": 0.06576403230428696,
"learning_rate": 1.8773658302028525e-05,
"loss": 0.0094,
"step": 710
},
{
"epoch": 0.49382716049382713,
"grad_norm": 0.13461001217365265,
"learning_rate": 1.87155840138622e-05,
"loss": 0.0123,
"step": 720
},
{
"epoch": 0.5006858710562414,
"grad_norm": 0.10820876806974411,
"learning_rate": 1.8656260435332732e-05,
"loss": 0.0031,
"step": 730
},
{
"epoch": 0.5075445816186557,
"grad_norm": 0.05024191737174988,
"learning_rate": 1.8595696069872013e-05,
"loss": 0.0047,
"step": 740
},
{
"epoch": 0.51440329218107,
"grad_norm": 0.06663210690021515,
"learning_rate": 1.8533899598766106e-05,
"loss": 0.0023,
"step": 750
},
{
"epoch": 0.5212620027434842,
"grad_norm": 0.1120394617319107,
"learning_rate": 1.8470879879910916e-05,
"loss": 0.0016,
"step": 760
},
{
"epoch": 0.5281207133058985,
"grad_norm": 0.5442692637443542,
"learning_rate": 1.8406645946542446e-05,
"loss": 0.0388,
"step": 770
},
{
"epoch": 0.5349794238683128,
"grad_norm": 0.3559621274471283,
"learning_rate": 1.8341207005942033e-05,
"loss": 0.0042,
"step": 780
},
{
"epoch": 0.541838134430727,
"grad_norm": 0.06798390299081802,
"learning_rate": 1.827457243811654e-05,
"loss": 0.0007,
"step": 790
},
{
"epoch": 0.5486968449931413,
"grad_norm": 0.09693529456853867,
"learning_rate": 1.8206751794453837e-05,
"loss": 0.0028,
"step": 800
},
{
"epoch": 0.5555555555555556,
"grad_norm": 0.014329448342323303,
"learning_rate": 1.8137754796353708e-05,
"loss": 0.0024,
"step": 810
},
{
"epoch": 0.5624142661179699,
"grad_norm": 0.05700727179646492,
"learning_rate": 1.8067591333834382e-05,
"loss": 0.0098,
"step": 820
},
{
"epoch": 0.5692729766803841,
"grad_norm": 0.16214902698993683,
"learning_rate": 1.7996271464114915e-05,
"loss": 0.0024,
"step": 830
},
{
"epoch": 0.5761316872427984,
"grad_norm": 0.017287936061620712,
"learning_rate": 1.792380541017357e-05,
"loss": 0.0023,
"step": 840
},
{
"epoch": 0.5829903978052127,
"grad_norm": 0.03860418125987053,
"learning_rate": 1.7850203559282464e-05,
"loss": 0.0054,
"step": 850
},
{
"epoch": 0.5898491083676269,
"grad_norm": 0.057672981172800064,
"learning_rate": 1.7775476461518668e-05,
"loss": 0.0039,
"step": 860
},
{
"epoch": 0.5967078189300411,
"grad_norm": 0.1254579871892929,
"learning_rate": 1.7699634828251945e-05,
"loss": 0.0016,
"step": 870
},
{
"epoch": 0.6035665294924554,
"grad_norm": 0.12202060967683792,
"learning_rate": 1.7622689530609397e-05,
"loss": 0.0055,
"step": 880
},
{
"epoch": 0.6104252400548696,
"grad_norm": 0.5825458765029907,
"learning_rate": 1.7544651597917194e-05,
"loss": 0.0015,
"step": 890
},
{
"epoch": 0.6172839506172839,
"grad_norm": 0.06615274399518967,
"learning_rate": 1.7465532216119628e-05,
"loss": 0.0029,
"step": 900
},
{
"epoch": 0.6241426611796982,
"grad_norm": 0.016936153173446655,
"learning_rate": 1.7385342726175728e-05,
"loss": 0.0033,
"step": 910
},
{
"epoch": 0.6310013717421125,
"grad_norm": 0.09208139032125473,
"learning_rate": 1.7304094622433646e-05,
"loss": 0.0072,
"step": 920
},
{
"epoch": 0.6378600823045267,
"grad_norm": 0.0077827684581279755,
"learning_rate": 1.7221799550983062e-05,
"loss": 0.001,
"step": 930
},
{
"epoch": 0.644718792866941,
"grad_norm": 0.15854914486408234,
"learning_rate": 1.7138469307985832e-05,
"loss": 0.0033,
"step": 940
},
{
"epoch": 0.6515775034293553,
"grad_norm": 0.062444571405649185,
"learning_rate": 1.705411583798513e-05,
"loss": 0.0327,
"step": 950
},
{
"epoch": 0.6584362139917695,
"grad_norm": 0.018232915550470352,
"learning_rate": 1.6968751232193315e-05,
"loss": 0.008,
"step": 960
},
{
"epoch": 0.6652949245541838,
"grad_norm": 0.16335558891296387,
"learning_rate": 1.6882387726758793e-05,
"loss": 0.0359,
"step": 970
},
{
"epoch": 0.6721536351165981,
"grad_norm": 0.4718017876148224,
"learning_rate": 1.679503770101206e-05,
"loss": 0.0096,
"step": 980
},
{
"epoch": 0.6790123456790124,
"grad_norm": 0.16451981663703918,
"learning_rate": 1.6706713675691283e-05,
"loss": 0.0055,
"step": 990
},
{
"epoch": 0.6858710562414266,
"grad_norm": 0.08348975330591202,
"learning_rate": 1.661742831114757e-05,
"loss": 0.0051,
"step": 1000
},
{
"epoch": 0.6927297668038409,
"grad_norm": 0.22571489214897156,
"learning_rate": 1.6527194405530217e-05,
"loss": 0.0029,
"step": 1010
},
{
"epoch": 0.6995884773662552,
"grad_norm": 0.11072508990764618,
"learning_rate": 1.6436024892952256e-05,
"loss": 0.0143,
"step": 1020
},
{
"epoch": 0.7064471879286695,
"grad_norm": 0.01999078132212162,
"learning_rate": 1.6343932841636455e-05,
"loss": 0.0012,
"step": 1030
},
{
"epoch": 0.7133058984910837,
"grad_norm": 0.04143204912543297,
"learning_rate": 1.6250931452042136e-05,
"loss": 0.011,
"step": 1040
},
{
"epoch": 0.720164609053498,
"grad_norm": 0.017960038036108017,
"learning_rate": 1.615703405497302e-05,
"loss": 0.0106,
"step": 1050
},
{
"epoch": 0.7270233196159122,
"grad_norm": 0.13416878879070282,
"learning_rate": 1.6062254109666383e-05,
"loss": 0.0281,
"step": 1060
},
{
"epoch": 0.7338820301783264,
"grad_norm": 0.0555805005133152,
"learning_rate": 1.5966605201863822e-05,
"loss": 0.0051,
"step": 1070
},
{
"epoch": 0.7407407407407407,
"grad_norm": 0.11911418288946152,
"learning_rate": 1.587010104186388e-05,
"loss": 0.0097,
"step": 1080
},
{
"epoch": 0.747599451303155,
"grad_norm": 0.10376396775245667,
"learning_rate": 1.57727554625568e-05,
"loss": 0.0023,
"step": 1090
},
{
"epoch": 0.7544581618655692,
"grad_norm": 0.025814570486545563,
"learning_rate": 1.5674582417441734e-05,
"loss": 0.0009,
"step": 1100
},
{
"epoch": 0.7613168724279835,
"grad_norm": 0.05079122632741928,
"learning_rate": 1.5575595978626634e-05,
"loss": 0.0012,
"step": 1110
},
{
"epoch": 0.7681755829903978,
"grad_norm": 0.01824193075299263,
"learning_rate": 1.547581033481119e-05,
"loss": 0.0033,
"step": 1120
},
{
"epoch": 0.7750342935528121,
"grad_norm": 1.8023141622543335,
"learning_rate": 1.5375239789252986e-05,
"loss": 0.0092,
"step": 1130
},
{
"epoch": 0.7818930041152263,
"grad_norm": 0.10559989511966705,
"learning_rate": 1.5273898757717295e-05,
"loss": 0.0049,
"step": 1140
},
{
"epoch": 0.7887517146776406,
"grad_norm": 0.07854919880628586,
"learning_rate": 1.5171801766410727e-05,
"loss": 0.0041,
"step": 1150
},
{
"epoch": 0.7956104252400549,
"grad_norm": 0.039341386407613754,
"learning_rate": 1.5068963449899039e-05,
"loss": 0.0025,
"step": 1160
},
{
"epoch": 0.8024691358024691,
"grad_norm": 0.10025200247764587,
"learning_rate": 1.4965398549009416e-05,
"loss": 0.0013,
"step": 1170
},
{
"epoch": 0.8093278463648834,
"grad_norm": 0.021738484501838684,
"learning_rate": 1.4861121908717529e-05,
"loss": 0.0085,
"step": 1180
},
{
"epoch": 0.8161865569272977,
"grad_norm": 0.06829023361206055,
"learning_rate": 1.4756148476019654e-05,
"loss": 0.0116,
"step": 1190
},
{
"epoch": 0.823045267489712,
"grad_norm": 0.2639261782169342,
"learning_rate": 1.4650493297790178e-05,
"loss": 0.0123,
"step": 1200
},
{
"epoch": 0.8299039780521262,
"grad_norm": 0.35069772601127625,
"learning_rate": 1.4544171518624778e-05,
"loss": 0.0006,
"step": 1210
},
{
"epoch": 0.8367626886145405,
"grad_norm": 0.07271619141101837,
"learning_rate": 1.4437198378669598e-05,
"loss": 0.0093,
"step": 1220
},
{
"epoch": 0.8436213991769548,
"grad_norm": 0.00380721571855247,
"learning_rate": 1.4329589211436733e-05,
"loss": 0.0012,
"step": 1230
},
{
"epoch": 0.850480109739369,
"grad_norm": 0.02267816662788391,
"learning_rate": 1.4221359441606311e-05,
"loss": 0.0027,
"step": 1240
},
{
"epoch": 0.8573388203017832,
"grad_norm": 0.11175378412008286,
"learning_rate": 1.4112524582815546e-05,
"loss": 0.0128,
"step": 1250
},
{
"epoch": 0.8641975308641975,
"grad_norm": 0.053114600479602814,
"learning_rate": 1.4003100235434998e-05,
"loss": 0.0022,
"step": 1260
},
{
"epoch": 0.8710562414266118,
"grad_norm": 0.011361058801412582,
"learning_rate": 1.389310208433242e-05,
"loss": 0.0045,
"step": 1270
},
{
"epoch": 0.877914951989026,
"grad_norm": 0.0025912427809089422,
"learning_rate": 1.3782545896624502e-05,
"loss": 0.0094,
"step": 1280
},
{
"epoch": 0.8847736625514403,
"grad_norm": 0.12230653315782547,
"learning_rate": 1.3671447519416803e-05,
"loss": 0.0206,
"step": 1290
},
{
"epoch": 0.8916323731138546,
"grad_norm": 0.01137256808578968,
"learning_rate": 1.3559822877532234e-05,
"loss": 0.0022,
"step": 1300
},
{
"epoch": 0.8984910836762688,
"grad_norm": 0.07587670534849167,
"learning_rate": 1.3447687971228402e-05,
"loss": 0.0013,
"step": 1310
},
{
"epoch": 0.9053497942386831,
"grad_norm": 0.07251809537410736,
"learning_rate": 1.3335058873904128e-05,
"loss": 0.0054,
"step": 1320
},
{
"epoch": 0.9122085048010974,
"grad_norm": 0.23888733983039856,
"learning_rate": 1.3221951729795492e-05,
"loss": 0.0073,
"step": 1330
},
{
"epoch": 0.9190672153635117,
"grad_norm": 0.3885558843612671,
"learning_rate": 1.3108382751661722e-05,
"loss": 0.0022,
"step": 1340
},
{
"epoch": 0.9259259259259259,
"grad_norm": 0.12456855922937393,
"learning_rate": 1.2994368218461255e-05,
"loss": 0.0037,
"step": 1350
},
{
"epoch": 0.9327846364883402,
"grad_norm": 0.011711199767887592,
"learning_rate": 1.287992447301832e-05,
"loss": 0.0061,
"step": 1360
},
{
"epoch": 0.9396433470507545,
"grad_norm": 0.3066045045852661,
"learning_rate": 1.2765067919680357e-05,
"loss": 0.0041,
"step": 1370
},
{
"epoch": 0.9465020576131687,
"grad_norm": 0.016845189034938812,
"learning_rate": 1.264981502196662e-05,
"loss": 0.0046,
"step": 1380
},
{
"epoch": 0.953360768175583,
"grad_norm": 0.09259962290525436,
"learning_rate": 1.2534182300208299e-05,
"loss": 0.0081,
"step": 1390
},
{
"epoch": 0.9602194787379973,
"grad_norm": 0.4014507830142975,
"learning_rate": 1.2418186329180506e-05,
"loss": 0.0069,
"step": 1400
},
{
"epoch": 0.9670781893004116,
"grad_norm": 0.24332621693611145,
"learning_rate": 1.230184373572643e-05,
"loss": 0.0017,
"step": 1410
},
{
"epoch": 0.9739368998628258,
"grad_norm": 0.07767323404550552,
"learning_rate": 1.218517119637408e-05,
"loss": 0.0034,
"step": 1420
},
{
"epoch": 0.9807956104252401,
"grad_norm": 0.023259377107024193,
"learning_rate": 1.2068185434945834e-05,
"loss": 0.0025,
"step": 1430
},
{
"epoch": 0.9876543209876543,
"grad_norm": 0.01660298928618431,
"learning_rate": 1.1950903220161286e-05,
"loss": 0.0031,
"step": 1440
},
{
"epoch": 0.9945130315500685,
"grad_norm": 0.06700021773576736,
"learning_rate": 1.1833341363233594e-05,
"loss": 0.0125,
"step": 1450
},
{
"epoch": 1.0013717421124828,
"grad_norm": 0.09212377667427063,
"learning_rate": 1.1715516715459784e-05,
"loss": 0.0021,
"step": 1460
},
{
"epoch": 1.008230452674897,
"grad_norm": 0.2013186812400818,
"learning_rate": 1.1597446165805272e-05,
"loss": 0.0055,
"step": 1470
},
{
"epoch": 1.0150891632373114,
"grad_norm": 0.033271919935941696,
"learning_rate": 1.147914663848301e-05,
"loss": 0.0006,
"step": 1480
},
{
"epoch": 1.0219478737997256,
"grad_norm": 0.012748132459819317,
"learning_rate": 1.1360635090527571e-05,
"loss": 0.0061,
"step": 1490
},
{
"epoch": 1.02880658436214,
"grad_norm": 0.18844588100910187,
"learning_rate": 1.1241928509364533e-05,
"loss": 0.0164,
"step": 1500
},
{
"epoch": 1.0356652949245542,
"grad_norm": 0.02880697138607502,
"learning_rate": 1.1123043910375495e-05,
"loss": 0.0013,
"step": 1510
},
{
"epoch": 1.0425240054869684,
"grad_norm": 0.040209993720054626,
"learning_rate": 1.1003998334459107e-05,
"loss": 0.0076,
"step": 1520
},
{
"epoch": 1.0493827160493827,
"grad_norm": 0.03542792797088623,
"learning_rate": 1.0884808845588424e-05,
"loss": 0.0121,
"step": 1530
},
{
"epoch": 1.056241426611797,
"grad_norm": 0.029188377782702446,
"learning_rate": 1.076549252836496e-05,
"loss": 0.0006,
"step": 1540
},
{
"epoch": 1.0631001371742113,
"grad_norm": 0.1534910947084427,
"learning_rate": 1.0646066485569779e-05,
"loss": 0.0011,
"step": 1550
},
{
"epoch": 1.0699588477366255,
"grad_norm": 0.0010759709402918816,
"learning_rate": 1.0526547835712e-05,
"loss": 0.0002,
"step": 1560
},
{
"epoch": 1.0768175582990398,
"grad_norm": 0.0017988062463700771,
"learning_rate": 1.0406953710575015e-05,
"loss": 0.0003,
"step": 1570
},
{
"epoch": 1.083676268861454,
"grad_norm": 0.2528439462184906,
"learning_rate": 1.0287301252760833e-05,
"loss": 0.0039,
"step": 1580
},
{
"epoch": 1.0905349794238683,
"grad_norm": 0.000582815904635936,
"learning_rate": 1.0167607613232856e-05,
"loss": 0.0005,
"step": 1590
},
{
"epoch": 1.0973936899862826,
"grad_norm": 0.08733749389648438,
"learning_rate": 1.0047889948857477e-05,
"loss": 0.0003,
"step": 1600
},
{
"epoch": 1.1042524005486969,
"grad_norm": 0.05657390132546425,
"learning_rate": 9.928165419944788e-06,
"loss": 0.002,
"step": 1610
},
{
"epoch": 1.1111111111111112,
"grad_norm": 0.036703433841466904,
"learning_rate": 9.80845118778886e-06,
"loss": 0.0004,
"step": 1620
},
{
"epoch": 1.1179698216735254,
"grad_norm": 0.004433758556842804,
"learning_rate": 9.68876441220782e-06,
"loss": 0.0051,
"step": 1630
},
{
"epoch": 1.1248285322359397,
"grad_norm": 0.06380768865346909,
"learning_rate": 9.569122249084177e-06,
"loss": 0.0035,
"step": 1640
},
{
"epoch": 1.131687242798354,
"grad_norm": 0.012877637520432472,
"learning_rate": 9.449541847905688e-06,
"loss": 0.0009,
"step": 1650
},
{
"epoch": 1.1385459533607682,
"grad_norm": 0.009387146681547165,
"learning_rate": 9.330040349307185e-06,
"loss": 0.0111,
"step": 1660
},
{
"epoch": 1.1454046639231825,
"grad_norm": 0.0012669875286519527,
"learning_rate": 9.210634882613595e-06,
"loss": 0.0006,
"step": 1670
},
{
"epoch": 1.1522633744855968,
"grad_norm": 0.029193460941314697,
"learning_rate": 9.091342563384661e-06,
"loss": 0.0009,
"step": 1680
},
{
"epoch": 1.159122085048011,
"grad_norm": 0.0319526381790638,
"learning_rate": 8.972180490961581e-06,
"loss": 0.0038,
"step": 1690
},
{
"epoch": 1.1659807956104253,
"grad_norm": 0.01600039191544056,
"learning_rate": 8.853165746015997e-06,
"loss": 0.0034,
"step": 1700
},
{
"epoch": 1.1728395061728394,
"grad_norm": 0.00153280608355999,
"learning_rate": 8.73431538810166e-06,
"loss": 0.0007,
"step": 1710
},
{
"epoch": 1.1796982167352539,
"grad_norm": 0.012536576949059963,
"learning_rate": 8.61564645320911e-06,
"loss": 0.0011,
"step": 1720
},
{
"epoch": 1.186556927297668,
"grad_norm": 0.013463485054671764,
"learning_rate": 8.497175951323737e-06,
"loss": 0.003,
"step": 1730
},
{
"epoch": 1.1934156378600824,
"grad_norm": 0.0013876461889594793,
"learning_rate": 8.378920863987576e-06,
"loss": 0.0005,
"step": 1740
},
{
"epoch": 1.2002743484224965,
"grad_norm": 0.008089344017207623,
"learning_rate": 8.260898141865188e-06,
"loss": 0.0117,
"step": 1750
},
{
"epoch": 1.2071330589849107,
"grad_norm": 0.0536433607339859,
"learning_rate": 8.143124702313932e-06,
"loss": 0.0057,
"step": 1760
},
{
"epoch": 1.213991769547325,
"grad_norm": 0.16750673949718475,
"learning_rate": 8.025617426959046e-06,
"loss": 0.0011,
"step": 1770
},
{
"epoch": 1.2208504801097393,
"grad_norm": 0.15625609457492828,
"learning_rate": 7.908393159273835e-06,
"loss": 0.0031,
"step": 1780
},
{
"epoch": 1.2277091906721536,
"grad_norm": 0.01993492804467678,
"learning_rate": 7.791468702165337e-06,
"loss": 0.0034,
"step": 1790
},
{
"epoch": 1.2345679012345678,
"grad_norm": 0.03969631716609001,
"learning_rate": 7.674860815565792e-06,
"loss": 0.0008,
"step": 1800
},
{
"epoch": 1.241426611796982,
"grad_norm": 0.05006815120577812,
"learning_rate": 7.558586214030272e-06,
"loss": 0.0034,
"step": 1810
},
{
"epoch": 1.2482853223593964,
"grad_norm": 0.004975775256752968,
"learning_rate": 7.442661564340823e-06,
"loss": 0.0001,
"step": 1820
},
{
"epoch": 1.2551440329218106,
"grad_norm": 0.010278506204485893,
"learning_rate": 7.327103483117453e-06,
"loss": 0.003,
"step": 1830
},
{
"epoch": 1.262002743484225,
"grad_norm": 0.03384735807776451,
"learning_rate": 7.211928534436307e-06,
"loss": 0.0025,
"step": 1840
},
{
"epoch": 1.2688614540466392,
"grad_norm": 0.11490129679441452,
"learning_rate": 7.097153227455379e-06,
"loss": 0.0033,
"step": 1850
},
{
"epoch": 1.2757201646090535,
"grad_norm": 0.008139098063111305,
"learning_rate": 6.9827940140480776e-06,
"loss": 0.0021,
"step": 1860
},
{
"epoch": 1.2825788751714677,
"grad_norm": 0.07241293787956238,
"learning_rate": 6.868867286445041e-06,
"loss": 0.0003,
"step": 1870
},
{
"epoch": 1.289437585733882,
"grad_norm": 0.2514030933380127,
"learning_rate": 6.7553893748844535e-06,
"loss": 0.0036,
"step": 1880
},
{
"epoch": 1.2962962962962963,
"grad_norm": 0.3750598430633545,
"learning_rate": 6.6423765452712895e-06,
"loss": 0.0185,
"step": 1890
},
{
"epoch": 1.3031550068587106,
"grad_norm": 0.011171502061188221,
"learning_rate": 6.529844996845751e-06,
"loss": 0.0012,
"step": 1900
},
{
"epoch": 1.3100137174211248,
"grad_norm": 0.010206708684563637,
"learning_rate": 6.417810859861275e-06,
"loss": 0.0081,
"step": 1910
},
{
"epoch": 1.316872427983539,
"grad_norm": 0.20837537944316864,
"learning_rate": 6.306290193272422e-06,
"loss": 0.0004,
"step": 1920
},
{
"epoch": 1.3237311385459534,
"grad_norm": 0.002073473297059536,
"learning_rate": 6.195298982433e-06,
"loss": 0.0004,
"step": 1930
},
{
"epoch": 1.3305898491083676,
"grad_norm": 0.13253618776798248,
"learning_rate": 6.084853136804711e-06,
"loss": 0.0006,
"step": 1940
},
{
"epoch": 1.337448559670782,
"grad_norm": 0.0015091156819835305,
"learning_rate": 5.9749684876767015e-06,
"loss": 0.0001,
"step": 1950
},
{
"epoch": 1.3443072702331962,
"grad_norm": 0.04806216433644295,
"learning_rate": 5.8656607858963014e-06,
"loss": 0.0258,
"step": 1960
},
{
"epoch": 1.3511659807956105,
"grad_norm": 0.0028714430518448353,
"learning_rate": 5.756945699611302e-06,
"loss": 0.0027,
"step": 1970
},
{
"epoch": 1.3580246913580247,
"grad_norm": 0.0008589240605942905,
"learning_rate": 5.6488388120241e-06,
"loss": 0.0036,
"step": 1980
},
{
"epoch": 1.364883401920439,
"grad_norm": 0.018008651211857796,
"learning_rate": 5.541355619157981e-06,
"loss": 0.004,
"step": 1990
},
{
"epoch": 1.3717421124828533,
"grad_norm": 0.1377667784690857,
"learning_rate": 5.434511527635935e-06,
"loss": 0.032,
"step": 2000
},
{
"epoch": 1.3786008230452675,
"grad_norm": 0.0013193864142522216,
"learning_rate": 5.328321852472269e-06,
"loss": 0.0044,
"step": 2010
},
{
"epoch": 1.3854595336076818,
"grad_norm": 0.0013291804352775216,
"learning_rate": 5.22280181487737e-06,
"loss": 0.0006,
"step": 2020
},
{
"epoch": 1.392318244170096,
"grad_norm": 0.0038565269205719233,
"learning_rate": 5.117966540075874e-06,
"loss": 0.0008,
"step": 2030
},
{
"epoch": 1.3991769547325104,
"grad_norm": 0.08756982535123825,
"learning_rate": 5.013831055138636e-06,
"loss": 0.0056,
"step": 2040
},
{
"epoch": 1.4060356652949246,
"grad_norm": 0.16994261741638184,
"learning_rate": 4.91041028682875e-06,
"loss": 0.0055,
"step": 2050
},
{
"epoch": 1.412894375857339,
"grad_norm": 0.0005570650682784617,
"learning_rate": 4.8077190594619425e-06,
"loss": 0.0012,
"step": 2060
},
{
"epoch": 1.4197530864197532,
"grad_norm": 0.04777060076594353,
"learning_rate": 4.705772092781675e-06,
"loss": 0.001,
"step": 2070
},
{
"epoch": 1.4266117969821672,
"grad_norm": 0.017951903864741325,
"learning_rate": 4.604583999849193e-06,
"loss": 0.0014,
"step": 2080
},
{
"epoch": 1.4334705075445817,
"grad_norm": 0.0037813596427440643,
"learning_rate": 4.504169284948909e-06,
"loss": 0.0004,
"step": 2090
},
{
"epoch": 1.4403292181069958,
"grad_norm": 0.09480316936969757,
"learning_rate": 4.40454234150936e-06,
"loss": 0.001,
"step": 2100
},
{
"epoch": 1.4471879286694103,
"grad_norm": 0.012794774025678635,
"learning_rate": 4.30571745004005e-06,
"loss": 0.0005,
"step": 2110
},
{
"epoch": 1.4540466392318243,
"grad_norm": 0.06668855994939804,
"learning_rate": 4.207708776084486e-06,
"loss": 0.0035,
"step": 2120
},
{
"epoch": 1.4609053497942388,
"grad_norm": 0.0008814105531200767,
"learning_rate": 4.110530368189695e-06,
"loss": 0.0036,
"step": 2130
},
{
"epoch": 1.4677640603566529,
"grad_norm": 0.003472542390227318,
"learning_rate": 4.014196155892503e-06,
"loss": 0.0001,
"step": 2140
},
{
"epoch": 1.4746227709190673,
"grad_norm": 0.010452075861394405,
"learning_rate": 3.9187199477228764e-06,
"loss": 0.0002,
"step": 2150
},
{
"epoch": 1.4814814814814814,
"grad_norm": 0.0006203448283486068,
"learning_rate": 3.824115429224625e-06,
"loss": 0.0019,
"step": 2160
},
{
"epoch": 1.4883401920438957,
"grad_norm": 0.0003119745524600148,
"learning_rate": 3.7303961609936933e-06,
"loss": 0.0026,
"step": 2170
},
{
"epoch": 1.49519890260631,
"grad_norm": 0.0362611822783947,
"learning_rate": 3.6375755767344047e-06,
"loss": 0.0003,
"step": 2180
},
{
"epoch": 1.5020576131687244,
"grad_norm": 0.2301040142774582,
"learning_rate": 3.5456669813338684e-06,
"loss": 0.0021,
"step": 2190
},
{
"epoch": 1.5089163237311385,
"grad_norm": 0.004565828945487738,
"learning_rate": 3.4546835489548647e-06,
"loss": 0.0111,
"step": 2200
},
{
"epoch": 1.5157750342935528,
"grad_norm": 0.03340575471520424,
"learning_rate": 3.3646383211474633e-06,
"loss": 0.0007,
"step": 2210
},
{
"epoch": 1.522633744855967,
"grad_norm": 0.0012551895342767239,
"learning_rate": 3.275544204979643e-06,
"loss": 0.0019,
"step": 2220
},
{
"epoch": 1.5294924554183813,
"grad_norm": 0.0014971940545365214,
"learning_rate": 3.187413971187198e-06,
"loss": 0.0029,
"step": 2230
},
{
"epoch": 1.5363511659807956,
"grad_norm": 0.009951584972441196,
"learning_rate": 3.1002602523431792e-06,
"loss": 0.0003,
"step": 2240
},
{
"epoch": 1.5432098765432098,
"grad_norm": 0.019237512722611427,
"learning_rate": 3.0140955410471606e-06,
"loss": 0.0054,
"step": 2250
},
{
"epoch": 1.5500685871056241,
"grad_norm": 0.5323840975761414,
"learning_rate": 2.9289321881345257e-06,
"loss": 0.0081,
"step": 2260
},
{
"epoch": 1.5569272976680384,
"grad_norm": 0.045489732176065445,
"learning_rate": 2.8447824009061185e-06,
"loss": 0.0007,
"step": 2270
},
{
"epoch": 1.5637860082304527,
"grad_norm": 0.0036737327463924885,
"learning_rate": 2.7616582413784465e-06,
"loss": 0.0021,
"step": 2280
},
{
"epoch": 1.570644718792867,
"grad_norm": 0.157160222530365,
"learning_rate": 2.679571624554709e-06,
"loss": 0.0033,
"step": 2290
},
{
"epoch": 1.5775034293552812,
"grad_norm": 0.037767913192510605,
"learning_rate": 2.5985343167169174e-06,
"loss": 0.0053,
"step": 2300
},
{
"epoch": 1.5843621399176955,
"grad_norm": 0.0038649821653962135,
"learning_rate": 2.5185579337392964e-06,
"loss": 0.0007,
"step": 2310
},
{
"epoch": 1.5912208504801097,
"grad_norm": 0.004294196609407663,
"learning_rate": 2.439653939423283e-06,
"loss": 0.0012,
"step": 2320
},
{
"epoch": 1.598079561042524,
"grad_norm": 0.0028122446965426207,
"learning_rate": 2.3618336438542977e-06,
"loss": 0.0,
"step": 2330
},
{
"epoch": 1.6049382716049383,
"grad_norm": 0.0009102143230848014,
"learning_rate": 2.2851082017805704e-06,
"loss": 0.0001,
"step": 2340
},
{
"epoch": 1.6117969821673526,
"grad_norm": 0.031087348237633705,
"learning_rate": 2.2094886110142065e-06,
"loss": 0.0007,
"step": 2350
},
{
"epoch": 1.6186556927297668,
"grad_norm": 0.009288856759667397,
"learning_rate": 2.13498571085477e-06,
"loss": 0.0012,
"step": 2360
},
{
"epoch": 1.625514403292181,
"grad_norm": 0.02940617874264717,
"learning_rate": 2.0616101805355814e-06,
"loss": 0.0172,
"step": 2370
},
{
"epoch": 1.6323731138545954,
"grad_norm": 0.29882147908210754,
"learning_rate": 1.9893725376929506e-06,
"loss": 0.0042,
"step": 2380
},
{
"epoch": 1.6392318244170097,
"grad_norm": 0.08803040534257889,
"learning_rate": 1.918283136858595e-06,
"loss": 0.005,
"step": 2390
},
{
"epoch": 1.646090534979424,
"grad_norm": 0.0005768566625192761,
"learning_rate": 1.8483521679754046e-06,
"loss": 0.001,
"step": 2400
},
{
"epoch": 1.652949245541838,
"grad_norm": 0.13397887349128723,
"learning_rate": 1.7795896549368308e-06,
"loss": 0.0008,
"step": 2410
},
{
"epoch": 1.6598079561042525,
"grad_norm": 0.08956614136695862,
"learning_rate": 1.7120054541500552e-06,
"loss": 0.0012,
"step": 2420
},
{
"epoch": 1.6666666666666665,
"grad_norm": 0.0006750011234544218,
"learning_rate": 1.6456092531231816e-06,
"loss": 0.0008,
"step": 2430
},
{
"epoch": 1.673525377229081,
"grad_norm": 0.0019244247814640403,
"learning_rate": 1.5804105690766224e-06,
"loss": 0.0021,
"step": 2440
},
{
"epoch": 1.680384087791495,
"grad_norm": 0.013517620973289013,
"learning_rate": 1.516418747578906e-06,
"loss": 0.0018,
"step": 2450
},
{
"epoch": 1.6872427983539096,
"grad_norm": 0.01616906374692917,
"learning_rate": 1.4536429612070846e-06,
"loss": 0.0012,
"step": 2460
},
{
"epoch": 1.6941015089163236,
"grad_norm": 0.001243108999915421,
"learning_rate": 1.3920922082319355e-06,
"loss": 0.002,
"step": 2470
},
{
"epoch": 1.700960219478738,
"grad_norm": 0.17654798924922943,
"learning_rate": 1.3317753113281562e-06,
"loss": 0.0019,
"step": 2480
},
{
"epoch": 1.7078189300411522,
"grad_norm": 0.000989454216323793,
"learning_rate": 1.272700916309718e-06,
"loss": 0.0021,
"step": 2490
},
{
"epoch": 1.7146776406035666,
"grad_norm": 0.002051191870123148,
"learning_rate": 1.2148774908905782e-06,
"loss": 0.0016,
"step": 2500
},
{
"epoch": 1.7215363511659807,
"grad_norm": 0.0013189888559281826,
"learning_rate": 1.1583133234709198e-06,
"loss": 0.0012,
"step": 2510
},
{
"epoch": 1.7283950617283952,
"grad_norm": 0.08241615444421768,
"learning_rate": 1.103016521949093e-06,
"loss": 0.0006,
"step": 2520
},
{
"epoch": 1.7352537722908092,
"grad_norm": 0.0015023777959868312,
"learning_rate": 1.0489950125594351e-06,
"loss": 0.0003,
"step": 2530
},
{
"epoch": 1.7421124828532237,
"grad_norm": 0.025439105927944183,
"learning_rate": 9.962565387361167e-07,
"loss": 0.0013,
"step": 2540
},
{
"epoch": 1.7489711934156378,
"grad_norm": 0.0018972799880430102,
"learning_rate": 9.448086600032047e-07,
"loss": 0.0087,
"step": 2550
},
{
"epoch": 1.7558299039780523,
"grad_norm": 0.003276234259828925,
"learning_rate": 8.946587508910798e-07,
"loss": 0.0133,
"step": 2560
},
{
"epoch": 1.7626886145404663,
"grad_norm": 0.04291309043765068,
"learning_rate": 8.458139998793779e-07,
"loss": 0.0001,
"step": 2570
},
{
"epoch": 1.7695473251028808,
"grad_norm": 0.01314778346568346,
"learning_rate": 7.982814083665825e-07,
"loss": 0.0014,
"step": 2580
},
{
"epoch": 1.7764060356652949,
"grad_norm": 0.020097751170396805,
"learning_rate": 7.520677896664586e-07,
"loss": 0.002,
"step": 2590
},
{
"epoch": 1.7832647462277091,
"grad_norm": 0.002437903080135584,
"learning_rate": 7.07179768031424e-07,
"loss": 0.0008,
"step": 2600
},
{
"epoch": 1.7901234567901234,
"grad_norm": 0.003759504295885563,
"learning_rate": 6.636237777030341e-07,
"loss": 0.0032,
"step": 2610
},
{
"epoch": 1.7969821673525377,
"grad_norm": 0.0019247238524258137,
"learning_rate": 6.214060619897011e-07,
"loss": 0.0006,
"step": 2620
},
{
"epoch": 1.803840877914952,
"grad_norm": 0.48312854766845703,
"learning_rate": 5.805326723717741e-07,
"loss": 0.0058,
"step": 2630
},
{
"epoch": 1.8106995884773662,
"grad_norm": 0.0015425217570737004,
"learning_rate": 5.410094676341237e-07,
"loss": 0.0005,
"step": 2640
},
{
"epoch": 1.8175582990397805,
"grad_norm": 0.39914342761039734,
"learning_rate": 5.028421130263416e-07,
"loss": 0.0036,
"step": 2650
},
{
"epoch": 1.8244170096021948,
"grad_norm": 0.042819537222385406,
"learning_rate": 4.660360794506946e-07,
"loss": 0.004,
"step": 2660
},
{
"epoch": 1.831275720164609,
"grad_norm": 0.16416482627391815,
"learning_rate": 4.305966426779118e-07,
"loss": 0.0061,
"step": 2670
},
{
"epoch": 1.8381344307270233,
"grad_norm": 0.04438329488039017,
"learning_rate": 3.9652888259096635e-07,
"loss": 0.0024,
"step": 2680
},
{
"epoch": 1.8449931412894376,
"grad_norm": 0.16208341717720032,
"learning_rate": 3.6383768245692453e-07,
"loss": 0.001,
"step": 2690
},
{
"epoch": 1.8518518518518519,
"grad_norm": 0.07959479838609695,
"learning_rate": 3.3252772822697565e-07,
"loss": 0.0011,
"step": 2700
},
{
"epoch": 1.8587105624142661,
"grad_norm": 0.04148540273308754,
"learning_rate": 3.026035078647549e-07,
"loss": 0.0052,
"step": 2710
},
{
"epoch": 1.8655692729766804,
"grad_norm": 0.0011746763484552503,
"learning_rate": 2.740693107030301e-07,
"loss": 0.0011,
"step": 2720
},
{
"epoch": 1.8724279835390947,
"grad_norm": 0.003397882217541337,
"learning_rate": 2.4692922682887923e-07,
"loss": 0.0013,
"step": 2730
},
{
"epoch": 1.879286694101509,
"grad_norm": 0.0028587563429027796,
"learning_rate": 2.2118714649740912e-07,
"loss": 0.0023,
"step": 2740
},
{
"epoch": 1.8861454046639232,
"grad_norm": 0.45575016736984253,
"learning_rate": 1.9684675957413414e-07,
"loss": 0.012,
"step": 2750
},
{
"epoch": 1.8930041152263375,
"grad_norm": 0.00040283441194333136,
"learning_rate": 1.739115550060688e-07,
"loss": 0.0009,
"step": 2760
},
{
"epoch": 1.8998628257887518,
"grad_norm": 0.0022473863791674376,
"learning_rate": 1.5238482032162162e-07,
"loss": 0.0019,
"step": 2770
},
{
"epoch": 1.906721536351166,
"grad_norm": 0.0015027286717668176,
"learning_rate": 1.3226964115936046e-07,
"loss": 0.0026,
"step": 2780
},
{
"epoch": 1.9135802469135803,
"grad_norm": 0.0023126809392124414,
"learning_rate": 1.1356890082572459e-07,
"loss": 0.0018,
"step": 2790
},
{
"epoch": 1.9204389574759944,
"grad_norm": 0.050206076353788376,
"learning_rate": 9.628527988172154e-08,
"loss": 0.0123,
"step": 2800
},
{
"epoch": 1.9272976680384089,
"grad_norm": 0.000970209832303226,
"learning_rate": 8.042125575870362e-08,
"loss": 0.0013,
"step": 2810
},
{
"epoch": 1.934156378600823,
"grad_norm": 0.002282192464917898,
"learning_rate": 6.597910240324967e-08,
"loss": 0.0021,
"step": 2820
},
{
"epoch": 1.9410150891632374,
"grad_norm": 0.00975970458239317,
"learning_rate": 5.296088995122017e-08,
"loss": 0.0039,
"step": 2830
},
{
"epoch": 1.9478737997256514,
"grad_norm": 0.010269064456224442,
"learning_rate": 4.1368484431023593e-08,
"loss": 0.0022,
"step": 2840
},
{
"epoch": 1.954732510288066,
"grad_norm": 0.0009142697090283036,
"learning_rate": 3.1203547496140295e-08,
"loss": 0.0028,
"step": 2850
},
{
"epoch": 1.96159122085048,
"grad_norm": 0.002814473118633032,
"learning_rate": 2.2467536186937532e-08,
"loss": 0.0041,
"step": 2860
},
{
"epoch": 1.9684499314128945,
"grad_norm": 0.0822644829750061,
"learning_rate": 1.516170272182538e-08,
"loss": 0.0163,
"step": 2870
},
{
"epoch": 1.9753086419753085,
"grad_norm": 0.08335670083761215,
"learning_rate": 9.287094317756985e-09,
"loss": 0.0005,
"step": 2880
},
{
"epoch": 1.982167352537723,
"grad_norm": 0.00891471654176712,
"learning_rate": 4.844553040125322e-09,
"loss": 0.0037,
"step": 2890
},
{
"epoch": 1.989026063100137,
"grad_norm": 0.014899800531566143,
"learning_rate": 1.8347156820563983e-09,
"loss": 0.0006,
"step": 2900
},
{
"epoch": 1.9958847736625516,
"grad_norm": 0.002669480862095952,
"learning_rate": 2.5801367313782464e-10,
"loss": 0.0022,
"step": 2910
},
{
"epoch": 2.0,
"step": 2916,
"total_flos": 6.452437106693243e+17,
"train_loss": 0.005305082097742036,
"train_runtime": 10141.6254,
"train_samples_per_second": 4.6,
"train_steps_per_second": 0.288
}
],
"logging_steps": 10,
"max_steps": 2916,
"num_input_tokens_seen": 0,
"num_train_epochs": 2,
"save_steps": 1000,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 6.452437106693243e+17,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}