diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,21701 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 100.0, + "eval_steps": 20000, + "global_step": 309400, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.03232062055591468, + "grad_norm": 9.332721710205078, + "learning_rate": 9.900000000000002e-06, + "loss": 4.2426, + "step": 100 + }, + { + "epoch": 0.06464124111182935, + "grad_norm": 8.882152557373047, + "learning_rate": 1.9900000000000003e-05, + "loss": 4.0765, + "step": 200 + }, + { + "epoch": 0.09696186166774402, + "grad_norm": 15.724532127380371, + "learning_rate": 2.9900000000000002e-05, + "loss": 4.0268, + "step": 300 + }, + { + "epoch": 0.1292824822236587, + "grad_norm": 7.888890266418457, + "learning_rate": 3.99e-05, + "loss": 3.9902, + "step": 400 + }, + { + "epoch": 0.16160310277957338, + "grad_norm": 10.950591087341309, + "learning_rate": 4.99e-05, + "loss": 3.9292, + "step": 500 + }, + { + "epoch": 0.19392372333548805, + "grad_norm": 16.338972091674805, + "learning_rate": 5.9900000000000006e-05, + "loss": 3.8515, + "step": 600 + }, + { + "epoch": 0.22624434389140272, + "grad_norm": 5.990420818328857, + "learning_rate": 6.99e-05, + "loss": 3.8284, + "step": 700 + }, + { + "epoch": 0.2585649644473174, + "grad_norm": 9.774703025817871, + "learning_rate": 7.99e-05, + "loss": 3.7876, + "step": 800 + }, + { + "epoch": 0.2908855850032321, + "grad_norm": 7.742414951324463, + "learning_rate": 8.989999999999999e-05, + "loss": 3.7472, + "step": 900 + }, + { + "epoch": 0.32320620555914675, + "grad_norm": 4.4023284912109375, + "learning_rate": 9.99e-05, + "loss": 3.7163, + "step": 1000 + }, + { + "epoch": 0.3555268261150614, + "grad_norm": 4.680882453918457, + "learning_rate": 0.0001099, + "loss": 3.6804, + "step": 1100 + }, + { + "epoch": 0.3878474466709761, + "grad_norm": 4.001336574554443, + "learning_rate": 0.00011990000000000001, + "loss": 3.6462, + "step": 1200 + }, + { + "epoch": 0.42016806722689076, + "grad_norm": 5.962468147277832, + "learning_rate": 0.00012989999999999999, + "loss": 3.6611, + "step": 1300 + }, + { + "epoch": 0.45248868778280543, + "grad_norm": 3.200063467025757, + "learning_rate": 0.0001399, + "loss": 3.6355, + "step": 1400 + }, + { + "epoch": 0.4848093083387201, + "grad_norm": 4.732622146606445, + "learning_rate": 0.0001499, + "loss": 3.6127, + "step": 1500 + }, + { + "epoch": 0.5171299288946348, + "grad_norm": 3.1570827960968018, + "learning_rate": 0.00015989999999999998, + "loss": 3.6142, + "step": 1600 + }, + { + "epoch": 0.5494505494505495, + "grad_norm": 4.7870073318481445, + "learning_rate": 0.0001699, + "loss": 3.5934, + "step": 1700 + }, + { + "epoch": 0.5817711700064642, + "grad_norm": 3.6958024501800537, + "learning_rate": 0.0001799, + "loss": 3.5961, + "step": 1800 + }, + { + "epoch": 0.6140917905623788, + "grad_norm": 3.7356350421905518, + "learning_rate": 0.0001899, + "loss": 3.6097, + "step": 1900 + }, + { + "epoch": 0.6464124111182935, + "grad_norm": 3.8976237773895264, + "learning_rate": 0.0001999, + "loss": 3.5555, + "step": 2000 + }, + { + "epoch": 0.6787330316742082, + "grad_norm": 3.5063016414642334, + "learning_rate": 0.0002099, + "loss": 3.5528, + "step": 2100 + }, + { + "epoch": 0.7110536522301228, + "grad_norm": 3.2186264991760254, + "learning_rate": 0.0002199, + "loss": 3.5596, + "step": 2200 + }, + { + "epoch": 0.7433742727860375, + "grad_norm": 3.2118473052978516, + "learning_rate": 0.0002299, + "loss": 3.5283, + "step": 2300 + }, + { + "epoch": 0.7756948933419522, + "grad_norm": 3.3671634197235107, + "learning_rate": 0.0002399, + "loss": 3.5257, + "step": 2400 + }, + { + "epoch": 0.8080155138978669, + "grad_norm": 4.391268730163574, + "learning_rate": 0.0002499, + "loss": 3.5231, + "step": 2500 + }, + { + "epoch": 0.8403361344537815, + "grad_norm": 2.7105960845947266, + "learning_rate": 0.00025990000000000003, + "loss": 3.522, + "step": 2600 + }, + { + "epoch": 0.8726567550096962, + "grad_norm": 3.373960494995117, + "learning_rate": 0.0002699, + "loss": 3.5354, + "step": 2700 + }, + { + "epoch": 0.9049773755656109, + "grad_norm": 2.757404088973999, + "learning_rate": 0.0002799, + "loss": 3.5267, + "step": 2800 + }, + { + "epoch": 0.9372979961215255, + "grad_norm": 4.519193649291992, + "learning_rate": 0.0002899, + "loss": 3.501, + "step": 2900 + }, + { + "epoch": 0.9696186166774402, + "grad_norm": 4.307316780090332, + "learning_rate": 0.0002999, + "loss": 3.4861, + "step": 3000 + }, + { + "epoch": 1.0019392372333549, + "grad_norm": 3.6178064346313477, + "learning_rate": 0.0003099, + "loss": 3.4992, + "step": 3100 + }, + { + "epoch": 1.0342598577892697, + "grad_norm": 1.7168558835983276, + "learning_rate": 0.0003199, + "loss": 3.4419, + "step": 3200 + }, + { + "epoch": 1.0665804783451842, + "grad_norm": 1.5993854999542236, + "learning_rate": 0.00032990000000000005, + "loss": 3.4434, + "step": 3300 + }, + { + "epoch": 1.098901098901099, + "grad_norm": 1.2065600156784058, + "learning_rate": 0.00033989999999999997, + "loss": 3.4292, + "step": 3400 + }, + { + "epoch": 1.1312217194570136, + "grad_norm": 1.1300657987594604, + "learning_rate": 0.0003499, + "loss": 3.4418, + "step": 3500 + }, + { + "epoch": 1.1635423400129283, + "grad_norm": 2.4604320526123047, + "learning_rate": 0.0003599, + "loss": 3.4454, + "step": 3600 + }, + { + "epoch": 1.195862960568843, + "grad_norm": 5.035538196563721, + "learning_rate": 0.0003699, + "loss": 3.4292, + "step": 3700 + }, + { + "epoch": 1.2281835811247577, + "grad_norm": 1.4227688312530518, + "learning_rate": 0.0003799, + "loss": 3.4257, + "step": 3800 + }, + { + "epoch": 1.2605042016806722, + "grad_norm": 1.5160913467407227, + "learning_rate": 0.00038990000000000004, + "loss": 3.4242, + "step": 3900 + }, + { + "epoch": 1.292824822236587, + "grad_norm": 1.1818920373916626, + "learning_rate": 0.00039989999999999996, + "loss": 3.4004, + "step": 4000 + }, + { + "epoch": 1.3251454427925016, + "grad_norm": 1.411624550819397, + "learning_rate": 0.0004099, + "loss": 3.4166, + "step": 4100 + }, + { + "epoch": 1.3574660633484164, + "grad_norm": 1.5991268157958984, + "learning_rate": 0.0004199, + "loss": 3.4149, + "step": 4200 + }, + { + "epoch": 1.389786683904331, + "grad_norm": 1.228127360343933, + "learning_rate": 0.0004299, + "loss": 3.4229, + "step": 4300 + }, + { + "epoch": 1.4221073044602457, + "grad_norm": 7.947666645050049, + "learning_rate": 0.0004399, + "loss": 3.4058, + "step": 4400 + }, + { + "epoch": 1.4544279250161603, + "grad_norm": 1.4560375213623047, + "learning_rate": 0.00044990000000000004, + "loss": 3.3876, + "step": 4500 + }, + { + "epoch": 1.486748545572075, + "grad_norm": 1.2084722518920898, + "learning_rate": 0.0004599, + "loss": 3.3963, + "step": 4600 + }, + { + "epoch": 1.5190691661279896, + "grad_norm": 1.5284830331802368, + "learning_rate": 0.0004699, + "loss": 3.4088, + "step": 4700 + }, + { + "epoch": 1.5513897866839044, + "grad_norm": 1.1983979940414429, + "learning_rate": 0.0004799, + "loss": 3.4061, + "step": 4800 + }, + { + "epoch": 1.5837104072398192, + "grad_norm": 1.314408540725708, + "learning_rate": 0.0004899, + "loss": 3.3987, + "step": 4900 + }, + { + "epoch": 1.6160310277957337, + "grad_norm": 1.5176293849945068, + "learning_rate": 0.0004999000000000001, + "loss": 3.366, + "step": 5000 + }, + { + "epoch": 1.6483516483516483, + "grad_norm": 1.500085711479187, + "learning_rate": 0.0005099, + "loss": 3.3984, + "step": 5100 + }, + { + "epoch": 1.680672268907563, + "grad_norm": 1.016550898551941, + "learning_rate": 0.0005199, + "loss": 3.3626, + "step": 5200 + }, + { + "epoch": 1.7129928894634778, + "grad_norm": 13.165894508361816, + "learning_rate": 0.0005299, + "loss": 3.3609, + "step": 5300 + }, + { + "epoch": 1.7453135100193924, + "grad_norm": 1.186579942703247, + "learning_rate": 0.0005399000000000001, + "loss": 3.3672, + "step": 5400 + }, + { + "epoch": 1.777634130575307, + "grad_norm": 1.2896537780761719, + "learning_rate": 0.0005499000000000001, + "loss": 3.3825, + "step": 5500 + }, + { + "epoch": 1.8099547511312217, + "grad_norm": 1.2675527334213257, + "learning_rate": 0.0005599, + "loss": 3.3698, + "step": 5600 + }, + { + "epoch": 1.8422753716871365, + "grad_norm": 1.1798584461212158, + "learning_rate": 0.0005698999999999999, + "loss": 3.3624, + "step": 5700 + }, + { + "epoch": 1.874595992243051, + "grad_norm": 0.8817252516746521, + "learning_rate": 0.0005799, + "loss": 3.3503, + "step": 5800 + }, + { + "epoch": 1.9069166127989656, + "grad_norm": 1.2770187854766846, + "learning_rate": 0.0005899, + "loss": 3.3655, + "step": 5900 + }, + { + "epoch": 1.9392372333548804, + "grad_norm": 1.062826156616211, + "learning_rate": 0.0005999, + "loss": 3.3595, + "step": 6000 + }, + { + "epoch": 1.9715578539107952, + "grad_norm": 0.97618567943573, + "learning_rate": 0.0006099, + "loss": 3.365, + "step": 6100 + }, + { + "epoch": 2.0038784744667097, + "grad_norm": 0.8138112425804138, + "learning_rate": 0.0006199, + "loss": 3.3674, + "step": 6200 + }, + { + "epoch": 2.0361990950226243, + "grad_norm": 0.8098726272583008, + "learning_rate": 0.0006299000000000001, + "loss": 3.255, + "step": 6300 + }, + { + "epoch": 2.0685197155785393, + "grad_norm": 1.1858711242675781, + "learning_rate": 0.0006399, + "loss": 3.2813, + "step": 6400 + }, + { + "epoch": 2.100840336134454, + "grad_norm": 0.9740011692047119, + "learning_rate": 0.0006499, + "loss": 3.2799, + "step": 6500 + }, + { + "epoch": 2.1331609566903684, + "grad_norm": 1.2355788946151733, + "learning_rate": 0.0006599, + "loss": 3.2678, + "step": 6600 + }, + { + "epoch": 2.165481577246283, + "grad_norm": 5.133415222167969, + "learning_rate": 0.0006699000000000001, + "loss": 3.2945, + "step": 6700 + }, + { + "epoch": 2.197802197802198, + "grad_norm": 1.0777193307876587, + "learning_rate": 0.0006799, + "loss": 3.3022, + "step": 6800 + }, + { + "epoch": 2.2301228183581125, + "grad_norm": 0.9968545436859131, + "learning_rate": 0.0006899, + "loss": 3.2852, + "step": 6900 + }, + { + "epoch": 2.262443438914027, + "grad_norm": 1.0664645433425903, + "learning_rate": 0.0006998999999999999, + "loss": 3.2945, + "step": 7000 + }, + { + "epoch": 2.2947640594699417, + "grad_norm": 0.9292928576469421, + "learning_rate": 0.0007099, + "loss": 3.2809, + "step": 7100 + }, + { + "epoch": 2.3270846800258567, + "grad_norm": 0.9592123627662659, + "learning_rate": 0.0007199, + "loss": 3.2944, + "step": 7200 + }, + { + "epoch": 2.3594053005817712, + "grad_norm": 1.028623342514038, + "learning_rate": 0.0007299, + "loss": 3.2946, + "step": 7300 + }, + { + "epoch": 2.391725921137686, + "grad_norm": 4.865314483642578, + "learning_rate": 0.0007399, + "loss": 3.296, + "step": 7400 + }, + { + "epoch": 2.4240465416936003, + "grad_norm": 1.256349802017212, + "learning_rate": 0.0007499000000000001, + "loss": 3.2976, + "step": 7500 + }, + { + "epoch": 2.4563671622495153, + "grad_norm": 1.0199131965637207, + "learning_rate": 0.0007599, + "loss": 3.2802, + "step": 7600 + }, + { + "epoch": 2.48868778280543, + "grad_norm": 1.946007251739502, + "learning_rate": 0.0007699, + "loss": 3.2834, + "step": 7700 + }, + { + "epoch": 2.5210084033613445, + "grad_norm": 0.9734399914741516, + "learning_rate": 0.0007799, + "loss": 3.2793, + "step": 7800 + }, + { + "epoch": 2.553329023917259, + "grad_norm": 0.9436636567115784, + "learning_rate": 0.0007899000000000001, + "loss": 3.29, + "step": 7900 + }, + { + "epoch": 2.585649644473174, + "grad_norm": 0.9262025952339172, + "learning_rate": 0.0007999000000000001, + "loss": 3.2756, + "step": 8000 + }, + { + "epoch": 2.6179702650290886, + "grad_norm": 1.195101022720337, + "learning_rate": 0.0008099, + "loss": 3.2799, + "step": 8100 + }, + { + "epoch": 2.650290885585003, + "grad_norm": 0.9717804193496704, + "learning_rate": 0.0008198999999999999, + "loss": 3.2714, + "step": 8200 + }, + { + "epoch": 2.682611506140918, + "grad_norm": 1.1211719512939453, + "learning_rate": 0.0008299, + "loss": 3.287, + "step": 8300 + }, + { + "epoch": 2.7149321266968327, + "grad_norm": 1.057012915611267, + "learning_rate": 0.0008399, + "loss": 3.2752, + "step": 8400 + }, + { + "epoch": 2.7472527472527473, + "grad_norm": 1.0968471765518188, + "learning_rate": 0.0008499, + "loss": 3.2719, + "step": 8500 + }, + { + "epoch": 2.779573367808662, + "grad_norm": 1.0198901891708374, + "learning_rate": 0.0008599, + "loss": 3.2522, + "step": 8600 + }, + { + "epoch": 2.8118939883645764, + "grad_norm": 1.330259919166565, + "learning_rate": 0.0008699000000000001, + "loss": 3.2589, + "step": 8700 + }, + { + "epoch": 2.8442146089204914, + "grad_norm": 0.8509685397148132, + "learning_rate": 0.0008799000000000001, + "loss": 3.2736, + "step": 8800 + }, + { + "epoch": 2.876535229476406, + "grad_norm": 1.174782633781433, + "learning_rate": 0.0008899, + "loss": 3.2597, + "step": 8900 + }, + { + "epoch": 2.9088558500323205, + "grad_norm": 1.1556833982467651, + "learning_rate": 0.0008999, + "loss": 3.2822, + "step": 9000 + }, + { + "epoch": 2.9411764705882355, + "grad_norm": 1.1285648345947266, + "learning_rate": 0.0009099, + "loss": 3.2897, + "step": 9100 + }, + { + "epoch": 2.97349709114415, + "grad_norm": 0.9292157292366028, + "learning_rate": 0.0009199000000000001, + "loss": 3.2389, + "step": 9200 + }, + { + "epoch": 3.0058177117000646, + "grad_norm": 0.9960983991622925, + "learning_rate": 0.0009299, + "loss": 3.2608, + "step": 9300 + }, + { + "epoch": 3.038138332255979, + "grad_norm": 1.0418298244476318, + "learning_rate": 0.0009399, + "loss": 3.1552, + "step": 9400 + }, + { + "epoch": 3.070458952811894, + "grad_norm": 0.8377931714057922, + "learning_rate": 0.0009498999999999999, + "loss": 3.1847, + "step": 9500 + }, + { + "epoch": 3.1027795733678087, + "grad_norm": 1.1886683702468872, + "learning_rate": 0.0009599, + "loss": 3.1789, + "step": 9600 + }, + { + "epoch": 3.1351001939237233, + "grad_norm": 0.9381577968597412, + "learning_rate": 0.0009699, + "loss": 3.166, + "step": 9700 + }, + { + "epoch": 3.167420814479638, + "grad_norm": 0.9787984490394592, + "learning_rate": 0.0009799, + "loss": 3.1831, + "step": 9800 + }, + { + "epoch": 3.199741435035553, + "grad_norm": 2.247471332550049, + "learning_rate": 0.0009899, + "loss": 3.2012, + "step": 9900 + }, + { + "epoch": 3.2320620555914674, + "grad_norm": 0.855204701423645, + "learning_rate": 0.0009999, + "loss": 3.1842, + "step": 10000 + }, + { + "epoch": 3.264382676147382, + "grad_norm": 1.1390490531921387, + "learning_rate": 0.001, + "loss": 3.1923, + "step": 10100 + }, + { + "epoch": 3.2967032967032965, + "grad_norm": 1.1109646558761597, + "learning_rate": 0.001, + "loss": 3.192, + "step": 10200 + }, + { + "epoch": 3.3290239172592115, + "grad_norm": 1.2080135345458984, + "learning_rate": 0.001, + "loss": 3.1771, + "step": 10300 + }, + { + "epoch": 3.361344537815126, + "grad_norm": 1.106696367263794, + "learning_rate": 0.001, + "loss": 3.1865, + "step": 10400 + }, + { + "epoch": 3.3936651583710407, + "grad_norm": 1.0035741329193115, + "learning_rate": 0.001, + "loss": 3.1756, + "step": 10500 + }, + { + "epoch": 3.425985778926955, + "grad_norm": 1.0501046180725098, + "learning_rate": 0.001, + "loss": 3.1716, + "step": 10600 + }, + { + "epoch": 3.45830639948287, + "grad_norm": 0.8912081122398376, + "learning_rate": 0.001, + "loss": 3.1657, + "step": 10700 + }, + { + "epoch": 3.490627020038785, + "grad_norm": 1.302748680114746, + "learning_rate": 0.001, + "loss": 3.1769, + "step": 10800 + }, + { + "epoch": 3.5229476405946993, + "grad_norm": 0.816489577293396, + "learning_rate": 0.001, + "loss": 3.201, + "step": 10900 + }, + { + "epoch": 3.555268261150614, + "grad_norm": 1.2402598857879639, + "learning_rate": 0.001, + "loss": 3.1804, + "step": 11000 + }, + { + "epoch": 3.587588881706529, + "grad_norm": 1.3531599044799805, + "learning_rate": 0.001, + "loss": 3.1921, + "step": 11100 + }, + { + "epoch": 3.6199095022624435, + "grad_norm": 2.2036519050598145, + "learning_rate": 0.001, + "loss": 3.1774, + "step": 11200 + }, + { + "epoch": 3.652230122818358, + "grad_norm": 1.7961952686309814, + "learning_rate": 0.001, + "loss": 3.1717, + "step": 11300 + }, + { + "epoch": 3.684550743374273, + "grad_norm": 1.176538348197937, + "learning_rate": 0.001, + "loss": 3.171, + "step": 11400 + }, + { + "epoch": 3.7168713639301876, + "grad_norm": 1.1532666683197021, + "learning_rate": 0.001, + "loss": 3.1913, + "step": 11500 + }, + { + "epoch": 3.749191984486102, + "grad_norm": 0.9709277749061584, + "learning_rate": 0.001, + "loss": 3.1926, + "step": 11600 + }, + { + "epoch": 3.7815126050420167, + "grad_norm": 1.0044294595718384, + "learning_rate": 0.001, + "loss": 3.19, + "step": 11700 + }, + { + "epoch": 3.8138332255979313, + "grad_norm": 0.8759526610374451, + "learning_rate": 0.001, + "loss": 3.1719, + "step": 11800 + }, + { + "epoch": 3.8461538461538463, + "grad_norm": 0.9571463465690613, + "learning_rate": 0.001, + "loss": 3.1798, + "step": 11900 + }, + { + "epoch": 3.878474466709761, + "grad_norm": 1.0519996881484985, + "learning_rate": 0.001, + "loss": 3.1891, + "step": 12000 + }, + { + "epoch": 3.9107950872656754, + "grad_norm": 1.0874314308166504, + "learning_rate": 0.001, + "loss": 3.177, + "step": 12100 + }, + { + "epoch": 3.9431157078215904, + "grad_norm": 1.134121298789978, + "learning_rate": 0.001, + "loss": 3.1724, + "step": 12200 + }, + { + "epoch": 3.975436328377505, + "grad_norm": 1.093509554862976, + "learning_rate": 0.001, + "loss": 3.1751, + "step": 12300 + }, + { + "epoch": 4.0077569489334195, + "grad_norm": 0.838979184627533, + "learning_rate": 0.001, + "loss": 3.1506, + "step": 12400 + }, + { + "epoch": 4.040077569489334, + "grad_norm": 0.9417329430580139, + "learning_rate": 0.001, + "loss": 3.0295, + "step": 12500 + }, + { + "epoch": 4.072398190045249, + "grad_norm": 0.941433310508728, + "learning_rate": 0.001, + "loss": 3.0514, + "step": 12600 + }, + { + "epoch": 4.104718810601163, + "grad_norm": 1.1772059202194214, + "learning_rate": 0.001, + "loss": 3.0639, + "step": 12700 + }, + { + "epoch": 4.137039431157079, + "grad_norm": 0.9115270376205444, + "learning_rate": 0.001, + "loss": 3.0563, + "step": 12800 + }, + { + "epoch": 4.169360051712993, + "grad_norm": 1.0796817541122437, + "learning_rate": 0.001, + "loss": 3.07, + "step": 12900 + }, + { + "epoch": 4.201680672268908, + "grad_norm": 0.9457636475563049, + "learning_rate": 0.001, + "loss": 3.0671, + "step": 13000 + }, + { + "epoch": 4.234001292824822, + "grad_norm": 1.6689468622207642, + "learning_rate": 0.001, + "loss": 3.0806, + "step": 13100 + }, + { + "epoch": 4.266321913380737, + "grad_norm": 0.8102026581764221, + "learning_rate": 0.001, + "loss": 3.0564, + "step": 13200 + }, + { + "epoch": 4.298642533936651, + "grad_norm": 0.8251219391822815, + "learning_rate": 0.001, + "loss": 3.0567, + "step": 13300 + }, + { + "epoch": 4.330963154492566, + "grad_norm": 0.921691358089447, + "learning_rate": 0.001, + "loss": 3.0732, + "step": 13400 + }, + { + "epoch": 4.3632837750484805, + "grad_norm": 1.216403841972351, + "learning_rate": 0.001, + "loss": 3.0796, + "step": 13500 + }, + { + "epoch": 4.395604395604396, + "grad_norm": 1.2740323543548584, + "learning_rate": 0.001, + "loss": 3.0667, + "step": 13600 + }, + { + "epoch": 4.4279250161603105, + "grad_norm": 1.0430225133895874, + "learning_rate": 0.001, + "loss": 3.0521, + "step": 13700 + }, + { + "epoch": 4.460245636716225, + "grad_norm": 0.8932771682739258, + "learning_rate": 0.001, + "loss": 3.0704, + "step": 13800 + }, + { + "epoch": 4.49256625727214, + "grad_norm": 1.011426568031311, + "learning_rate": 0.001, + "loss": 3.0857, + "step": 13900 + }, + { + "epoch": 4.524886877828054, + "grad_norm": 1.2008609771728516, + "learning_rate": 0.001, + "loss": 3.0533, + "step": 14000 + }, + { + "epoch": 4.557207498383969, + "grad_norm": 0.847837507724762, + "learning_rate": 0.001, + "loss": 3.0694, + "step": 14100 + }, + { + "epoch": 4.589528118939883, + "grad_norm": 0.951865017414093, + "learning_rate": 0.001, + "loss": 3.084, + "step": 14200 + }, + { + "epoch": 4.621848739495798, + "grad_norm": 0.972174882888794, + "learning_rate": 0.001, + "loss": 3.0808, + "step": 14300 + }, + { + "epoch": 4.654169360051713, + "grad_norm": 1.0798579454421997, + "learning_rate": 0.001, + "loss": 3.0889, + "step": 14400 + }, + { + "epoch": 4.686489980607628, + "grad_norm": 0.7689244747161865, + "learning_rate": 0.001, + "loss": 3.0891, + "step": 14500 + }, + { + "epoch": 4.7188106011635425, + "grad_norm": 1.4271835088729858, + "learning_rate": 0.001, + "loss": 3.0655, + "step": 14600 + }, + { + "epoch": 4.751131221719457, + "grad_norm": 1.0069650411605835, + "learning_rate": 0.001, + "loss": 3.0706, + "step": 14700 + }, + { + "epoch": 4.783451842275372, + "grad_norm": 0.9084206223487854, + "learning_rate": 0.001, + "loss": 3.057, + "step": 14800 + }, + { + "epoch": 4.815772462831286, + "grad_norm": 1.3670860528945923, + "learning_rate": 0.001, + "loss": 3.0592, + "step": 14900 + }, + { + "epoch": 4.848093083387201, + "grad_norm": 0.9387325048446655, + "learning_rate": 0.001, + "loss": 3.0789, + "step": 15000 + }, + { + "epoch": 4.880413703943116, + "grad_norm": 0.8084505200386047, + "learning_rate": 0.001, + "loss": 3.0631, + "step": 15100 + }, + { + "epoch": 4.912734324499031, + "grad_norm": 1.0331807136535645, + "learning_rate": 0.001, + "loss": 3.0723, + "step": 15200 + }, + { + "epoch": 4.945054945054945, + "grad_norm": 0.9408292770385742, + "learning_rate": 0.001, + "loss": 3.0616, + "step": 15300 + }, + { + "epoch": 4.97737556561086, + "grad_norm": 0.9665517807006836, + "learning_rate": 0.001, + "loss": 3.0801, + "step": 15400 + }, + { + "epoch": 5.009696186166774, + "grad_norm": 1.1656768321990967, + "learning_rate": 0.001, + "loss": 3.0375, + "step": 15500 + }, + { + "epoch": 5.042016806722689, + "grad_norm": 0.9300348162651062, + "learning_rate": 0.001, + "loss": 2.9091, + "step": 15600 + }, + { + "epoch": 5.0743374272786035, + "grad_norm": 1.0597182512283325, + "learning_rate": 0.001, + "loss": 2.9504, + "step": 15700 + }, + { + "epoch": 5.106658047834518, + "grad_norm": 1.2280610799789429, + "learning_rate": 0.001, + "loss": 2.9492, + "step": 15800 + }, + { + "epoch": 5.1389786683904335, + "grad_norm": 1.0233289003372192, + "learning_rate": 0.001, + "loss": 2.9551, + "step": 15900 + }, + { + "epoch": 5.171299288946348, + "grad_norm": 0.941676676273346, + "learning_rate": 0.001, + "loss": 2.9642, + "step": 16000 + }, + { + "epoch": 5.203619909502263, + "grad_norm": 0.7993482351303101, + "learning_rate": 0.001, + "loss": 2.9547, + "step": 16100 + }, + { + "epoch": 5.235940530058177, + "grad_norm": 0.9896591305732727, + "learning_rate": 0.001, + "loss": 2.9548, + "step": 16200 + }, + { + "epoch": 5.268261150614092, + "grad_norm": 0.9227080345153809, + "learning_rate": 0.001, + "loss": 2.9812, + "step": 16300 + }, + { + "epoch": 5.300581771170006, + "grad_norm": 1.2044575214385986, + "learning_rate": 0.001, + "loss": 2.9787, + "step": 16400 + }, + { + "epoch": 5.332902391725921, + "grad_norm": 1.003462791442871, + "learning_rate": 0.001, + "loss": 2.9652, + "step": 16500 + }, + { + "epoch": 5.365223012281835, + "grad_norm": 1.0406891107559204, + "learning_rate": 0.001, + "loss": 2.963, + "step": 16600 + }, + { + "epoch": 5.397543632837751, + "grad_norm": 1.0937559604644775, + "learning_rate": 0.001, + "loss": 2.9639, + "step": 16700 + }, + { + "epoch": 5.429864253393665, + "grad_norm": 0.8680944442749023, + "learning_rate": 0.001, + "loss": 2.9819, + "step": 16800 + }, + { + "epoch": 5.46218487394958, + "grad_norm": 0.917489230632782, + "learning_rate": 0.001, + "loss": 2.9663, + "step": 16900 + }, + { + "epoch": 5.4945054945054945, + "grad_norm": 0.8815052509307861, + "learning_rate": 0.001, + "loss": 2.9881, + "step": 17000 + }, + { + "epoch": 5.526826115061409, + "grad_norm": 0.9785053133964539, + "learning_rate": 0.001, + "loss": 2.9701, + "step": 17100 + }, + { + "epoch": 5.559146735617324, + "grad_norm": 1.2235257625579834, + "learning_rate": 0.001, + "loss": 2.9775, + "step": 17200 + }, + { + "epoch": 5.591467356173238, + "grad_norm": 0.8558531403541565, + "learning_rate": 0.001, + "loss": 2.9492, + "step": 17300 + }, + { + "epoch": 5.623787976729153, + "grad_norm": 1.730175495147705, + "learning_rate": 0.001, + "loss": 2.9725, + "step": 17400 + }, + { + "epoch": 5.656108597285068, + "grad_norm": 0.9976469278335571, + "learning_rate": 0.001, + "loss": 2.9795, + "step": 17500 + }, + { + "epoch": 5.688429217840983, + "grad_norm": 1.102630853652954, + "learning_rate": 0.001, + "loss": 3.0011, + "step": 17600 + }, + { + "epoch": 5.720749838396897, + "grad_norm": 0.8655111789703369, + "learning_rate": 0.001, + "loss": 2.982, + "step": 17700 + }, + { + "epoch": 5.753070458952812, + "grad_norm": 0.9005181193351746, + "learning_rate": 0.001, + "loss": 2.9795, + "step": 17800 + }, + { + "epoch": 5.785391079508726, + "grad_norm": 0.91997891664505, + "learning_rate": 0.001, + "loss": 2.9668, + "step": 17900 + }, + { + "epoch": 5.817711700064641, + "grad_norm": 0.9092044234275818, + "learning_rate": 0.001, + "loss": 2.9854, + "step": 18000 + }, + { + "epoch": 5.850032320620556, + "grad_norm": 1.1681147813796997, + "learning_rate": 0.001, + "loss": 2.9888, + "step": 18100 + }, + { + "epoch": 5.882352941176471, + "grad_norm": 0.88965904712677, + "learning_rate": 0.001, + "loss": 2.99, + "step": 18200 + }, + { + "epoch": 5.914673561732386, + "grad_norm": 0.915117084980011, + "learning_rate": 0.001, + "loss": 3.0072, + "step": 18300 + }, + { + "epoch": 5.9469941822883, + "grad_norm": 0.9954575300216675, + "learning_rate": 0.001, + "loss": 2.9764, + "step": 18400 + }, + { + "epoch": 5.979314802844215, + "grad_norm": 1.087384581565857, + "learning_rate": 0.001, + "loss": 2.9776, + "step": 18500 + }, + { + "epoch": 6.011635423400129, + "grad_norm": 0.9920886158943176, + "learning_rate": 0.001, + "loss": 2.9278, + "step": 18600 + }, + { + "epoch": 6.043956043956044, + "grad_norm": 1.1691513061523438, + "learning_rate": 0.001, + "loss": 2.842, + "step": 18700 + }, + { + "epoch": 6.076276664511958, + "grad_norm": 1.0943280458450317, + "learning_rate": 0.001, + "loss": 2.8453, + "step": 18800 + }, + { + "epoch": 6.108597285067873, + "grad_norm": 1.4848939180374146, + "learning_rate": 0.001, + "loss": 2.8522, + "step": 18900 + }, + { + "epoch": 6.140917905623788, + "grad_norm": 0.8807019591331482, + "learning_rate": 0.001, + "loss": 2.8703, + "step": 19000 + }, + { + "epoch": 6.173238526179703, + "grad_norm": 1.2009692192077637, + "learning_rate": 0.001, + "loss": 2.8789, + "step": 19100 + }, + { + "epoch": 6.2055591467356175, + "grad_norm": 1.0970031023025513, + "learning_rate": 0.001, + "loss": 2.8751, + "step": 19200 + }, + { + "epoch": 6.237879767291532, + "grad_norm": 1.1768124103546143, + "learning_rate": 0.001, + "loss": 2.886, + "step": 19300 + }, + { + "epoch": 6.270200387847447, + "grad_norm": 0.8989688754081726, + "learning_rate": 0.001, + "loss": 2.8944, + "step": 19400 + }, + { + "epoch": 6.302521008403361, + "grad_norm": 1.2694783210754395, + "learning_rate": 0.001, + "loss": 2.8668, + "step": 19500 + }, + { + "epoch": 6.334841628959276, + "grad_norm": 0.9729022979736328, + "learning_rate": 0.001, + "loss": 2.8705, + "step": 19600 + }, + { + "epoch": 6.36716224951519, + "grad_norm": 1.0138781070709229, + "learning_rate": 0.001, + "loss": 2.8859, + "step": 19700 + }, + { + "epoch": 6.399482870071106, + "grad_norm": 0.9176075458526611, + "learning_rate": 0.001, + "loss": 2.9032, + "step": 19800 + }, + { + "epoch": 6.43180349062702, + "grad_norm": 1.1158503293991089, + "learning_rate": 0.001, + "loss": 2.8774, + "step": 19900 + }, + { + "epoch": 6.464124111182935, + "grad_norm": 0.9626113176345825, + "learning_rate": 0.001, + "loss": 2.8744, + "step": 20000 + }, + { + "epoch": 6.496444731738849, + "grad_norm": 0.9146256446838379, + "learning_rate": 0.001, + "loss": 2.8945, + "step": 20100 + }, + { + "epoch": 6.528765352294764, + "grad_norm": 0.9654421806335449, + "learning_rate": 0.001, + "loss": 2.8934, + "step": 20200 + }, + { + "epoch": 6.5610859728506785, + "grad_norm": 0.9559252262115479, + "learning_rate": 0.001, + "loss": 2.8954, + "step": 20300 + }, + { + "epoch": 6.593406593406593, + "grad_norm": 0.9600493907928467, + "learning_rate": 0.001, + "loss": 2.8964, + "step": 20400 + }, + { + "epoch": 6.625727213962508, + "grad_norm": 1.152198314666748, + "learning_rate": 0.001, + "loss": 2.8953, + "step": 20500 + }, + { + "epoch": 6.658047834518423, + "grad_norm": 0.7821874618530273, + "learning_rate": 0.001, + "loss": 2.8899, + "step": 20600 + }, + { + "epoch": 6.690368455074338, + "grad_norm": 1.2071696519851685, + "learning_rate": 0.001, + "loss": 2.8832, + "step": 20700 + }, + { + "epoch": 6.722689075630252, + "grad_norm": 0.9243321418762207, + "learning_rate": 0.001, + "loss": 2.9011, + "step": 20800 + }, + { + "epoch": 6.755009696186167, + "grad_norm": 0.925390899181366, + "learning_rate": 0.001, + "loss": 2.9143, + "step": 20900 + }, + { + "epoch": 6.787330316742081, + "grad_norm": 0.9191309213638306, + "learning_rate": 0.001, + "loss": 2.9058, + "step": 21000 + }, + { + "epoch": 6.819650937297996, + "grad_norm": 0.8833218812942505, + "learning_rate": 0.001, + "loss": 2.8933, + "step": 21100 + }, + { + "epoch": 6.85197155785391, + "grad_norm": 0.9937705397605896, + "learning_rate": 0.001, + "loss": 2.9203, + "step": 21200 + }, + { + "epoch": 6.884292178409826, + "grad_norm": 1.2760004997253418, + "learning_rate": 0.001, + "loss": 2.9042, + "step": 21300 + }, + { + "epoch": 6.91661279896574, + "grad_norm": 1.0547213554382324, + "learning_rate": 0.001, + "loss": 2.9187, + "step": 21400 + }, + { + "epoch": 6.948933419521655, + "grad_norm": 0.957062840461731, + "learning_rate": 0.001, + "loss": 2.903, + "step": 21500 + }, + { + "epoch": 6.98125404007757, + "grad_norm": 1.1016453504562378, + "learning_rate": 0.001, + "loss": 2.8896, + "step": 21600 + }, + { + "epoch": 7.013574660633484, + "grad_norm": 1.3905339241027832, + "learning_rate": 0.001, + "loss": 2.8379, + "step": 21700 + }, + { + "epoch": 7.045895281189399, + "grad_norm": 1.1392508745193481, + "learning_rate": 0.001, + "loss": 2.7692, + "step": 21800 + }, + { + "epoch": 7.078215901745313, + "grad_norm": 0.9623212814331055, + "learning_rate": 0.001, + "loss": 2.76, + "step": 21900 + }, + { + "epoch": 7.110536522301228, + "grad_norm": 1.0762792825698853, + "learning_rate": 0.001, + "loss": 2.7784, + "step": 22000 + }, + { + "epoch": 7.142857142857143, + "grad_norm": 1.1317200660705566, + "learning_rate": 0.001, + "loss": 2.7783, + "step": 22100 + }, + { + "epoch": 7.175177763413058, + "grad_norm": 1.379044532775879, + "learning_rate": 0.001, + "loss": 2.7868, + "step": 22200 + }, + { + "epoch": 7.207498383968972, + "grad_norm": 1.036180019378662, + "learning_rate": 0.001, + "loss": 2.8014, + "step": 22300 + }, + { + "epoch": 7.239819004524887, + "grad_norm": 1.326994776725769, + "learning_rate": 0.001, + "loss": 2.8077, + "step": 22400 + }, + { + "epoch": 7.2721396250808015, + "grad_norm": 1.378857135772705, + "learning_rate": 0.001, + "loss": 2.8186, + "step": 22500 + }, + { + "epoch": 7.304460245636716, + "grad_norm": 1.1402287483215332, + "learning_rate": 0.001, + "loss": 2.8114, + "step": 22600 + }, + { + "epoch": 7.336780866192631, + "grad_norm": 1.236741304397583, + "learning_rate": 0.001, + "loss": 2.798, + "step": 22700 + }, + { + "epoch": 7.369101486748546, + "grad_norm": 0.9529298543930054, + "learning_rate": 0.001, + "loss": 2.8223, + "step": 22800 + }, + { + "epoch": 7.401422107304461, + "grad_norm": 1.272033929824829, + "learning_rate": 0.001, + "loss": 2.8241, + "step": 22900 + }, + { + "epoch": 7.433742727860375, + "grad_norm": 0.9919891953468323, + "learning_rate": 0.001, + "loss": 2.8106, + "step": 23000 + }, + { + "epoch": 7.46606334841629, + "grad_norm": 0.9951006770133972, + "learning_rate": 0.001, + "loss": 2.8164, + "step": 23100 + }, + { + "epoch": 7.498383968972204, + "grad_norm": 1.260886549949646, + "learning_rate": 0.001, + "loss": 2.8238, + "step": 23200 + }, + { + "epoch": 7.530704589528119, + "grad_norm": 1.0894906520843506, + "learning_rate": 0.001, + "loss": 2.818, + "step": 23300 + }, + { + "epoch": 7.563025210084033, + "grad_norm": 1.1154838800430298, + "learning_rate": 0.001, + "loss": 2.8312, + "step": 23400 + }, + { + "epoch": 7.595345830639948, + "grad_norm": 1.0110588073730469, + "learning_rate": 0.001, + "loss": 2.812, + "step": 23500 + }, + { + "epoch": 7.6276664511958625, + "grad_norm": 0.9715908765792847, + "learning_rate": 0.001, + "loss": 2.819, + "step": 23600 + }, + { + "epoch": 7.659987071751778, + "grad_norm": 1.0196453332901, + "learning_rate": 0.001, + "loss": 2.8195, + "step": 23700 + }, + { + "epoch": 7.6923076923076925, + "grad_norm": 1.3575221300125122, + "learning_rate": 0.001, + "loss": 2.8115, + "step": 23800 + }, + { + "epoch": 7.724628312863607, + "grad_norm": 1.3838183879852295, + "learning_rate": 0.001, + "loss": 2.8206, + "step": 23900 + }, + { + "epoch": 7.756948933419522, + "grad_norm": 1.2254596948623657, + "learning_rate": 0.001, + "loss": 2.8281, + "step": 24000 + }, + { + "epoch": 7.789269553975436, + "grad_norm": 1.2324926853179932, + "learning_rate": 0.001, + "loss": 2.8426, + "step": 24100 + }, + { + "epoch": 7.821590174531351, + "grad_norm": 1.2173677682876587, + "learning_rate": 0.001, + "loss": 2.8355, + "step": 24200 + }, + { + "epoch": 7.853910795087265, + "grad_norm": 0.9907102584838867, + "learning_rate": 0.001, + "loss": 2.8081, + "step": 24300 + }, + { + "epoch": 7.886231415643181, + "grad_norm": 1.141242265701294, + "learning_rate": 0.001, + "loss": 2.8234, + "step": 24400 + }, + { + "epoch": 7.918552036199095, + "grad_norm": 1.286460518836975, + "learning_rate": 0.001, + "loss": 2.8324, + "step": 24500 + }, + { + "epoch": 7.95087265675501, + "grad_norm": 0.9198762774467468, + "learning_rate": 0.001, + "loss": 2.8671, + "step": 24600 + }, + { + "epoch": 7.983193277310924, + "grad_norm": 0.9404067397117615, + "learning_rate": 0.001, + "loss": 2.8488, + "step": 24700 + }, + { + "epoch": 8.015513897866839, + "grad_norm": 1.2536792755126953, + "learning_rate": 0.001, + "loss": 2.7556, + "step": 24800 + }, + { + "epoch": 8.047834518422754, + "grad_norm": 1.4987441301345825, + "learning_rate": 0.001, + "loss": 2.7025, + "step": 24900 + }, + { + "epoch": 8.080155138978668, + "grad_norm": 1.1885977983474731, + "learning_rate": 0.001, + "loss": 2.7104, + "step": 25000 + }, + { + "epoch": 8.112475759534583, + "grad_norm": 1.5676301717758179, + "learning_rate": 0.001, + "loss": 2.7184, + "step": 25100 + }, + { + "epoch": 8.144796380090497, + "grad_norm": 1.3227053880691528, + "learning_rate": 0.001, + "loss": 2.7353, + "step": 25200 + }, + { + "epoch": 8.177117000646412, + "grad_norm": 1.1693710088729858, + "learning_rate": 0.001, + "loss": 2.7109, + "step": 25300 + }, + { + "epoch": 8.209437621202326, + "grad_norm": 1.5500166416168213, + "learning_rate": 0.001, + "loss": 2.7279, + "step": 25400 + }, + { + "epoch": 8.241758241758241, + "grad_norm": 1.6105555295944214, + "learning_rate": 0.001, + "loss": 2.7238, + "step": 25500 + }, + { + "epoch": 8.274078862314157, + "grad_norm": 1.3008909225463867, + "learning_rate": 0.001, + "loss": 2.7138, + "step": 25600 + }, + { + "epoch": 8.306399482870072, + "grad_norm": 1.1506481170654297, + "learning_rate": 0.001, + "loss": 2.7202, + "step": 25700 + }, + { + "epoch": 8.338720103425986, + "grad_norm": 1.583932876586914, + "learning_rate": 0.001, + "loss": 2.7328, + "step": 25800 + }, + { + "epoch": 8.371040723981901, + "grad_norm": 1.3606271743774414, + "learning_rate": 0.001, + "loss": 2.7364, + "step": 25900 + }, + { + "epoch": 8.403361344537815, + "grad_norm": 1.6803429126739502, + "learning_rate": 0.001, + "loss": 2.7109, + "step": 26000 + }, + { + "epoch": 8.43568196509373, + "grad_norm": 1.6417889595031738, + "learning_rate": 0.001, + "loss": 2.7318, + "step": 26100 + }, + { + "epoch": 8.468002585649645, + "grad_norm": 1.2529147863388062, + "learning_rate": 0.001, + "loss": 2.7559, + "step": 26200 + }, + { + "epoch": 8.50032320620556, + "grad_norm": 1.352419137954712, + "learning_rate": 0.001, + "loss": 2.7578, + "step": 26300 + }, + { + "epoch": 8.532643826761474, + "grad_norm": 1.5327883958816528, + "learning_rate": 0.001, + "loss": 2.7497, + "step": 26400 + }, + { + "epoch": 8.564964447317388, + "grad_norm": 1.2606614828109741, + "learning_rate": 0.001, + "loss": 2.7531, + "step": 26500 + }, + { + "epoch": 8.597285067873303, + "grad_norm": 1.340108871459961, + "learning_rate": 0.001, + "loss": 2.76, + "step": 26600 + }, + { + "epoch": 8.629605688429217, + "grad_norm": 1.622501015663147, + "learning_rate": 0.001, + "loss": 2.7671, + "step": 26700 + }, + { + "epoch": 8.661926308985132, + "grad_norm": 2.9881439208984375, + "learning_rate": 0.001, + "loss": 2.7507, + "step": 26800 + }, + { + "epoch": 8.694246929541046, + "grad_norm": 1.4684876203536987, + "learning_rate": 0.001, + "loss": 2.7754, + "step": 26900 + }, + { + "epoch": 8.726567550096961, + "grad_norm": 1.279051423072815, + "learning_rate": 0.001, + "loss": 2.7737, + "step": 27000 + }, + { + "epoch": 8.758888170652877, + "grad_norm": 1.2881108522415161, + "learning_rate": 0.001, + "loss": 2.7566, + "step": 27100 + }, + { + "epoch": 8.791208791208792, + "grad_norm": 1.4721473455429077, + "learning_rate": 0.001, + "loss": 2.753, + "step": 27200 + }, + { + "epoch": 8.823529411764707, + "grad_norm": 1.335915207862854, + "learning_rate": 0.001, + "loss": 2.7699, + "step": 27300 + }, + { + "epoch": 8.855850032320621, + "grad_norm": 1.3305962085723877, + "learning_rate": 0.001, + "loss": 2.7792, + "step": 27400 + }, + { + "epoch": 8.888170652876536, + "grad_norm": 1.196067452430725, + "learning_rate": 0.001, + "loss": 2.7727, + "step": 27500 + }, + { + "epoch": 8.92049127343245, + "grad_norm": 1.2162010669708252, + "learning_rate": 0.001, + "loss": 2.751, + "step": 27600 + }, + { + "epoch": 8.952811893988365, + "grad_norm": 1.5516213178634644, + "learning_rate": 0.001, + "loss": 2.7937, + "step": 27700 + }, + { + "epoch": 8.98513251454428, + "grad_norm": 1.2385696172714233, + "learning_rate": 0.001, + "loss": 2.7732, + "step": 27800 + }, + { + "epoch": 9.017453135100194, + "grad_norm": 1.991585373878479, + "learning_rate": 0.001, + "loss": 2.6816, + "step": 27900 + }, + { + "epoch": 9.049773755656108, + "grad_norm": 1.8597898483276367, + "learning_rate": 0.001, + "loss": 2.6321, + "step": 28000 + }, + { + "epoch": 9.082094376212023, + "grad_norm": 1.6625946760177612, + "learning_rate": 0.001, + "loss": 2.6467, + "step": 28100 + }, + { + "epoch": 9.114414996767938, + "grad_norm": 1.5089329481124878, + "learning_rate": 0.001, + "loss": 2.6456, + "step": 28200 + }, + { + "epoch": 9.146735617323852, + "grad_norm": 1.6687277555465698, + "learning_rate": 0.001, + "loss": 2.652, + "step": 28300 + }, + { + "epoch": 9.179056237879767, + "grad_norm": 1.1334947347640991, + "learning_rate": 0.001, + "loss": 2.6543, + "step": 28400 + }, + { + "epoch": 9.211376858435681, + "grad_norm": 1.7903298139572144, + "learning_rate": 0.001, + "loss": 2.6396, + "step": 28500 + }, + { + "epoch": 9.243697478991596, + "grad_norm": 1.8491761684417725, + "learning_rate": 0.001, + "loss": 2.6443, + "step": 28600 + }, + { + "epoch": 9.276018099547512, + "grad_norm": 1.4492714405059814, + "learning_rate": 0.001, + "loss": 2.6509, + "step": 28700 + }, + { + "epoch": 9.308338720103427, + "grad_norm": 1.8237452507019043, + "learning_rate": 0.001, + "loss": 2.6867, + "step": 28800 + }, + { + "epoch": 9.340659340659341, + "grad_norm": 1.7002811431884766, + "learning_rate": 0.001, + "loss": 2.6911, + "step": 28900 + }, + { + "epoch": 9.372979961215256, + "grad_norm": 1.24556303024292, + "learning_rate": 0.001, + "loss": 2.6842, + "step": 29000 + }, + { + "epoch": 9.40530058177117, + "grad_norm": 1.3534449338912964, + "learning_rate": 0.001, + "loss": 2.6882, + "step": 29100 + }, + { + "epoch": 9.437621202327085, + "grad_norm": 1.465098261833191, + "learning_rate": 0.001, + "loss": 2.6845, + "step": 29200 + }, + { + "epoch": 9.469941822883, + "grad_norm": 1.1282223463058472, + "learning_rate": 0.001, + "loss": 2.6926, + "step": 29300 + }, + { + "epoch": 9.502262443438914, + "grad_norm": 1.5469937324523926, + "learning_rate": 0.001, + "loss": 2.6861, + "step": 29400 + }, + { + "epoch": 9.534583063994829, + "grad_norm": 1.313873052597046, + "learning_rate": 0.001, + "loss": 2.6821, + "step": 29500 + }, + { + "epoch": 9.566903684550743, + "grad_norm": 1.0639135837554932, + "learning_rate": 0.001, + "loss": 2.6661, + "step": 29600 + }, + { + "epoch": 9.599224305106658, + "grad_norm": 1.1810053586959839, + "learning_rate": 0.001, + "loss": 2.6854, + "step": 29700 + }, + { + "epoch": 9.631544925662572, + "grad_norm": 1.4090721607208252, + "learning_rate": 0.001, + "loss": 2.7232, + "step": 29800 + }, + { + "epoch": 9.663865546218487, + "grad_norm": 1.278445839881897, + "learning_rate": 0.001, + "loss": 2.6675, + "step": 29900 + }, + { + "epoch": 9.696186166774401, + "grad_norm": 1.2034200429916382, + "learning_rate": 0.001, + "loss": 2.6747, + "step": 30000 + }, + { + "epoch": 9.728506787330316, + "grad_norm": 1.2123016119003296, + "learning_rate": 0.001, + "loss": 2.7185, + "step": 30100 + }, + { + "epoch": 9.760827407886232, + "grad_norm": 1.6041324138641357, + "learning_rate": 0.001, + "loss": 2.7031, + "step": 30200 + }, + { + "epoch": 9.793148028442147, + "grad_norm": 1.3464832305908203, + "learning_rate": 0.001, + "loss": 2.7295, + "step": 30300 + }, + { + "epoch": 9.825468648998061, + "grad_norm": 1.8239651918411255, + "learning_rate": 0.001, + "loss": 2.6939, + "step": 30400 + }, + { + "epoch": 9.857789269553976, + "grad_norm": 1.3928236961364746, + "learning_rate": 0.001, + "loss": 2.702, + "step": 30500 + }, + { + "epoch": 9.89010989010989, + "grad_norm": 1.341913104057312, + "learning_rate": 0.001, + "loss": 2.699, + "step": 30600 + }, + { + "epoch": 9.922430510665805, + "grad_norm": 1.3967502117156982, + "learning_rate": 0.001, + "loss": 2.7218, + "step": 30700 + }, + { + "epoch": 9.95475113122172, + "grad_norm": 1.2057521343231201, + "learning_rate": 0.001, + "loss": 2.7233, + "step": 30800 + }, + { + "epoch": 9.987071751777634, + "grad_norm": 1.4760347604751587, + "learning_rate": 0.001, + "loss": 2.7322, + "step": 30900 + }, + { + "epoch": 10.019392372333549, + "grad_norm": 0.9195663332939148, + "learning_rate": 0.001, + "loss": 2.642, + "step": 31000 + }, + { + "epoch": 10.051712992889463, + "grad_norm": 1.7018245458602905, + "learning_rate": 0.001, + "loss": 2.5902, + "step": 31100 + }, + { + "epoch": 10.084033613445378, + "grad_norm": 1.2946157455444336, + "learning_rate": 0.001, + "loss": 2.5836, + "step": 31200 + }, + { + "epoch": 10.116354234001292, + "grad_norm": 1.2677333354949951, + "learning_rate": 0.001, + "loss": 2.5861, + "step": 31300 + }, + { + "epoch": 10.148674854557207, + "grad_norm": 0.9341103434562683, + "learning_rate": 0.001, + "loss": 2.5815, + "step": 31400 + }, + { + "epoch": 10.180995475113122, + "grad_norm": 1.197549819946289, + "learning_rate": 0.001, + "loss": 2.5988, + "step": 31500 + }, + { + "epoch": 10.213316095669036, + "grad_norm": 0.9701215028762817, + "learning_rate": 0.001, + "loss": 2.6033, + "step": 31600 + }, + { + "epoch": 10.24563671622495, + "grad_norm": 1.440954327583313, + "learning_rate": 0.001, + "loss": 2.6095, + "step": 31700 + }, + { + "epoch": 10.277957336780867, + "grad_norm": 1.2938240766525269, + "learning_rate": 0.001, + "loss": 2.6024, + "step": 31800 + }, + { + "epoch": 10.310277957336782, + "grad_norm": 1.4884780645370483, + "learning_rate": 0.001, + "loss": 2.6229, + "step": 31900 + }, + { + "epoch": 10.342598577892696, + "grad_norm": 0.9927781820297241, + "learning_rate": 0.001, + "loss": 2.5953, + "step": 32000 + }, + { + "epoch": 10.37491919844861, + "grad_norm": 0.957020103931427, + "learning_rate": 0.001, + "loss": 2.6005, + "step": 32100 + }, + { + "epoch": 10.407239819004525, + "grad_norm": 1.0292260646820068, + "learning_rate": 0.001, + "loss": 2.6422, + "step": 32200 + }, + { + "epoch": 10.43956043956044, + "grad_norm": 1.205029010772705, + "learning_rate": 0.001, + "loss": 2.6276, + "step": 32300 + }, + { + "epoch": 10.471881060116354, + "grad_norm": 1.0172486305236816, + "learning_rate": 0.001, + "loss": 2.6254, + "step": 32400 + }, + { + "epoch": 10.504201680672269, + "grad_norm": 0.9256879687309265, + "learning_rate": 0.001, + "loss": 2.6103, + "step": 32500 + }, + { + "epoch": 10.536522301228183, + "grad_norm": 1.0289719104766846, + "learning_rate": 0.001, + "loss": 2.6329, + "step": 32600 + }, + { + "epoch": 10.568842921784098, + "grad_norm": 1.1163206100463867, + "learning_rate": 0.001, + "loss": 2.6488, + "step": 32700 + }, + { + "epoch": 10.601163542340013, + "grad_norm": 1.0654981136322021, + "learning_rate": 0.001, + "loss": 2.6346, + "step": 32800 + }, + { + "epoch": 10.633484162895927, + "grad_norm": 0.7902207374572754, + "learning_rate": 0.001, + "loss": 2.6188, + "step": 32900 + }, + { + "epoch": 10.665804783451842, + "grad_norm": 1.250271201133728, + "learning_rate": 0.001, + "loss": 2.6563, + "step": 33000 + }, + { + "epoch": 10.698125404007756, + "grad_norm": 0.969681978225708, + "learning_rate": 0.001, + "loss": 2.6384, + "step": 33100 + }, + { + "epoch": 10.73044602456367, + "grad_norm": 1.1124166250228882, + "learning_rate": 0.001, + "loss": 2.6356, + "step": 33200 + }, + { + "epoch": 10.762766645119587, + "grad_norm": 0.918755292892456, + "learning_rate": 0.001, + "loss": 2.6393, + "step": 33300 + }, + { + "epoch": 10.795087265675502, + "grad_norm": 0.9233816862106323, + "learning_rate": 0.001, + "loss": 2.6507, + "step": 33400 + }, + { + "epoch": 10.827407886231416, + "grad_norm": 1.036242127418518, + "learning_rate": 0.001, + "loss": 2.6541, + "step": 33500 + }, + { + "epoch": 10.85972850678733, + "grad_norm": 1.00826895236969, + "learning_rate": 0.001, + "loss": 2.6496, + "step": 33600 + }, + { + "epoch": 10.892049127343245, + "grad_norm": 1.0117528438568115, + "learning_rate": 0.001, + "loss": 2.6466, + "step": 33700 + }, + { + "epoch": 10.92436974789916, + "grad_norm": 0.9768591523170471, + "learning_rate": 0.001, + "loss": 2.6448, + "step": 33800 + }, + { + "epoch": 10.956690368455074, + "grad_norm": 1.2766749858856201, + "learning_rate": 0.001, + "loss": 2.647, + "step": 33900 + }, + { + "epoch": 10.989010989010989, + "grad_norm": 1.2299177646636963, + "learning_rate": 0.001, + "loss": 2.6419, + "step": 34000 + }, + { + "epoch": 11.021331609566904, + "grad_norm": 1.1423105001449585, + "learning_rate": 0.001, + "loss": 2.5704, + "step": 34100 + }, + { + "epoch": 11.053652230122818, + "grad_norm": 0.8941395282745361, + "learning_rate": 0.001, + "loss": 2.4834, + "step": 34200 + }, + { + "epoch": 11.085972850678733, + "grad_norm": 1.1033960580825806, + "learning_rate": 0.001, + "loss": 2.513, + "step": 34300 + }, + { + "epoch": 11.118293471234647, + "grad_norm": 1.0653290748596191, + "learning_rate": 0.001, + "loss": 2.541, + "step": 34400 + }, + { + "epoch": 11.150614091790562, + "grad_norm": 1.4494647979736328, + "learning_rate": 0.001, + "loss": 2.5199, + "step": 34500 + }, + { + "epoch": 11.182934712346476, + "grad_norm": 1.2029805183410645, + "learning_rate": 0.001, + "loss": 2.5371, + "step": 34600 + }, + { + "epoch": 11.215255332902391, + "grad_norm": 0.9592697024345398, + "learning_rate": 0.001, + "loss": 2.5513, + "step": 34700 + }, + { + "epoch": 11.247575953458306, + "grad_norm": 0.9625367522239685, + "learning_rate": 0.001, + "loss": 2.5316, + "step": 34800 + }, + { + "epoch": 11.279896574014222, + "grad_norm": 1.119964361190796, + "learning_rate": 0.001, + "loss": 2.5531, + "step": 34900 + }, + { + "epoch": 11.312217194570136, + "grad_norm": 0.9373201131820679, + "learning_rate": 0.001, + "loss": 2.5427, + "step": 35000 + }, + { + "epoch": 11.344537815126051, + "grad_norm": 0.9922090172767639, + "learning_rate": 0.001, + "loss": 2.5593, + "step": 35100 + }, + { + "epoch": 11.376858435681966, + "grad_norm": 0.949802577495575, + "learning_rate": 0.001, + "loss": 2.5581, + "step": 35200 + }, + { + "epoch": 11.40917905623788, + "grad_norm": 1.0595334768295288, + "learning_rate": 0.001, + "loss": 2.572, + "step": 35300 + }, + { + "epoch": 11.441499676793795, + "grad_norm": 0.883158802986145, + "learning_rate": 0.001, + "loss": 2.584, + "step": 35400 + }, + { + "epoch": 11.47382029734971, + "grad_norm": 0.983586311340332, + "learning_rate": 0.001, + "loss": 2.5704, + "step": 35500 + }, + { + "epoch": 11.506140917905624, + "grad_norm": 0.814781129360199, + "learning_rate": 0.001, + "loss": 2.5732, + "step": 35600 + }, + { + "epoch": 11.538461538461538, + "grad_norm": 1.2671406269073486, + "learning_rate": 0.001, + "loss": 2.5872, + "step": 35700 + }, + { + "epoch": 11.570782159017453, + "grad_norm": 1.1636486053466797, + "learning_rate": 0.001, + "loss": 2.5662, + "step": 35800 + }, + { + "epoch": 11.603102779573367, + "grad_norm": 0.9227073192596436, + "learning_rate": 0.001, + "loss": 2.5826, + "step": 35900 + }, + { + "epoch": 11.635423400129282, + "grad_norm": 1.394180178642273, + "learning_rate": 0.001, + "loss": 2.5915, + "step": 36000 + }, + { + "epoch": 11.667744020685197, + "grad_norm": 1.2034887075424194, + "learning_rate": 0.001, + "loss": 2.5843, + "step": 36100 + }, + { + "epoch": 11.700064641241111, + "grad_norm": 1.2949236631393433, + "learning_rate": 0.001, + "loss": 2.617, + "step": 36200 + }, + { + "epoch": 11.732385261797026, + "grad_norm": 0.9753849506378174, + "learning_rate": 0.001, + "loss": 2.5988, + "step": 36300 + }, + { + "epoch": 11.764705882352942, + "grad_norm": 0.8794882893562317, + "learning_rate": 0.001, + "loss": 2.602, + "step": 36400 + }, + { + "epoch": 11.797026502908857, + "grad_norm": 0.9771369099617004, + "learning_rate": 0.001, + "loss": 2.5793, + "step": 36500 + }, + { + "epoch": 11.829347123464771, + "grad_norm": 1.5032073259353638, + "learning_rate": 0.001, + "loss": 2.5841, + "step": 36600 + }, + { + "epoch": 11.861667744020686, + "grad_norm": 1.11965012550354, + "learning_rate": 0.001, + "loss": 2.5914, + "step": 36700 + }, + { + "epoch": 11.8939883645766, + "grad_norm": 1.1560014486312866, + "learning_rate": 0.001, + "loss": 2.5853, + "step": 36800 + }, + { + "epoch": 11.926308985132515, + "grad_norm": 1.01851224899292, + "learning_rate": 0.001, + "loss": 2.6131, + "step": 36900 + }, + { + "epoch": 11.95862960568843, + "grad_norm": 1.0631927251815796, + "learning_rate": 0.001, + "loss": 2.6137, + "step": 37000 + }, + { + "epoch": 11.990950226244344, + "grad_norm": 1.0793895721435547, + "learning_rate": 0.001, + "loss": 2.5941, + "step": 37100 + }, + { + "epoch": 12.023270846800258, + "grad_norm": 1.1296499967575073, + "learning_rate": 0.001, + "loss": 2.5119, + "step": 37200 + }, + { + "epoch": 12.055591467356173, + "grad_norm": 1.054184913635254, + "learning_rate": 0.001, + "loss": 2.4729, + "step": 37300 + }, + { + "epoch": 12.087912087912088, + "grad_norm": 21.742403030395508, + "learning_rate": 0.001, + "loss": 2.4631, + "step": 37400 + }, + { + "epoch": 12.120232708468002, + "grad_norm": 1.0219130516052246, + "learning_rate": 0.001, + "loss": 2.4652, + "step": 37500 + }, + { + "epoch": 12.152553329023917, + "grad_norm": 1.1618340015411377, + "learning_rate": 0.001, + "loss": 2.4674, + "step": 37600 + }, + { + "epoch": 12.184873949579831, + "grad_norm": 1.190769910812378, + "learning_rate": 0.001, + "loss": 2.4804, + "step": 37700 + }, + { + "epoch": 12.217194570135746, + "grad_norm": 1.1756348609924316, + "learning_rate": 0.001, + "loss": 2.5027, + "step": 37800 + }, + { + "epoch": 12.24951519069166, + "grad_norm": 0.9078492522239685, + "learning_rate": 0.001, + "loss": 2.4863, + "step": 37900 + }, + { + "epoch": 12.281835811247577, + "grad_norm": 0.9652780294418335, + "learning_rate": 0.001, + "loss": 2.4849, + "step": 38000 + }, + { + "epoch": 12.314156431803491, + "grad_norm": 1.3750672340393066, + "learning_rate": 0.001, + "loss": 2.4803, + "step": 38100 + }, + { + "epoch": 12.346477052359406, + "grad_norm": 1.0233724117279053, + "learning_rate": 0.001, + "loss": 2.5161, + "step": 38200 + }, + { + "epoch": 12.37879767291532, + "grad_norm": 1.130647897720337, + "learning_rate": 0.001, + "loss": 2.5181, + "step": 38300 + }, + { + "epoch": 12.411118293471235, + "grad_norm": 1.1900297403335571, + "learning_rate": 0.001, + "loss": 2.5251, + "step": 38400 + }, + { + "epoch": 12.44343891402715, + "grad_norm": 0.9599136710166931, + "learning_rate": 0.001, + "loss": 2.5308, + "step": 38500 + }, + { + "epoch": 12.475759534583064, + "grad_norm": 1.0950437784194946, + "learning_rate": 0.001, + "loss": 2.528, + "step": 38600 + }, + { + "epoch": 12.508080155138979, + "grad_norm": 2.040606737136841, + "learning_rate": 0.001, + "loss": 2.5159, + "step": 38700 + }, + { + "epoch": 12.540400775694893, + "grad_norm": 0.9455929398536682, + "learning_rate": 0.001, + "loss": 2.5413, + "step": 38800 + }, + { + "epoch": 12.572721396250808, + "grad_norm": 1.0487362146377563, + "learning_rate": 0.001, + "loss": 2.5331, + "step": 38900 + }, + { + "epoch": 12.605042016806722, + "grad_norm": 1.202513337135315, + "learning_rate": 0.001, + "loss": 2.5273, + "step": 39000 + }, + { + "epoch": 12.637362637362637, + "grad_norm": 0.8983702063560486, + "learning_rate": 0.001, + "loss": 2.5342, + "step": 39100 + }, + { + "epoch": 12.669683257918551, + "grad_norm": 1.1663144826889038, + "learning_rate": 0.001, + "loss": 2.5271, + "step": 39200 + }, + { + "epoch": 12.702003878474466, + "grad_norm": 1.0637140274047852, + "learning_rate": 0.001, + "loss": 2.5429, + "step": 39300 + }, + { + "epoch": 12.73432449903038, + "grad_norm": 0.9071537852287292, + "learning_rate": 0.001, + "loss": 2.5431, + "step": 39400 + }, + { + "epoch": 12.766645119586297, + "grad_norm": 1.0884722471237183, + "learning_rate": 0.001, + "loss": 2.5415, + "step": 39500 + }, + { + "epoch": 12.798965740142211, + "grad_norm": 1.1432896852493286, + "learning_rate": 0.001, + "loss": 2.5633, + "step": 39600 + }, + { + "epoch": 12.831286360698126, + "grad_norm": 1.1623923778533936, + "learning_rate": 0.001, + "loss": 2.5508, + "step": 39700 + }, + { + "epoch": 12.86360698125404, + "grad_norm": 0.9450523257255554, + "learning_rate": 0.001, + "loss": 2.5332, + "step": 39800 + }, + { + "epoch": 12.895927601809955, + "grad_norm": 1.2209385633468628, + "learning_rate": 0.001, + "loss": 2.538, + "step": 39900 + }, + { + "epoch": 12.92824822236587, + "grad_norm": 0.8747568726539612, + "learning_rate": 0.001, + "loss": 2.5379, + "step": 40000 + }, + { + "epoch": 12.960568842921784, + "grad_norm": 0.8547672629356384, + "learning_rate": 0.001, + "loss": 2.531, + "step": 40100 + }, + { + "epoch": 12.992889463477699, + "grad_norm": 1.1148180961608887, + "learning_rate": 0.001, + "loss": 2.5493, + "step": 40200 + }, + { + "epoch": 13.025210084033613, + "grad_norm": 1.0299571752548218, + "learning_rate": 0.001, + "loss": 2.455, + "step": 40300 + }, + { + "epoch": 13.057530704589528, + "grad_norm": 1.5488170385360718, + "learning_rate": 0.001, + "loss": 2.4235, + "step": 40400 + }, + { + "epoch": 13.089851325145442, + "grad_norm": 0.9480970501899719, + "learning_rate": 0.001, + "loss": 2.4149, + "step": 40500 + }, + { + "epoch": 13.122171945701357, + "grad_norm": 0.9796513319015503, + "learning_rate": 0.001, + "loss": 2.4012, + "step": 40600 + }, + { + "epoch": 13.154492566257272, + "grad_norm": 1.253645658493042, + "learning_rate": 0.001, + "loss": 2.4345, + "step": 40700 + }, + { + "epoch": 13.186813186813186, + "grad_norm": 0.9671187996864319, + "learning_rate": 0.001, + "loss": 2.421, + "step": 40800 + }, + { + "epoch": 13.2191338073691, + "grad_norm": 1.2620867490768433, + "learning_rate": 0.001, + "loss": 2.4489, + "step": 40900 + }, + { + "epoch": 13.251454427925015, + "grad_norm": 1.3267464637756348, + "learning_rate": 0.001, + "loss": 2.4219, + "step": 41000 + }, + { + "epoch": 13.283775048480932, + "grad_norm": 0.949113130569458, + "learning_rate": 0.001, + "loss": 2.4586, + "step": 41100 + }, + { + "epoch": 13.316095669036846, + "grad_norm": 0.9057651162147522, + "learning_rate": 0.001, + "loss": 2.4504, + "step": 41200 + }, + { + "epoch": 13.34841628959276, + "grad_norm": 0.9519304633140564, + "learning_rate": 0.001, + "loss": 2.4565, + "step": 41300 + }, + { + "epoch": 13.380736910148675, + "grad_norm": 0.8539422154426575, + "learning_rate": 0.001, + "loss": 2.4649, + "step": 41400 + }, + { + "epoch": 13.41305753070459, + "grad_norm": 0.9074021577835083, + "learning_rate": 0.001, + "loss": 2.4422, + "step": 41500 + }, + { + "epoch": 13.445378151260504, + "grad_norm": 0.9951125383377075, + "learning_rate": 0.001, + "loss": 2.474, + "step": 41600 + }, + { + "epoch": 13.477698771816419, + "grad_norm": 0.884623646736145, + "learning_rate": 0.001, + "loss": 2.4678, + "step": 41700 + }, + { + "epoch": 13.510019392372334, + "grad_norm": 1.0569515228271484, + "learning_rate": 0.001, + "loss": 2.4836, + "step": 41800 + }, + { + "epoch": 13.542340012928248, + "grad_norm": 1.202636957168579, + "learning_rate": 0.001, + "loss": 2.4781, + "step": 41900 + }, + { + "epoch": 13.574660633484163, + "grad_norm": 0.9308040738105774, + "learning_rate": 0.001, + "loss": 2.4682, + "step": 42000 + }, + { + "epoch": 13.606981254040077, + "grad_norm": 1.0900559425354004, + "learning_rate": 0.001, + "loss": 2.504, + "step": 42100 + }, + { + "epoch": 13.639301874595992, + "grad_norm": 1.135162591934204, + "learning_rate": 0.001, + "loss": 2.4863, + "step": 42200 + }, + { + "epoch": 13.671622495151906, + "grad_norm": 1.2860257625579834, + "learning_rate": 0.001, + "loss": 2.4946, + "step": 42300 + }, + { + "epoch": 13.70394311570782, + "grad_norm": 1.5642868280410767, + "learning_rate": 0.001, + "loss": 2.4893, + "step": 42400 + }, + { + "epoch": 13.736263736263737, + "grad_norm": 1.355553388595581, + "learning_rate": 0.001, + "loss": 2.4914, + "step": 42500 + }, + { + "epoch": 13.768584356819652, + "grad_norm": 1.0944535732269287, + "learning_rate": 0.001, + "loss": 2.4979, + "step": 42600 + }, + { + "epoch": 13.800904977375566, + "grad_norm": 1.1084762811660767, + "learning_rate": 0.001, + "loss": 2.4889, + "step": 42700 + }, + { + "epoch": 13.83322559793148, + "grad_norm": 1.0795499086380005, + "learning_rate": 0.001, + "loss": 2.5077, + "step": 42800 + }, + { + "epoch": 13.865546218487395, + "grad_norm": 1.1181540489196777, + "learning_rate": 0.001, + "loss": 2.5115, + "step": 42900 + }, + { + "epoch": 13.89786683904331, + "grad_norm": 0.8543340563774109, + "learning_rate": 0.001, + "loss": 2.4785, + "step": 43000 + }, + { + "epoch": 13.930187459599225, + "grad_norm": 0.9921061396598816, + "learning_rate": 0.001, + "loss": 2.5062, + "step": 43100 + }, + { + "epoch": 13.96250808015514, + "grad_norm": 1.1608710289001465, + "learning_rate": 0.001, + "loss": 2.5112, + "step": 43200 + }, + { + "epoch": 13.994828700711054, + "grad_norm": 0.9125173687934875, + "learning_rate": 0.001, + "loss": 2.4806, + "step": 43300 + }, + { + "epoch": 14.027149321266968, + "grad_norm": 1.0954087972640991, + "learning_rate": 0.001, + "loss": 2.3845, + "step": 43400 + }, + { + "epoch": 14.059469941822883, + "grad_norm": 1.1624791622161865, + "learning_rate": 0.001, + "loss": 2.3829, + "step": 43500 + }, + { + "epoch": 14.091790562378797, + "grad_norm": 1.3029100894927979, + "learning_rate": 0.001, + "loss": 2.3835, + "step": 43600 + }, + { + "epoch": 14.124111182934712, + "grad_norm": 1.024627923965454, + "learning_rate": 0.001, + "loss": 2.3614, + "step": 43700 + }, + { + "epoch": 14.156431803490626, + "grad_norm": 1.298632264137268, + "learning_rate": 0.001, + "loss": 2.3911, + "step": 43800 + }, + { + "epoch": 14.188752424046541, + "grad_norm": 1.313515543937683, + "learning_rate": 0.001, + "loss": 2.4011, + "step": 43900 + }, + { + "epoch": 14.221073044602456, + "grad_norm": 1.3333510160446167, + "learning_rate": 0.001, + "loss": 2.3881, + "step": 44000 + }, + { + "epoch": 14.25339366515837, + "grad_norm": 1.04513680934906, + "learning_rate": 0.001, + "loss": 2.3704, + "step": 44100 + }, + { + "epoch": 14.285714285714286, + "grad_norm": 1.4951848983764648, + "learning_rate": 0.001, + "loss": 2.386, + "step": 44200 + }, + { + "epoch": 14.318034906270201, + "grad_norm": 1.0465465784072876, + "learning_rate": 0.001, + "loss": 2.3972, + "step": 44300 + }, + { + "epoch": 14.350355526826116, + "grad_norm": 1.1519221067428589, + "learning_rate": 0.001, + "loss": 2.4056, + "step": 44400 + }, + { + "epoch": 14.38267614738203, + "grad_norm": 1.315697431564331, + "learning_rate": 0.001, + "loss": 2.4214, + "step": 44500 + }, + { + "epoch": 14.414996767937945, + "grad_norm": 1.0481849908828735, + "learning_rate": 0.001, + "loss": 2.4211, + "step": 44600 + }, + { + "epoch": 14.44731738849386, + "grad_norm": 1.1554055213928223, + "learning_rate": 0.001, + "loss": 2.4134, + "step": 44700 + }, + { + "epoch": 14.479638009049774, + "grad_norm": 1.0574384927749634, + "learning_rate": 0.001, + "loss": 2.4197, + "step": 44800 + }, + { + "epoch": 14.511958629605688, + "grad_norm": 1.1386839151382446, + "learning_rate": 0.001, + "loss": 2.414, + "step": 44900 + }, + { + "epoch": 14.544279250161603, + "grad_norm": 1.327596664428711, + "learning_rate": 0.001, + "loss": 2.4182, + "step": 45000 + }, + { + "epoch": 14.576599870717518, + "grad_norm": 0.963005542755127, + "learning_rate": 0.001, + "loss": 2.4251, + "step": 45100 + }, + { + "epoch": 14.608920491273432, + "grad_norm": 1.0058512687683105, + "learning_rate": 0.001, + "loss": 2.4233, + "step": 45200 + }, + { + "epoch": 14.641241111829347, + "grad_norm": 1.0360257625579834, + "learning_rate": 0.001, + "loss": 2.4353, + "step": 45300 + }, + { + "epoch": 14.673561732385261, + "grad_norm": 1.2501556873321533, + "learning_rate": 0.001, + "loss": 2.4511, + "step": 45400 + }, + { + "epoch": 14.705882352941176, + "grad_norm": 1.212724208831787, + "learning_rate": 0.001, + "loss": 2.4493, + "step": 45500 + }, + { + "epoch": 14.738202973497092, + "grad_norm": 1.4460214376449585, + "learning_rate": 0.001, + "loss": 2.4591, + "step": 45600 + }, + { + "epoch": 14.770523594053007, + "grad_norm": 1.2264606952667236, + "learning_rate": 0.001, + "loss": 2.431, + "step": 45700 + }, + { + "epoch": 14.802844214608921, + "grad_norm": 0.9162919521331787, + "learning_rate": 0.001, + "loss": 2.4497, + "step": 45800 + }, + { + "epoch": 14.835164835164836, + "grad_norm": 1.2006787061691284, + "learning_rate": 0.001, + "loss": 2.4343, + "step": 45900 + }, + { + "epoch": 14.86748545572075, + "grad_norm": 1.3091291189193726, + "learning_rate": 0.001, + "loss": 2.4543, + "step": 46000 + }, + { + "epoch": 14.899806076276665, + "grad_norm": 1.2788023948669434, + "learning_rate": 0.001, + "loss": 2.4441, + "step": 46100 + }, + { + "epoch": 14.93212669683258, + "grad_norm": 1.4079340696334839, + "learning_rate": 0.001, + "loss": 2.467, + "step": 46200 + }, + { + "epoch": 14.964447317388494, + "grad_norm": 1.0543346405029297, + "learning_rate": 0.001, + "loss": 2.4757, + "step": 46300 + }, + { + "epoch": 14.996767937944409, + "grad_norm": 1.0508509874343872, + "learning_rate": 0.001, + "loss": 2.448, + "step": 46400 + }, + { + "epoch": 15.029088558500323, + "grad_norm": 1.4893016815185547, + "learning_rate": 0.001, + "loss": 2.3238, + "step": 46500 + }, + { + "epoch": 15.061409179056238, + "grad_norm": 1.837750792503357, + "learning_rate": 0.001, + "loss": 2.3131, + "step": 46600 + }, + { + "epoch": 15.093729799612152, + "grad_norm": 1.2078869342803955, + "learning_rate": 0.001, + "loss": 2.3283, + "step": 46700 + }, + { + "epoch": 15.126050420168067, + "grad_norm": 1.6027250289916992, + "learning_rate": 0.001, + "loss": 2.3327, + "step": 46800 + }, + { + "epoch": 15.158371040723981, + "grad_norm": 1.5826632976531982, + "learning_rate": 0.001, + "loss": 2.3439, + "step": 46900 + }, + { + "epoch": 15.190691661279896, + "grad_norm": 1.4874987602233887, + "learning_rate": 0.001, + "loss": 2.3284, + "step": 47000 + }, + { + "epoch": 15.22301228183581, + "grad_norm": 1.3203476667404175, + "learning_rate": 0.001, + "loss": 2.3384, + "step": 47100 + }, + { + "epoch": 15.255332902391725, + "grad_norm": 1.3712375164031982, + "learning_rate": 0.001, + "loss": 2.3232, + "step": 47200 + }, + { + "epoch": 15.287653522947641, + "grad_norm": 1.7889151573181152, + "learning_rate": 0.001, + "loss": 2.3565, + "step": 47300 + }, + { + "epoch": 15.319974143503556, + "grad_norm": 1.271273136138916, + "learning_rate": 0.001, + "loss": 2.3609, + "step": 47400 + }, + { + "epoch": 15.35229476405947, + "grad_norm": 1.26175856590271, + "learning_rate": 0.001, + "loss": 2.3648, + "step": 47500 + }, + { + "epoch": 15.384615384615385, + "grad_norm": 1.1784569025039673, + "learning_rate": 0.001, + "loss": 2.3543, + "step": 47600 + }, + { + "epoch": 15.4169360051713, + "grad_norm": 1.213889241218567, + "learning_rate": 0.001, + "loss": 2.3653, + "step": 47700 + }, + { + "epoch": 15.449256625727214, + "grad_norm": 1.2286897897720337, + "learning_rate": 0.001, + "loss": 2.3669, + "step": 47800 + }, + { + "epoch": 15.481577246283129, + "grad_norm": 1.0805023908615112, + "learning_rate": 0.001, + "loss": 2.3646, + "step": 47900 + }, + { + "epoch": 15.513897866839043, + "grad_norm": 1.3555302619934082, + "learning_rate": 0.001, + "loss": 2.4106, + "step": 48000 + }, + { + "epoch": 15.546218487394958, + "grad_norm": 1.2589572668075562, + "learning_rate": 0.001, + "loss": 2.3889, + "step": 48100 + }, + { + "epoch": 15.578539107950872, + "grad_norm": 1.3926182985305786, + "learning_rate": 0.001, + "loss": 2.3845, + "step": 48200 + }, + { + "epoch": 15.610859728506787, + "grad_norm": 1.1518105268478394, + "learning_rate": 0.001, + "loss": 2.3971, + "step": 48300 + }, + { + "epoch": 15.643180349062701, + "grad_norm": 3.0709455013275146, + "learning_rate": 0.001, + "loss": 2.3771, + "step": 48400 + }, + { + "epoch": 15.675500969618616, + "grad_norm": 1.155657172203064, + "learning_rate": 0.001, + "loss": 2.3964, + "step": 48500 + }, + { + "epoch": 15.70782159017453, + "grad_norm": 1.1553372144699097, + "learning_rate": 0.001, + "loss": 2.3915, + "step": 48600 + }, + { + "epoch": 15.740142210730447, + "grad_norm": 1.320940613746643, + "learning_rate": 0.001, + "loss": 2.4046, + "step": 48700 + }, + { + "epoch": 15.772462831286362, + "grad_norm": 1.5498952865600586, + "learning_rate": 0.001, + "loss": 2.3993, + "step": 48800 + }, + { + "epoch": 15.804783451842276, + "grad_norm": 1.2019668817520142, + "learning_rate": 0.001, + "loss": 2.3913, + "step": 48900 + }, + { + "epoch": 15.83710407239819, + "grad_norm": 1.5592061281204224, + "learning_rate": 0.001, + "loss": 2.3928, + "step": 49000 + }, + { + "epoch": 15.869424692954105, + "grad_norm": 1.2563676834106445, + "learning_rate": 0.001, + "loss": 2.4179, + "step": 49100 + }, + { + "epoch": 15.90174531351002, + "grad_norm": 1.7604995965957642, + "learning_rate": 0.001, + "loss": 2.4154, + "step": 49200 + }, + { + "epoch": 15.934065934065934, + "grad_norm": 2.444636583328247, + "learning_rate": 0.001, + "loss": 2.4203, + "step": 49300 + }, + { + "epoch": 15.966386554621849, + "grad_norm": 1.1105613708496094, + "learning_rate": 0.001, + "loss": 2.4068, + "step": 49400 + }, + { + "epoch": 15.998707175177763, + "grad_norm": 1.4374933242797852, + "learning_rate": 0.001, + "loss": 2.4238, + "step": 49500 + }, + { + "epoch": 16.031027795733678, + "grad_norm": 1.5967682600021362, + "learning_rate": 0.001, + "loss": 2.265, + "step": 49600 + }, + { + "epoch": 16.063348416289593, + "grad_norm": 1.267298698425293, + "learning_rate": 0.001, + "loss": 2.2788, + "step": 49700 + }, + { + "epoch": 16.095669036845507, + "grad_norm": 1.7938473224639893, + "learning_rate": 0.001, + "loss": 2.2835, + "step": 49800 + }, + { + "epoch": 16.12798965740142, + "grad_norm": 1.4659450054168701, + "learning_rate": 0.001, + "loss": 2.2731, + "step": 49900 + }, + { + "epoch": 16.160310277957336, + "grad_norm": 1.5971636772155762, + "learning_rate": 0.001, + "loss": 2.3012, + "step": 50000 + }, + { + "epoch": 16.19263089851325, + "grad_norm": 1.6608117818832397, + "learning_rate": 0.001, + "loss": 2.309, + "step": 50100 + }, + { + "epoch": 16.224951519069165, + "grad_norm": 1.403343915939331, + "learning_rate": 0.001, + "loss": 2.3204, + "step": 50200 + }, + { + "epoch": 16.25727213962508, + "grad_norm": 1.548507809638977, + "learning_rate": 0.001, + "loss": 2.307, + "step": 50300 + }, + { + "epoch": 16.289592760180994, + "grad_norm": 1.3704252243041992, + "learning_rate": 0.001, + "loss": 2.3057, + "step": 50400 + }, + { + "epoch": 16.32191338073691, + "grad_norm": 1.283632755279541, + "learning_rate": 0.001, + "loss": 2.3067, + "step": 50500 + }, + { + "epoch": 16.354234001292824, + "grad_norm": 1.857095718383789, + "learning_rate": 0.001, + "loss": 2.3188, + "step": 50600 + }, + { + "epoch": 16.386554621848738, + "grad_norm": 1.4724833965301514, + "learning_rate": 0.001, + "loss": 2.3249, + "step": 50700 + }, + { + "epoch": 16.418875242404653, + "grad_norm": 1.4150060415267944, + "learning_rate": 0.001, + "loss": 2.3166, + "step": 50800 + }, + { + "epoch": 16.451195862960567, + "grad_norm": 1.3506375551223755, + "learning_rate": 0.001, + "loss": 2.332, + "step": 50900 + }, + { + "epoch": 16.483516483516482, + "grad_norm": 1.4680278301239014, + "learning_rate": 0.001, + "loss": 2.3305, + "step": 51000 + }, + { + "epoch": 16.5158371040724, + "grad_norm": 1.3896517753601074, + "learning_rate": 0.001, + "loss": 2.3371, + "step": 51100 + }, + { + "epoch": 16.548157724628314, + "grad_norm": 1.4641127586364746, + "learning_rate": 0.001, + "loss": 2.3332, + "step": 51200 + }, + { + "epoch": 16.58047834518423, + "grad_norm": 1.54449462890625, + "learning_rate": 0.001, + "loss": 2.3543, + "step": 51300 + }, + { + "epoch": 16.612798965740144, + "grad_norm": 1.260672688484192, + "learning_rate": 0.001, + "loss": 2.3588, + "step": 51400 + }, + { + "epoch": 16.645119586296058, + "grad_norm": 1.3852020502090454, + "learning_rate": 0.001, + "loss": 2.3545, + "step": 51500 + }, + { + "epoch": 16.677440206851973, + "grad_norm": 1.5494886636734009, + "learning_rate": 0.001, + "loss": 2.358, + "step": 51600 + }, + { + "epoch": 16.709760827407887, + "grad_norm": 1.8374618291854858, + "learning_rate": 0.001, + "loss": 2.3726, + "step": 51700 + }, + { + "epoch": 16.742081447963802, + "grad_norm": 1.3113868236541748, + "learning_rate": 0.001, + "loss": 2.3415, + "step": 51800 + }, + { + "epoch": 16.774402068519716, + "grad_norm": 1.4161752462387085, + "learning_rate": 0.001, + "loss": 2.3594, + "step": 51900 + }, + { + "epoch": 16.80672268907563, + "grad_norm": 1.4049732685089111, + "learning_rate": 0.001, + "loss": 2.3403, + "step": 52000 + }, + { + "epoch": 16.839043309631545, + "grad_norm": 1.5107018947601318, + "learning_rate": 0.001, + "loss": 2.3809, + "step": 52100 + }, + { + "epoch": 16.87136393018746, + "grad_norm": 1.6911265850067139, + "learning_rate": 0.001, + "loss": 2.3562, + "step": 52200 + }, + { + "epoch": 16.903684550743375, + "grad_norm": 1.6009875535964966, + "learning_rate": 0.001, + "loss": 2.3589, + "step": 52300 + }, + { + "epoch": 16.93600517129929, + "grad_norm": 1.415225863456726, + "learning_rate": 0.001, + "loss": 2.3723, + "step": 52400 + }, + { + "epoch": 16.968325791855204, + "grad_norm": 1.5780458450317383, + "learning_rate": 0.001, + "loss": 2.3631, + "step": 52500 + }, + { + "epoch": 17.00064641241112, + "grad_norm": 1.3046797513961792, + "learning_rate": 0.001, + "loss": 2.3591, + "step": 52600 + }, + { + "epoch": 17.032967032967033, + "grad_norm": 1.631547212600708, + "learning_rate": 0.001, + "loss": 2.213, + "step": 52700 + }, + { + "epoch": 17.065287653522947, + "grad_norm": 1.5670453310012817, + "learning_rate": 0.001, + "loss": 2.231, + "step": 52800 + }, + { + "epoch": 17.097608274078862, + "grad_norm": 1.5162924528121948, + "learning_rate": 0.001, + "loss": 2.2282, + "step": 52900 + }, + { + "epoch": 17.129928894634777, + "grad_norm": 1.8685030937194824, + "learning_rate": 0.001, + "loss": 2.2464, + "step": 53000 + }, + { + "epoch": 17.16224951519069, + "grad_norm": 1.8752682209014893, + "learning_rate": 0.001, + "loss": 2.2316, + "step": 53100 + }, + { + "epoch": 17.194570135746606, + "grad_norm": 1.5304337739944458, + "learning_rate": 0.001, + "loss": 2.2437, + "step": 53200 + }, + { + "epoch": 17.22689075630252, + "grad_norm": 1.8339931964874268, + "learning_rate": 0.001, + "loss": 2.2524, + "step": 53300 + }, + { + "epoch": 17.259211376858435, + "grad_norm": 1.6601121425628662, + "learning_rate": 0.001, + "loss": 2.2821, + "step": 53400 + }, + { + "epoch": 17.29153199741435, + "grad_norm": 1.037027359008789, + "learning_rate": 0.001, + "loss": 2.2599, + "step": 53500 + }, + { + "epoch": 17.323852617970264, + "grad_norm": 1.4101696014404297, + "learning_rate": 0.001, + "loss": 2.2894, + "step": 53600 + }, + { + "epoch": 17.35617323852618, + "grad_norm": 1.715714931488037, + "learning_rate": 0.001, + "loss": 2.2673, + "step": 53700 + }, + { + "epoch": 17.388493859082093, + "grad_norm": 1.6918067932128906, + "learning_rate": 0.001, + "loss": 2.277, + "step": 53800 + }, + { + "epoch": 17.420814479638008, + "grad_norm": 1.513771653175354, + "learning_rate": 0.001, + "loss": 2.2822, + "step": 53900 + }, + { + "epoch": 17.453135100193922, + "grad_norm": 2.0623667240142822, + "learning_rate": 0.001, + "loss": 2.3088, + "step": 54000 + }, + { + "epoch": 17.485455720749837, + "grad_norm": 1.43783700466156, + "learning_rate": 0.001, + "loss": 2.2943, + "step": 54100 + }, + { + "epoch": 17.517776341305755, + "grad_norm": 1.387234091758728, + "learning_rate": 0.001, + "loss": 2.3021, + "step": 54200 + }, + { + "epoch": 17.55009696186167, + "grad_norm": 1.8661473989486694, + "learning_rate": 0.001, + "loss": 2.2701, + "step": 54300 + }, + { + "epoch": 17.582417582417584, + "grad_norm": 1.76520836353302, + "learning_rate": 0.001, + "loss": 2.2823, + "step": 54400 + }, + { + "epoch": 17.6147382029735, + "grad_norm": 1.5826014280319214, + "learning_rate": 0.001, + "loss": 2.3244, + "step": 54500 + }, + { + "epoch": 17.647058823529413, + "grad_norm": 1.3721729516983032, + "learning_rate": 0.001, + "loss": 2.318, + "step": 54600 + }, + { + "epoch": 17.679379444085328, + "grad_norm": 1.4153558015823364, + "learning_rate": 0.001, + "loss": 2.3211, + "step": 54700 + }, + { + "epoch": 17.711700064641242, + "grad_norm": 1.6873489618301392, + "learning_rate": 0.001, + "loss": 2.3211, + "step": 54800 + }, + { + "epoch": 17.744020685197157, + "grad_norm": 1.48008131980896, + "learning_rate": 0.001, + "loss": 2.3298, + "step": 54900 + }, + { + "epoch": 17.77634130575307, + "grad_norm": 1.2169060707092285, + "learning_rate": 0.001, + "loss": 2.3117, + "step": 55000 + }, + { + "epoch": 17.808661926308986, + "grad_norm": 2.0541675090789795, + "learning_rate": 0.001, + "loss": 2.3168, + "step": 55100 + }, + { + "epoch": 17.8409825468649, + "grad_norm": 1.6494852304458618, + "learning_rate": 0.001, + "loss": 2.3136, + "step": 55200 + }, + { + "epoch": 17.873303167420815, + "grad_norm": 1.9559639692306519, + "learning_rate": 0.001, + "loss": 2.3385, + "step": 55300 + }, + { + "epoch": 17.90562378797673, + "grad_norm": 1.883894443511963, + "learning_rate": 0.001, + "loss": 2.3241, + "step": 55400 + }, + { + "epoch": 17.937944408532644, + "grad_norm": 1.4204341173171997, + "learning_rate": 0.001, + "loss": 2.3306, + "step": 55500 + }, + { + "epoch": 17.97026502908856, + "grad_norm": 1.837131142616272, + "learning_rate": 0.001, + "loss": 2.3515, + "step": 55600 + }, + { + "epoch": 18.002585649644473, + "grad_norm": 1.2758315801620483, + "learning_rate": 0.001, + "loss": 2.3336, + "step": 55700 + }, + { + "epoch": 18.034906270200388, + "grad_norm": 1.0778571367263794, + "learning_rate": 0.001, + "loss": 2.1599, + "step": 55800 + }, + { + "epoch": 18.067226890756302, + "grad_norm": 1.2033774852752686, + "learning_rate": 0.001, + "loss": 2.1879, + "step": 55900 + }, + { + "epoch": 18.099547511312217, + "grad_norm": 1.5203527212142944, + "learning_rate": 0.001, + "loss": 2.1859, + "step": 56000 + }, + { + "epoch": 18.13186813186813, + "grad_norm": 1.2778196334838867, + "learning_rate": 0.001, + "loss": 2.2118, + "step": 56100 + }, + { + "epoch": 18.164188752424046, + "grad_norm": 1.490444302558899, + "learning_rate": 0.001, + "loss": 2.215, + "step": 56200 + }, + { + "epoch": 18.19650937297996, + "grad_norm": 1.25520658493042, + "learning_rate": 0.001, + "loss": 2.2096, + "step": 56300 + }, + { + "epoch": 18.228829993535875, + "grad_norm": 1.3420361280441284, + "learning_rate": 0.001, + "loss": 2.2346, + "step": 56400 + }, + { + "epoch": 18.26115061409179, + "grad_norm": 1.4662959575653076, + "learning_rate": 0.001, + "loss": 2.2047, + "step": 56500 + }, + { + "epoch": 18.293471234647704, + "grad_norm": 1.3517006635665894, + "learning_rate": 0.001, + "loss": 2.2302, + "step": 56600 + }, + { + "epoch": 18.32579185520362, + "grad_norm": 1.6744149923324585, + "learning_rate": 0.001, + "loss": 2.2548, + "step": 56700 + }, + { + "epoch": 18.358112475759533, + "grad_norm": 1.6994774341583252, + "learning_rate": 0.001, + "loss": 2.2184, + "step": 56800 + }, + { + "epoch": 18.390433096315448, + "grad_norm": 1.2075378894805908, + "learning_rate": 0.001, + "loss": 2.2467, + "step": 56900 + }, + { + "epoch": 18.422753716871362, + "grad_norm": 1.0433144569396973, + "learning_rate": 0.001, + "loss": 2.2499, + "step": 57000 + }, + { + "epoch": 18.455074337427277, + "grad_norm": 1.2884716987609863, + "learning_rate": 0.001, + "loss": 2.2475, + "step": 57100 + }, + { + "epoch": 18.48739495798319, + "grad_norm": 1.8086559772491455, + "learning_rate": 0.001, + "loss": 2.2572, + "step": 57200 + }, + { + "epoch": 18.51971557853911, + "grad_norm": 1.1635278463363647, + "learning_rate": 0.001, + "loss": 2.2554, + "step": 57300 + }, + { + "epoch": 18.552036199095024, + "grad_norm": 1.3635642528533936, + "learning_rate": 0.001, + "loss": 2.2633, + "step": 57400 + }, + { + "epoch": 18.58435681965094, + "grad_norm": 1.2767882347106934, + "learning_rate": 0.001, + "loss": 2.2519, + "step": 57500 + }, + { + "epoch": 18.616677440206853, + "grad_norm": 1.571807861328125, + "learning_rate": 0.001, + "loss": 2.2582, + "step": 57600 + }, + { + "epoch": 18.648998060762768, + "grad_norm": 1.5809171199798584, + "learning_rate": 0.001, + "loss": 2.2612, + "step": 57700 + }, + { + "epoch": 18.681318681318682, + "grad_norm": 1.2579069137573242, + "learning_rate": 0.001, + "loss": 2.2713, + "step": 57800 + }, + { + "epoch": 18.713639301874597, + "grad_norm": 1.2632404565811157, + "learning_rate": 0.001, + "loss": 2.2876, + "step": 57900 + }, + { + "epoch": 18.74595992243051, + "grad_norm": 1.0768790245056152, + "learning_rate": 0.001, + "loss": 2.2794, + "step": 58000 + }, + { + "epoch": 18.778280542986426, + "grad_norm": 1.4682295322418213, + "learning_rate": 0.001, + "loss": 2.2766, + "step": 58100 + }, + { + "epoch": 18.81060116354234, + "grad_norm": 1.269097089767456, + "learning_rate": 0.001, + "loss": 2.2587, + "step": 58200 + }, + { + "epoch": 18.842921784098255, + "grad_norm": 1.7296055555343628, + "learning_rate": 0.001, + "loss": 2.2853, + "step": 58300 + }, + { + "epoch": 18.87524240465417, + "grad_norm": 1.5035419464111328, + "learning_rate": 0.001, + "loss": 2.2967, + "step": 58400 + }, + { + "epoch": 18.907563025210084, + "grad_norm": 1.2617650032043457, + "learning_rate": 0.001, + "loss": 2.3184, + "step": 58500 + }, + { + "epoch": 18.939883645766, + "grad_norm": 1.4061576128005981, + "learning_rate": 0.001, + "loss": 2.2902, + "step": 58600 + }, + { + "epoch": 18.972204266321913, + "grad_norm": 1.2522116899490356, + "learning_rate": 0.001, + "loss": 2.2897, + "step": 58700 + }, + { + "epoch": 19.004524886877828, + "grad_norm": 1.2318428754806519, + "learning_rate": 0.001, + "loss": 2.295, + "step": 58800 + }, + { + "epoch": 19.036845507433743, + "grad_norm": 1.2215492725372314, + "learning_rate": 0.001, + "loss": 2.1301, + "step": 58900 + }, + { + "epoch": 19.069166127989657, + "grad_norm": 1.204942226409912, + "learning_rate": 0.001, + "loss": 2.1383, + "step": 59000 + }, + { + "epoch": 19.10148674854557, + "grad_norm": 1.343122124671936, + "learning_rate": 0.001, + "loss": 2.1669, + "step": 59100 + }, + { + "epoch": 19.133807369101486, + "grad_norm": 1.4247043132781982, + "learning_rate": 0.001, + "loss": 2.17, + "step": 59200 + }, + { + "epoch": 19.1661279896574, + "grad_norm": 1.212086796760559, + "learning_rate": 0.001, + "loss": 2.1771, + "step": 59300 + }, + { + "epoch": 19.198448610213315, + "grad_norm": 0.9887686371803284, + "learning_rate": 0.001, + "loss": 2.1871, + "step": 59400 + }, + { + "epoch": 19.23076923076923, + "grad_norm": 0.9896878600120544, + "learning_rate": 0.001, + "loss": 2.1768, + "step": 59500 + }, + { + "epoch": 19.263089851325145, + "grad_norm": 1.0798989534378052, + "learning_rate": 0.001, + "loss": 2.1963, + "step": 59600 + }, + { + "epoch": 19.29541047188106, + "grad_norm": 1.0032464265823364, + "learning_rate": 0.001, + "loss": 2.1917, + "step": 59700 + }, + { + "epoch": 19.327731092436974, + "grad_norm": 1.21811044216156, + "learning_rate": 0.001, + "loss": 2.204, + "step": 59800 + }, + { + "epoch": 19.360051712992888, + "grad_norm": 1.1439648866653442, + "learning_rate": 0.001, + "loss": 2.2006, + "step": 59900 + }, + { + "epoch": 19.392372333548803, + "grad_norm": 1.0855740308761597, + "learning_rate": 0.001, + "loss": 2.2165, + "step": 60000 + }, + { + "epoch": 19.424692954104717, + "grad_norm": 1.388441562652588, + "learning_rate": 0.001, + "loss": 2.2109, + "step": 60100 + }, + { + "epoch": 19.457013574660632, + "grad_norm": 1.4667842388153076, + "learning_rate": 0.001, + "loss": 2.1972, + "step": 60200 + }, + { + "epoch": 19.489334195216546, + "grad_norm": 1.7039697170257568, + "learning_rate": 0.001, + "loss": 2.2221, + "step": 60300 + }, + { + "epoch": 19.521654815772465, + "grad_norm": 1.1940791606903076, + "learning_rate": 0.001, + "loss": 2.2167, + "step": 60400 + }, + { + "epoch": 19.55397543632838, + "grad_norm": 1.150011420249939, + "learning_rate": 0.001, + "loss": 2.2246, + "step": 60500 + }, + { + "epoch": 19.586296056884294, + "grad_norm": 1.097654104232788, + "learning_rate": 0.001, + "loss": 2.2212, + "step": 60600 + }, + { + "epoch": 19.618616677440208, + "grad_norm": 1.1845519542694092, + "learning_rate": 0.001, + "loss": 2.2251, + "step": 60700 + }, + { + "epoch": 19.650937297996123, + "grad_norm": 1.1336361169815063, + "learning_rate": 0.001, + "loss": 2.234, + "step": 60800 + }, + { + "epoch": 19.683257918552037, + "grad_norm": 1.1724891662597656, + "learning_rate": 0.001, + "loss": 2.2089, + "step": 60900 + }, + { + "epoch": 19.715578539107952, + "grad_norm": 0.9693626165390015, + "learning_rate": 0.001, + "loss": 2.2348, + "step": 61000 + }, + { + "epoch": 19.747899159663866, + "grad_norm": 1.1252988576889038, + "learning_rate": 0.001, + "loss": 2.2443, + "step": 61100 + }, + { + "epoch": 19.78021978021978, + "grad_norm": 0.9875534772872925, + "learning_rate": 0.001, + "loss": 2.2378, + "step": 61200 + }, + { + "epoch": 19.812540400775696, + "grad_norm": 1.3839106559753418, + "learning_rate": 0.001, + "loss": 2.2329, + "step": 61300 + }, + { + "epoch": 19.84486102133161, + "grad_norm": 1.5243983268737793, + "learning_rate": 0.001, + "loss": 2.252, + "step": 61400 + }, + { + "epoch": 19.877181641887525, + "grad_norm": 1.1300511360168457, + "learning_rate": 0.001, + "loss": 2.2568, + "step": 61500 + }, + { + "epoch": 19.90950226244344, + "grad_norm": 1.2548259496688843, + "learning_rate": 0.001, + "loss": 2.2585, + "step": 61600 + }, + { + "epoch": 19.941822882999354, + "grad_norm": 1.2727535963058472, + "learning_rate": 0.001, + "loss": 2.236, + "step": 61700 + }, + { + "epoch": 19.97414350355527, + "grad_norm": 1.0166510343551636, + "learning_rate": 0.001, + "loss": 2.2477, + "step": 61800 + }, + { + "epoch": 20.006464124111183, + "grad_norm": 1.0059797763824463, + "learning_rate": 0.001, + "loss": 2.2228, + "step": 61900 + }, + { + "epoch": 20.038784744667097, + "grad_norm": 1.5406187772750854, + "learning_rate": 0.001, + "loss": 2.1074, + "step": 62000 + }, + { + "epoch": 20.071105365223012, + "grad_norm": 1.2194257974624634, + "learning_rate": 0.001, + "loss": 2.128, + "step": 62100 + }, + { + "epoch": 20.103425985778927, + "grad_norm": 1.000863790512085, + "learning_rate": 0.001, + "loss": 2.1116, + "step": 62200 + }, + { + "epoch": 20.13574660633484, + "grad_norm": 1.1983182430267334, + "learning_rate": 0.001, + "loss": 2.1301, + "step": 62300 + }, + { + "epoch": 20.168067226890756, + "grad_norm": 1.2805134057998657, + "learning_rate": 0.001, + "loss": 2.1357, + "step": 62400 + }, + { + "epoch": 20.20038784744667, + "grad_norm": 1.5315334796905518, + "learning_rate": 0.001, + "loss": 2.127, + "step": 62500 + }, + { + "epoch": 20.232708468002585, + "grad_norm": 1.239235520362854, + "learning_rate": 0.001, + "loss": 2.1294, + "step": 62600 + }, + { + "epoch": 20.2650290885585, + "grad_norm": 1.214128851890564, + "learning_rate": 0.001, + "loss": 2.1297, + "step": 62700 + }, + { + "epoch": 20.297349709114414, + "grad_norm": 1.1846396923065186, + "learning_rate": 0.001, + "loss": 2.1661, + "step": 62800 + }, + { + "epoch": 20.32967032967033, + "grad_norm": 1.3728803396224976, + "learning_rate": 0.001, + "loss": 2.1509, + "step": 62900 + }, + { + "epoch": 20.361990950226243, + "grad_norm": 1.369428038597107, + "learning_rate": 0.001, + "loss": 2.1814, + "step": 63000 + }, + { + "epoch": 20.394311570782158, + "grad_norm": 1.49150812625885, + "learning_rate": 0.001, + "loss": 2.156, + "step": 63100 + }, + { + "epoch": 20.426632191338072, + "grad_norm": 1.056602954864502, + "learning_rate": 0.001, + "loss": 2.1691, + "step": 63200 + }, + { + "epoch": 20.458952811893987, + "grad_norm": 1.071666955947876, + "learning_rate": 0.001, + "loss": 2.1601, + "step": 63300 + }, + { + "epoch": 20.4912734324499, + "grad_norm": 1.1927623748779297, + "learning_rate": 0.001, + "loss": 2.1771, + "step": 63400 + }, + { + "epoch": 20.52359405300582, + "grad_norm": 1.1696590185165405, + "learning_rate": 0.001, + "loss": 2.1663, + "step": 63500 + }, + { + "epoch": 20.555914673561734, + "grad_norm": 1.454006314277649, + "learning_rate": 0.001, + "loss": 2.1836, + "step": 63600 + }, + { + "epoch": 20.58823529411765, + "grad_norm": 1.0862654447555542, + "learning_rate": 0.001, + "loss": 2.1822, + "step": 63700 + }, + { + "epoch": 20.620555914673563, + "grad_norm": 1.1355229616165161, + "learning_rate": 0.001, + "loss": 2.1819, + "step": 63800 + }, + { + "epoch": 20.652876535229478, + "grad_norm": 1.1009161472320557, + "learning_rate": 0.001, + "loss": 2.1741, + "step": 63900 + }, + { + "epoch": 20.685197155785392, + "grad_norm": 1.2176330089569092, + "learning_rate": 0.001, + "loss": 2.1882, + "step": 64000 + }, + { + "epoch": 20.717517776341307, + "grad_norm": 1.3134511709213257, + "learning_rate": 0.001, + "loss": 2.1978, + "step": 64100 + }, + { + "epoch": 20.74983839689722, + "grad_norm": 1.0069459676742554, + "learning_rate": 0.001, + "loss": 2.2084, + "step": 64200 + }, + { + "epoch": 20.782159017453136, + "grad_norm": 1.4432331323623657, + "learning_rate": 0.001, + "loss": 2.1979, + "step": 64300 + }, + { + "epoch": 20.81447963800905, + "grad_norm": 1.021673321723938, + "learning_rate": 0.001, + "loss": 2.2206, + "step": 64400 + }, + { + "epoch": 20.846800258564965, + "grad_norm": 1.5848689079284668, + "learning_rate": 0.001, + "loss": 2.2038, + "step": 64500 + }, + { + "epoch": 20.87912087912088, + "grad_norm": 1.2562905550003052, + "learning_rate": 0.001, + "loss": 2.2304, + "step": 64600 + }, + { + "epoch": 20.911441499676794, + "grad_norm": 1.084649920463562, + "learning_rate": 0.001, + "loss": 2.2199, + "step": 64700 + }, + { + "epoch": 20.94376212023271, + "grad_norm": 1.2564764022827148, + "learning_rate": 0.001, + "loss": 2.2264, + "step": 64800 + }, + { + "epoch": 20.976082740788623, + "grad_norm": 1.4754396677017212, + "learning_rate": 0.001, + "loss": 2.2299, + "step": 64900 + }, + { + "epoch": 21.008403361344538, + "grad_norm": 1.1850255727767944, + "learning_rate": 0.001, + "loss": 2.1639, + "step": 65000 + }, + { + "epoch": 21.040723981900452, + "grad_norm": 1.1101585626602173, + "learning_rate": 0.001, + "loss": 2.0785, + "step": 65100 + }, + { + "epoch": 21.073044602456367, + "grad_norm": 1.2771697044372559, + "learning_rate": 0.001, + "loss": 2.084, + "step": 65200 + }, + { + "epoch": 21.10536522301228, + "grad_norm": 1.1103767156600952, + "learning_rate": 0.001, + "loss": 2.0889, + "step": 65300 + }, + { + "epoch": 21.137685843568196, + "grad_norm": 1.2200546264648438, + "learning_rate": 0.001, + "loss": 2.0993, + "step": 65400 + }, + { + "epoch": 21.17000646412411, + "grad_norm": 1.3447659015655518, + "learning_rate": 0.001, + "loss": 2.0916, + "step": 65500 + }, + { + "epoch": 21.202327084680025, + "grad_norm": 2.29350209236145, + "learning_rate": 0.001, + "loss": 2.1049, + "step": 65600 + }, + { + "epoch": 21.23464770523594, + "grad_norm": 1.195257306098938, + "learning_rate": 0.001, + "loss": 2.1069, + "step": 65700 + }, + { + "epoch": 21.266968325791854, + "grad_norm": 1.0652481317520142, + "learning_rate": 0.001, + "loss": 2.1048, + "step": 65800 + }, + { + "epoch": 21.29928894634777, + "grad_norm": 1.1504040956497192, + "learning_rate": 0.001, + "loss": 2.1239, + "step": 65900 + }, + { + "epoch": 21.331609566903683, + "grad_norm": 1.2053735256195068, + "learning_rate": 0.001, + "loss": 2.1326, + "step": 66000 + }, + { + "epoch": 21.363930187459598, + "grad_norm": 39.07048034667969, + "learning_rate": 0.001, + "loss": 2.112, + "step": 66100 + }, + { + "epoch": 21.396250808015512, + "grad_norm": 1.1385326385498047, + "learning_rate": 0.001, + "loss": 2.1137, + "step": 66200 + }, + { + "epoch": 21.428571428571427, + "grad_norm": 1.2207857370376587, + "learning_rate": 0.001, + "loss": 2.125, + "step": 66300 + }, + { + "epoch": 21.46089204912734, + "grad_norm": 1.2614213228225708, + "learning_rate": 0.001, + "loss": 2.1434, + "step": 66400 + }, + { + "epoch": 21.49321266968326, + "grad_norm": 1.3514631986618042, + "learning_rate": 0.001, + "loss": 2.1106, + "step": 66500 + }, + { + "epoch": 21.525533290239174, + "grad_norm": 1.471451759338379, + "learning_rate": 0.001, + "loss": 2.1038, + "step": 66600 + }, + { + "epoch": 21.55785391079509, + "grad_norm": 1.3486419916152954, + "learning_rate": 0.001, + "loss": 2.1398, + "step": 66700 + }, + { + "epoch": 21.590174531351003, + "grad_norm": 3.350062847137451, + "learning_rate": 0.001, + "loss": 2.1478, + "step": 66800 + }, + { + "epoch": 21.622495151906918, + "grad_norm": 1.3389320373535156, + "learning_rate": 0.001, + "loss": 2.1472, + "step": 66900 + }, + { + "epoch": 21.654815772462833, + "grad_norm": 1.0626788139343262, + "learning_rate": 0.001, + "loss": 2.1595, + "step": 67000 + }, + { + "epoch": 21.687136393018747, + "grad_norm": 1.2367748022079468, + "learning_rate": 0.001, + "loss": 2.1655, + "step": 67100 + }, + { + "epoch": 21.71945701357466, + "grad_norm": 1.1122276782989502, + "learning_rate": 0.001, + "loss": 2.179, + "step": 67200 + }, + { + "epoch": 21.751777634130576, + "grad_norm": 1.179870367050171, + "learning_rate": 0.001, + "loss": 2.1743, + "step": 67300 + }, + { + "epoch": 21.78409825468649, + "grad_norm": 1.1807243824005127, + "learning_rate": 0.001, + "loss": 2.1917, + "step": 67400 + }, + { + "epoch": 21.816418875242405, + "grad_norm": 1.0804619789123535, + "learning_rate": 0.001, + "loss": 2.192, + "step": 67500 + }, + { + "epoch": 21.84873949579832, + "grad_norm": 1.6039589643478394, + "learning_rate": 0.001, + "loss": 2.1808, + "step": 67600 + }, + { + "epoch": 21.881060116354234, + "grad_norm": 1.2812756299972534, + "learning_rate": 0.001, + "loss": 2.1729, + "step": 67700 + }, + { + "epoch": 21.91338073691015, + "grad_norm": 1.1737068891525269, + "learning_rate": 0.001, + "loss": 2.1592, + "step": 67800 + }, + { + "epoch": 21.945701357466064, + "grad_norm": 1.1612744331359863, + "learning_rate": 0.001, + "loss": 2.1783, + "step": 67900 + }, + { + "epoch": 21.978021978021978, + "grad_norm": 1.238431692123413, + "learning_rate": 0.001, + "loss": 2.1802, + "step": 68000 + }, + { + "epoch": 22.010342598577893, + "grad_norm": 1.3498260974884033, + "learning_rate": 0.001, + "loss": 2.1187, + "step": 68100 + }, + { + "epoch": 22.042663219133807, + "grad_norm": 1.307900309562683, + "learning_rate": 0.001, + "loss": 2.0347, + "step": 68200 + }, + { + "epoch": 22.07498383968972, + "grad_norm": 1.265341877937317, + "learning_rate": 0.001, + "loss": 2.0262, + "step": 68300 + }, + { + "epoch": 22.107304460245636, + "grad_norm": 1.0917607545852661, + "learning_rate": 0.001, + "loss": 2.0515, + "step": 68400 + }, + { + "epoch": 22.13962508080155, + "grad_norm": 1.6194117069244385, + "learning_rate": 0.001, + "loss": 2.0563, + "step": 68500 + }, + { + "epoch": 22.171945701357465, + "grad_norm": 1.8932991027832031, + "learning_rate": 0.001, + "loss": 2.0652, + "step": 68600 + }, + { + "epoch": 22.20426632191338, + "grad_norm": 1.2356934547424316, + "learning_rate": 0.001, + "loss": 2.0741, + "step": 68700 + }, + { + "epoch": 22.236586942469295, + "grad_norm": 1.2971307039260864, + "learning_rate": 0.001, + "loss": 2.0682, + "step": 68800 + }, + { + "epoch": 22.26890756302521, + "grad_norm": 1.5780755281448364, + "learning_rate": 0.001, + "loss": 2.0567, + "step": 68900 + }, + { + "epoch": 22.301228183581124, + "grad_norm": 1.664420247077942, + "learning_rate": 0.001, + "loss": 2.0788, + "step": 69000 + }, + { + "epoch": 22.33354880413704, + "grad_norm": 1.3689608573913574, + "learning_rate": 0.001, + "loss": 2.0612, + "step": 69100 + }, + { + "epoch": 22.365869424692953, + "grad_norm": 1.2644816637039185, + "learning_rate": 0.001, + "loss": 2.1076, + "step": 69200 + }, + { + "epoch": 22.398190045248867, + "grad_norm": 1.6748441457748413, + "learning_rate": 0.001, + "loss": 2.1104, + "step": 69300 + }, + { + "epoch": 22.430510665804782, + "grad_norm": 1.0100698471069336, + "learning_rate": 0.001, + "loss": 2.1049, + "step": 69400 + }, + { + "epoch": 22.462831286360696, + "grad_norm": 1.4298042058944702, + "learning_rate": 0.001, + "loss": 2.1022, + "step": 69500 + }, + { + "epoch": 22.49515190691661, + "grad_norm": 1.8333765268325806, + "learning_rate": 0.001, + "loss": 2.12, + "step": 69600 + }, + { + "epoch": 22.52747252747253, + "grad_norm": 1.4487437009811401, + "learning_rate": 0.001, + "loss": 2.1007, + "step": 69700 + }, + { + "epoch": 22.559793148028444, + "grad_norm": 1.6013681888580322, + "learning_rate": 0.001, + "loss": 2.1079, + "step": 69800 + }, + { + "epoch": 22.59211376858436, + "grad_norm": 2.3832428455352783, + "learning_rate": 0.001, + "loss": 2.1165, + "step": 69900 + }, + { + "epoch": 22.624434389140273, + "grad_norm": 1.600501298904419, + "learning_rate": 0.001, + "loss": 2.1214, + "step": 70000 + }, + { + "epoch": 22.656755009696187, + "grad_norm": 1.5591310262680054, + "learning_rate": 0.001, + "loss": 2.115, + "step": 70100 + }, + { + "epoch": 22.689075630252102, + "grad_norm": 1.2109787464141846, + "learning_rate": 0.001, + "loss": 2.1442, + "step": 70200 + }, + { + "epoch": 22.721396250808017, + "grad_norm": 1.465110421180725, + "learning_rate": 0.001, + "loss": 2.1413, + "step": 70300 + }, + { + "epoch": 22.75371687136393, + "grad_norm": 1.2152010202407837, + "learning_rate": 0.001, + "loss": 2.1242, + "step": 70400 + }, + { + "epoch": 22.786037491919846, + "grad_norm": 1.4363352060317993, + "learning_rate": 0.001, + "loss": 2.1379, + "step": 70500 + }, + { + "epoch": 22.81835811247576, + "grad_norm": 1.399573564529419, + "learning_rate": 0.001, + "loss": 2.138, + "step": 70600 + }, + { + "epoch": 22.850678733031675, + "grad_norm": 1.378006100654602, + "learning_rate": 0.001, + "loss": 2.1285, + "step": 70700 + }, + { + "epoch": 22.88299935358759, + "grad_norm": 1.274100422859192, + "learning_rate": 0.001, + "loss": 2.1528, + "step": 70800 + }, + { + "epoch": 22.915319974143504, + "grad_norm": 1.2786856889724731, + "learning_rate": 0.001, + "loss": 2.1371, + "step": 70900 + }, + { + "epoch": 22.94764059469942, + "grad_norm": 1.3367137908935547, + "learning_rate": 0.001, + "loss": 2.1356, + "step": 71000 + }, + { + "epoch": 22.979961215255333, + "grad_norm": 1.1747994422912598, + "learning_rate": 0.001, + "loss": 2.1513, + "step": 71100 + }, + { + "epoch": 23.012281835811248, + "grad_norm": 1.472936987876892, + "learning_rate": 0.001, + "loss": 2.0644, + "step": 71200 + }, + { + "epoch": 23.044602456367162, + "grad_norm": 1.5637643337249756, + "learning_rate": 0.001, + "loss": 2.0074, + "step": 71300 + }, + { + "epoch": 23.076923076923077, + "grad_norm": 1.5909124612808228, + "learning_rate": 0.001, + "loss": 1.99, + "step": 71400 + }, + { + "epoch": 23.10924369747899, + "grad_norm": 1.779449462890625, + "learning_rate": 0.001, + "loss": 2.0129, + "step": 71500 + }, + { + "epoch": 23.141564318034906, + "grad_norm": 1.4273805618286133, + "learning_rate": 0.001, + "loss": 2.0336, + "step": 71600 + }, + { + "epoch": 23.17388493859082, + "grad_norm": 1.4807401895523071, + "learning_rate": 0.001, + "loss": 2.0258, + "step": 71700 + }, + { + "epoch": 23.206205559146735, + "grad_norm": 1.5846195220947266, + "learning_rate": 0.001, + "loss": 2.0398, + "step": 71800 + }, + { + "epoch": 23.23852617970265, + "grad_norm": 1.2522149085998535, + "learning_rate": 0.001, + "loss": 2.0358, + "step": 71900 + }, + { + "epoch": 23.270846800258564, + "grad_norm": 1.8011384010314941, + "learning_rate": 0.001, + "loss": 2.0368, + "step": 72000 + }, + { + "epoch": 23.30316742081448, + "grad_norm": 1.339313268661499, + "learning_rate": 0.001, + "loss": 2.0504, + "step": 72100 + }, + { + "epoch": 23.335488041370393, + "grad_norm": 1.317734956741333, + "learning_rate": 0.001, + "loss": 2.0429, + "step": 72200 + }, + { + "epoch": 23.367808661926308, + "grad_norm": 1.351259708404541, + "learning_rate": 0.001, + "loss": 2.0514, + "step": 72300 + }, + { + "epoch": 23.400129282482222, + "grad_norm": 1.3582738637924194, + "learning_rate": 0.001, + "loss": 2.0577, + "step": 72400 + }, + { + "epoch": 23.432449903038137, + "grad_norm": 1.813624382019043, + "learning_rate": 0.001, + "loss": 2.0445, + "step": 72500 + }, + { + "epoch": 23.46477052359405, + "grad_norm": 1.531417727470398, + "learning_rate": 0.001, + "loss": 2.0614, + "step": 72600 + }, + { + "epoch": 23.49709114414997, + "grad_norm": 1.5919642448425293, + "learning_rate": 0.001, + "loss": 2.0691, + "step": 72700 + }, + { + "epoch": 23.529411764705884, + "grad_norm": 1.5767107009887695, + "learning_rate": 0.001, + "loss": 2.0789, + "step": 72800 + }, + { + "epoch": 23.5617323852618, + "grad_norm": 1.5957386493682861, + "learning_rate": 0.001, + "loss": 2.067, + "step": 72900 + }, + { + "epoch": 23.594053005817713, + "grad_norm": 1.8179656267166138, + "learning_rate": 0.001, + "loss": 2.0862, + "step": 73000 + }, + { + "epoch": 23.626373626373628, + "grad_norm": 1.5586670637130737, + "learning_rate": 0.001, + "loss": 2.0848, + "step": 73100 + }, + { + "epoch": 23.658694246929542, + "grad_norm": 1.4760098457336426, + "learning_rate": 0.001, + "loss": 2.106, + "step": 73200 + }, + { + "epoch": 23.691014867485457, + "grad_norm": 1.4071135520935059, + "learning_rate": 0.001, + "loss": 2.0928, + "step": 73300 + }, + { + "epoch": 23.72333548804137, + "grad_norm": 1.3541771173477173, + "learning_rate": 0.001, + "loss": 2.1112, + "step": 73400 + }, + { + "epoch": 23.755656108597286, + "grad_norm": 1.6055703163146973, + "learning_rate": 0.001, + "loss": 2.1058, + "step": 73500 + }, + { + "epoch": 23.7879767291532, + "grad_norm": 1.7289507389068604, + "learning_rate": 0.001, + "loss": 2.1219, + "step": 73600 + }, + { + "epoch": 23.820297349709115, + "grad_norm": 1.8029732704162598, + "learning_rate": 0.001, + "loss": 2.083, + "step": 73700 + }, + { + "epoch": 23.85261797026503, + "grad_norm": 1.8605992794036865, + "learning_rate": 0.001, + "loss": 2.103, + "step": 73800 + }, + { + "epoch": 23.884938590820944, + "grad_norm": 1.1460589170455933, + "learning_rate": 0.001, + "loss": 2.0971, + "step": 73900 + }, + { + "epoch": 23.91725921137686, + "grad_norm": 1.4249467849731445, + "learning_rate": 0.001, + "loss": 2.1331, + "step": 74000 + }, + { + "epoch": 23.949579831932773, + "grad_norm": 1.4224003553390503, + "learning_rate": 0.001, + "loss": 2.1221, + "step": 74100 + }, + { + "epoch": 23.981900452488688, + "grad_norm": 1.3229635953903198, + "learning_rate": 0.001, + "loss": 2.1182, + "step": 74200 + }, + { + "epoch": 24.014221073044602, + "grad_norm": 1.3930984735488892, + "learning_rate": 0.001, + "loss": 2.0503, + "step": 74300 + }, + { + "epoch": 24.046541693600517, + "grad_norm": 1.4770911931991577, + "learning_rate": 0.001, + "loss": 1.9697, + "step": 74400 + }, + { + "epoch": 24.07886231415643, + "grad_norm": 1.6999855041503906, + "learning_rate": 0.001, + "loss": 1.9748, + "step": 74500 + }, + { + "epoch": 24.111182934712346, + "grad_norm": 1.63511061668396, + "learning_rate": 0.001, + "loss": 1.9672, + "step": 74600 + }, + { + "epoch": 24.14350355526826, + "grad_norm": 1.4599164724349976, + "learning_rate": 0.001, + "loss": 1.9993, + "step": 74700 + }, + { + "epoch": 24.175824175824175, + "grad_norm": 1.9511414766311646, + "learning_rate": 0.001, + "loss": 2.0237, + "step": 74800 + }, + { + "epoch": 24.20814479638009, + "grad_norm": 1.46794593334198, + "learning_rate": 0.001, + "loss": 1.985, + "step": 74900 + }, + { + "epoch": 24.240465416936004, + "grad_norm": 1.2807059288024902, + "learning_rate": 0.001, + "loss": 2.0034, + "step": 75000 + }, + { + "epoch": 24.27278603749192, + "grad_norm": 1.7410281896591187, + "learning_rate": 0.001, + "loss": 2.024, + "step": 75100 + }, + { + "epoch": 24.305106658047833, + "grad_norm": 1.439113736152649, + "learning_rate": 0.001, + "loss": 2.0233, + "step": 75200 + }, + { + "epoch": 24.337427278603748, + "grad_norm": 1.419325351715088, + "learning_rate": 0.001, + "loss": 2.0295, + "step": 75300 + }, + { + "epoch": 24.369747899159663, + "grad_norm": 1.564228892326355, + "learning_rate": 0.001, + "loss": 2.0153, + "step": 75400 + }, + { + "epoch": 24.402068519715577, + "grad_norm": 1.38504159450531, + "learning_rate": 0.001, + "loss": 2.0205, + "step": 75500 + }, + { + "epoch": 24.43438914027149, + "grad_norm": 1.453363060951233, + "learning_rate": 0.001, + "loss": 2.0271, + "step": 75600 + }, + { + "epoch": 24.466709760827406, + "grad_norm": 1.6083861589431763, + "learning_rate": 0.001, + "loss": 2.014, + "step": 75700 + }, + { + "epoch": 24.49903038138332, + "grad_norm": 1.7092853784561157, + "learning_rate": 0.001, + "loss": 2.0424, + "step": 75800 + }, + { + "epoch": 24.53135100193924, + "grad_norm": 1.4002851247787476, + "learning_rate": 0.001, + "loss": 2.0249, + "step": 75900 + }, + { + "epoch": 24.563671622495153, + "grad_norm": 1.5491043329238892, + "learning_rate": 0.001, + "loss": 2.0572, + "step": 76000 + }, + { + "epoch": 24.595992243051068, + "grad_norm": 1.3030824661254883, + "learning_rate": 0.001, + "loss": 2.0439, + "step": 76100 + }, + { + "epoch": 24.628312863606983, + "grad_norm": 1.2916637659072876, + "learning_rate": 0.001, + "loss": 2.0471, + "step": 76200 + }, + { + "epoch": 24.660633484162897, + "grad_norm": 1.4240041971206665, + "learning_rate": 0.001, + "loss": 2.0634, + "step": 76300 + }, + { + "epoch": 24.69295410471881, + "grad_norm": 1.753969669342041, + "learning_rate": 0.001, + "loss": 2.0657, + "step": 76400 + }, + { + "epoch": 24.725274725274726, + "grad_norm": 1.2906912565231323, + "learning_rate": 0.001, + "loss": 2.0597, + "step": 76500 + }, + { + "epoch": 24.75759534583064, + "grad_norm": 1.5350031852722168, + "learning_rate": 0.001, + "loss": 2.0839, + "step": 76600 + }, + { + "epoch": 24.789915966386555, + "grad_norm": 1.9816159009933472, + "learning_rate": 0.001, + "loss": 2.057, + "step": 76700 + }, + { + "epoch": 24.82223658694247, + "grad_norm": 1.9255553483963013, + "learning_rate": 0.001, + "loss": 2.0699, + "step": 76800 + }, + { + "epoch": 24.854557207498384, + "grad_norm": 1.4984052181243896, + "learning_rate": 0.001, + "loss": 2.0713, + "step": 76900 + }, + { + "epoch": 24.8868778280543, + "grad_norm": 4.322579860687256, + "learning_rate": 0.001, + "loss": 2.1062, + "step": 77000 + }, + { + "epoch": 24.919198448610214, + "grad_norm": 1.597294569015503, + "learning_rate": 0.001, + "loss": 2.0879, + "step": 77100 + }, + { + "epoch": 24.951519069166128, + "grad_norm": 1.4483535289764404, + "learning_rate": 0.001, + "loss": 2.0873, + "step": 77200 + }, + { + "epoch": 24.983839689722043, + "grad_norm": 1.4084569215774536, + "learning_rate": 0.001, + "loss": 2.0913, + "step": 77300 + }, + { + "epoch": 25.016160310277957, + "grad_norm": 1.9849538803100586, + "learning_rate": 0.001, + "loss": 1.9734, + "step": 77400 + }, + { + "epoch": 25.048480930833872, + "grad_norm": 2.1516408920288086, + "learning_rate": 0.001, + "loss": 1.9801, + "step": 77500 + }, + { + "epoch": 25.080801551389786, + "grad_norm": 2.2661306858062744, + "learning_rate": 0.001, + "loss": 1.937, + "step": 77600 + }, + { + "epoch": 25.1131221719457, + "grad_norm": 2.510815382003784, + "learning_rate": 0.001, + "loss": 1.9473, + "step": 77700 + }, + { + "epoch": 25.145442792501616, + "grad_norm": 2.2111470699310303, + "learning_rate": 0.001, + "loss": 1.96, + "step": 77800 + }, + { + "epoch": 25.17776341305753, + "grad_norm": 2.111010789871216, + "learning_rate": 0.001, + "loss": 1.9912, + "step": 77900 + }, + { + "epoch": 25.210084033613445, + "grad_norm": 2.753941297531128, + "learning_rate": 0.001, + "loss": 1.9919, + "step": 78000 + }, + { + "epoch": 25.24240465416936, + "grad_norm": 2.347527027130127, + "learning_rate": 0.001, + "loss": 1.9843, + "step": 78100 + }, + { + "epoch": 25.274725274725274, + "grad_norm": 2.777312755584717, + "learning_rate": 0.001, + "loss": 1.9692, + "step": 78200 + }, + { + "epoch": 25.30704589528119, + "grad_norm": 1.7811833620071411, + "learning_rate": 0.001, + "loss": 2.0007, + "step": 78300 + }, + { + "epoch": 25.339366515837103, + "grad_norm": 2.1932690143585205, + "learning_rate": 0.001, + "loss": 2.0012, + "step": 78400 + }, + { + "epoch": 25.371687136393017, + "grad_norm": 2.195629596710205, + "learning_rate": 0.001, + "loss": 1.9921, + "step": 78500 + }, + { + "epoch": 25.404007756948932, + "grad_norm": 2.693999767303467, + "learning_rate": 0.001, + "loss": 2.0042, + "step": 78600 + }, + { + "epoch": 25.436328377504847, + "grad_norm": 3.0932207107543945, + "learning_rate": 0.001, + "loss": 1.9884, + "step": 78700 + }, + { + "epoch": 25.46864899806076, + "grad_norm": 2.486372232437134, + "learning_rate": 0.001, + "loss": 2.0082, + "step": 78800 + }, + { + "epoch": 25.50096961861668, + "grad_norm": 2.309953212738037, + "learning_rate": 0.001, + "loss": 1.9836, + "step": 78900 + }, + { + "epoch": 25.533290239172594, + "grad_norm": 2.481170892715454, + "learning_rate": 0.001, + "loss": 2.0401, + "step": 79000 + }, + { + "epoch": 25.56561085972851, + "grad_norm": 6.894639492034912, + "learning_rate": 0.001, + "loss": 2.0187, + "step": 79100 + }, + { + "epoch": 25.597931480284423, + "grad_norm": 2.154688596725464, + "learning_rate": 0.001, + "loss": 2.0334, + "step": 79200 + }, + { + "epoch": 25.630252100840337, + "grad_norm": 2.5269687175750732, + "learning_rate": 0.001, + "loss": 2.0062, + "step": 79300 + }, + { + "epoch": 25.662572721396252, + "grad_norm": 2.0479533672332764, + "learning_rate": 0.001, + "loss": 2.0332, + "step": 79400 + }, + { + "epoch": 25.694893341952167, + "grad_norm": 2.8446123600006104, + "learning_rate": 0.001, + "loss": 2.0332, + "step": 79500 + }, + { + "epoch": 25.72721396250808, + "grad_norm": 1.9865922927856445, + "learning_rate": 0.001, + "loss": 2.0408, + "step": 79600 + }, + { + "epoch": 25.759534583063996, + "grad_norm": 2.4870991706848145, + "learning_rate": 0.001, + "loss": 2.0251, + "step": 79700 + }, + { + "epoch": 25.79185520361991, + "grad_norm": 2.0632777214050293, + "learning_rate": 0.001, + "loss": 2.0453, + "step": 79800 + }, + { + "epoch": 25.824175824175825, + "grad_norm": 2.7556283473968506, + "learning_rate": 0.001, + "loss": 2.0285, + "step": 79900 + }, + { + "epoch": 25.85649644473174, + "grad_norm": 2.0263702869415283, + "learning_rate": 0.001, + "loss": 2.0623, + "step": 80000 + }, + { + "epoch": 25.888817065287654, + "grad_norm": 3.15863299369812, + "learning_rate": 0.001, + "loss": 2.0461, + "step": 80100 + }, + { + "epoch": 25.92113768584357, + "grad_norm": 2.3202335834503174, + "learning_rate": 0.001, + "loss": 2.0489, + "step": 80200 + }, + { + "epoch": 25.953458306399483, + "grad_norm": 2.275404691696167, + "learning_rate": 0.001, + "loss": 2.0559, + "step": 80300 + }, + { + "epoch": 25.985778926955398, + "grad_norm": 1.9701374769210815, + "learning_rate": 0.001, + "loss": 2.0521, + "step": 80400 + }, + { + "epoch": 26.018099547511312, + "grad_norm": 2.0686566829681396, + "learning_rate": 0.001, + "loss": 1.982, + "step": 80500 + }, + { + "epoch": 26.050420168067227, + "grad_norm": 1.7882318496704102, + "learning_rate": 0.001, + "loss": 1.9092, + "step": 80600 + }, + { + "epoch": 26.08274078862314, + "grad_norm": 1.3887428045272827, + "learning_rate": 0.001, + "loss": 1.9209, + "step": 80700 + }, + { + "epoch": 26.115061409179056, + "grad_norm": 1.775455117225647, + "learning_rate": 0.001, + "loss": 1.935, + "step": 80800 + }, + { + "epoch": 26.14738202973497, + "grad_norm": 1.9676622152328491, + "learning_rate": 0.001, + "loss": 1.9301, + "step": 80900 + }, + { + "epoch": 26.179702650290885, + "grad_norm": 1.7484667301177979, + "learning_rate": 0.001, + "loss": 1.9365, + "step": 81000 + }, + { + "epoch": 26.2120232708468, + "grad_norm": 1.333925485610962, + "learning_rate": 0.001, + "loss": 1.9396, + "step": 81100 + }, + { + "epoch": 26.244343891402714, + "grad_norm": 1.3024888038635254, + "learning_rate": 0.001, + "loss": 1.9488, + "step": 81200 + }, + { + "epoch": 26.27666451195863, + "grad_norm": 1.3314354419708252, + "learning_rate": 0.001, + "loss": 1.9625, + "step": 81300 + }, + { + "epoch": 26.308985132514543, + "grad_norm": 2.3453688621520996, + "learning_rate": 0.001, + "loss": 1.9588, + "step": 81400 + }, + { + "epoch": 26.341305753070458, + "grad_norm": 1.459130048751831, + "learning_rate": 0.001, + "loss": 1.9716, + "step": 81500 + }, + { + "epoch": 26.373626373626372, + "grad_norm": 1.3043862581253052, + "learning_rate": 0.001, + "loss": 1.9662, + "step": 81600 + }, + { + "epoch": 26.405946994182287, + "grad_norm": 1.2890630960464478, + "learning_rate": 0.001, + "loss": 1.9729, + "step": 81700 + }, + { + "epoch": 26.4382676147382, + "grad_norm": 1.2323054075241089, + "learning_rate": 0.001, + "loss": 1.9562, + "step": 81800 + }, + { + "epoch": 26.470588235294116, + "grad_norm": 1.9107179641723633, + "learning_rate": 0.001, + "loss": 1.9665, + "step": 81900 + }, + { + "epoch": 26.50290885585003, + "grad_norm": 1.5742831230163574, + "learning_rate": 0.001, + "loss": 1.9809, + "step": 82000 + }, + { + "epoch": 26.53522947640595, + "grad_norm": 1.3814857006072998, + "learning_rate": 0.001, + "loss": 1.975, + "step": 82100 + }, + { + "epoch": 26.567550096961863, + "grad_norm": 1.3042103052139282, + "learning_rate": 0.001, + "loss": 1.9745, + "step": 82200 + }, + { + "epoch": 26.599870717517778, + "grad_norm": 1.6151447296142578, + "learning_rate": 0.001, + "loss": 1.9872, + "step": 82300 + }, + { + "epoch": 26.632191338073692, + "grad_norm": 1.6068259477615356, + "learning_rate": 0.001, + "loss": 1.9933, + "step": 82400 + }, + { + "epoch": 26.664511958629607, + "grad_norm": 1.3208508491516113, + "learning_rate": 0.001, + "loss": 2.0022, + "step": 82500 + }, + { + "epoch": 26.69683257918552, + "grad_norm": 1.5930817127227783, + "learning_rate": 0.001, + "loss": 1.9939, + "step": 82600 + }, + { + "epoch": 26.729153199741436, + "grad_norm": 1.5000683069229126, + "learning_rate": 0.001, + "loss": 2.0076, + "step": 82700 + }, + { + "epoch": 26.76147382029735, + "grad_norm": 1.692630410194397, + "learning_rate": 0.001, + "loss": 2.002, + "step": 82800 + }, + { + "epoch": 26.793794440853265, + "grad_norm": 2.1297543048858643, + "learning_rate": 0.001, + "loss": 2.024, + "step": 82900 + }, + { + "epoch": 26.82611506140918, + "grad_norm": 1.2182215452194214, + "learning_rate": 0.001, + "loss": 2.0362, + "step": 83000 + }, + { + "epoch": 26.858435681965094, + "grad_norm": 1.3465772867202759, + "learning_rate": 0.001, + "loss": 2.0525, + "step": 83100 + }, + { + "epoch": 26.89075630252101, + "grad_norm": 1.9355134963989258, + "learning_rate": 0.001, + "loss": 2.0377, + "step": 83200 + }, + { + "epoch": 26.923076923076923, + "grad_norm": 1.3993531465530396, + "learning_rate": 0.001, + "loss": 2.0204, + "step": 83300 + }, + { + "epoch": 26.955397543632838, + "grad_norm": 1.3632115125656128, + "learning_rate": 0.001, + "loss": 2.0247, + "step": 83400 + }, + { + "epoch": 26.987718164188752, + "grad_norm": 1.70760977268219, + "learning_rate": 0.001, + "loss": 2.0275, + "step": 83500 + }, + { + "epoch": 27.020038784744667, + "grad_norm": 1.203029751777649, + "learning_rate": 0.001, + "loss": 1.951, + "step": 83600 + }, + { + "epoch": 27.05235940530058, + "grad_norm": 1.2251530885696411, + "learning_rate": 0.001, + "loss": 1.8761, + "step": 83700 + }, + { + "epoch": 27.084680025856496, + "grad_norm": 1.2010034322738647, + "learning_rate": 0.001, + "loss": 1.8784, + "step": 83800 + }, + { + "epoch": 27.11700064641241, + "grad_norm": 1.628166913986206, + "learning_rate": 0.001, + "loss": 1.9073, + "step": 83900 + }, + { + "epoch": 27.149321266968325, + "grad_norm": 1.467311143875122, + "learning_rate": 0.001, + "loss": 1.9048, + "step": 84000 + }, + { + "epoch": 27.18164188752424, + "grad_norm": 1.3212134838104248, + "learning_rate": 0.001, + "loss": 1.9009, + "step": 84100 + }, + { + "epoch": 27.213962508080154, + "grad_norm": 1.3070082664489746, + "learning_rate": 0.001, + "loss": 1.9234, + "step": 84200 + }, + { + "epoch": 27.24628312863607, + "grad_norm": 1.4710814952850342, + "learning_rate": 0.001, + "loss": 1.897, + "step": 84300 + }, + { + "epoch": 27.278603749191983, + "grad_norm": 1.5833498239517212, + "learning_rate": 0.001, + "loss": 1.9441, + "step": 84400 + }, + { + "epoch": 27.310924369747898, + "grad_norm": 1.346295952796936, + "learning_rate": 0.001, + "loss": 1.9243, + "step": 84500 + }, + { + "epoch": 27.343244990303813, + "grad_norm": 1.4825193881988525, + "learning_rate": 0.001, + "loss": 1.948, + "step": 84600 + }, + { + "epoch": 27.375565610859727, + "grad_norm": 1.489837884902954, + "learning_rate": 0.001, + "loss": 1.933, + "step": 84700 + }, + { + "epoch": 27.40788623141564, + "grad_norm": 1.3613611459732056, + "learning_rate": 0.001, + "loss": 1.9417, + "step": 84800 + }, + { + "epoch": 27.440206851971556, + "grad_norm": 1.3851776123046875, + "learning_rate": 0.001, + "loss": 1.9535, + "step": 84900 + }, + { + "epoch": 27.47252747252747, + "grad_norm": 1.228777289390564, + "learning_rate": 0.001, + "loss": 1.9639, + "step": 85000 + }, + { + "epoch": 27.50484809308339, + "grad_norm": 1.362752914428711, + "learning_rate": 0.001, + "loss": 1.9513, + "step": 85100 + }, + { + "epoch": 27.537168713639304, + "grad_norm": 1.3782377243041992, + "learning_rate": 0.001, + "loss": 1.9598, + "step": 85200 + }, + { + "epoch": 27.569489334195218, + "grad_norm": 1.31719970703125, + "learning_rate": 0.001, + "loss": 1.961, + "step": 85300 + }, + { + "epoch": 27.601809954751133, + "grad_norm": 1.4611492156982422, + "learning_rate": 0.001, + "loss": 1.9581, + "step": 85400 + }, + { + "epoch": 27.634130575307047, + "grad_norm": 1.6541672945022583, + "learning_rate": 0.001, + "loss": 1.9674, + "step": 85500 + }, + { + "epoch": 27.66645119586296, + "grad_norm": 1.4168950319290161, + "learning_rate": 0.001, + "loss": 1.9783, + "step": 85600 + }, + { + "epoch": 27.698771816418876, + "grad_norm": 1.565339207649231, + "learning_rate": 0.001, + "loss": 1.9772, + "step": 85700 + }, + { + "epoch": 27.73109243697479, + "grad_norm": 1.4636658430099487, + "learning_rate": 0.001, + "loss": 1.9662, + "step": 85800 + }, + { + "epoch": 27.763413057530705, + "grad_norm": 1.1653151512145996, + "learning_rate": 0.001, + "loss": 1.9753, + "step": 85900 + }, + { + "epoch": 27.79573367808662, + "grad_norm": 1.3461968898773193, + "learning_rate": 0.001, + "loss": 1.9591, + "step": 86000 + }, + { + "epoch": 27.828054298642535, + "grad_norm": 1.2734227180480957, + "learning_rate": 0.001, + "loss": 1.975, + "step": 86100 + }, + { + "epoch": 27.86037491919845, + "grad_norm": 1.3196417093276978, + "learning_rate": 0.001, + "loss": 1.9847, + "step": 86200 + }, + { + "epoch": 27.892695539754364, + "grad_norm": 1.2176880836486816, + "learning_rate": 0.001, + "loss": 1.9904, + "step": 86300 + }, + { + "epoch": 27.92501616031028, + "grad_norm": 1.1071490049362183, + "learning_rate": 0.001, + "loss": 1.9906, + "step": 86400 + }, + { + "epoch": 27.957336780866193, + "grad_norm": 1.5895119905471802, + "learning_rate": 0.001, + "loss": 2.0066, + "step": 86500 + }, + { + "epoch": 27.989657401422107, + "grad_norm": 1.3792082071304321, + "learning_rate": 0.001, + "loss": 1.9941, + "step": 86600 + }, + { + "epoch": 28.021978021978022, + "grad_norm": 1.2426801919937134, + "learning_rate": 0.001, + "loss": 1.9242, + "step": 86700 + }, + { + "epoch": 28.054298642533936, + "grad_norm": 1.7085320949554443, + "learning_rate": 0.001, + "loss": 1.8433, + "step": 86800 + }, + { + "epoch": 28.08661926308985, + "grad_norm": 1.3165888786315918, + "learning_rate": 0.001, + "loss": 1.8714, + "step": 86900 + }, + { + "epoch": 28.118939883645766, + "grad_norm": 1.4661237001419067, + "learning_rate": 0.001, + "loss": 1.8749, + "step": 87000 + }, + { + "epoch": 28.15126050420168, + "grad_norm": 1.6082890033721924, + "learning_rate": 0.001, + "loss": 1.8612, + "step": 87100 + }, + { + "epoch": 28.183581124757595, + "grad_norm": 1.3356757164001465, + "learning_rate": 0.001, + "loss": 1.8897, + "step": 87200 + }, + { + "epoch": 28.21590174531351, + "grad_norm": 1.557093620300293, + "learning_rate": 0.001, + "loss": 1.8871, + "step": 87300 + }, + { + "epoch": 28.248222365869424, + "grad_norm": 1.7916589975357056, + "learning_rate": 0.001, + "loss": 1.8883, + "step": 87400 + }, + { + "epoch": 28.28054298642534, + "grad_norm": 1.2493481636047363, + "learning_rate": 0.001, + "loss": 1.9002, + "step": 87500 + }, + { + "epoch": 28.312863606981253, + "grad_norm": 1.3864846229553223, + "learning_rate": 0.001, + "loss": 1.9133, + "step": 87600 + }, + { + "epoch": 28.345184227537167, + "grad_norm": 1.6483922004699707, + "learning_rate": 0.001, + "loss": 1.8825, + "step": 87700 + }, + { + "epoch": 28.377504848093082, + "grad_norm": 1.1912819147109985, + "learning_rate": 0.001, + "loss": 1.91, + "step": 87800 + }, + { + "epoch": 28.409825468648997, + "grad_norm": 1.1268420219421387, + "learning_rate": 0.001, + "loss": 1.9175, + "step": 87900 + }, + { + "epoch": 28.44214608920491, + "grad_norm": 1.364435076713562, + "learning_rate": 0.001, + "loss": 1.9084, + "step": 88000 + }, + { + "epoch": 28.474466709760826, + "grad_norm": 1.3538644313812256, + "learning_rate": 0.001, + "loss": 1.8976, + "step": 88100 + }, + { + "epoch": 28.50678733031674, + "grad_norm": 1.308135986328125, + "learning_rate": 0.001, + "loss": 1.9203, + "step": 88200 + }, + { + "epoch": 28.53910795087266, + "grad_norm": 1.5055309534072876, + "learning_rate": 0.001, + "loss": 1.9177, + "step": 88300 + }, + { + "epoch": 28.571428571428573, + "grad_norm": 1.4888206720352173, + "learning_rate": 0.001, + "loss": 1.9213, + "step": 88400 + }, + { + "epoch": 28.603749191984488, + "grad_norm": 1.112297773361206, + "learning_rate": 0.001, + "loss": 1.939, + "step": 88500 + }, + { + "epoch": 28.636069812540402, + "grad_norm": 1.3420555591583252, + "learning_rate": 0.001, + "loss": 1.9181, + "step": 88600 + }, + { + "epoch": 28.668390433096317, + "grad_norm": 1.143880009651184, + "learning_rate": 0.001, + "loss": 1.9568, + "step": 88700 + }, + { + "epoch": 28.70071105365223, + "grad_norm": 1.6893914937973022, + "learning_rate": 0.001, + "loss": 1.9262, + "step": 88800 + }, + { + "epoch": 28.733031674208146, + "grad_norm": 1.6080857515335083, + "learning_rate": 0.001, + "loss": 1.929, + "step": 88900 + }, + { + "epoch": 28.76535229476406, + "grad_norm": 1.342633605003357, + "learning_rate": 0.001, + "loss": 1.9665, + "step": 89000 + }, + { + "epoch": 28.797672915319975, + "grad_norm": 1.5504629611968994, + "learning_rate": 0.001, + "loss": 1.9502, + "step": 89100 + }, + { + "epoch": 28.82999353587589, + "grad_norm": 1.4633890390396118, + "learning_rate": 0.001, + "loss": 1.9644, + "step": 89200 + }, + { + "epoch": 28.862314156431804, + "grad_norm": 1.3921465873718262, + "learning_rate": 0.001, + "loss": 1.9698, + "step": 89300 + }, + { + "epoch": 28.89463477698772, + "grad_norm": 1.5433467626571655, + "learning_rate": 0.001, + "loss": 1.9694, + "step": 89400 + }, + { + "epoch": 28.926955397543633, + "grad_norm": 1.159615397453308, + "learning_rate": 0.001, + "loss": 1.968, + "step": 89500 + }, + { + "epoch": 28.959276018099548, + "grad_norm": 1.3793103694915771, + "learning_rate": 0.001, + "loss": 1.9649, + "step": 89600 + }, + { + "epoch": 28.991596638655462, + "grad_norm": 2.6745166778564453, + "learning_rate": 0.001, + "loss": 1.9738, + "step": 89700 + }, + { + "epoch": 29.023917259211377, + "grad_norm": 1.4499655961990356, + "learning_rate": 0.001, + "loss": 1.8535, + "step": 89800 + }, + { + "epoch": 29.05623787976729, + "grad_norm": 1.5100798606872559, + "learning_rate": 0.001, + "loss": 1.8261, + "step": 89900 + }, + { + "epoch": 29.088558500323206, + "grad_norm": 1.518397569656372, + "learning_rate": 0.001, + "loss": 1.8219, + "step": 90000 + }, + { + "epoch": 29.12087912087912, + "grad_norm": 1.3293129205703735, + "learning_rate": 0.001, + "loss": 1.8459, + "step": 90100 + }, + { + "epoch": 29.153199741435035, + "grad_norm": 1.8555089235305786, + "learning_rate": 0.001, + "loss": 1.8459, + "step": 90200 + }, + { + "epoch": 29.18552036199095, + "grad_norm": 1.4512817859649658, + "learning_rate": 0.001, + "loss": 1.8584, + "step": 90300 + }, + { + "epoch": 29.217840982546864, + "grad_norm": 1.735163688659668, + "learning_rate": 0.001, + "loss": 1.8434, + "step": 90400 + }, + { + "epoch": 29.25016160310278, + "grad_norm": 1.3833225965499878, + "learning_rate": 0.001, + "loss": 1.884, + "step": 90500 + }, + { + "epoch": 29.282482223658693, + "grad_norm": 1.7664813995361328, + "learning_rate": 0.001, + "loss": 1.88, + "step": 90600 + }, + { + "epoch": 29.314802844214608, + "grad_norm": 1.440193772315979, + "learning_rate": 0.001, + "loss": 1.8567, + "step": 90700 + }, + { + "epoch": 29.347123464770522, + "grad_norm": 1.239136815071106, + "learning_rate": 0.001, + "loss": 1.8746, + "step": 90800 + }, + { + "epoch": 29.379444085326437, + "grad_norm": 1.3825310468673706, + "learning_rate": 0.001, + "loss": 1.8684, + "step": 90900 + }, + { + "epoch": 29.41176470588235, + "grad_norm": 1.4752728939056396, + "learning_rate": 0.001, + "loss": 1.8587, + "step": 91000 + }, + { + "epoch": 29.444085326438266, + "grad_norm": 1.495429515838623, + "learning_rate": 0.001, + "loss": 1.8849, + "step": 91100 + }, + { + "epoch": 29.47640594699418, + "grad_norm": 1.4592987298965454, + "learning_rate": 0.001, + "loss": 1.8892, + "step": 91200 + }, + { + "epoch": 29.5087265675501, + "grad_norm": 1.321603536605835, + "learning_rate": 0.001, + "loss": 1.9044, + "step": 91300 + }, + { + "epoch": 29.541047188106013, + "grad_norm": 1.2586690187454224, + "learning_rate": 0.001, + "loss": 1.9141, + "step": 91400 + }, + { + "epoch": 29.573367808661928, + "grad_norm": 1.3730324506759644, + "learning_rate": 0.001, + "loss": 1.8934, + "step": 91500 + }, + { + "epoch": 29.605688429217842, + "grad_norm": 1.6763105392456055, + "learning_rate": 0.001, + "loss": 1.905, + "step": 91600 + }, + { + "epoch": 29.638009049773757, + "grad_norm": 1.6392866373062134, + "learning_rate": 0.001, + "loss": 1.9155, + "step": 91700 + }, + { + "epoch": 29.67032967032967, + "grad_norm": 1.2820043563842773, + "learning_rate": 0.001, + "loss": 1.9181, + "step": 91800 + }, + { + "epoch": 29.702650290885586, + "grad_norm": 1.6354836225509644, + "learning_rate": 0.001, + "loss": 1.9207, + "step": 91900 + }, + { + "epoch": 29.7349709114415, + "grad_norm": 1.3978163003921509, + "learning_rate": 0.001, + "loss": 1.9039, + "step": 92000 + }, + { + "epoch": 29.767291531997415, + "grad_norm": 1.3554919958114624, + "learning_rate": 0.001, + "loss": 1.9283, + "step": 92100 + }, + { + "epoch": 29.79961215255333, + "grad_norm": 1.4941645860671997, + "learning_rate": 0.001, + "loss": 1.9183, + "step": 92200 + }, + { + "epoch": 29.831932773109244, + "grad_norm": 1.5266228914260864, + "learning_rate": 0.001, + "loss": 1.9186, + "step": 92300 + }, + { + "epoch": 29.86425339366516, + "grad_norm": 1.4845457077026367, + "learning_rate": 0.001, + "loss": 1.9399, + "step": 92400 + }, + { + "epoch": 29.896574014221073, + "grad_norm": 1.1934114694595337, + "learning_rate": 0.001, + "loss": 1.9314, + "step": 92500 + }, + { + "epoch": 29.928894634776988, + "grad_norm": 1.3535517454147339, + "learning_rate": 0.001, + "loss": 1.9399, + "step": 92600 + }, + { + "epoch": 29.961215255332903, + "grad_norm": 1.230025291442871, + "learning_rate": 0.001, + "loss": 1.9475, + "step": 92700 + }, + { + "epoch": 29.993535875888817, + "grad_norm": 1.4865684509277344, + "learning_rate": 0.001, + "loss": 1.9565, + "step": 92800 + }, + { + "epoch": 30.02585649644473, + "grad_norm": 1.4620200395584106, + "learning_rate": 0.001, + "loss": 1.828, + "step": 92900 + }, + { + "epoch": 30.058177117000646, + "grad_norm": 3.006803512573242, + "learning_rate": 0.001, + "loss": 1.7962, + "step": 93000 + }, + { + "epoch": 30.09049773755656, + "grad_norm": 1.6281250715255737, + "learning_rate": 0.001, + "loss": 1.8051, + "step": 93100 + }, + { + "epoch": 30.122818358112475, + "grad_norm": 1.3403794765472412, + "learning_rate": 0.001, + "loss": 1.8096, + "step": 93200 + }, + { + "epoch": 30.15513897866839, + "grad_norm": 1.3069578409194946, + "learning_rate": 0.001, + "loss": 1.8239, + "step": 93300 + }, + { + "epoch": 30.187459599224304, + "grad_norm": 1.467483401298523, + "learning_rate": 0.001, + "loss": 1.8257, + "step": 93400 + }, + { + "epoch": 30.21978021978022, + "grad_norm": 3.415764570236206, + "learning_rate": 0.001, + "loss": 1.8274, + "step": 93500 + }, + { + "epoch": 30.252100840336134, + "grad_norm": 2.0394747257232666, + "learning_rate": 0.001, + "loss": 1.8256, + "step": 93600 + }, + { + "epoch": 30.284421460892048, + "grad_norm": 1.498351812362671, + "learning_rate": 0.001, + "loss": 1.855, + "step": 93700 + }, + { + "epoch": 30.316742081447963, + "grad_norm": 1.360203742980957, + "learning_rate": 0.001, + "loss": 1.8346, + "step": 93800 + }, + { + "epoch": 30.349062702003877, + "grad_norm": 1.4011281728744507, + "learning_rate": 0.001, + "loss": 1.8497, + "step": 93900 + }, + { + "epoch": 30.381383322559792, + "grad_norm": 1.6812119483947754, + "learning_rate": 0.001, + "loss": 1.8551, + "step": 94000 + }, + { + "epoch": 30.413703943115706, + "grad_norm": 1.4505479335784912, + "learning_rate": 0.001, + "loss": 1.8439, + "step": 94100 + }, + { + "epoch": 30.44602456367162, + "grad_norm": 1.6102886199951172, + "learning_rate": 0.001, + "loss": 1.8579, + "step": 94200 + }, + { + "epoch": 30.478345184227535, + "grad_norm": 1.5858819484710693, + "learning_rate": 0.001, + "loss": 1.8701, + "step": 94300 + }, + { + "epoch": 30.51066580478345, + "grad_norm": 2.008108139038086, + "learning_rate": 0.001, + "loss": 1.8719, + "step": 94400 + }, + { + "epoch": 30.542986425339368, + "grad_norm": 1.247879147529602, + "learning_rate": 0.001, + "loss": 1.8679, + "step": 94500 + }, + { + "epoch": 30.575307045895283, + "grad_norm": 1.4479765892028809, + "learning_rate": 0.001, + "loss": 1.8821, + "step": 94600 + }, + { + "epoch": 30.607627666451197, + "grad_norm": 1.325579047203064, + "learning_rate": 0.001, + "loss": 1.8679, + "step": 94700 + }, + { + "epoch": 30.639948287007112, + "grad_norm": 1.6809526681900024, + "learning_rate": 0.001, + "loss": 1.8784, + "step": 94800 + }, + { + "epoch": 30.672268907563026, + "grad_norm": 1.5918498039245605, + "learning_rate": 0.001, + "loss": 1.8975, + "step": 94900 + }, + { + "epoch": 30.70458952811894, + "grad_norm": 1.6501222848892212, + "learning_rate": 0.001, + "loss": 1.8868, + "step": 95000 + }, + { + "epoch": 30.736910148674855, + "grad_norm": 2.2188880443573, + "learning_rate": 0.001, + "loss": 1.8769, + "step": 95100 + }, + { + "epoch": 30.76923076923077, + "grad_norm": 1.1673107147216797, + "learning_rate": 0.001, + "loss": 1.9076, + "step": 95200 + }, + { + "epoch": 30.801551389786685, + "grad_norm": 1.7042407989501953, + "learning_rate": 0.001, + "loss": 1.8965, + "step": 95300 + }, + { + "epoch": 30.8338720103426, + "grad_norm": 1.224590539932251, + "learning_rate": 0.001, + "loss": 1.8954, + "step": 95400 + }, + { + "epoch": 30.866192630898514, + "grad_norm": 1.4763602018356323, + "learning_rate": 0.001, + "loss": 1.9105, + "step": 95500 + }, + { + "epoch": 30.89851325145443, + "grad_norm": 1.6836724281311035, + "learning_rate": 0.001, + "loss": 1.8827, + "step": 95600 + }, + { + "epoch": 30.930833872010343, + "grad_norm": 1.3475334644317627, + "learning_rate": 0.001, + "loss": 1.9173, + "step": 95700 + }, + { + "epoch": 30.963154492566257, + "grad_norm": 1.347589373588562, + "learning_rate": 0.001, + "loss": 1.889, + "step": 95800 + }, + { + "epoch": 30.995475113122172, + "grad_norm": 1.473758339881897, + "learning_rate": 0.001, + "loss": 1.8965, + "step": 95900 + }, + { + "epoch": 31.027795733678087, + "grad_norm": 1.728955864906311, + "learning_rate": 0.001, + "loss": 1.8083, + "step": 96000 + }, + { + "epoch": 31.060116354234, + "grad_norm": 1.5232839584350586, + "learning_rate": 0.001, + "loss": 1.7555, + "step": 96100 + }, + { + "epoch": 31.092436974789916, + "grad_norm": 1.8657755851745605, + "learning_rate": 0.001, + "loss": 1.7685, + "step": 96200 + }, + { + "epoch": 31.12475759534583, + "grad_norm": 1.4750165939331055, + "learning_rate": 0.001, + "loss": 1.7735, + "step": 96300 + }, + { + "epoch": 31.157078215901745, + "grad_norm": 1.5432738065719604, + "learning_rate": 0.001, + "loss": 1.7882, + "step": 96400 + }, + { + "epoch": 31.18939883645766, + "grad_norm": 1.422799825668335, + "learning_rate": 0.001, + "loss": 1.7952, + "step": 96500 + }, + { + "epoch": 31.221719457013574, + "grad_norm": 1.6849409341812134, + "learning_rate": 0.001, + "loss": 1.784, + "step": 96600 + }, + { + "epoch": 31.25404007756949, + "grad_norm": 1.4621268510818481, + "learning_rate": 0.001, + "loss": 1.788, + "step": 96700 + }, + { + "epoch": 31.286360698125403, + "grad_norm": 1.3027772903442383, + "learning_rate": 0.001, + "loss": 1.8025, + "step": 96800 + }, + { + "epoch": 31.318681318681318, + "grad_norm": 1.5838264226913452, + "learning_rate": 0.001, + "loss": 1.8274, + "step": 96900 + }, + { + "epoch": 31.351001939237232, + "grad_norm": 1.368515968322754, + "learning_rate": 0.001, + "loss": 1.8323, + "step": 97000 + }, + { + "epoch": 31.383322559793147, + "grad_norm": 1.6732114553451538, + "learning_rate": 0.001, + "loss": 1.8276, + "step": 97100 + }, + { + "epoch": 31.41564318034906, + "grad_norm": 1.4694806337356567, + "learning_rate": 0.001, + "loss": 1.8157, + "step": 97200 + }, + { + "epoch": 31.447963800904976, + "grad_norm": 2.3622124195098877, + "learning_rate": 0.001, + "loss": 1.8327, + "step": 97300 + }, + { + "epoch": 31.48028442146089, + "grad_norm": 1.6618635654449463, + "learning_rate": 0.001, + "loss": 1.8298, + "step": 97400 + }, + { + "epoch": 31.51260504201681, + "grad_norm": 1.743264079093933, + "learning_rate": 0.001, + "loss": 1.8448, + "step": 97500 + }, + { + "epoch": 31.544925662572723, + "grad_norm": 1.2837010622024536, + "learning_rate": 0.001, + "loss": 1.8467, + "step": 97600 + }, + { + "epoch": 31.577246283128638, + "grad_norm": 1.7191213369369507, + "learning_rate": 0.001, + "loss": 1.8323, + "step": 97700 + }, + { + "epoch": 31.609566903684552, + "grad_norm": 2.304013729095459, + "learning_rate": 0.001, + "loss": 1.8629, + "step": 97800 + }, + { + "epoch": 31.641887524240467, + "grad_norm": 1.6232678890228271, + "learning_rate": 0.001, + "loss": 1.8508, + "step": 97900 + }, + { + "epoch": 31.67420814479638, + "grad_norm": 2.126199722290039, + "learning_rate": 0.001, + "loss": 1.858, + "step": 98000 + }, + { + "epoch": 31.706528765352296, + "grad_norm": 1.7926188707351685, + "learning_rate": 0.001, + "loss": 1.8507, + "step": 98100 + }, + { + "epoch": 31.73884938590821, + "grad_norm": 1.4954445362091064, + "learning_rate": 0.001, + "loss": 1.8566, + "step": 98200 + }, + { + "epoch": 31.771170006464125, + "grad_norm": 1.5035561323165894, + "learning_rate": 0.001, + "loss": 1.8696, + "step": 98300 + }, + { + "epoch": 31.80349062702004, + "grad_norm": 1.323290467262268, + "learning_rate": 0.001, + "loss": 1.8866, + "step": 98400 + }, + { + "epoch": 31.835811247575954, + "grad_norm": 1.6078685522079468, + "learning_rate": 0.001, + "loss": 1.872, + "step": 98500 + }, + { + "epoch": 31.86813186813187, + "grad_norm": 1.5674740076065063, + "learning_rate": 0.001, + "loss": 1.8907, + "step": 98600 + }, + { + "epoch": 31.900452488687783, + "grad_norm": 1.6643004417419434, + "learning_rate": 0.001, + "loss": 1.8766, + "step": 98700 + }, + { + "epoch": 31.932773109243698, + "grad_norm": 1.5275764465332031, + "learning_rate": 0.001, + "loss": 1.8751, + "step": 98800 + }, + { + "epoch": 31.965093729799612, + "grad_norm": 1.471692681312561, + "learning_rate": 0.001, + "loss": 1.8686, + "step": 98900 + }, + { + "epoch": 31.997414350355527, + "grad_norm": 1.378650188446045, + "learning_rate": 0.001, + "loss": 1.8815, + "step": 99000 + }, + { + "epoch": 32.02973497091144, + "grad_norm": 1.5829144716262817, + "learning_rate": 0.001, + "loss": 1.7734, + "step": 99100 + }, + { + "epoch": 32.062055591467356, + "grad_norm": 1.8891956806182861, + "learning_rate": 0.001, + "loss": 1.7384, + "step": 99200 + }, + { + "epoch": 32.09437621202327, + "grad_norm": 1.706789493560791, + "learning_rate": 0.001, + "loss": 1.7708, + "step": 99300 + }, + { + "epoch": 32.126696832579185, + "grad_norm": 1.6363348960876465, + "learning_rate": 0.001, + "loss": 1.751, + "step": 99400 + }, + { + "epoch": 32.1590174531351, + "grad_norm": 1.6457102298736572, + "learning_rate": 0.001, + "loss": 1.765, + "step": 99500 + }, + { + "epoch": 32.191338073691014, + "grad_norm": 1.5266406536102295, + "learning_rate": 0.001, + "loss": 1.7812, + "step": 99600 + }, + { + "epoch": 32.22365869424693, + "grad_norm": 1.9714754819869995, + "learning_rate": 0.001, + "loss": 1.7729, + "step": 99700 + }, + { + "epoch": 32.25597931480284, + "grad_norm": 1.6170477867126465, + "learning_rate": 0.001, + "loss": 1.7729, + "step": 99800 + }, + { + "epoch": 32.28829993535876, + "grad_norm": 1.3753221035003662, + "learning_rate": 0.001, + "loss": 1.7805, + "step": 99900 + }, + { + "epoch": 32.32062055591467, + "grad_norm": 2.1753334999084473, + "learning_rate": 0.001, + "loss": 1.7737, + "step": 100000 + }, + { + "epoch": 32.35294117647059, + "grad_norm": 1.8860663175582886, + "learning_rate": 0.001, + "loss": 1.7906, + "step": 100100 + }, + { + "epoch": 32.3852617970265, + "grad_norm": 2.0654399394989014, + "learning_rate": 0.001, + "loss": 1.7885, + "step": 100200 + }, + { + "epoch": 32.417582417582416, + "grad_norm": 1.7325553894042969, + "learning_rate": 0.001, + "loss": 1.8063, + "step": 100300 + }, + { + "epoch": 32.44990303813833, + "grad_norm": 1.588712215423584, + "learning_rate": 0.001, + "loss": 1.8004, + "step": 100400 + }, + { + "epoch": 32.482223658694245, + "grad_norm": 1.6810243129730225, + "learning_rate": 0.001, + "loss": 1.8152, + "step": 100500 + }, + { + "epoch": 32.51454427925016, + "grad_norm": 1.6769487857818604, + "learning_rate": 0.001, + "loss": 1.8212, + "step": 100600 + }, + { + "epoch": 32.546864899806074, + "grad_norm": 1.9445384740829468, + "learning_rate": 0.001, + "loss": 1.7917, + "step": 100700 + }, + { + "epoch": 32.57918552036199, + "grad_norm": 1.6605241298675537, + "learning_rate": 0.001, + "loss": 1.8362, + "step": 100800 + }, + { + "epoch": 32.6115061409179, + "grad_norm": 2.058520793914795, + "learning_rate": 0.001, + "loss": 1.8341, + "step": 100900 + }, + { + "epoch": 32.64382676147382, + "grad_norm": 1.3105531930923462, + "learning_rate": 0.001, + "loss": 1.8431, + "step": 101000 + }, + { + "epoch": 32.67614738202973, + "grad_norm": 1.626055359840393, + "learning_rate": 0.001, + "loss": 1.824, + "step": 101100 + }, + { + "epoch": 32.70846800258565, + "grad_norm": 1.5629281997680664, + "learning_rate": 0.001, + "loss": 1.8388, + "step": 101200 + }, + { + "epoch": 32.74078862314156, + "grad_norm": 1.8073807954788208, + "learning_rate": 0.001, + "loss": 1.8373, + "step": 101300 + }, + { + "epoch": 32.773109243697476, + "grad_norm": 2.0889344215393066, + "learning_rate": 0.001, + "loss": 1.8304, + "step": 101400 + }, + { + "epoch": 32.80542986425339, + "grad_norm": 1.771985650062561, + "learning_rate": 0.001, + "loss": 1.8436, + "step": 101500 + }, + { + "epoch": 32.837750484809305, + "grad_norm": 1.631714940071106, + "learning_rate": 0.001, + "loss": 1.8502, + "step": 101600 + }, + { + "epoch": 32.87007110536522, + "grad_norm": 2.003098487854004, + "learning_rate": 0.001, + "loss": 1.8608, + "step": 101700 + }, + { + "epoch": 32.902391725921134, + "grad_norm": 1.7163927555084229, + "learning_rate": 0.001, + "loss": 1.8409, + "step": 101800 + }, + { + "epoch": 32.93471234647705, + "grad_norm": 1.5179773569107056, + "learning_rate": 0.001, + "loss": 1.836, + "step": 101900 + }, + { + "epoch": 32.967032967032964, + "grad_norm": 1.826611042022705, + "learning_rate": 0.001, + "loss": 1.8544, + "step": 102000 + }, + { + "epoch": 32.999353587588885, + "grad_norm": 2.0512094497680664, + "learning_rate": 0.001, + "loss": 1.8587, + "step": 102100 + }, + { + "epoch": 33.0316742081448, + "grad_norm": 2.0743322372436523, + "learning_rate": 0.001, + "loss": 1.6994, + "step": 102200 + }, + { + "epoch": 33.063994828700714, + "grad_norm": 2.1216299533843994, + "learning_rate": 0.001, + "loss": 1.7167, + "step": 102300 + }, + { + "epoch": 33.09631544925663, + "grad_norm": 2.0061864852905273, + "learning_rate": 0.001, + "loss": 1.7269, + "step": 102400 + }, + { + "epoch": 33.12863606981254, + "grad_norm": 1.8516168594360352, + "learning_rate": 0.001, + "loss": 1.7471, + "step": 102500 + }, + { + "epoch": 33.16095669036846, + "grad_norm": 2.0900888442993164, + "learning_rate": 0.001, + "loss": 1.7444, + "step": 102600 + }, + { + "epoch": 33.19327731092437, + "grad_norm": 1.9788792133331299, + "learning_rate": 0.001, + "loss": 1.7557, + "step": 102700 + }, + { + "epoch": 33.22559793148029, + "grad_norm": 2.034575939178467, + "learning_rate": 0.001, + "loss": 1.7563, + "step": 102800 + }, + { + "epoch": 33.2579185520362, + "grad_norm": 2.135305881500244, + "learning_rate": 0.001, + "loss": 1.7338, + "step": 102900 + }, + { + "epoch": 33.290239172592116, + "grad_norm": 1.8343353271484375, + "learning_rate": 0.001, + "loss": 1.7662, + "step": 103000 + }, + { + "epoch": 33.32255979314803, + "grad_norm": 2.277712106704712, + "learning_rate": 0.001, + "loss": 1.7439, + "step": 103100 + }, + { + "epoch": 33.354880413703945, + "grad_norm": 1.983909010887146, + "learning_rate": 0.001, + "loss": 1.7626, + "step": 103200 + }, + { + "epoch": 33.38720103425986, + "grad_norm": 1.6377862691879272, + "learning_rate": 0.001, + "loss": 1.7789, + "step": 103300 + }, + { + "epoch": 33.419521654815775, + "grad_norm": 1.8458659648895264, + "learning_rate": 0.001, + "loss": 1.7728, + "step": 103400 + }, + { + "epoch": 33.45184227537169, + "grad_norm": 1.9960947036743164, + "learning_rate": 0.001, + "loss": 1.7659, + "step": 103500 + }, + { + "epoch": 33.484162895927604, + "grad_norm": 2.0840041637420654, + "learning_rate": 0.001, + "loss": 1.763, + "step": 103600 + }, + { + "epoch": 33.51648351648352, + "grad_norm": 2.215972661972046, + "learning_rate": 0.001, + "loss": 1.7933, + "step": 103700 + }, + { + "epoch": 33.54880413703943, + "grad_norm": 1.8777416944503784, + "learning_rate": 0.001, + "loss": 1.7918, + "step": 103800 + }, + { + "epoch": 33.58112475759535, + "grad_norm": 1.7286779880523682, + "learning_rate": 0.001, + "loss": 1.792, + "step": 103900 + }, + { + "epoch": 33.61344537815126, + "grad_norm": 2.0685818195343018, + "learning_rate": 0.001, + "loss": 1.7941, + "step": 104000 + }, + { + "epoch": 33.645765998707176, + "grad_norm": 1.928328037261963, + "learning_rate": 0.001, + "loss": 1.8152, + "step": 104100 + }, + { + "epoch": 33.67808661926309, + "grad_norm": 1.7642704248428345, + "learning_rate": 0.001, + "loss": 1.8128, + "step": 104200 + }, + { + "epoch": 33.710407239819006, + "grad_norm": 2.2432053089141846, + "learning_rate": 0.001, + "loss": 1.8034, + "step": 104300 + }, + { + "epoch": 33.74272786037492, + "grad_norm": 2.012679100036621, + "learning_rate": 0.001, + "loss": 1.8132, + "step": 104400 + }, + { + "epoch": 33.775048480930835, + "grad_norm": 2.067655086517334, + "learning_rate": 0.001, + "loss": 1.8267, + "step": 104500 + }, + { + "epoch": 33.80736910148675, + "grad_norm": 1.9166897535324097, + "learning_rate": 0.001, + "loss": 1.8191, + "step": 104600 + }, + { + "epoch": 33.839689722042664, + "grad_norm": 1.8261538743972778, + "learning_rate": 0.001, + "loss": 1.8396, + "step": 104700 + }, + { + "epoch": 33.87201034259858, + "grad_norm": 1.6121453046798706, + "learning_rate": 0.001, + "loss": 1.8192, + "step": 104800 + }, + { + "epoch": 33.90433096315449, + "grad_norm": 1.7790288925170898, + "learning_rate": 0.001, + "loss": 1.8441, + "step": 104900 + }, + { + "epoch": 33.93665158371041, + "grad_norm": 2.033315658569336, + "learning_rate": 0.001, + "loss": 1.8407, + "step": 105000 + }, + { + "epoch": 33.96897220426632, + "grad_norm": 1.9757274389266968, + "learning_rate": 0.001, + "loss": 1.8312, + "step": 105100 + }, + { + "epoch": 34.00129282482224, + "grad_norm": 1.8282935619354248, + "learning_rate": 0.001, + "loss": 1.824, + "step": 105200 + }, + { + "epoch": 34.03361344537815, + "grad_norm": 1.3721950054168701, + "learning_rate": 0.001, + "loss": 1.6864, + "step": 105300 + }, + { + "epoch": 34.065934065934066, + "grad_norm": 2.061631679534912, + "learning_rate": 0.001, + "loss": 1.6903, + "step": 105400 + }, + { + "epoch": 34.09825468648998, + "grad_norm": 2.0354113578796387, + "learning_rate": 0.001, + "loss": 1.708, + "step": 105500 + }, + { + "epoch": 34.130575307045895, + "grad_norm": 1.6836439371109009, + "learning_rate": 0.001, + "loss": 1.7179, + "step": 105600 + }, + { + "epoch": 34.16289592760181, + "grad_norm": 2.024641513824463, + "learning_rate": 0.001, + "loss": 1.7056, + "step": 105700 + }, + { + "epoch": 34.195216548157724, + "grad_norm": 1.5095860958099365, + "learning_rate": 0.001, + "loss": 1.705, + "step": 105800 + }, + { + "epoch": 34.22753716871364, + "grad_norm": 2.405456781387329, + "learning_rate": 0.001, + "loss": 1.7241, + "step": 105900 + }, + { + "epoch": 34.25985778926955, + "grad_norm": 1.871866226196289, + "learning_rate": 0.001, + "loss": 1.7392, + "step": 106000 + }, + { + "epoch": 34.29217840982547, + "grad_norm": 2.0286736488342285, + "learning_rate": 0.001, + "loss": 1.7312, + "step": 106100 + }, + { + "epoch": 34.32449903038138, + "grad_norm": 1.6875593662261963, + "learning_rate": 0.001, + "loss": 1.747, + "step": 106200 + }, + { + "epoch": 34.3568196509373, + "grad_norm": 1.6652581691741943, + "learning_rate": 0.001, + "loss": 1.7525, + "step": 106300 + }, + { + "epoch": 34.38914027149321, + "grad_norm": 1.8134232759475708, + "learning_rate": 0.001, + "loss": 1.7527, + "step": 106400 + }, + { + "epoch": 34.421460892049126, + "grad_norm": 1.7698140144348145, + "learning_rate": 0.001, + "loss": 1.762, + "step": 106500 + }, + { + "epoch": 34.45378151260504, + "grad_norm": 1.5082416534423828, + "learning_rate": 0.001, + "loss": 1.7687, + "step": 106600 + }, + { + "epoch": 34.486102133160955, + "grad_norm": 1.5613726377487183, + "learning_rate": 0.001, + "loss": 1.7655, + "step": 106700 + }, + { + "epoch": 34.51842275371687, + "grad_norm": 1.7272530794143677, + "learning_rate": 0.001, + "loss": 1.7483, + "step": 106800 + }, + { + "epoch": 34.550743374272784, + "grad_norm": 1.644972801208496, + "learning_rate": 0.001, + "loss": 1.7832, + "step": 106900 + }, + { + "epoch": 34.5830639948287, + "grad_norm": 1.644237756729126, + "learning_rate": 0.001, + "loss": 1.7609, + "step": 107000 + }, + { + "epoch": 34.61538461538461, + "grad_norm": 1.6201183795928955, + "learning_rate": 0.001, + "loss": 1.772, + "step": 107100 + }, + { + "epoch": 34.64770523594053, + "grad_norm": 1.9227070808410645, + "learning_rate": 0.001, + "loss": 1.7954, + "step": 107200 + }, + { + "epoch": 34.68002585649644, + "grad_norm": 1.4974156618118286, + "learning_rate": 0.001, + "loss": 1.7881, + "step": 107300 + }, + { + "epoch": 34.71234647705236, + "grad_norm": 1.9709665775299072, + "learning_rate": 0.001, + "loss": 1.7943, + "step": 107400 + }, + { + "epoch": 34.74466709760827, + "grad_norm": 1.6651779413223267, + "learning_rate": 0.001, + "loss": 1.803, + "step": 107500 + }, + { + "epoch": 34.776987718164186, + "grad_norm": 1.9187260866165161, + "learning_rate": 0.001, + "loss": 1.7989, + "step": 107600 + }, + { + "epoch": 34.8093083387201, + "grad_norm": 1.8573428392410278, + "learning_rate": 0.001, + "loss": 1.8145, + "step": 107700 + }, + { + "epoch": 34.841628959276015, + "grad_norm": 1.4682703018188477, + "learning_rate": 0.001, + "loss": 1.8096, + "step": 107800 + }, + { + "epoch": 34.87394957983193, + "grad_norm": 2.2076940536499023, + "learning_rate": 0.001, + "loss": 1.8052, + "step": 107900 + }, + { + "epoch": 34.906270200387844, + "grad_norm": 1.7180118560791016, + "learning_rate": 0.001, + "loss": 1.8129, + "step": 108000 + }, + { + "epoch": 34.93859082094376, + "grad_norm": 1.665969729423523, + "learning_rate": 0.001, + "loss": 1.8067, + "step": 108100 + }, + { + "epoch": 34.97091144149967, + "grad_norm": 5.9310622215271, + "learning_rate": 0.001, + "loss": 1.8212, + "step": 108200 + }, + { + "epoch": 35.003232062055595, + "grad_norm": 1.5211377143859863, + "learning_rate": 0.001, + "loss": 1.8044, + "step": 108300 + }, + { + "epoch": 35.03555268261151, + "grad_norm": 1.2844172716140747, + "learning_rate": 0.001, + "loss": 1.6721, + "step": 108400 + }, + { + "epoch": 35.067873303167424, + "grad_norm": 1.5357627868652344, + "learning_rate": 0.001, + "loss": 1.6904, + "step": 108500 + }, + { + "epoch": 35.10019392372334, + "grad_norm": 1.5204764604568481, + "learning_rate": 0.001, + "loss": 1.6687, + "step": 108600 + }, + { + "epoch": 35.13251454427925, + "grad_norm": 2.441347599029541, + "learning_rate": 0.001, + "loss": 1.675, + "step": 108700 + }, + { + "epoch": 35.16483516483517, + "grad_norm": 1.9407317638397217, + "learning_rate": 0.001, + "loss": 1.7092, + "step": 108800 + }, + { + "epoch": 35.19715578539108, + "grad_norm": 1.4935519695281982, + "learning_rate": 0.001, + "loss": 1.7088, + "step": 108900 + }, + { + "epoch": 35.229476405947, + "grad_norm": 1.2903261184692383, + "learning_rate": 0.001, + "loss": 1.6767, + "step": 109000 + }, + { + "epoch": 35.26179702650291, + "grad_norm": 2.764295816421509, + "learning_rate": 0.001, + "loss": 1.7007, + "step": 109100 + }, + { + "epoch": 35.294117647058826, + "grad_norm": 1.714455008506775, + "learning_rate": 0.001, + "loss": 1.7017, + "step": 109200 + }, + { + "epoch": 35.32643826761474, + "grad_norm": 1.885419487953186, + "learning_rate": 0.001, + "loss": 1.7402, + "step": 109300 + }, + { + "epoch": 35.358758888170655, + "grad_norm": 1.5595377683639526, + "learning_rate": 0.001, + "loss": 1.7365, + "step": 109400 + }, + { + "epoch": 35.39107950872657, + "grad_norm": 1.3263633251190186, + "learning_rate": 0.001, + "loss": 1.7513, + "step": 109500 + }, + { + "epoch": 35.423400129282484, + "grad_norm": 1.623655080795288, + "learning_rate": 0.001, + "loss": 1.7408, + "step": 109600 + }, + { + "epoch": 35.4557207498384, + "grad_norm": 1.5252445936203003, + "learning_rate": 0.001, + "loss": 1.7426, + "step": 109700 + }, + { + "epoch": 35.48804137039431, + "grad_norm": 1.5424871444702148, + "learning_rate": 0.001, + "loss": 1.7424, + "step": 109800 + }, + { + "epoch": 35.52036199095023, + "grad_norm": 1.604275107383728, + "learning_rate": 0.001, + "loss": 1.7404, + "step": 109900 + }, + { + "epoch": 35.55268261150614, + "grad_norm": 1.3040121793746948, + "learning_rate": 0.001, + "loss": 1.7377, + "step": 110000 + }, + { + "epoch": 35.58500323206206, + "grad_norm": 1.4881088733673096, + "learning_rate": 0.001, + "loss": 1.7472, + "step": 110100 + }, + { + "epoch": 35.61732385261797, + "grad_norm": 1.2945785522460938, + "learning_rate": 0.001, + "loss": 1.7656, + "step": 110200 + }, + { + "epoch": 35.649644473173886, + "grad_norm": 1.5668212175369263, + "learning_rate": 0.001, + "loss": 1.7522, + "step": 110300 + }, + { + "epoch": 35.6819650937298, + "grad_norm": 1.6249139308929443, + "learning_rate": 0.001, + "loss": 1.7661, + "step": 110400 + }, + { + "epoch": 35.714285714285715, + "grad_norm": 1.5623595714569092, + "learning_rate": 0.001, + "loss": 1.7587, + "step": 110500 + }, + { + "epoch": 35.74660633484163, + "grad_norm": 1.6370724439620972, + "learning_rate": 0.001, + "loss": 1.7873, + "step": 110600 + }, + { + "epoch": 35.778926955397544, + "grad_norm": 1.387434959411621, + "learning_rate": 0.001, + "loss": 1.7579, + "step": 110700 + }, + { + "epoch": 35.81124757595346, + "grad_norm": 1.4653477668762207, + "learning_rate": 0.001, + "loss": 1.7723, + "step": 110800 + }, + { + "epoch": 35.84356819650937, + "grad_norm": 1.6415297985076904, + "learning_rate": 0.001, + "loss": 1.7719, + "step": 110900 + }, + { + "epoch": 35.87588881706529, + "grad_norm": 1.447180151939392, + "learning_rate": 0.001, + "loss": 1.7818, + "step": 111000 + }, + { + "epoch": 35.9082094376212, + "grad_norm": 1.4582403898239136, + "learning_rate": 0.001, + "loss": 1.7835, + "step": 111100 + }, + { + "epoch": 35.94053005817712, + "grad_norm": 1.6619702577590942, + "learning_rate": 0.001, + "loss": 1.7904, + "step": 111200 + }, + { + "epoch": 35.97285067873303, + "grad_norm": 1.6435052156448364, + "learning_rate": 0.001, + "loss": 1.7815, + "step": 111300 + }, + { + "epoch": 36.005171299288946, + "grad_norm": 1.2920403480529785, + "learning_rate": 0.001, + "loss": 1.7803, + "step": 111400 + }, + { + "epoch": 36.03749191984486, + "grad_norm": 1.1500316858291626, + "learning_rate": 0.001, + "loss": 1.6365, + "step": 111500 + }, + { + "epoch": 36.069812540400775, + "grad_norm": 1.4132637977600098, + "learning_rate": 0.001, + "loss": 1.6589, + "step": 111600 + }, + { + "epoch": 36.10213316095669, + "grad_norm": 1.4161231517791748, + "learning_rate": 0.001, + "loss": 1.6784, + "step": 111700 + }, + { + "epoch": 36.134453781512605, + "grad_norm": 1.4937621355056763, + "learning_rate": 0.001, + "loss": 1.6499, + "step": 111800 + }, + { + "epoch": 36.16677440206852, + "grad_norm": 1.2990803718566895, + "learning_rate": 0.001, + "loss": 1.6766, + "step": 111900 + }, + { + "epoch": 36.199095022624434, + "grad_norm": 1.636744737625122, + "learning_rate": 0.001, + "loss": 1.6621, + "step": 112000 + }, + { + "epoch": 36.23141564318035, + "grad_norm": 1.783856987953186, + "learning_rate": 0.001, + "loss": 1.6684, + "step": 112100 + }, + { + "epoch": 36.26373626373626, + "grad_norm": 1.3172340393066406, + "learning_rate": 0.001, + "loss": 1.6902, + "step": 112200 + }, + { + "epoch": 36.29605688429218, + "grad_norm": 1.2710336446762085, + "learning_rate": 0.001, + "loss": 1.6858, + "step": 112300 + }, + { + "epoch": 36.32837750484809, + "grad_norm": 2.070700168609619, + "learning_rate": 0.001, + "loss": 1.6842, + "step": 112400 + }, + { + "epoch": 36.36069812540401, + "grad_norm": 1.738664984703064, + "learning_rate": 0.001, + "loss": 1.7021, + "step": 112500 + }, + { + "epoch": 36.39301874595992, + "grad_norm": 1.4381351470947266, + "learning_rate": 0.001, + "loss": 1.7221, + "step": 112600 + }, + { + "epoch": 36.425339366515836, + "grad_norm": 1.3279963731765747, + "learning_rate": 0.001, + "loss": 1.7187, + "step": 112700 + }, + { + "epoch": 36.45765998707175, + "grad_norm": 1.7753669023513794, + "learning_rate": 0.001, + "loss": 1.7147, + "step": 112800 + }, + { + "epoch": 36.489980607627665, + "grad_norm": 2.194094657897949, + "learning_rate": 0.001, + "loss": 1.7264, + "step": 112900 + }, + { + "epoch": 36.52230122818358, + "grad_norm": 1.360891342163086, + "learning_rate": 0.001, + "loss": 1.7423, + "step": 113000 + }, + { + "epoch": 36.554621848739494, + "grad_norm": 1.8281059265136719, + "learning_rate": 0.001, + "loss": 1.7389, + "step": 113100 + }, + { + "epoch": 36.58694246929541, + "grad_norm": 7.225008964538574, + "learning_rate": 0.001, + "loss": 1.7317, + "step": 113200 + }, + { + "epoch": 36.61926308985132, + "grad_norm": 1.3751835823059082, + "learning_rate": 0.001, + "loss": 1.7178, + "step": 113300 + }, + { + "epoch": 36.65158371040724, + "grad_norm": 1.3303834199905396, + "learning_rate": 0.001, + "loss": 1.7376, + "step": 113400 + }, + { + "epoch": 36.68390433096315, + "grad_norm": 1.3303159475326538, + "learning_rate": 0.001, + "loss": 1.7494, + "step": 113500 + }, + { + "epoch": 36.71622495151907, + "grad_norm": 1.3926934003829956, + "learning_rate": 0.001, + "loss": 1.7545, + "step": 113600 + }, + { + "epoch": 36.74854557207498, + "grad_norm": 1.3599934577941895, + "learning_rate": 0.001, + "loss": 1.7438, + "step": 113700 + }, + { + "epoch": 36.780866192630896, + "grad_norm": 17.112966537475586, + "learning_rate": 0.001, + "loss": 1.7566, + "step": 113800 + }, + { + "epoch": 36.81318681318681, + "grad_norm": 1.1081054210662842, + "learning_rate": 0.001, + "loss": 1.7545, + "step": 113900 + }, + { + "epoch": 36.845507433742725, + "grad_norm": 1.49489426612854, + "learning_rate": 0.001, + "loss": 1.7553, + "step": 114000 + }, + { + "epoch": 36.87782805429864, + "grad_norm": 1.4801154136657715, + "learning_rate": 0.001, + "loss": 1.739, + "step": 114100 + }, + { + "epoch": 36.910148674854554, + "grad_norm": 1.4519503116607666, + "learning_rate": 0.001, + "loss": 1.7696, + "step": 114200 + }, + { + "epoch": 36.94246929541047, + "grad_norm": 1.424436092376709, + "learning_rate": 0.001, + "loss": 1.7624, + "step": 114300 + }, + { + "epoch": 36.97478991596638, + "grad_norm": 1.5529747009277344, + "learning_rate": 0.001, + "loss": 1.7481, + "step": 114400 + }, + { + "epoch": 37.007110536522305, + "grad_norm": 1.3554753065109253, + "learning_rate": 0.001, + "loss": 1.7531, + "step": 114500 + }, + { + "epoch": 37.03943115707822, + "grad_norm": 1.5860496759414673, + "learning_rate": 0.001, + "loss": 1.6318, + "step": 114600 + }, + { + "epoch": 37.071751777634134, + "grad_norm": 2.0900566577911377, + "learning_rate": 0.001, + "loss": 1.623, + "step": 114700 + }, + { + "epoch": 37.10407239819005, + "grad_norm": 1.452841877937317, + "learning_rate": 0.001, + "loss": 1.6301, + "step": 114800 + }, + { + "epoch": 37.13639301874596, + "grad_norm": 2.630765914916992, + "learning_rate": 0.001, + "loss": 1.6386, + "step": 114900 + }, + { + "epoch": 37.16871363930188, + "grad_norm": 1.6086002588272095, + "learning_rate": 0.001, + "loss": 1.6525, + "step": 115000 + }, + { + "epoch": 37.20103425985779, + "grad_norm": 1.5856311321258545, + "learning_rate": 0.001, + "loss": 1.6452, + "step": 115100 + }, + { + "epoch": 37.23335488041371, + "grad_norm": 1.3888436555862427, + "learning_rate": 0.001, + "loss": 1.6605, + "step": 115200 + }, + { + "epoch": 37.26567550096962, + "grad_norm": 1.6696892976760864, + "learning_rate": 0.001, + "loss": 1.6635, + "step": 115300 + }, + { + "epoch": 37.297996121525536, + "grad_norm": 1.3410964012145996, + "learning_rate": 0.001, + "loss": 1.6879, + "step": 115400 + }, + { + "epoch": 37.33031674208145, + "grad_norm": 1.9728986024856567, + "learning_rate": 0.001, + "loss": 1.6818, + "step": 115500 + }, + { + "epoch": 37.362637362637365, + "grad_norm": 1.2899627685546875, + "learning_rate": 0.001, + "loss": 1.6637, + "step": 115600 + }, + { + "epoch": 37.39495798319328, + "grad_norm": 1.4178438186645508, + "learning_rate": 0.001, + "loss": 1.6829, + "step": 115700 + }, + { + "epoch": 37.427278603749194, + "grad_norm": 1.2930041551589966, + "learning_rate": 0.001, + "loss": 1.6989, + "step": 115800 + }, + { + "epoch": 37.45959922430511, + "grad_norm": 1.5607823133468628, + "learning_rate": 0.001, + "loss": 1.6994, + "step": 115900 + }, + { + "epoch": 37.49191984486102, + "grad_norm": 1.3192517757415771, + "learning_rate": 0.001, + "loss": 1.7011, + "step": 116000 + }, + { + "epoch": 37.52424046541694, + "grad_norm": 1.385068655014038, + "learning_rate": 0.001, + "loss": 1.715, + "step": 116100 + }, + { + "epoch": 37.55656108597285, + "grad_norm": 1.4092013835906982, + "learning_rate": 0.001, + "loss": 1.7085, + "step": 116200 + }, + { + "epoch": 37.58888170652877, + "grad_norm": 1.5187952518463135, + "learning_rate": 0.001, + "loss": 1.6988, + "step": 116300 + }, + { + "epoch": 37.62120232708468, + "grad_norm": 1.5603824853897095, + "learning_rate": 0.001, + "loss": 1.6981, + "step": 116400 + }, + { + "epoch": 37.653522947640596, + "grad_norm": 1.3722445964813232, + "learning_rate": 0.001, + "loss": 1.7052, + "step": 116500 + }, + { + "epoch": 37.68584356819651, + "grad_norm": 1.2925854921340942, + "learning_rate": 0.001, + "loss": 1.7185, + "step": 116600 + }, + { + "epoch": 37.718164188752425, + "grad_norm": 1.8050371408462524, + "learning_rate": 0.001, + "loss": 1.7142, + "step": 116700 + }, + { + "epoch": 37.75048480930834, + "grad_norm": 1.6540824174880981, + "learning_rate": 0.001, + "loss": 1.7186, + "step": 116800 + }, + { + "epoch": 37.782805429864254, + "grad_norm": 1.7223883867263794, + "learning_rate": 0.001, + "loss": 1.7336, + "step": 116900 + }, + { + "epoch": 37.81512605042017, + "grad_norm": 1.4943795204162598, + "learning_rate": 0.001, + "loss": 1.7443, + "step": 117000 + }, + { + "epoch": 37.84744667097608, + "grad_norm": 1.3059314489364624, + "learning_rate": 0.001, + "loss": 1.7356, + "step": 117100 + }, + { + "epoch": 37.879767291532, + "grad_norm": 1.4005022048950195, + "learning_rate": 0.001, + "loss": 1.7175, + "step": 117200 + }, + { + "epoch": 37.91208791208791, + "grad_norm": 1.3799285888671875, + "learning_rate": 0.001, + "loss": 1.7465, + "step": 117300 + }, + { + "epoch": 37.94440853264383, + "grad_norm": 1.4844553470611572, + "learning_rate": 0.001, + "loss": 1.7428, + "step": 117400 + }, + { + "epoch": 37.97672915319974, + "grad_norm": 1.2771897315979004, + "learning_rate": 0.001, + "loss": 1.746, + "step": 117500 + }, + { + "epoch": 38.009049773755656, + "grad_norm": 1.4916632175445557, + "learning_rate": 0.001, + "loss": 1.7074, + "step": 117600 + }, + { + "epoch": 38.04137039431157, + "grad_norm": 1.445297360420227, + "learning_rate": 0.001, + "loss": 1.5948, + "step": 117700 + }, + { + "epoch": 38.073691014867485, + "grad_norm": 1.513931155204773, + "learning_rate": 0.001, + "loss": 1.61, + "step": 117800 + }, + { + "epoch": 38.1060116354234, + "grad_norm": 1.5125148296356201, + "learning_rate": 0.001, + "loss": 1.6239, + "step": 117900 + }, + { + "epoch": 38.138332255979314, + "grad_norm": 1.384259819984436, + "learning_rate": 0.001, + "loss": 1.6232, + "step": 118000 + }, + { + "epoch": 38.17065287653523, + "grad_norm": 1.5894758701324463, + "learning_rate": 0.001, + "loss": 1.6157, + "step": 118100 + }, + { + "epoch": 38.20297349709114, + "grad_norm": 1.6156755685806274, + "learning_rate": 0.001, + "loss": 1.638, + "step": 118200 + }, + { + "epoch": 38.23529411764706, + "grad_norm": 1.5429646968841553, + "learning_rate": 0.001, + "loss": 1.6424, + "step": 118300 + }, + { + "epoch": 38.26761473820297, + "grad_norm": 2.129305839538574, + "learning_rate": 0.001, + "loss": 1.6344, + "step": 118400 + }, + { + "epoch": 38.29993535875889, + "grad_norm": 1.6252347230911255, + "learning_rate": 0.001, + "loss": 1.6621, + "step": 118500 + }, + { + "epoch": 38.3322559793148, + "grad_norm": 1.6291354894638062, + "learning_rate": 0.001, + "loss": 1.6543, + "step": 118600 + }, + { + "epoch": 38.364576599870716, + "grad_norm": 1.9039782285690308, + "learning_rate": 0.001, + "loss": 1.6766, + "step": 118700 + }, + { + "epoch": 38.39689722042663, + "grad_norm": 1.4002729654312134, + "learning_rate": 0.001, + "loss": 1.6732, + "step": 118800 + }, + { + "epoch": 38.429217840982545, + "grad_norm": 1.6930391788482666, + "learning_rate": 0.001, + "loss": 1.6552, + "step": 118900 + }, + { + "epoch": 38.46153846153846, + "grad_norm": 1.3052587509155273, + "learning_rate": 0.001, + "loss": 1.6699, + "step": 119000 + }, + { + "epoch": 38.493859082094374, + "grad_norm": 1.9328466653823853, + "learning_rate": 0.001, + "loss": 1.6617, + "step": 119100 + }, + { + "epoch": 38.52617970265029, + "grad_norm": 1.5333701372146606, + "learning_rate": 0.001, + "loss": 1.6919, + "step": 119200 + }, + { + "epoch": 38.558500323206204, + "grad_norm": 1.3492674827575684, + "learning_rate": 0.001, + "loss": 1.6642, + "step": 119300 + }, + { + "epoch": 38.59082094376212, + "grad_norm": 1.2537891864776611, + "learning_rate": 0.001, + "loss": 1.6928, + "step": 119400 + }, + { + "epoch": 38.62314156431803, + "grad_norm": 1.5430333614349365, + "learning_rate": 0.001, + "loss": 1.6925, + "step": 119500 + }, + { + "epoch": 38.65546218487395, + "grad_norm": 1.3039238452911377, + "learning_rate": 0.001, + "loss": 1.6964, + "step": 119600 + }, + { + "epoch": 38.68778280542986, + "grad_norm": 1.9416965246200562, + "learning_rate": 0.001, + "loss": 1.6892, + "step": 119700 + }, + { + "epoch": 38.720103425985776, + "grad_norm": 1.5908054113388062, + "learning_rate": 0.001, + "loss": 1.6994, + "step": 119800 + }, + { + "epoch": 38.75242404654169, + "grad_norm": 1.4828917980194092, + "learning_rate": 0.001, + "loss": 1.678, + "step": 119900 + }, + { + "epoch": 38.784744667097605, + "grad_norm": 1.236195683479309, + "learning_rate": 0.001, + "loss": 1.7161, + "step": 120000 + }, + { + "epoch": 38.81706528765352, + "grad_norm": 1.6175165176391602, + "learning_rate": 0.001, + "loss": 1.7076, + "step": 120100 + }, + { + "epoch": 38.849385908209435, + "grad_norm": 1.4187142848968506, + "learning_rate": 0.001, + "loss": 1.7213, + "step": 120200 + }, + { + "epoch": 38.88170652876535, + "grad_norm": 1.615424633026123, + "learning_rate": 0.001, + "loss": 1.6994, + "step": 120300 + }, + { + "epoch": 38.914027149321264, + "grad_norm": 1.3184314966201782, + "learning_rate": 0.001, + "loss": 1.7017, + "step": 120400 + }, + { + "epoch": 38.94634776987718, + "grad_norm": 2.0182273387908936, + "learning_rate": 0.001, + "loss": 1.7112, + "step": 120500 + }, + { + "epoch": 38.97866839043309, + "grad_norm": 1.53033447265625, + "learning_rate": 0.001, + "loss": 1.7221, + "step": 120600 + }, + { + "epoch": 39.010989010989015, + "grad_norm": 1.5735342502593994, + "learning_rate": 0.001, + "loss": 1.6814, + "step": 120700 + }, + { + "epoch": 39.04330963154493, + "grad_norm": 1.396195411682129, + "learning_rate": 0.001, + "loss": 1.5669, + "step": 120800 + }, + { + "epoch": 39.075630252100844, + "grad_norm": 1.510571002960205, + "learning_rate": 0.001, + "loss": 1.5803, + "step": 120900 + }, + { + "epoch": 39.10795087265676, + "grad_norm": 1.533634066581726, + "learning_rate": 0.001, + "loss": 1.5828, + "step": 121000 + }, + { + "epoch": 39.14027149321267, + "grad_norm": 1.4806658029556274, + "learning_rate": 0.001, + "loss": 1.5861, + "step": 121100 + }, + { + "epoch": 39.17259211376859, + "grad_norm": 1.7448620796203613, + "learning_rate": 0.001, + "loss": 1.6208, + "step": 121200 + }, + { + "epoch": 39.2049127343245, + "grad_norm": 1.800340175628662, + "learning_rate": 0.001, + "loss": 1.6287, + "step": 121300 + }, + { + "epoch": 39.237233354880416, + "grad_norm": 1.4154140949249268, + "learning_rate": 0.001, + "loss": 1.6252, + "step": 121400 + }, + { + "epoch": 39.26955397543633, + "grad_norm": 1.729437232017517, + "learning_rate": 0.001, + "loss": 1.6358, + "step": 121500 + }, + { + "epoch": 39.301874595992246, + "grad_norm": 1.3439644575119019, + "learning_rate": 0.001, + "loss": 1.6278, + "step": 121600 + }, + { + "epoch": 39.33419521654816, + "grad_norm": 2.249131441116333, + "learning_rate": 0.001, + "loss": 1.6345, + "step": 121700 + }, + { + "epoch": 39.366515837104075, + "grad_norm": 1.8861782550811768, + "learning_rate": 0.001, + "loss": 1.6142, + "step": 121800 + }, + { + "epoch": 39.39883645765999, + "grad_norm": 1.7923866510391235, + "learning_rate": 0.001, + "loss": 1.6408, + "step": 121900 + }, + { + "epoch": 39.431157078215904, + "grad_norm": 1.8089905977249146, + "learning_rate": 0.001, + "loss": 1.625, + "step": 122000 + }, + { + "epoch": 39.46347769877182, + "grad_norm": 1.6108099222183228, + "learning_rate": 0.001, + "loss": 1.6471, + "step": 122100 + }, + { + "epoch": 39.49579831932773, + "grad_norm": 1.8288142681121826, + "learning_rate": 0.001, + "loss": 1.6492, + "step": 122200 + }, + { + "epoch": 39.52811893988365, + "grad_norm": 1.6648753881454468, + "learning_rate": 0.001, + "loss": 1.6743, + "step": 122300 + }, + { + "epoch": 39.56043956043956, + "grad_norm": 1.447311282157898, + "learning_rate": 0.001, + "loss": 1.6649, + "step": 122400 + }, + { + "epoch": 39.59276018099548, + "grad_norm": 1.5929150581359863, + "learning_rate": 0.001, + "loss": 1.6768, + "step": 122500 + }, + { + "epoch": 39.62508080155139, + "grad_norm": 1.5306599140167236, + "learning_rate": 0.001, + "loss": 1.6823, + "step": 122600 + }, + { + "epoch": 39.657401422107306, + "grad_norm": 2.1185336112976074, + "learning_rate": 0.001, + "loss": 1.647, + "step": 122700 + }, + { + "epoch": 39.68972204266322, + "grad_norm": 1.5616239309310913, + "learning_rate": 0.001, + "loss": 1.6811, + "step": 122800 + }, + { + "epoch": 39.722042663219135, + "grad_norm": 1.610378623008728, + "learning_rate": 0.001, + "loss": 1.682, + "step": 122900 + }, + { + "epoch": 39.75436328377505, + "grad_norm": 1.679341197013855, + "learning_rate": 0.001, + "loss": 1.6773, + "step": 123000 + }, + { + "epoch": 39.786683904330964, + "grad_norm": 1.6334840059280396, + "learning_rate": 0.001, + "loss": 1.6717, + "step": 123100 + }, + { + "epoch": 39.81900452488688, + "grad_norm": 1.4154548645019531, + "learning_rate": 0.001, + "loss": 1.6878, + "step": 123200 + }, + { + "epoch": 39.85132514544279, + "grad_norm": 1.5215219259262085, + "learning_rate": 0.001, + "loss": 1.6776, + "step": 123300 + }, + { + "epoch": 39.88364576599871, + "grad_norm": 1.4763894081115723, + "learning_rate": 0.001, + "loss": 1.6989, + "step": 123400 + }, + { + "epoch": 39.91596638655462, + "grad_norm": 1.5995749235153198, + "learning_rate": 0.001, + "loss": 1.6769, + "step": 123500 + }, + { + "epoch": 39.94828700711054, + "grad_norm": 1.726880431175232, + "learning_rate": 0.001, + "loss": 1.7047, + "step": 123600 + }, + { + "epoch": 39.98060762766645, + "grad_norm": 2.136265754699707, + "learning_rate": 0.001, + "loss": 1.7003, + "step": 123700 + }, + { + "epoch": 40.012928248222366, + "grad_norm": 1.5406869649887085, + "learning_rate": 0.001, + "loss": 1.6333, + "step": 123800 + }, + { + "epoch": 40.04524886877828, + "grad_norm": 1.669394850730896, + "learning_rate": 0.001, + "loss": 1.5618, + "step": 123900 + }, + { + "epoch": 40.077569489334195, + "grad_norm": 1.9656615257263184, + "learning_rate": 0.001, + "loss": 1.5684, + "step": 124000 + }, + { + "epoch": 40.10989010989011, + "grad_norm": 1.5017229318618774, + "learning_rate": 0.001, + "loss": 1.5721, + "step": 124100 + }, + { + "epoch": 40.142210730446024, + "grad_norm": 1.9601807594299316, + "learning_rate": 0.001, + "loss": 1.5763, + "step": 124200 + }, + { + "epoch": 40.17453135100194, + "grad_norm": 1.748874545097351, + "learning_rate": 0.001, + "loss": 1.5934, + "step": 124300 + }, + { + "epoch": 40.20685197155785, + "grad_norm": 1.5721076726913452, + "learning_rate": 0.001, + "loss": 1.5867, + "step": 124400 + }, + { + "epoch": 40.23917259211377, + "grad_norm": 2.065415620803833, + "learning_rate": 0.001, + "loss": 1.5856, + "step": 124500 + }, + { + "epoch": 40.27149321266968, + "grad_norm": 1.926713466644287, + "learning_rate": 0.001, + "loss": 1.6055, + "step": 124600 + }, + { + "epoch": 40.3038138332256, + "grad_norm": 1.827108383178711, + "learning_rate": 0.001, + "loss": 1.5936, + "step": 124700 + }, + { + "epoch": 40.33613445378151, + "grad_norm": 2.123699426651001, + "learning_rate": 0.001, + "loss": 1.6068, + "step": 124800 + }, + { + "epoch": 40.368455074337426, + "grad_norm": 2.119037628173828, + "learning_rate": 0.001, + "loss": 1.6113, + "step": 124900 + }, + { + "epoch": 40.40077569489334, + "grad_norm": 1.7764804363250732, + "learning_rate": 0.001, + "loss": 1.6156, + "step": 125000 + }, + { + "epoch": 40.433096315449255, + "grad_norm": 1.7340418100357056, + "learning_rate": 0.001, + "loss": 1.6309, + "step": 125100 + }, + { + "epoch": 40.46541693600517, + "grad_norm": 2.1750295162200928, + "learning_rate": 0.001, + "loss": 1.6165, + "step": 125200 + }, + { + "epoch": 40.497737556561084, + "grad_norm": 1.8627874851226807, + "learning_rate": 0.001, + "loss": 1.6342, + "step": 125300 + }, + { + "epoch": 40.530058177117, + "grad_norm": 2.254770517349243, + "learning_rate": 0.001, + "loss": 1.64, + "step": 125400 + }, + { + "epoch": 40.56237879767291, + "grad_norm": 1.959664225578308, + "learning_rate": 0.001, + "loss": 1.6451, + "step": 125500 + }, + { + "epoch": 40.59469941822883, + "grad_norm": 1.7636991739273071, + "learning_rate": 0.001, + "loss": 1.6386, + "step": 125600 + }, + { + "epoch": 40.62702003878474, + "grad_norm": 1.5677493810653687, + "learning_rate": 0.001, + "loss": 1.6454, + "step": 125700 + }, + { + "epoch": 40.65934065934066, + "grad_norm": 2.089933156967163, + "learning_rate": 0.001, + "loss": 1.6412, + "step": 125800 + }, + { + "epoch": 40.69166127989657, + "grad_norm": 1.8187321424484253, + "learning_rate": 0.001, + "loss": 1.6429, + "step": 125900 + }, + { + "epoch": 40.723981900452486, + "grad_norm": 1.580985426902771, + "learning_rate": 0.001, + "loss": 1.6749, + "step": 126000 + }, + { + "epoch": 40.7563025210084, + "grad_norm": 1.7752652168273926, + "learning_rate": 0.001, + "loss": 1.6616, + "step": 126100 + }, + { + "epoch": 40.788623141564315, + "grad_norm": 2.0827362537384033, + "learning_rate": 0.001, + "loss": 1.6555, + "step": 126200 + }, + { + "epoch": 40.82094376212023, + "grad_norm": 1.8399465084075928, + "learning_rate": 0.001, + "loss": 1.6663, + "step": 126300 + }, + { + "epoch": 40.853264382676144, + "grad_norm": 1.5653818845748901, + "learning_rate": 0.001, + "loss": 1.6752, + "step": 126400 + }, + { + "epoch": 40.88558500323206, + "grad_norm": 2.2567121982574463, + "learning_rate": 0.001, + "loss": 1.6658, + "step": 126500 + }, + { + "epoch": 40.91790562378797, + "grad_norm": 1.8558109998703003, + "learning_rate": 0.001, + "loss": 1.6766, + "step": 126600 + }, + { + "epoch": 40.95022624434389, + "grad_norm": 1.7659657001495361, + "learning_rate": 0.001, + "loss": 1.6761, + "step": 126700 + }, + { + "epoch": 40.9825468648998, + "grad_norm": 1.8757858276367188, + "learning_rate": 0.001, + "loss": 1.6769, + "step": 126800 + }, + { + "epoch": 41.014867485455724, + "grad_norm": 2.512212038040161, + "learning_rate": 0.001, + "loss": 1.6044, + "step": 126900 + }, + { + "epoch": 41.04718810601164, + "grad_norm": 2.0472733974456787, + "learning_rate": 0.001, + "loss": 1.541, + "step": 127000 + }, + { + "epoch": 41.07950872656755, + "grad_norm": 1.9753613471984863, + "learning_rate": 0.001, + "loss": 1.5501, + "step": 127100 + }, + { + "epoch": 41.11182934712347, + "grad_norm": 1.7416068315505981, + "learning_rate": 0.001, + "loss": 1.5391, + "step": 127200 + }, + { + "epoch": 41.14414996767938, + "grad_norm": 2.020958662033081, + "learning_rate": 0.001, + "loss": 1.55, + "step": 127300 + }, + { + "epoch": 41.1764705882353, + "grad_norm": 1.6599868535995483, + "learning_rate": 0.001, + "loss": 1.5525, + "step": 127400 + }, + { + "epoch": 41.20879120879121, + "grad_norm": 2.1170713901519775, + "learning_rate": 0.001, + "loss": 1.5644, + "step": 127500 + }, + { + "epoch": 41.241111829347126, + "grad_norm": 2.277888298034668, + "learning_rate": 0.001, + "loss": 1.5721, + "step": 127600 + }, + { + "epoch": 41.27343244990304, + "grad_norm": 2.1243207454681396, + "learning_rate": 0.001, + "loss": 1.585, + "step": 127700 + }, + { + "epoch": 41.305753070458955, + "grad_norm": 1.9793413877487183, + "learning_rate": 0.001, + "loss": 1.5874, + "step": 127800 + }, + { + "epoch": 41.33807369101487, + "grad_norm": 2.173837661743164, + "learning_rate": 0.001, + "loss": 1.6025, + "step": 127900 + }, + { + "epoch": 41.370394311570784, + "grad_norm": 1.8958253860473633, + "learning_rate": 0.001, + "loss": 1.5792, + "step": 128000 + }, + { + "epoch": 41.4027149321267, + "grad_norm": 1.8988944292068481, + "learning_rate": 0.001, + "loss": 1.5951, + "step": 128100 + }, + { + "epoch": 41.43503555268261, + "grad_norm": 2.107556104660034, + "learning_rate": 0.001, + "loss": 1.6001, + "step": 128200 + }, + { + "epoch": 41.46735617323853, + "grad_norm": 1.8033684492111206, + "learning_rate": 0.001, + "loss": 1.5962, + "step": 128300 + }, + { + "epoch": 41.49967679379444, + "grad_norm": 2.004406213760376, + "learning_rate": 0.001, + "loss": 1.6003, + "step": 128400 + }, + { + "epoch": 41.53199741435036, + "grad_norm": 1.7807986736297607, + "learning_rate": 0.001, + "loss": 1.6142, + "step": 128500 + }, + { + "epoch": 41.56431803490627, + "grad_norm": 2.0727765560150146, + "learning_rate": 0.001, + "loss": 1.6103, + "step": 128600 + }, + { + "epoch": 41.596638655462186, + "grad_norm": 1.7568210363388062, + "learning_rate": 0.001, + "loss": 1.6313, + "step": 128700 + }, + { + "epoch": 41.6289592760181, + "grad_norm": 1.965510606765747, + "learning_rate": 0.001, + "loss": 1.6206, + "step": 128800 + }, + { + "epoch": 41.661279896574015, + "grad_norm": 1.6988037824630737, + "learning_rate": 0.001, + "loss": 1.6457, + "step": 128900 + }, + { + "epoch": 41.69360051712993, + "grad_norm": 2.4268720149993896, + "learning_rate": 0.001, + "loss": 1.6157, + "step": 129000 + }, + { + "epoch": 41.725921137685845, + "grad_norm": 2.0334534645080566, + "learning_rate": 0.001, + "loss": 1.6322, + "step": 129100 + }, + { + "epoch": 41.75824175824176, + "grad_norm": 2.085456371307373, + "learning_rate": 0.001, + "loss": 1.6448, + "step": 129200 + }, + { + "epoch": 41.790562378797674, + "grad_norm": 2.056472063064575, + "learning_rate": 0.001, + "loss": 1.6533, + "step": 129300 + }, + { + "epoch": 41.82288299935359, + "grad_norm": 2.280672073364258, + "learning_rate": 0.001, + "loss": 1.6466, + "step": 129400 + }, + { + "epoch": 41.8552036199095, + "grad_norm": 1.9739775657653809, + "learning_rate": 0.001, + "loss": 1.647, + "step": 129500 + }, + { + "epoch": 41.88752424046542, + "grad_norm": 2.5622267723083496, + "learning_rate": 0.001, + "loss": 1.6521, + "step": 129600 + }, + { + "epoch": 41.91984486102133, + "grad_norm": 1.893709421157837, + "learning_rate": 0.001, + "loss": 1.6407, + "step": 129700 + }, + { + "epoch": 41.95216548157725, + "grad_norm": 1.7965614795684814, + "learning_rate": 0.001, + "loss": 1.6568, + "step": 129800 + }, + { + "epoch": 41.98448610213316, + "grad_norm": 2.208484411239624, + "learning_rate": 0.001, + "loss": 1.6759, + "step": 129900 + }, + { + "epoch": 42.016806722689076, + "grad_norm": 2.078091859817505, + "learning_rate": 0.001, + "loss": 1.5652, + "step": 130000 + }, + { + "epoch": 42.04912734324499, + "grad_norm": 1.9855774641036987, + "learning_rate": 0.001, + "loss": 1.5017, + "step": 130100 + }, + { + "epoch": 42.081447963800905, + "grad_norm": 2.0214784145355225, + "learning_rate": 0.001, + "loss": 1.545, + "step": 130200 + }, + { + "epoch": 42.11376858435682, + "grad_norm": 2.2106988430023193, + "learning_rate": 0.001, + "loss": 1.5264, + "step": 130300 + }, + { + "epoch": 42.146089204912734, + "grad_norm": 2.208866834640503, + "learning_rate": 0.001, + "loss": 1.5276, + "step": 130400 + }, + { + "epoch": 42.17840982546865, + "grad_norm": 2.6902999877929688, + "learning_rate": 0.001, + "loss": 1.5549, + "step": 130500 + }, + { + "epoch": 42.21073044602456, + "grad_norm": 2.041752576828003, + "learning_rate": 0.001, + "loss": 1.5361, + "step": 130600 + }, + { + "epoch": 42.24305106658048, + "grad_norm": 1.764216423034668, + "learning_rate": 0.001, + "loss": 1.5404, + "step": 130700 + }, + { + "epoch": 42.27537168713639, + "grad_norm": 2.0946340560913086, + "learning_rate": 0.001, + "loss": 1.5454, + "step": 130800 + }, + { + "epoch": 42.30769230769231, + "grad_norm": 2.1806182861328125, + "learning_rate": 0.001, + "loss": 1.5488, + "step": 130900 + }, + { + "epoch": 42.34001292824822, + "grad_norm": 1.8357605934143066, + "learning_rate": 0.001, + "loss": 1.572, + "step": 131000 + }, + { + "epoch": 42.372333548804136, + "grad_norm": 2.311324119567871, + "learning_rate": 0.001, + "loss": 1.5771, + "step": 131100 + }, + { + "epoch": 42.40465416936005, + "grad_norm": 2.406909465789795, + "learning_rate": 0.001, + "loss": 1.588, + "step": 131200 + }, + { + "epoch": 42.436974789915965, + "grad_norm": 2.6602001190185547, + "learning_rate": 0.001, + "loss": 1.5705, + "step": 131300 + }, + { + "epoch": 42.46929541047188, + "grad_norm": 2.230372428894043, + "learning_rate": 0.001, + "loss": 1.5834, + "step": 131400 + }, + { + "epoch": 42.501616031027794, + "grad_norm": 2.017826557159424, + "learning_rate": 0.001, + "loss": 1.5873, + "step": 131500 + }, + { + "epoch": 42.53393665158371, + "grad_norm": 2.3046560287475586, + "learning_rate": 0.001, + "loss": 1.5883, + "step": 131600 + }, + { + "epoch": 42.56625727213962, + "grad_norm": 2.286146402359009, + "learning_rate": 0.001, + "loss": 1.6031, + "step": 131700 + }, + { + "epoch": 42.59857789269554, + "grad_norm": 2.2403616905212402, + "learning_rate": 0.001, + "loss": 1.6109, + "step": 131800 + }, + { + "epoch": 42.63089851325145, + "grad_norm": 2.069786787033081, + "learning_rate": 0.001, + "loss": 1.6011, + "step": 131900 + }, + { + "epoch": 42.66321913380737, + "grad_norm": 2.289213180541992, + "learning_rate": 0.001, + "loss": 1.6092, + "step": 132000 + }, + { + "epoch": 42.69553975436328, + "grad_norm": 2.493170738220215, + "learning_rate": 0.001, + "loss": 1.608, + "step": 132100 + }, + { + "epoch": 42.727860374919196, + "grad_norm": 2.157357931137085, + "learning_rate": 0.001, + "loss": 1.6024, + "step": 132200 + }, + { + "epoch": 42.76018099547511, + "grad_norm": 1.8655034303665161, + "learning_rate": 0.001, + "loss": 1.6296, + "step": 132300 + }, + { + "epoch": 42.792501616031025, + "grad_norm": 1.8031189441680908, + "learning_rate": 0.001, + "loss": 1.623, + "step": 132400 + }, + { + "epoch": 42.82482223658694, + "grad_norm": 1.9016975164413452, + "learning_rate": 0.001, + "loss": 1.6318, + "step": 132500 + }, + { + "epoch": 42.857142857142854, + "grad_norm": 2.0630972385406494, + "learning_rate": 0.001, + "loss": 1.6342, + "step": 132600 + }, + { + "epoch": 42.88946347769877, + "grad_norm": 2.427926778793335, + "learning_rate": 0.001, + "loss": 1.6399, + "step": 132700 + }, + { + "epoch": 42.92178409825468, + "grad_norm": 2.0053133964538574, + "learning_rate": 0.001, + "loss": 1.6561, + "step": 132800 + }, + { + "epoch": 42.9541047188106, + "grad_norm": 2.1016571521759033, + "learning_rate": 0.001, + "loss": 1.6391, + "step": 132900 + }, + { + "epoch": 42.98642533936652, + "grad_norm": 2.22855281829834, + "learning_rate": 0.001, + "loss": 1.6486, + "step": 133000 + }, + { + "epoch": 43.018745959922434, + "grad_norm": 1.6028803586959839, + "learning_rate": 0.001, + "loss": 1.5826, + "step": 133100 + }, + { + "epoch": 43.05106658047835, + "grad_norm": 2.0032293796539307, + "learning_rate": 0.001, + "loss": 1.4941, + "step": 133200 + }, + { + "epoch": 43.08338720103426, + "grad_norm": 1.550441861152649, + "learning_rate": 0.001, + "loss": 1.527, + "step": 133300 + }, + { + "epoch": 43.11570782159018, + "grad_norm": 1.429332971572876, + "learning_rate": 0.001, + "loss": 1.5175, + "step": 133400 + }, + { + "epoch": 43.14802844214609, + "grad_norm": 1.6169044971466064, + "learning_rate": 0.001, + "loss": 1.5182, + "step": 133500 + }, + { + "epoch": 43.18034906270201, + "grad_norm": 1.5943845510482788, + "learning_rate": 0.001, + "loss": 1.5155, + "step": 133600 + }, + { + "epoch": 43.21266968325792, + "grad_norm": 5.721950531005859, + "learning_rate": 0.001, + "loss": 1.5268, + "step": 133700 + }, + { + "epoch": 43.244990303813836, + "grad_norm": 1.472016453742981, + "learning_rate": 0.001, + "loss": 1.5302, + "step": 133800 + }, + { + "epoch": 43.27731092436975, + "grad_norm": 1.616565227508545, + "learning_rate": 0.001, + "loss": 1.5384, + "step": 133900 + }, + { + "epoch": 43.309631544925665, + "grad_norm": 1.4653263092041016, + "learning_rate": 0.001, + "loss": 1.5375, + "step": 134000 + }, + { + "epoch": 43.34195216548158, + "grad_norm": 1.5831880569458008, + "learning_rate": 0.001, + "loss": 1.5442, + "step": 134100 + }, + { + "epoch": 43.374272786037494, + "grad_norm": 2.1112189292907715, + "learning_rate": 0.001, + "loss": 1.5827, + "step": 134200 + }, + { + "epoch": 43.40659340659341, + "grad_norm": 1.6573975086212158, + "learning_rate": 0.001, + "loss": 1.5598, + "step": 134300 + }, + { + "epoch": 43.43891402714932, + "grad_norm": 1.8012890815734863, + "learning_rate": 0.001, + "loss": 1.5378, + "step": 134400 + }, + { + "epoch": 43.47123464770524, + "grad_norm": 1.6912004947662354, + "learning_rate": 0.001, + "loss": 1.5774, + "step": 134500 + }, + { + "epoch": 43.50355526826115, + "grad_norm": 1.7754735946655273, + "learning_rate": 0.001, + "loss": 1.5764, + "step": 134600 + }, + { + "epoch": 43.53587588881707, + "grad_norm": 2.1237239837646484, + "learning_rate": 0.001, + "loss": 1.576, + "step": 134700 + }, + { + "epoch": 43.56819650937298, + "grad_norm": 1.9989526271820068, + "learning_rate": 0.001, + "loss": 1.5701, + "step": 134800 + }, + { + "epoch": 43.600517129928896, + "grad_norm": 1.6959154605865479, + "learning_rate": 0.001, + "loss": 1.5828, + "step": 134900 + }, + { + "epoch": 43.63283775048481, + "grad_norm": 1.4177024364471436, + "learning_rate": 0.001, + "loss": 1.5929, + "step": 135000 + }, + { + "epoch": 43.665158371040725, + "grad_norm": 1.2994517087936401, + "learning_rate": 0.001, + "loss": 1.5947, + "step": 135100 + }, + { + "epoch": 43.69747899159664, + "grad_norm": 1.7819082736968994, + "learning_rate": 0.001, + "loss": 1.5897, + "step": 135200 + }, + { + "epoch": 43.729799612152554, + "grad_norm": 1.472542405128479, + "learning_rate": 0.001, + "loss": 1.6063, + "step": 135300 + }, + { + "epoch": 43.76212023270847, + "grad_norm": 1.7978127002716064, + "learning_rate": 0.001, + "loss": 1.5995, + "step": 135400 + }, + { + "epoch": 43.79444085326438, + "grad_norm": 1.496368169784546, + "learning_rate": 0.001, + "loss": 1.6055, + "step": 135500 + }, + { + "epoch": 43.8267614738203, + "grad_norm": 2.0534775257110596, + "learning_rate": 0.001, + "loss": 1.624, + "step": 135600 + }, + { + "epoch": 43.85908209437621, + "grad_norm": 1.7040880918502808, + "learning_rate": 0.001, + "loss": 1.6129, + "step": 135700 + }, + { + "epoch": 43.89140271493213, + "grad_norm": 1.5424871444702148, + "learning_rate": 0.001, + "loss": 1.6121, + "step": 135800 + }, + { + "epoch": 43.92372333548804, + "grad_norm": 1.6452441215515137, + "learning_rate": 0.001, + "loss": 1.6367, + "step": 135900 + }, + { + "epoch": 43.956043956043956, + "grad_norm": 1.9150134325027466, + "learning_rate": 0.001, + "loss": 1.6305, + "step": 136000 + }, + { + "epoch": 43.98836457659987, + "grad_norm": 1.769917607307434, + "learning_rate": 0.001, + "loss": 1.6333, + "step": 136100 + }, + { + "epoch": 44.020685197155785, + "grad_norm": 1.7169297933578491, + "learning_rate": 0.001, + "loss": 1.5414, + "step": 136200 + }, + { + "epoch": 44.0530058177117, + "grad_norm": 1.613829255104065, + "learning_rate": 0.001, + "loss": 1.494, + "step": 136300 + }, + { + "epoch": 44.085326438267614, + "grad_norm": 1.3439263105392456, + "learning_rate": 0.001, + "loss": 1.488, + "step": 136400 + }, + { + "epoch": 44.11764705882353, + "grad_norm": 1.3409062623977661, + "learning_rate": 0.001, + "loss": 1.5025, + "step": 136500 + }, + { + "epoch": 44.14996767937944, + "grad_norm": 1.8013584613800049, + "learning_rate": 0.001, + "loss": 1.4958, + "step": 136600 + }, + { + "epoch": 44.18228829993536, + "grad_norm": 1.5000190734863281, + "learning_rate": 0.001, + "loss": 1.5028, + "step": 136700 + }, + { + "epoch": 44.21460892049127, + "grad_norm": 1.672287940979004, + "learning_rate": 0.001, + "loss": 1.5285, + "step": 136800 + }, + { + "epoch": 44.24692954104719, + "grad_norm": 1.3260489702224731, + "learning_rate": 0.001, + "loss": 1.5245, + "step": 136900 + }, + { + "epoch": 44.2792501616031, + "grad_norm": 1.5546982288360596, + "learning_rate": 0.001, + "loss": 1.5327, + "step": 137000 + }, + { + "epoch": 44.311570782159016, + "grad_norm": 1.8142194747924805, + "learning_rate": 0.001, + "loss": 1.5384, + "step": 137100 + }, + { + "epoch": 44.34389140271493, + "grad_norm": 1.5028103590011597, + "learning_rate": 0.001, + "loss": 1.5345, + "step": 137200 + }, + { + "epoch": 44.376212023270845, + "grad_norm": 1.577204704284668, + "learning_rate": 0.001, + "loss": 1.5437, + "step": 137300 + }, + { + "epoch": 44.40853264382676, + "grad_norm": 1.8334197998046875, + "learning_rate": 0.001, + "loss": 1.5337, + "step": 137400 + }, + { + "epoch": 44.440853264382675, + "grad_norm": 1.5087043046951294, + "learning_rate": 0.001, + "loss": 1.5548, + "step": 137500 + }, + { + "epoch": 44.47317388493859, + "grad_norm": 1.5817244052886963, + "learning_rate": 0.001, + "loss": 1.5527, + "step": 137600 + }, + { + "epoch": 44.505494505494504, + "grad_norm": 1.6286990642547607, + "learning_rate": 0.001, + "loss": 1.5375, + "step": 137700 + }, + { + "epoch": 44.53781512605042, + "grad_norm": 1.6903146505355835, + "learning_rate": 0.001, + "loss": 1.555, + "step": 137800 + }, + { + "epoch": 44.57013574660633, + "grad_norm": 1.5768530368804932, + "learning_rate": 0.001, + "loss": 1.5537, + "step": 137900 + }, + { + "epoch": 44.60245636716225, + "grad_norm": 1.6440843343734741, + "learning_rate": 0.001, + "loss": 1.5767, + "step": 138000 + }, + { + "epoch": 44.63477698771816, + "grad_norm": 1.4411373138427734, + "learning_rate": 0.001, + "loss": 1.5582, + "step": 138100 + }, + { + "epoch": 44.66709760827408, + "grad_norm": 6.931090354919434, + "learning_rate": 0.001, + "loss": 1.5674, + "step": 138200 + }, + { + "epoch": 44.69941822882999, + "grad_norm": 1.7121952772140503, + "learning_rate": 0.001, + "loss": 1.5849, + "step": 138300 + }, + { + "epoch": 44.731738849385906, + "grad_norm": 1.3487331867218018, + "learning_rate": 0.001, + "loss": 1.581, + "step": 138400 + }, + { + "epoch": 44.76405946994182, + "grad_norm": 1.4686524868011475, + "learning_rate": 0.001, + "loss": 1.5855, + "step": 138500 + }, + { + "epoch": 44.796380090497735, + "grad_norm": 1.3534255027770996, + "learning_rate": 0.001, + "loss": 1.5878, + "step": 138600 + }, + { + "epoch": 44.82870071105365, + "grad_norm": 1.4645819664001465, + "learning_rate": 0.001, + "loss": 1.5831, + "step": 138700 + }, + { + "epoch": 44.861021331609564, + "grad_norm": 1.9700833559036255, + "learning_rate": 0.001, + "loss": 1.5986, + "step": 138800 + }, + { + "epoch": 44.89334195216548, + "grad_norm": 1.4428085088729858, + "learning_rate": 0.001, + "loss": 1.5975, + "step": 138900 + }, + { + "epoch": 44.92566257272139, + "grad_norm": 1.724913239479065, + "learning_rate": 0.001, + "loss": 1.601, + "step": 139000 + }, + { + "epoch": 44.95798319327731, + "grad_norm": 1.7658933401107788, + "learning_rate": 0.001, + "loss": 1.5877, + "step": 139100 + }, + { + "epoch": 44.99030381383322, + "grad_norm": 1.6144723892211914, + "learning_rate": 0.001, + "loss": 1.566, + "step": 139200 + }, + { + "epoch": 45.022624434389144, + "grad_norm": 1.3865910768508911, + "learning_rate": 0.001, + "loss": 1.5231, + "step": 139300 + }, + { + "epoch": 45.05494505494506, + "grad_norm": 1.631665825843811, + "learning_rate": 0.001, + "loss": 1.4477, + "step": 139400 + }, + { + "epoch": 45.08726567550097, + "grad_norm": 1.7368639707565308, + "learning_rate": 0.001, + "loss": 1.4724, + "step": 139500 + }, + { + "epoch": 45.11958629605689, + "grad_norm": 1.483976125717163, + "learning_rate": 0.001, + "loss": 1.4668, + "step": 139600 + }, + { + "epoch": 45.1519069166128, + "grad_norm": 1.597456932067871, + "learning_rate": 0.001, + "loss": 1.484, + "step": 139700 + }, + { + "epoch": 45.18422753716872, + "grad_norm": 1.684712529182434, + "learning_rate": 0.001, + "loss": 1.4857, + "step": 139800 + }, + { + "epoch": 45.21654815772463, + "grad_norm": 1.7697254419326782, + "learning_rate": 0.001, + "loss": 1.4854, + "step": 139900 + }, + { + "epoch": 45.248868778280546, + "grad_norm": 1.6394789218902588, + "learning_rate": 0.001, + "loss": 1.494, + "step": 140000 + }, + { + "epoch": 45.28118939883646, + "grad_norm": 1.8333781957626343, + "learning_rate": 0.001, + "loss": 1.4974, + "step": 140100 + }, + { + "epoch": 45.313510019392375, + "grad_norm": 1.4193600416183472, + "learning_rate": 0.001, + "loss": 1.5108, + "step": 140200 + }, + { + "epoch": 45.34583063994829, + "grad_norm": 1.4354008436203003, + "learning_rate": 0.001, + "loss": 1.4979, + "step": 140300 + }, + { + "epoch": 45.378151260504204, + "grad_norm": 1.3426471948623657, + "learning_rate": 0.001, + "loss": 1.5283, + "step": 140400 + }, + { + "epoch": 45.41047188106012, + "grad_norm": 1.817825198173523, + "learning_rate": 0.001, + "loss": 1.5295, + "step": 140500 + }, + { + "epoch": 45.44279250161603, + "grad_norm": 1.8174062967300415, + "learning_rate": 0.001, + "loss": 1.5311, + "step": 140600 + }, + { + "epoch": 45.47511312217195, + "grad_norm": 1.4800944328308105, + "learning_rate": 0.001, + "loss": 1.5262, + "step": 140700 + }, + { + "epoch": 45.50743374272786, + "grad_norm": 1.8175491094589233, + "learning_rate": 0.001, + "loss": 1.5133, + "step": 140800 + }, + { + "epoch": 45.53975436328378, + "grad_norm": 1.32296884059906, + "learning_rate": 0.001, + "loss": 1.5473, + "step": 140900 + }, + { + "epoch": 45.57207498383969, + "grad_norm": 1.3446507453918457, + "learning_rate": 0.001, + "loss": 1.5304, + "step": 141000 + }, + { + "epoch": 45.604395604395606, + "grad_norm": 1.742011547088623, + "learning_rate": 0.001, + "loss": 1.5464, + "step": 141100 + }, + { + "epoch": 45.63671622495152, + "grad_norm": 1.6908282041549683, + "learning_rate": 0.001, + "loss": 1.5599, + "step": 141200 + }, + { + "epoch": 45.669036845507435, + "grad_norm": 1.6784456968307495, + "learning_rate": 0.001, + "loss": 1.5543, + "step": 141300 + }, + { + "epoch": 45.70135746606335, + "grad_norm": 1.5689213275909424, + "learning_rate": 0.001, + "loss": 1.555, + "step": 141400 + }, + { + "epoch": 45.733678086619264, + "grad_norm": 1.5936824083328247, + "learning_rate": 0.001, + "loss": 1.5674, + "step": 141500 + }, + { + "epoch": 45.76599870717518, + "grad_norm": 1.7053321599960327, + "learning_rate": 0.001, + "loss": 1.5705, + "step": 141600 + }, + { + "epoch": 45.79831932773109, + "grad_norm": 2.1247267723083496, + "learning_rate": 0.001, + "loss": 1.5815, + "step": 141700 + }, + { + "epoch": 45.83063994828701, + "grad_norm": 1.4352737665176392, + "learning_rate": 0.001, + "loss": 1.573, + "step": 141800 + }, + { + "epoch": 45.86296056884292, + "grad_norm": 1.7753167152404785, + "learning_rate": 0.001, + "loss": 1.5797, + "step": 141900 + }, + { + "epoch": 45.89528118939884, + "grad_norm": 1.3698890209197998, + "learning_rate": 0.001, + "loss": 1.5852, + "step": 142000 + }, + { + "epoch": 45.92760180995475, + "grad_norm": 1.4331963062286377, + "learning_rate": 0.001, + "loss": 1.5806, + "step": 142100 + }, + { + "epoch": 45.959922430510666, + "grad_norm": 1.4742848873138428, + "learning_rate": 0.001, + "loss": 1.5736, + "step": 142200 + }, + { + "epoch": 45.99224305106658, + "grad_norm": 1.542399287223816, + "learning_rate": 0.001, + "loss": 1.6072, + "step": 142300 + }, + { + "epoch": 46.024563671622495, + "grad_norm": 1.7629003524780273, + "learning_rate": 0.001, + "loss": 1.4912, + "step": 142400 + }, + { + "epoch": 46.05688429217841, + "grad_norm": 1.528564691543579, + "learning_rate": 0.001, + "loss": 1.4412, + "step": 142500 + }, + { + "epoch": 46.089204912734324, + "grad_norm": 1.6338632106781006, + "learning_rate": 0.001, + "loss": 1.4508, + "step": 142600 + }, + { + "epoch": 46.12152553329024, + "grad_norm": 2.0615508556365967, + "learning_rate": 0.001, + "loss": 1.4567, + "step": 142700 + }, + { + "epoch": 46.15384615384615, + "grad_norm": 1.3304144144058228, + "learning_rate": 0.001, + "loss": 1.4729, + "step": 142800 + }, + { + "epoch": 46.18616677440207, + "grad_norm": 2.0170974731445312, + "learning_rate": 0.001, + "loss": 1.4745, + "step": 142900 + }, + { + "epoch": 46.21848739495798, + "grad_norm": 1.8077945709228516, + "learning_rate": 0.001, + "loss": 1.4868, + "step": 143000 + }, + { + "epoch": 46.2508080155139, + "grad_norm": 1.6119381189346313, + "learning_rate": 0.001, + "loss": 1.4822, + "step": 143100 + }, + { + "epoch": 46.28312863606981, + "grad_norm": 1.403429388999939, + "learning_rate": 0.001, + "loss": 1.4943, + "step": 143200 + }, + { + "epoch": 46.315449256625726, + "grad_norm": 1.3802709579467773, + "learning_rate": 0.001, + "loss": 1.485, + "step": 143300 + }, + { + "epoch": 46.34776987718164, + "grad_norm": 1.4897013902664185, + "learning_rate": 0.001, + "loss": 1.4872, + "step": 143400 + }, + { + "epoch": 46.380090497737555, + "grad_norm": 1.8272331953048706, + "learning_rate": 0.001, + "loss": 1.4959, + "step": 143500 + }, + { + "epoch": 46.41241111829347, + "grad_norm": 1.6205357313156128, + "learning_rate": 0.001, + "loss": 1.4995, + "step": 143600 + }, + { + "epoch": 46.444731738849384, + "grad_norm": 1.613732099533081, + "learning_rate": 0.001, + "loss": 1.5116, + "step": 143700 + }, + { + "epoch": 46.4770523594053, + "grad_norm": 1.474404215812683, + "learning_rate": 0.001, + "loss": 1.5179, + "step": 143800 + }, + { + "epoch": 46.50937297996121, + "grad_norm": 1.4578990936279297, + "learning_rate": 0.001, + "loss": 1.5057, + "step": 143900 + }, + { + "epoch": 46.54169360051713, + "grad_norm": 1.424071192741394, + "learning_rate": 0.001, + "loss": 1.5122, + "step": 144000 + }, + { + "epoch": 46.57401422107304, + "grad_norm": 1.4065488576889038, + "learning_rate": 0.001, + "loss": 1.5131, + "step": 144100 + }, + { + "epoch": 46.60633484162896, + "grad_norm": 1.8913878202438354, + "learning_rate": 0.001, + "loss": 1.5399, + "step": 144200 + }, + { + "epoch": 46.63865546218487, + "grad_norm": 1.6994715929031372, + "learning_rate": 0.001, + "loss": 1.5329, + "step": 144300 + }, + { + "epoch": 46.670976082740786, + "grad_norm": 2.1696887016296387, + "learning_rate": 0.001, + "loss": 1.5482, + "step": 144400 + }, + { + "epoch": 46.7032967032967, + "grad_norm": 1.8181955814361572, + "learning_rate": 0.001, + "loss": 1.5486, + "step": 144500 + }, + { + "epoch": 46.735617323852615, + "grad_norm": 1.3913260698318481, + "learning_rate": 0.001, + "loss": 1.5384, + "step": 144600 + }, + { + "epoch": 46.76793794440853, + "grad_norm": 1.6030380725860596, + "learning_rate": 0.001, + "loss": 1.5396, + "step": 144700 + }, + { + "epoch": 46.800258564964444, + "grad_norm": 2.035700559616089, + "learning_rate": 0.001, + "loss": 1.5648, + "step": 144800 + }, + { + "epoch": 46.83257918552036, + "grad_norm": 1.7987518310546875, + "learning_rate": 0.001, + "loss": 1.5471, + "step": 144900 + }, + { + "epoch": 46.864899806076274, + "grad_norm": 1.4945619106292725, + "learning_rate": 0.001, + "loss": 1.5659, + "step": 145000 + }, + { + "epoch": 46.89722042663219, + "grad_norm": 1.7767857313156128, + "learning_rate": 0.001, + "loss": 1.5632, + "step": 145100 + }, + { + "epoch": 46.9295410471881, + "grad_norm": 1.6083945035934448, + "learning_rate": 0.001, + "loss": 1.5382, + "step": 145200 + }, + { + "epoch": 46.96186166774402, + "grad_norm": 1.7108503580093384, + "learning_rate": 0.001, + "loss": 1.57, + "step": 145300 + }, + { + "epoch": 46.99418228829994, + "grad_norm": 1.4575233459472656, + "learning_rate": 0.001, + "loss": 1.566, + "step": 145400 + }, + { + "epoch": 47.02650290885585, + "grad_norm": 1.838165283203125, + "learning_rate": 0.001, + "loss": 1.4575, + "step": 145500 + }, + { + "epoch": 47.05882352941177, + "grad_norm": 1.6801042556762695, + "learning_rate": 0.001, + "loss": 1.4235, + "step": 145600 + }, + { + "epoch": 47.09114414996768, + "grad_norm": 1.862371802330017, + "learning_rate": 0.001, + "loss": 1.434, + "step": 145700 + }, + { + "epoch": 47.1234647705236, + "grad_norm": 2.157015800476074, + "learning_rate": 0.001, + "loss": 1.4335, + "step": 145800 + }, + { + "epoch": 47.15578539107951, + "grad_norm": 1.6546690464019775, + "learning_rate": 0.001, + "loss": 1.4435, + "step": 145900 + }, + { + "epoch": 47.188106011635426, + "grad_norm": 1.6954889297485352, + "learning_rate": 0.001, + "loss": 1.4419, + "step": 146000 + }, + { + "epoch": 47.22042663219134, + "grad_norm": 1.6528239250183105, + "learning_rate": 0.001, + "loss": 1.468, + "step": 146100 + }, + { + "epoch": 47.252747252747255, + "grad_norm": 1.9358820915222168, + "learning_rate": 0.001, + "loss": 1.4415, + "step": 146200 + }, + { + "epoch": 47.28506787330317, + "grad_norm": 1.525715708732605, + "learning_rate": 0.001, + "loss": 1.483, + "step": 146300 + }, + { + "epoch": 47.317388493859085, + "grad_norm": 1.8445520401000977, + "learning_rate": 0.001, + "loss": 1.4701, + "step": 146400 + }, + { + "epoch": 47.349709114415, + "grad_norm": 1.9748725891113281, + "learning_rate": 0.001, + "loss": 1.4815, + "step": 146500 + }, + { + "epoch": 47.382029734970914, + "grad_norm": 1.626009464263916, + "learning_rate": 0.001, + "loss": 1.4907, + "step": 146600 + }, + { + "epoch": 47.41435035552683, + "grad_norm": 1.9791555404663086, + "learning_rate": 0.001, + "loss": 1.4922, + "step": 146700 + }, + { + "epoch": 47.44667097608274, + "grad_norm": 2.1587910652160645, + "learning_rate": 0.001, + "loss": 1.4788, + "step": 146800 + }, + { + "epoch": 47.47899159663866, + "grad_norm": 1.6786390542984009, + "learning_rate": 0.001, + "loss": 1.483, + "step": 146900 + }, + { + "epoch": 47.51131221719457, + "grad_norm": 1.9314137697219849, + "learning_rate": 0.001, + "loss": 1.5109, + "step": 147000 + }, + { + "epoch": 47.543632837750486, + "grad_norm": 1.7206977605819702, + "learning_rate": 0.001, + "loss": 1.5194, + "step": 147100 + }, + { + "epoch": 47.5759534583064, + "grad_norm": 1.6161469221115112, + "learning_rate": 0.001, + "loss": 1.5116, + "step": 147200 + }, + { + "epoch": 47.608274078862316, + "grad_norm": 2.3246424198150635, + "learning_rate": 0.001, + "loss": 1.5192, + "step": 147300 + }, + { + "epoch": 47.64059469941823, + "grad_norm": 1.691521167755127, + "learning_rate": 0.001, + "loss": 1.5237, + "step": 147400 + }, + { + "epoch": 47.672915319974145, + "grad_norm": 2.0103893280029297, + "learning_rate": 0.001, + "loss": 1.5214, + "step": 147500 + }, + { + "epoch": 47.70523594053006, + "grad_norm": 1.7042127847671509, + "learning_rate": 0.001, + "loss": 1.5186, + "step": 147600 + }, + { + "epoch": 47.737556561085974, + "grad_norm": 1.4514890909194946, + "learning_rate": 0.001, + "loss": 1.5237, + "step": 147700 + }, + { + "epoch": 47.76987718164189, + "grad_norm": 1.584375023841858, + "learning_rate": 0.001, + "loss": 1.5369, + "step": 147800 + }, + { + "epoch": 47.8021978021978, + "grad_norm": 1.8270056247711182, + "learning_rate": 0.001, + "loss": 1.5443, + "step": 147900 + }, + { + "epoch": 47.83451842275372, + "grad_norm": 1.6497583389282227, + "learning_rate": 0.001, + "loss": 1.5389, + "step": 148000 + }, + { + "epoch": 47.86683904330963, + "grad_norm": 1.663865089416504, + "learning_rate": 0.001, + "loss": 1.5383, + "step": 148100 + }, + { + "epoch": 47.89915966386555, + "grad_norm": 1.5827676057815552, + "learning_rate": 0.001, + "loss": 1.5506, + "step": 148200 + }, + { + "epoch": 47.93148028442146, + "grad_norm": 1.687949776649475, + "learning_rate": 0.001, + "loss": 1.5282, + "step": 148300 + }, + { + "epoch": 47.963800904977376, + "grad_norm": 1.9395780563354492, + "learning_rate": 0.001, + "loss": 1.5506, + "step": 148400 + }, + { + "epoch": 47.99612152553329, + "grad_norm": 2.5985894203186035, + "learning_rate": 0.001, + "loss": 1.5313, + "step": 148500 + }, + { + "epoch": 48.028442146089205, + "grad_norm": 1.8825600147247314, + "learning_rate": 0.001, + "loss": 1.4213, + "step": 148600 + }, + { + "epoch": 48.06076276664512, + "grad_norm": 1.617006540298462, + "learning_rate": 0.001, + "loss": 1.4215, + "step": 148700 + }, + { + "epoch": 48.093083387201034, + "grad_norm": 1.5653728246688843, + "learning_rate": 0.001, + "loss": 1.4204, + "step": 148800 + }, + { + "epoch": 48.12540400775695, + "grad_norm": 1.995313286781311, + "learning_rate": 0.001, + "loss": 1.4367, + "step": 148900 + }, + { + "epoch": 48.15772462831286, + "grad_norm": 1.708569049835205, + "learning_rate": 0.001, + "loss": 1.4358, + "step": 149000 + }, + { + "epoch": 48.19004524886878, + "grad_norm": 1.6492958068847656, + "learning_rate": 0.001, + "loss": 1.4422, + "step": 149100 + }, + { + "epoch": 48.22236586942469, + "grad_norm": 1.8807870149612427, + "learning_rate": 0.001, + "loss": 1.4331, + "step": 149200 + }, + { + "epoch": 48.25468648998061, + "grad_norm": 1.88623046875, + "learning_rate": 0.001, + "loss": 1.4425, + "step": 149300 + }, + { + "epoch": 48.28700711053652, + "grad_norm": 2.2832539081573486, + "learning_rate": 0.001, + "loss": 1.4621, + "step": 149400 + }, + { + "epoch": 48.319327731092436, + "grad_norm": 1.6884719133377075, + "learning_rate": 0.001, + "loss": 1.4365, + "step": 149500 + }, + { + "epoch": 48.35164835164835, + "grad_norm": 1.904253602027893, + "learning_rate": 0.001, + "loss": 1.4675, + "step": 149600 + }, + { + "epoch": 48.383968972204265, + "grad_norm": 1.6859662532806396, + "learning_rate": 0.001, + "loss": 1.4725, + "step": 149700 + }, + { + "epoch": 48.41628959276018, + "grad_norm": 2.050351858139038, + "learning_rate": 0.001, + "loss": 1.4567, + "step": 149800 + }, + { + "epoch": 48.448610213316094, + "grad_norm": 1.3949456214904785, + "learning_rate": 0.001, + "loss": 1.489, + "step": 149900 + }, + { + "epoch": 48.48093083387201, + "grad_norm": 1.9798871278762817, + "learning_rate": 0.001, + "loss": 1.4699, + "step": 150000 + }, + { + "epoch": 48.51325145442792, + "grad_norm": 1.7036690711975098, + "learning_rate": 0.001, + "loss": 1.4783, + "step": 150100 + }, + { + "epoch": 48.54557207498384, + "grad_norm": 2.0233912467956543, + "learning_rate": 0.001, + "loss": 1.4945, + "step": 150200 + }, + { + "epoch": 48.57789269553975, + "grad_norm": 1.6232671737670898, + "learning_rate": 0.001, + "loss": 1.4934, + "step": 150300 + }, + { + "epoch": 48.61021331609567, + "grad_norm": 1.8260281085968018, + "learning_rate": 0.001, + "loss": 1.5012, + "step": 150400 + }, + { + "epoch": 48.64253393665158, + "grad_norm": 2.079585552215576, + "learning_rate": 0.001, + "loss": 1.5057, + "step": 150500 + }, + { + "epoch": 48.674854557207496, + "grad_norm": 1.9923733472824097, + "learning_rate": 0.001, + "loss": 1.4999, + "step": 150600 + }, + { + "epoch": 48.70717517776341, + "grad_norm": 2.004462480545044, + "learning_rate": 0.001, + "loss": 1.5087, + "step": 150700 + }, + { + "epoch": 48.739495798319325, + "grad_norm": 1.9867020845413208, + "learning_rate": 0.001, + "loss": 1.5161, + "step": 150800 + }, + { + "epoch": 48.77181641887524, + "grad_norm": 1.81278657913208, + "learning_rate": 0.001, + "loss": 1.5151, + "step": 150900 + }, + { + "epoch": 48.804137039431154, + "grad_norm": 1.93266761302948, + "learning_rate": 0.001, + "loss": 1.5136, + "step": 151000 + }, + { + "epoch": 48.83645765998707, + "grad_norm": 2.0835654735565186, + "learning_rate": 0.001, + "loss": 1.5254, + "step": 151100 + }, + { + "epoch": 48.86877828054298, + "grad_norm": 1.9827555418014526, + "learning_rate": 0.001, + "loss": 1.5334, + "step": 151200 + }, + { + "epoch": 48.9010989010989, + "grad_norm": 1.724138855934143, + "learning_rate": 0.001, + "loss": 1.5281, + "step": 151300 + }, + { + "epoch": 48.93341952165481, + "grad_norm": 1.6550005674362183, + "learning_rate": 0.001, + "loss": 1.5372, + "step": 151400 + }, + { + "epoch": 48.96574014221073, + "grad_norm": 2.217447519302368, + "learning_rate": 0.001, + "loss": 1.5312, + "step": 151500 + }, + { + "epoch": 48.99806076276664, + "grad_norm": 2.1639389991760254, + "learning_rate": 0.001, + "loss": 1.5187, + "step": 151600 + }, + { + "epoch": 49.03038138332256, + "grad_norm": 1.84763503074646, + "learning_rate": 0.001, + "loss": 1.3963, + "step": 151700 + }, + { + "epoch": 49.06270200387848, + "grad_norm": 1.755767583847046, + "learning_rate": 0.001, + "loss": 1.4093, + "step": 151800 + }, + { + "epoch": 49.09502262443439, + "grad_norm": 2.258392333984375, + "learning_rate": 0.001, + "loss": 1.4131, + "step": 151900 + }, + { + "epoch": 49.12734324499031, + "grad_norm": 2.0753369331359863, + "learning_rate": 0.001, + "loss": 1.4137, + "step": 152000 + }, + { + "epoch": 49.15966386554622, + "grad_norm": 1.6378613710403442, + "learning_rate": 0.001, + "loss": 1.424, + "step": 152100 + }, + { + "epoch": 49.191984486102136, + "grad_norm": 7.034304618835449, + "learning_rate": 0.001, + "loss": 1.4069, + "step": 152200 + }, + { + "epoch": 49.22430510665805, + "grad_norm": 2.0201122760772705, + "learning_rate": 0.001, + "loss": 1.426, + "step": 152300 + }, + { + "epoch": 49.256625727213965, + "grad_norm": 1.9883705377578735, + "learning_rate": 0.001, + "loss": 1.4445, + "step": 152400 + }, + { + "epoch": 49.28894634776988, + "grad_norm": 2.0591297149658203, + "learning_rate": 0.001, + "loss": 1.43, + "step": 152500 + }, + { + "epoch": 49.321266968325794, + "grad_norm": 1.975419521331787, + "learning_rate": 0.001, + "loss": 1.4497, + "step": 152600 + }, + { + "epoch": 49.35358758888171, + "grad_norm": 2.1251235008239746, + "learning_rate": 0.001, + "loss": 1.465, + "step": 152700 + }, + { + "epoch": 49.38590820943762, + "grad_norm": 1.797031283378601, + "learning_rate": 0.001, + "loss": 1.441, + "step": 152800 + }, + { + "epoch": 49.41822882999354, + "grad_norm": 2.097074270248413, + "learning_rate": 0.001, + "loss": 1.4631, + "step": 152900 + }, + { + "epoch": 49.45054945054945, + "grad_norm": 1.9780620336532593, + "learning_rate": 0.001, + "loss": 1.4636, + "step": 153000 + }, + { + "epoch": 49.48287007110537, + "grad_norm": 1.8536884784698486, + "learning_rate": 0.001, + "loss": 1.4603, + "step": 153100 + }, + { + "epoch": 49.51519069166128, + "grad_norm": 2.2152280807495117, + "learning_rate": 0.001, + "loss": 1.4746, + "step": 153200 + }, + { + "epoch": 49.547511312217196, + "grad_norm": 2.028168201446533, + "learning_rate": 0.001, + "loss": 1.4809, + "step": 153300 + }, + { + "epoch": 49.57983193277311, + "grad_norm": 2.026210308074951, + "learning_rate": 0.001, + "loss": 1.4934, + "step": 153400 + }, + { + "epoch": 49.612152553329025, + "grad_norm": 1.8274730443954468, + "learning_rate": 0.001, + "loss": 1.4745, + "step": 153500 + }, + { + "epoch": 49.64447317388494, + "grad_norm": 1.7476829290390015, + "learning_rate": 0.001, + "loss": 1.4877, + "step": 153600 + }, + { + "epoch": 49.676793794440854, + "grad_norm": 1.8586665391921997, + "learning_rate": 0.001, + "loss": 1.4779, + "step": 153700 + }, + { + "epoch": 49.70911441499677, + "grad_norm": 2.142073154449463, + "learning_rate": 0.001, + "loss": 1.4904, + "step": 153800 + }, + { + "epoch": 49.74143503555268, + "grad_norm": 2.4163622856140137, + "learning_rate": 0.001, + "loss": 1.4913, + "step": 153900 + }, + { + "epoch": 49.7737556561086, + "grad_norm": 1.5314381122589111, + "learning_rate": 0.001, + "loss": 1.4888, + "step": 154000 + }, + { + "epoch": 49.80607627666451, + "grad_norm": 2.1272311210632324, + "learning_rate": 0.001, + "loss": 1.4829, + "step": 154100 + }, + { + "epoch": 49.83839689722043, + "grad_norm": 1.8081731796264648, + "learning_rate": 0.001, + "loss": 1.4969, + "step": 154200 + }, + { + "epoch": 49.87071751777634, + "grad_norm": 2.0165419578552246, + "learning_rate": 0.001, + "loss": 1.5123, + "step": 154300 + }, + { + "epoch": 49.903038138332256, + "grad_norm": 1.895053744316101, + "learning_rate": 0.001, + "loss": 1.5118, + "step": 154400 + }, + { + "epoch": 49.93535875888817, + "grad_norm": 1.836590051651001, + "learning_rate": 0.001, + "loss": 1.5071, + "step": 154500 + }, + { + "epoch": 49.967679379444085, + "grad_norm": 2.259945869445801, + "learning_rate": 0.001, + "loss": 1.5239, + "step": 154600 + }, + { + "epoch": 50.0, + "grad_norm": 2.8617236614227295, + "learning_rate": 0.001, + "loss": 1.4828, + "step": 154700 + }, + { + "epoch": 50.032320620555915, + "grad_norm": 2.7500596046447754, + "learning_rate": 0.001, + "loss": 1.3622, + "step": 154800 + }, + { + "epoch": 50.06464124111183, + "grad_norm": 2.843418598175049, + "learning_rate": 0.001, + "loss": 1.389, + "step": 154900 + }, + { + "epoch": 50.096961861667744, + "grad_norm": 1.911349892616272, + "learning_rate": 0.001, + "loss": 1.4045, + "step": 155000 + }, + { + "epoch": 50.12928248222366, + "grad_norm": 2.789196491241455, + "learning_rate": 0.001, + "loss": 1.4044, + "step": 155100 + }, + { + "epoch": 50.16160310277957, + "grad_norm": 2.3924665451049805, + "learning_rate": 0.001, + "loss": 1.4078, + "step": 155200 + }, + { + "epoch": 50.19392372333549, + "grad_norm": 2.2527916431427, + "learning_rate": 0.001, + "loss": 1.4179, + "step": 155300 + }, + { + "epoch": 50.2262443438914, + "grad_norm": 2.869682788848877, + "learning_rate": 0.001, + "loss": 1.4157, + "step": 155400 + }, + { + "epoch": 50.25856496444732, + "grad_norm": 3.076284646987915, + "learning_rate": 0.001, + "loss": 1.4194, + "step": 155500 + }, + { + "epoch": 50.29088558500323, + "grad_norm": 2.405968427658081, + "learning_rate": 0.001, + "loss": 1.4324, + "step": 155600 + }, + { + "epoch": 50.323206205559146, + "grad_norm": 2.4868760108947754, + "learning_rate": 0.001, + "loss": 1.4284, + "step": 155700 + }, + { + "epoch": 50.35552682611506, + "grad_norm": 2.351515769958496, + "learning_rate": 0.001, + "loss": 1.4396, + "step": 155800 + }, + { + "epoch": 50.387847446670975, + "grad_norm": 2.545591354370117, + "learning_rate": 0.001, + "loss": 1.4433, + "step": 155900 + }, + { + "epoch": 50.42016806722689, + "grad_norm": 2.52632212638855, + "learning_rate": 0.001, + "loss": 1.4548, + "step": 156000 + }, + { + "epoch": 50.452488687782804, + "grad_norm": 2.8453922271728516, + "learning_rate": 0.001, + "loss": 1.4365, + "step": 156100 + }, + { + "epoch": 50.48480930833872, + "grad_norm": 2.562619209289551, + "learning_rate": 0.001, + "loss": 1.4401, + "step": 156200 + }, + { + "epoch": 50.51712992889463, + "grad_norm": 2.5165834426879883, + "learning_rate": 0.001, + "loss": 1.4542, + "step": 156300 + }, + { + "epoch": 50.54945054945055, + "grad_norm": 2.4320249557495117, + "learning_rate": 0.001, + "loss": 1.4499, + "step": 156400 + }, + { + "epoch": 50.58177117000646, + "grad_norm": 2.7280805110931396, + "learning_rate": 0.001, + "loss": 1.4534, + "step": 156500 + }, + { + "epoch": 50.61409179056238, + "grad_norm": 2.658902645111084, + "learning_rate": 0.001, + "loss": 1.4678, + "step": 156600 + }, + { + "epoch": 50.64641241111829, + "grad_norm": 2.3166139125823975, + "learning_rate": 0.001, + "loss": 1.4596, + "step": 156700 + }, + { + "epoch": 50.678733031674206, + "grad_norm": 3.5382041931152344, + "learning_rate": 0.001, + "loss": 1.4593, + "step": 156800 + }, + { + "epoch": 50.71105365223012, + "grad_norm": 3.2525131702423096, + "learning_rate": 0.001, + "loss": 1.462, + "step": 156900 + }, + { + "epoch": 50.743374272786035, + "grad_norm": 2.6875829696655273, + "learning_rate": 0.001, + "loss": 1.5004, + "step": 157000 + }, + { + "epoch": 50.77569489334195, + "grad_norm": 2.6666595935821533, + "learning_rate": 0.001, + "loss": 1.473, + "step": 157100 + }, + { + "epoch": 50.808015513897864, + "grad_norm": 2.2740638256073, + "learning_rate": 0.001, + "loss": 1.4764, + "step": 157200 + }, + { + "epoch": 50.84033613445378, + "grad_norm": 2.457540273666382, + "learning_rate": 0.001, + "loss": 1.4869, + "step": 157300 + }, + { + "epoch": 50.87265675500969, + "grad_norm": 7.748457431793213, + "learning_rate": 0.001, + "loss": 1.4969, + "step": 157400 + }, + { + "epoch": 50.90497737556561, + "grad_norm": 2.187288284301758, + "learning_rate": 0.001, + "loss": 1.4913, + "step": 157500 + }, + { + "epoch": 50.93729799612152, + "grad_norm": 7.933531761169434, + "learning_rate": 0.001, + "loss": 1.5105, + "step": 157600 + }, + { + "epoch": 50.96961861667744, + "grad_norm": 2.370905637741089, + "learning_rate": 0.001, + "loss": 1.4957, + "step": 157700 + }, + { + "epoch": 51.00193923723336, + "grad_norm": 1.6068298816680908, + "learning_rate": 0.001, + "loss": 1.5156, + "step": 157800 + }, + { + "epoch": 51.03425985778927, + "grad_norm": 1.8159526586532593, + "learning_rate": 0.001, + "loss": 1.358, + "step": 157900 + }, + { + "epoch": 51.06658047834519, + "grad_norm": 1.58469820022583, + "learning_rate": 0.001, + "loss": 1.3614, + "step": 158000 + }, + { + "epoch": 51.0989010989011, + "grad_norm": 1.645398736000061, + "learning_rate": 0.001, + "loss": 1.3755, + "step": 158100 + }, + { + "epoch": 51.13122171945702, + "grad_norm": 2.198871612548828, + "learning_rate": 0.001, + "loss": 1.3954, + "step": 158200 + }, + { + "epoch": 51.16354234001293, + "grad_norm": 1.8951984643936157, + "learning_rate": 0.001, + "loss": 1.3857, + "step": 158300 + }, + { + "epoch": 51.195862960568846, + "grad_norm": 1.9388495683670044, + "learning_rate": 0.001, + "loss": 1.3751, + "step": 158400 + }, + { + "epoch": 51.22818358112476, + "grad_norm": 1.894666075706482, + "learning_rate": 0.001, + "loss": 1.414, + "step": 158500 + }, + { + "epoch": 51.260504201680675, + "grad_norm": 2.324024200439453, + "learning_rate": 0.001, + "loss": 1.4045, + "step": 158600 + }, + { + "epoch": 51.29282482223659, + "grad_norm": 2.4311294555664062, + "learning_rate": 0.001, + "loss": 1.4152, + "step": 158700 + }, + { + "epoch": 51.325145442792504, + "grad_norm": 1.6439367532730103, + "learning_rate": 0.001, + "loss": 1.4066, + "step": 158800 + }, + { + "epoch": 51.35746606334842, + "grad_norm": 1.458225131034851, + "learning_rate": 0.001, + "loss": 1.4157, + "step": 158900 + }, + { + "epoch": 51.38978668390433, + "grad_norm": 1.7206692695617676, + "learning_rate": 0.001, + "loss": 1.4376, + "step": 159000 + }, + { + "epoch": 51.42210730446025, + "grad_norm": 2.1803271770477295, + "learning_rate": 0.001, + "loss": 1.4167, + "step": 159100 + }, + { + "epoch": 51.45442792501616, + "grad_norm": 1.9700300693511963, + "learning_rate": 0.001, + "loss": 1.4532, + "step": 159200 + }, + { + "epoch": 51.48674854557208, + "grad_norm": 1.7611888647079468, + "learning_rate": 0.001, + "loss": 1.4344, + "step": 159300 + }, + { + "epoch": 51.51906916612799, + "grad_norm": 1.614646553993225, + "learning_rate": 0.001, + "loss": 1.4172, + "step": 159400 + }, + { + "epoch": 51.551389786683906, + "grad_norm": 1.9561604261398315, + "learning_rate": 0.001, + "loss": 1.4543, + "step": 159500 + }, + { + "epoch": 51.58371040723982, + "grad_norm": 2.4098892211914062, + "learning_rate": 0.001, + "loss": 1.4481, + "step": 159600 + }, + { + "epoch": 51.616031027795735, + "grad_norm": 1.6354035139083862, + "learning_rate": 0.001, + "loss": 1.4402, + "step": 159700 + }, + { + "epoch": 51.64835164835165, + "grad_norm": 1.5231209993362427, + "learning_rate": 0.001, + "loss": 1.4591, + "step": 159800 + }, + { + "epoch": 51.680672268907564, + "grad_norm": 1.4801390171051025, + "learning_rate": 0.001, + "loss": 1.4647, + "step": 159900 + }, + { + "epoch": 51.71299288946348, + "grad_norm": 1.897993803024292, + "learning_rate": 0.001, + "loss": 1.4731, + "step": 160000 + }, + { + "epoch": 51.74531351001939, + "grad_norm": 1.9183740615844727, + "learning_rate": 0.001, + "loss": 1.4652, + "step": 160100 + }, + { + "epoch": 51.77763413057531, + "grad_norm": 1.6743416786193848, + "learning_rate": 0.001, + "loss": 1.468, + "step": 160200 + }, + { + "epoch": 51.80995475113122, + "grad_norm": 1.5103219747543335, + "learning_rate": 0.001, + "loss": 1.4753, + "step": 160300 + }, + { + "epoch": 51.84227537168714, + "grad_norm": 1.8089489936828613, + "learning_rate": 0.001, + "loss": 1.4642, + "step": 160400 + }, + { + "epoch": 51.87459599224305, + "grad_norm": 1.889352560043335, + "learning_rate": 0.001, + "loss": 1.4753, + "step": 160500 + }, + { + "epoch": 51.906916612798966, + "grad_norm": 1.892152190208435, + "learning_rate": 0.001, + "loss": 1.4855, + "step": 160600 + }, + { + "epoch": 51.93923723335488, + "grad_norm": 2.1310675144195557, + "learning_rate": 0.001, + "loss": 1.4741, + "step": 160700 + }, + { + "epoch": 51.971557853910795, + "grad_norm": 1.6801360845565796, + "learning_rate": 0.001, + "loss": 1.4823, + "step": 160800 + }, + { + "epoch": 52.00387847446671, + "grad_norm": 1.5871704816818237, + "learning_rate": 0.001, + "loss": 1.4805, + "step": 160900 + }, + { + "epoch": 52.036199095022624, + "grad_norm": 1.765568494796753, + "learning_rate": 0.001, + "loss": 1.3575, + "step": 161000 + }, + { + "epoch": 52.06851971557854, + "grad_norm": 1.6732524633407593, + "learning_rate": 0.001, + "loss": 1.3625, + "step": 161100 + }, + { + "epoch": 52.10084033613445, + "grad_norm": 2.0522379875183105, + "learning_rate": 0.001, + "loss": 1.3777, + "step": 161200 + }, + { + "epoch": 52.13316095669037, + "grad_norm": 1.4788262844085693, + "learning_rate": 0.001, + "loss": 1.3684, + "step": 161300 + }, + { + "epoch": 52.16548157724628, + "grad_norm": 1.94754159450531, + "learning_rate": 0.001, + "loss": 1.364, + "step": 161400 + }, + { + "epoch": 52.1978021978022, + "grad_norm": 1.6368705034255981, + "learning_rate": 0.001, + "loss": 1.3813, + "step": 161500 + }, + { + "epoch": 52.23012281835811, + "grad_norm": 1.5317388772964478, + "learning_rate": 0.001, + "loss": 1.392, + "step": 161600 + }, + { + "epoch": 52.262443438914026, + "grad_norm": 1.593577265739441, + "learning_rate": 0.001, + "loss": 1.3962, + "step": 161700 + }, + { + "epoch": 52.29476405946994, + "grad_norm": 1.7241520881652832, + "learning_rate": 0.001, + "loss": 1.3977, + "step": 161800 + }, + { + "epoch": 52.327084680025855, + "grad_norm": 1.9638442993164062, + "learning_rate": 0.001, + "loss": 1.3956, + "step": 161900 + }, + { + "epoch": 52.35940530058177, + "grad_norm": 1.7215887308120728, + "learning_rate": 0.001, + "loss": 1.4052, + "step": 162000 + }, + { + "epoch": 52.391725921137684, + "grad_norm": 1.7366405725479126, + "learning_rate": 0.001, + "loss": 1.4169, + "step": 162100 + }, + { + "epoch": 52.4240465416936, + "grad_norm": 2.66810941696167, + "learning_rate": 0.001, + "loss": 1.4099, + "step": 162200 + }, + { + "epoch": 52.456367162249514, + "grad_norm": 1.9656466245651245, + "learning_rate": 0.001, + "loss": 1.4154, + "step": 162300 + }, + { + "epoch": 52.48868778280543, + "grad_norm": 1.7060424089431763, + "learning_rate": 0.001, + "loss": 1.4269, + "step": 162400 + }, + { + "epoch": 52.52100840336134, + "grad_norm": 1.6961543560028076, + "learning_rate": 0.001, + "loss": 1.4199, + "step": 162500 + }, + { + "epoch": 52.55332902391726, + "grad_norm": 1.670259952545166, + "learning_rate": 0.001, + "loss": 1.4225, + "step": 162600 + }, + { + "epoch": 52.58564964447317, + "grad_norm": 1.4814908504486084, + "learning_rate": 0.001, + "loss": 1.4246, + "step": 162700 + }, + { + "epoch": 52.617970265029086, + "grad_norm": 1.7170677185058594, + "learning_rate": 0.001, + "loss": 1.4303, + "step": 162800 + }, + { + "epoch": 52.650290885585, + "grad_norm": 1.9022291898727417, + "learning_rate": 0.001, + "loss": 1.4298, + "step": 162900 + }, + { + "epoch": 52.682611506140915, + "grad_norm": 1.8609496355056763, + "learning_rate": 0.001, + "loss": 1.4399, + "step": 163000 + }, + { + "epoch": 52.71493212669683, + "grad_norm": 1.6735694408416748, + "learning_rate": 0.001, + "loss": 1.4488, + "step": 163100 + }, + { + "epoch": 52.747252747252745, + "grad_norm": 1.5999531745910645, + "learning_rate": 0.001, + "loss": 1.4326, + "step": 163200 + }, + { + "epoch": 52.77957336780866, + "grad_norm": 1.8553581237792969, + "learning_rate": 0.001, + "loss": 1.4322, + "step": 163300 + }, + { + "epoch": 52.811893988364574, + "grad_norm": 1.971063256263733, + "learning_rate": 0.001, + "loss": 1.4361, + "step": 163400 + }, + { + "epoch": 52.84421460892049, + "grad_norm": 1.682065725326538, + "learning_rate": 0.001, + "loss": 1.4516, + "step": 163500 + }, + { + "epoch": 52.8765352294764, + "grad_norm": 1.498920202255249, + "learning_rate": 0.001, + "loss": 1.4783, + "step": 163600 + }, + { + "epoch": 52.90885585003232, + "grad_norm": 2.0326061248779297, + "learning_rate": 0.001, + "loss": 1.4666, + "step": 163700 + }, + { + "epoch": 52.94117647058823, + "grad_norm": 1.505751609802246, + "learning_rate": 0.001, + "loss": 1.4689, + "step": 163800 + }, + { + "epoch": 52.97349709114415, + "grad_norm": 1.652345895767212, + "learning_rate": 0.001, + "loss": 1.4698, + "step": 163900 + }, + { + "epoch": 53.00581771170007, + "grad_norm": 1.6074447631835938, + "learning_rate": 0.001, + "loss": 1.4543, + "step": 164000 + }, + { + "epoch": 53.03813833225598, + "grad_norm": 1.6570724248886108, + "learning_rate": 0.001, + "loss": 1.3409, + "step": 164100 + }, + { + "epoch": 53.0704589528119, + "grad_norm": 1.9990196228027344, + "learning_rate": 0.001, + "loss": 1.3491, + "step": 164200 + }, + { + "epoch": 53.10277957336781, + "grad_norm": 1.4702783823013306, + "learning_rate": 0.001, + "loss": 1.3252, + "step": 164300 + }, + { + "epoch": 53.135100193923726, + "grad_norm": 2.1267101764678955, + "learning_rate": 0.001, + "loss": 1.3418, + "step": 164400 + }, + { + "epoch": 53.16742081447964, + "grad_norm": 1.68787682056427, + "learning_rate": 0.001, + "loss": 1.3588, + "step": 164500 + }, + { + "epoch": 53.199741435035556, + "grad_norm": 1.6803096532821655, + "learning_rate": 0.001, + "loss": 1.3685, + "step": 164600 + }, + { + "epoch": 53.23206205559147, + "grad_norm": 1.3000693321228027, + "learning_rate": 0.001, + "loss": 1.3675, + "step": 164700 + }, + { + "epoch": 53.264382676147385, + "grad_norm": 1.6280704736709595, + "learning_rate": 0.001, + "loss": 1.3659, + "step": 164800 + }, + { + "epoch": 53.2967032967033, + "grad_norm": 1.7576541900634766, + "learning_rate": 0.001, + "loss": 1.383, + "step": 164900 + }, + { + "epoch": 53.329023917259214, + "grad_norm": 1.8576921224594116, + "learning_rate": 0.001, + "loss": 1.373, + "step": 165000 + }, + { + "epoch": 53.36134453781513, + "grad_norm": 1.5620722770690918, + "learning_rate": 0.001, + "loss": 1.3778, + "step": 165100 + }, + { + "epoch": 53.39366515837104, + "grad_norm": 1.6440199613571167, + "learning_rate": 0.001, + "loss": 1.413, + "step": 165200 + }, + { + "epoch": 53.42598577892696, + "grad_norm": 2.071763038635254, + "learning_rate": 0.001, + "loss": 1.3942, + "step": 165300 + }, + { + "epoch": 53.45830639948287, + "grad_norm": 1.619279384613037, + "learning_rate": 0.001, + "loss": 1.4029, + "step": 165400 + }, + { + "epoch": 53.49062702003879, + "grad_norm": 1.8446378707885742, + "learning_rate": 0.001, + "loss": 1.398, + "step": 165500 + }, + { + "epoch": 53.5229476405947, + "grad_norm": 6.851656913757324, + "learning_rate": 0.001, + "loss": 1.406, + "step": 165600 + }, + { + "epoch": 53.555268261150616, + "grad_norm": 1.4749475717544556, + "learning_rate": 0.001, + "loss": 1.4049, + "step": 165700 + }, + { + "epoch": 53.58758888170653, + "grad_norm": 1.752159595489502, + "learning_rate": 0.001, + "loss": 1.3914, + "step": 165800 + }, + { + "epoch": 53.619909502262445, + "grad_norm": 1.5363788604736328, + "learning_rate": 0.001, + "loss": 1.4229, + "step": 165900 + }, + { + "epoch": 53.65223012281836, + "grad_norm": 1.6976779699325562, + "learning_rate": 0.001, + "loss": 1.4316, + "step": 166000 + }, + { + "epoch": 53.684550743374274, + "grad_norm": 1.7210675477981567, + "learning_rate": 0.001, + "loss": 1.4234, + "step": 166100 + }, + { + "epoch": 53.71687136393019, + "grad_norm": 1.6245074272155762, + "learning_rate": 0.001, + "loss": 1.4233, + "step": 166200 + }, + { + "epoch": 53.7491919844861, + "grad_norm": 1.5693413019180298, + "learning_rate": 0.001, + "loss": 1.4393, + "step": 166300 + }, + { + "epoch": 53.78151260504202, + "grad_norm": 1.9812067747116089, + "learning_rate": 0.001, + "loss": 1.4486, + "step": 166400 + }, + { + "epoch": 53.81383322559793, + "grad_norm": 1.4747971296310425, + "learning_rate": 0.001, + "loss": 1.4399, + "step": 166500 + }, + { + "epoch": 53.84615384615385, + "grad_norm": 1.851163625717163, + "learning_rate": 0.001, + "loss": 1.4454, + "step": 166600 + }, + { + "epoch": 53.87847446670976, + "grad_norm": 2.2305819988250732, + "learning_rate": 0.001, + "loss": 1.4636, + "step": 166700 + }, + { + "epoch": 53.910795087265676, + "grad_norm": 1.541678547859192, + "learning_rate": 0.001, + "loss": 1.4269, + "step": 166800 + }, + { + "epoch": 53.94311570782159, + "grad_norm": 1.715518593788147, + "learning_rate": 0.001, + "loss": 1.4537, + "step": 166900 + }, + { + "epoch": 53.975436328377505, + "grad_norm": 1.4480615854263306, + "learning_rate": 0.001, + "loss": 1.4655, + "step": 167000 + }, + { + "epoch": 54.00775694893342, + "grad_norm": 1.692636251449585, + "learning_rate": 0.001, + "loss": 1.4344, + "step": 167100 + }, + { + "epoch": 54.040077569489334, + "grad_norm": 1.7430903911590576, + "learning_rate": 0.001, + "loss": 1.3191, + "step": 167200 + }, + { + "epoch": 54.07239819004525, + "grad_norm": 1.308061957359314, + "learning_rate": 0.001, + "loss": 1.3301, + "step": 167300 + }, + { + "epoch": 54.10471881060116, + "grad_norm": 1.7454525232315063, + "learning_rate": 0.001, + "loss": 1.3403, + "step": 167400 + }, + { + "epoch": 54.13703943115708, + "grad_norm": 1.5327422618865967, + "learning_rate": 0.001, + "loss": 1.3231, + "step": 167500 + }, + { + "epoch": 54.16936005171299, + "grad_norm": 1.5026171207427979, + "learning_rate": 0.001, + "loss": 1.3433, + "step": 167600 + }, + { + "epoch": 54.20168067226891, + "grad_norm": 1.3935140371322632, + "learning_rate": 0.001, + "loss": 1.3591, + "step": 167700 + }, + { + "epoch": 54.23400129282482, + "grad_norm": 1.617540717124939, + "learning_rate": 0.001, + "loss": 1.3346, + "step": 167800 + }, + { + "epoch": 54.266321913380736, + "grad_norm": 2.285799741744995, + "learning_rate": 0.001, + "loss": 1.3603, + "step": 167900 + }, + { + "epoch": 54.29864253393665, + "grad_norm": 1.4892338514328003, + "learning_rate": 0.001, + "loss": 1.3933, + "step": 168000 + }, + { + "epoch": 54.330963154492565, + "grad_norm": 1.4886034727096558, + "learning_rate": 0.001, + "loss": 1.3704, + "step": 168100 + }, + { + "epoch": 54.36328377504848, + "grad_norm": 1.4371678829193115, + "learning_rate": 0.001, + "loss": 1.3819, + "step": 168200 + }, + { + "epoch": 54.395604395604394, + "grad_norm": 2.0230281352996826, + "learning_rate": 0.001, + "loss": 1.3826, + "step": 168300 + }, + { + "epoch": 54.42792501616031, + "grad_norm": 1.5713683366775513, + "learning_rate": 0.001, + "loss": 1.3671, + "step": 168400 + }, + { + "epoch": 54.46024563671622, + "grad_norm": 1.58280348777771, + "learning_rate": 0.001, + "loss": 1.3767, + "step": 168500 + }, + { + "epoch": 54.49256625727214, + "grad_norm": 1.4634507894515991, + "learning_rate": 0.001, + "loss": 1.3916, + "step": 168600 + }, + { + "epoch": 54.52488687782805, + "grad_norm": 1.4438565969467163, + "learning_rate": 0.001, + "loss": 1.4001, + "step": 168700 + }, + { + "epoch": 54.55720749838397, + "grad_norm": 1.6048426628112793, + "learning_rate": 0.001, + "loss": 1.3831, + "step": 168800 + }, + { + "epoch": 54.58952811893988, + "grad_norm": 1.7846475839614868, + "learning_rate": 0.001, + "loss": 1.3927, + "step": 168900 + }, + { + "epoch": 54.621848739495796, + "grad_norm": 1.8997995853424072, + "learning_rate": 0.001, + "loss": 1.4055, + "step": 169000 + }, + { + "epoch": 54.65416936005171, + "grad_norm": 1.6765133142471313, + "learning_rate": 0.001, + "loss": 1.4073, + "step": 169100 + }, + { + "epoch": 54.686489980607625, + "grad_norm": 1.3951281309127808, + "learning_rate": 0.001, + "loss": 1.4121, + "step": 169200 + }, + { + "epoch": 54.71881060116354, + "grad_norm": 1.5928879976272583, + "learning_rate": 0.001, + "loss": 1.425, + "step": 169300 + }, + { + "epoch": 54.751131221719454, + "grad_norm": 1.5925168991088867, + "learning_rate": 0.001, + "loss": 1.4258, + "step": 169400 + }, + { + "epoch": 54.78345184227537, + "grad_norm": 1.8010461330413818, + "learning_rate": 0.001, + "loss": 1.4312, + "step": 169500 + }, + { + "epoch": 54.81577246283128, + "grad_norm": 2.023576259613037, + "learning_rate": 0.001, + "loss": 1.4175, + "step": 169600 + }, + { + "epoch": 54.8480930833872, + "grad_norm": 1.5003081560134888, + "learning_rate": 0.001, + "loss": 1.445, + "step": 169700 + }, + { + "epoch": 54.88041370394311, + "grad_norm": 1.3284757137298584, + "learning_rate": 0.001, + "loss": 1.4262, + "step": 169800 + }, + { + "epoch": 54.91273432449903, + "grad_norm": 1.90644371509552, + "learning_rate": 0.001, + "loss": 1.42, + "step": 169900 + }, + { + "epoch": 54.94505494505494, + "grad_norm": 2.487614393234253, + "learning_rate": 0.001, + "loss": 1.4205, + "step": 170000 + }, + { + "epoch": 54.977375565610856, + "grad_norm": 2.273512601852417, + "learning_rate": 0.001, + "loss": 1.4401, + "step": 170100 + }, + { + "epoch": 55.00969618616678, + "grad_norm": 1.6120694875717163, + "learning_rate": 0.001, + "loss": 1.3946, + "step": 170200 + }, + { + "epoch": 55.04201680672269, + "grad_norm": 1.6807186603546143, + "learning_rate": 0.001, + "loss": 1.3055, + "step": 170300 + }, + { + "epoch": 55.07433742727861, + "grad_norm": 1.9705253839492798, + "learning_rate": 0.001, + "loss": 1.318, + "step": 170400 + }, + { + "epoch": 55.10665804783452, + "grad_norm": 1.604475498199463, + "learning_rate": 0.001, + "loss": 1.3274, + "step": 170500 + }, + { + "epoch": 55.138978668390436, + "grad_norm": 6.541220664978027, + "learning_rate": 0.001, + "loss": 1.332, + "step": 170600 + }, + { + "epoch": 55.17129928894635, + "grad_norm": 2.1754775047302246, + "learning_rate": 0.001, + "loss": 1.3366, + "step": 170700 + }, + { + "epoch": 55.203619909502265, + "grad_norm": 1.566157341003418, + "learning_rate": 0.001, + "loss": 1.3424, + "step": 170800 + }, + { + "epoch": 55.23594053005818, + "grad_norm": 1.5534968376159668, + "learning_rate": 0.001, + "loss": 1.3364, + "step": 170900 + }, + { + "epoch": 55.268261150614094, + "grad_norm": 2.1709282398223877, + "learning_rate": 0.001, + "loss": 1.3536, + "step": 171000 + }, + { + "epoch": 55.30058177117001, + "grad_norm": 1.6569381952285767, + "learning_rate": 0.001, + "loss": 1.3321, + "step": 171100 + }, + { + "epoch": 55.33290239172592, + "grad_norm": 1.758289098739624, + "learning_rate": 0.001, + "loss": 1.3522, + "step": 171200 + }, + { + "epoch": 55.36522301228184, + "grad_norm": 1.7878657579421997, + "learning_rate": 0.001, + "loss": 1.3564, + "step": 171300 + }, + { + "epoch": 55.39754363283775, + "grad_norm": 1.8341219425201416, + "learning_rate": 0.001, + "loss": 1.3519, + "step": 171400 + }, + { + "epoch": 55.42986425339367, + "grad_norm": 1.522491216659546, + "learning_rate": 0.001, + "loss": 1.3586, + "step": 171500 + }, + { + "epoch": 55.46218487394958, + "grad_norm": 2.06044864654541, + "learning_rate": 0.001, + "loss": 1.3746, + "step": 171600 + }, + { + "epoch": 55.494505494505496, + "grad_norm": 1.9041239023208618, + "learning_rate": 0.001, + "loss": 1.3811, + "step": 171700 + }, + { + "epoch": 55.52682611506141, + "grad_norm": 1.7315250635147095, + "learning_rate": 0.001, + "loss": 1.3812, + "step": 171800 + }, + { + "epoch": 55.559146735617325, + "grad_norm": 1.8228223323822021, + "learning_rate": 0.001, + "loss": 1.3803, + "step": 171900 + }, + { + "epoch": 55.59146735617324, + "grad_norm": 1.6775150299072266, + "learning_rate": 0.001, + "loss": 1.395, + "step": 172000 + }, + { + "epoch": 55.623787976729155, + "grad_norm": 1.7854257822036743, + "learning_rate": 0.001, + "loss": 1.3913, + "step": 172100 + }, + { + "epoch": 55.65610859728507, + "grad_norm": 1.8913499116897583, + "learning_rate": 0.001, + "loss": 1.3857, + "step": 172200 + }, + { + "epoch": 55.688429217840984, + "grad_norm": 1.7007423639297485, + "learning_rate": 0.001, + "loss": 1.411, + "step": 172300 + }, + { + "epoch": 55.7207498383969, + "grad_norm": 1.6376309394836426, + "learning_rate": 0.001, + "loss": 1.3868, + "step": 172400 + }, + { + "epoch": 55.75307045895281, + "grad_norm": 1.9880905151367188, + "learning_rate": 0.001, + "loss": 1.4165, + "step": 172500 + }, + { + "epoch": 55.78539107950873, + "grad_norm": 1.8876042366027832, + "learning_rate": 0.001, + "loss": 1.3944, + "step": 172600 + }, + { + "epoch": 55.81771170006464, + "grad_norm": 1.67818284034729, + "learning_rate": 0.001, + "loss": 1.4349, + "step": 172700 + }, + { + "epoch": 55.85003232062056, + "grad_norm": 1.440558671951294, + "learning_rate": 0.001, + "loss": 1.4127, + "step": 172800 + }, + { + "epoch": 55.88235294117647, + "grad_norm": 1.8764231204986572, + "learning_rate": 0.001, + "loss": 1.4169, + "step": 172900 + }, + { + "epoch": 55.914673561732386, + "grad_norm": 2.102301597595215, + "learning_rate": 0.001, + "loss": 1.4181, + "step": 173000 + }, + { + "epoch": 55.9469941822883, + "grad_norm": 1.709457278251648, + "learning_rate": 0.001, + "loss": 1.4222, + "step": 173100 + }, + { + "epoch": 55.979314802844215, + "grad_norm": 1.5915135145187378, + "learning_rate": 0.001, + "loss": 1.4292, + "step": 173200 + }, + { + "epoch": 56.01163542340013, + "grad_norm": 2.3316972255706787, + "learning_rate": 0.001, + "loss": 1.378, + "step": 173300 + }, + { + "epoch": 56.043956043956044, + "grad_norm": 2.285443067550659, + "learning_rate": 0.001, + "loss": 1.2946, + "step": 173400 + }, + { + "epoch": 56.07627666451196, + "grad_norm": 1.96236252784729, + "learning_rate": 0.001, + "loss": 1.302, + "step": 173500 + }, + { + "epoch": 56.10859728506787, + "grad_norm": 2.4459619522094727, + "learning_rate": 0.001, + "loss": 1.304, + "step": 173600 + }, + { + "epoch": 56.14091790562379, + "grad_norm": 2.265803813934326, + "learning_rate": 0.001, + "loss": 1.3218, + "step": 173700 + }, + { + "epoch": 56.1732385261797, + "grad_norm": 1.7974573373794556, + "learning_rate": 0.001, + "loss": 1.3045, + "step": 173800 + }, + { + "epoch": 56.20555914673562, + "grad_norm": 1.8836841583251953, + "learning_rate": 0.001, + "loss": 1.3102, + "step": 173900 + }, + { + "epoch": 56.23787976729153, + "grad_norm": 1.8796806335449219, + "learning_rate": 0.001, + "loss": 1.3573, + "step": 174000 + }, + { + "epoch": 56.270200387847446, + "grad_norm": 1.8522303104400635, + "learning_rate": 0.001, + "loss": 1.3435, + "step": 174100 + }, + { + "epoch": 56.30252100840336, + "grad_norm": 1.5742342472076416, + "learning_rate": 0.001, + "loss": 1.3479, + "step": 174200 + }, + { + "epoch": 56.334841628959275, + "grad_norm": 1.655555248260498, + "learning_rate": 0.001, + "loss": 1.3261, + "step": 174300 + }, + { + "epoch": 56.36716224951519, + "grad_norm": 1.8719924688339233, + "learning_rate": 0.001, + "loss": 1.3298, + "step": 174400 + }, + { + "epoch": 56.399482870071104, + "grad_norm": 1.8895421028137207, + "learning_rate": 0.001, + "loss": 1.3586, + "step": 174500 + }, + { + "epoch": 56.43180349062702, + "grad_norm": 1.98106849193573, + "learning_rate": 0.001, + "loss": 1.3609, + "step": 174600 + }, + { + "epoch": 56.46412411118293, + "grad_norm": 1.809706687927246, + "learning_rate": 0.001, + "loss": 1.3688, + "step": 174700 + }, + { + "epoch": 56.49644473173885, + "grad_norm": 1.962716817855835, + "learning_rate": 0.001, + "loss": 1.3798, + "step": 174800 + }, + { + "epoch": 56.52876535229476, + "grad_norm": 2.347630500793457, + "learning_rate": 0.001, + "loss": 1.3793, + "step": 174900 + }, + { + "epoch": 56.56108597285068, + "grad_norm": 1.7523319721221924, + "learning_rate": 0.001, + "loss": 1.3641, + "step": 175000 + }, + { + "epoch": 56.59340659340659, + "grad_norm": 2.0193288326263428, + "learning_rate": 0.001, + "loss": 1.3682, + "step": 175100 + }, + { + "epoch": 56.625727213962506, + "grad_norm": 1.7714121341705322, + "learning_rate": 0.001, + "loss": 1.3583, + "step": 175200 + }, + { + "epoch": 56.65804783451842, + "grad_norm": 1.6798348426818848, + "learning_rate": 0.001, + "loss": 1.371, + "step": 175300 + }, + { + "epoch": 56.690368455074335, + "grad_norm": 1.8950109481811523, + "learning_rate": 0.001, + "loss": 1.3803, + "step": 175400 + }, + { + "epoch": 56.72268907563025, + "grad_norm": 2.079096555709839, + "learning_rate": 0.001, + "loss": 1.383, + "step": 175500 + }, + { + "epoch": 56.755009696186164, + "grad_norm": 1.8894920349121094, + "learning_rate": 0.001, + "loss": 1.3906, + "step": 175600 + }, + { + "epoch": 56.78733031674208, + "grad_norm": 1.8664498329162598, + "learning_rate": 0.001, + "loss": 1.3867, + "step": 175700 + }, + { + "epoch": 56.81965093729799, + "grad_norm": 1.9517700672149658, + "learning_rate": 0.001, + "loss": 1.4043, + "step": 175800 + }, + { + "epoch": 56.85197155785391, + "grad_norm": 1.8154116868972778, + "learning_rate": 0.001, + "loss": 1.4082, + "step": 175900 + }, + { + "epoch": 56.88429217840982, + "grad_norm": 1.7526671886444092, + "learning_rate": 0.001, + "loss": 1.4217, + "step": 176000 + }, + { + "epoch": 56.91661279896574, + "grad_norm": 1.6760896444320679, + "learning_rate": 0.001, + "loss": 1.411, + "step": 176100 + }, + { + "epoch": 56.94893341952165, + "grad_norm": 1.8976603746414185, + "learning_rate": 0.001, + "loss": 1.4064, + "step": 176200 + }, + { + "epoch": 56.981254040077566, + "grad_norm": 1.5558828115463257, + "learning_rate": 0.001, + "loss": 1.4082, + "step": 176300 + }, + { + "epoch": 57.01357466063349, + "grad_norm": 1.8569399118423462, + "learning_rate": 0.001, + "loss": 1.3491, + "step": 176400 + }, + { + "epoch": 57.0458952811894, + "grad_norm": 2.1090731620788574, + "learning_rate": 0.001, + "loss": 1.2905, + "step": 176500 + }, + { + "epoch": 57.07821590174532, + "grad_norm": 1.87632417678833, + "learning_rate": 0.001, + "loss": 1.2937, + "step": 176600 + }, + { + "epoch": 57.11053652230123, + "grad_norm": 2.033785820007324, + "learning_rate": 0.001, + "loss": 1.2854, + "step": 176700 + }, + { + "epoch": 57.142857142857146, + "grad_norm": 3.1014299392700195, + "learning_rate": 0.001, + "loss": 1.3014, + "step": 176800 + }, + { + "epoch": 57.17517776341306, + "grad_norm": 1.5487799644470215, + "learning_rate": 0.001, + "loss": 1.2949, + "step": 176900 + }, + { + "epoch": 57.207498383968975, + "grad_norm": 1.8203353881835938, + "learning_rate": 0.001, + "loss": 1.3163, + "step": 177000 + }, + { + "epoch": 57.23981900452489, + "grad_norm": 2.1298274993896484, + "learning_rate": 0.001, + "loss": 1.3148, + "step": 177100 + }, + { + "epoch": 57.272139625080804, + "grad_norm": 2.3572335243225098, + "learning_rate": 0.001, + "loss": 1.3131, + "step": 177200 + }, + { + "epoch": 57.30446024563672, + "grad_norm": 1.9870686531066895, + "learning_rate": 0.001, + "loss": 1.3295, + "step": 177300 + }, + { + "epoch": 57.33678086619263, + "grad_norm": 1.9002041816711426, + "learning_rate": 0.001, + "loss": 1.3113, + "step": 177400 + }, + { + "epoch": 57.36910148674855, + "grad_norm": 2.0429224967956543, + "learning_rate": 0.001, + "loss": 1.3369, + "step": 177500 + }, + { + "epoch": 57.40142210730446, + "grad_norm": 2.353167772293091, + "learning_rate": 0.001, + "loss": 1.3369, + "step": 177600 + }, + { + "epoch": 57.43374272786038, + "grad_norm": 1.6245406866073608, + "learning_rate": 0.001, + "loss": 1.3467, + "step": 177700 + }, + { + "epoch": 57.46606334841629, + "grad_norm": 1.77859628200531, + "learning_rate": 0.001, + "loss": 1.3399, + "step": 177800 + }, + { + "epoch": 57.498383968972206, + "grad_norm": 1.8216441869735718, + "learning_rate": 0.001, + "loss": 1.3448, + "step": 177900 + }, + { + "epoch": 57.53070458952812, + "grad_norm": 1.8462305068969727, + "learning_rate": 0.001, + "loss": 1.3505, + "step": 178000 + }, + { + "epoch": 57.563025210084035, + "grad_norm": 2.081976890563965, + "learning_rate": 0.001, + "loss": 1.3565, + "step": 178100 + }, + { + "epoch": 57.59534583063995, + "grad_norm": 1.996206521987915, + "learning_rate": 0.001, + "loss": 1.3618, + "step": 178200 + }, + { + "epoch": 57.627666451195864, + "grad_norm": 2.0252761840820312, + "learning_rate": 0.001, + "loss": 1.3762, + "step": 178300 + }, + { + "epoch": 57.65998707175178, + "grad_norm": 1.6856194734573364, + "learning_rate": 0.001, + "loss": 1.3561, + "step": 178400 + }, + { + "epoch": 57.69230769230769, + "grad_norm": 1.955251693725586, + "learning_rate": 0.001, + "loss": 1.3857, + "step": 178500 + }, + { + "epoch": 57.72462831286361, + "grad_norm": 1.9201545715332031, + "learning_rate": 0.001, + "loss": 1.3897, + "step": 178600 + }, + { + "epoch": 57.75694893341952, + "grad_norm": 2.1210641860961914, + "learning_rate": 0.001, + "loss": 1.3676, + "step": 178700 + }, + { + "epoch": 57.78926955397544, + "grad_norm": 1.891424536705017, + "learning_rate": 0.001, + "loss": 1.3912, + "step": 178800 + }, + { + "epoch": 57.82159017453135, + "grad_norm": 1.859992504119873, + "learning_rate": 0.001, + "loss": 1.3864, + "step": 178900 + }, + { + "epoch": 57.853910795087266, + "grad_norm": 2.3439786434173584, + "learning_rate": 0.001, + "loss": 1.3738, + "step": 179000 + }, + { + "epoch": 57.88623141564318, + "grad_norm": 1.835242748260498, + "learning_rate": 0.001, + "loss": 1.3923, + "step": 179100 + }, + { + "epoch": 57.918552036199095, + "grad_norm": 2.018841028213501, + "learning_rate": 0.001, + "loss": 1.3933, + "step": 179200 + }, + { + "epoch": 57.95087265675501, + "grad_norm": 2.056886672973633, + "learning_rate": 0.001, + "loss": 1.3979, + "step": 179300 + }, + { + "epoch": 57.983193277310924, + "grad_norm": 2.031996965408325, + "learning_rate": 0.001, + "loss": 1.3989, + "step": 179400 + }, + { + "epoch": 58.01551389786684, + "grad_norm": 1.9441972970962524, + "learning_rate": 0.001, + "loss": 1.3036, + "step": 179500 + }, + { + "epoch": 58.04783451842275, + "grad_norm": 2.2586889266967773, + "learning_rate": 0.001, + "loss": 1.2679, + "step": 179600 + }, + { + "epoch": 58.08015513897867, + "grad_norm": 2.025006055831909, + "learning_rate": 0.001, + "loss": 1.2601, + "step": 179700 + }, + { + "epoch": 58.11247575953458, + "grad_norm": 4.384105682373047, + "learning_rate": 0.001, + "loss": 1.2748, + "step": 179800 + }, + { + "epoch": 58.1447963800905, + "grad_norm": 1.9416863918304443, + "learning_rate": 0.001, + "loss": 1.2984, + "step": 179900 + }, + { + "epoch": 58.17711700064641, + "grad_norm": 2.0471954345703125, + "learning_rate": 0.001, + "loss": 1.2953, + "step": 180000 + }, + { + "epoch": 58.209437621202326, + "grad_norm": 1.9570256471633911, + "learning_rate": 0.001, + "loss": 1.2878, + "step": 180100 + }, + { + "epoch": 58.24175824175824, + "grad_norm": 2.0297162532806396, + "learning_rate": 0.001, + "loss": 1.3079, + "step": 180200 + }, + { + "epoch": 58.274078862314155, + "grad_norm": 1.9571456909179688, + "learning_rate": 0.001, + "loss": 1.3165, + "step": 180300 + }, + { + "epoch": 58.30639948287007, + "grad_norm": 3.118157148361206, + "learning_rate": 0.001, + "loss": 1.3078, + "step": 180400 + }, + { + "epoch": 58.338720103425985, + "grad_norm": 2.2558462619781494, + "learning_rate": 0.001, + "loss": 1.3159, + "step": 180500 + }, + { + "epoch": 58.3710407239819, + "grad_norm": 2.5575222969055176, + "learning_rate": 0.001, + "loss": 1.3179, + "step": 180600 + }, + { + "epoch": 58.403361344537814, + "grad_norm": 2.0723485946655273, + "learning_rate": 0.001, + "loss": 1.3242, + "step": 180700 + }, + { + "epoch": 58.43568196509373, + "grad_norm": 2.4277594089508057, + "learning_rate": 0.001, + "loss": 1.3392, + "step": 180800 + }, + { + "epoch": 58.46800258564964, + "grad_norm": 2.787843942642212, + "learning_rate": 0.001, + "loss": 1.3318, + "step": 180900 + }, + { + "epoch": 58.50032320620556, + "grad_norm": 2.410322666168213, + "learning_rate": 0.001, + "loss": 1.3433, + "step": 181000 + }, + { + "epoch": 58.53264382676147, + "grad_norm": 2.142733335494995, + "learning_rate": 0.001, + "loss": 1.3394, + "step": 181100 + }, + { + "epoch": 58.56496444731739, + "grad_norm": 2.8335678577423096, + "learning_rate": 0.001, + "loss": 1.3411, + "step": 181200 + }, + { + "epoch": 58.5972850678733, + "grad_norm": 2.7325358390808105, + "learning_rate": 0.001, + "loss": 1.3377, + "step": 181300 + }, + { + "epoch": 58.629605688429216, + "grad_norm": 2.1823666095733643, + "learning_rate": 0.001, + "loss": 1.3473, + "step": 181400 + }, + { + "epoch": 58.66192630898513, + "grad_norm": 2.302861213684082, + "learning_rate": 0.001, + "loss": 1.36, + "step": 181500 + }, + { + "epoch": 58.694246929541045, + "grad_norm": 2.170161485671997, + "learning_rate": 0.001, + "loss": 1.3743, + "step": 181600 + }, + { + "epoch": 58.72656755009696, + "grad_norm": 2.141266345977783, + "learning_rate": 0.001, + "loss": 1.3702, + "step": 181700 + }, + { + "epoch": 58.758888170652874, + "grad_norm": 2.1460530757904053, + "learning_rate": 0.001, + "loss": 1.3566, + "step": 181800 + }, + { + "epoch": 58.79120879120879, + "grad_norm": 2.7301716804504395, + "learning_rate": 0.001, + "loss": 1.3815, + "step": 181900 + }, + { + "epoch": 58.8235294117647, + "grad_norm": 2.333367109298706, + "learning_rate": 0.001, + "loss": 1.372, + "step": 182000 + }, + { + "epoch": 58.85585003232062, + "grad_norm": 2.421165943145752, + "learning_rate": 0.001, + "loss": 1.3698, + "step": 182100 + }, + { + "epoch": 58.88817065287653, + "grad_norm": 2.190744638442993, + "learning_rate": 0.001, + "loss": 1.3695, + "step": 182200 + }, + { + "epoch": 58.92049127343245, + "grad_norm": 2.4283175468444824, + "learning_rate": 0.001, + "loss": 1.3902, + "step": 182300 + }, + { + "epoch": 58.95281189398836, + "grad_norm": 2.749220132827759, + "learning_rate": 0.001, + "loss": 1.3813, + "step": 182400 + }, + { + "epoch": 58.985132514544276, + "grad_norm": 1.7633317708969116, + "learning_rate": 0.001, + "loss": 1.3811, + "step": 182500 + }, + { + "epoch": 59.0174531351002, + "grad_norm": 2.3028452396392822, + "learning_rate": 0.001, + "loss": 1.3115, + "step": 182600 + }, + { + "epoch": 59.04977375565611, + "grad_norm": 1.91004478931427, + "learning_rate": 0.001, + "loss": 1.2523, + "step": 182700 + }, + { + "epoch": 59.08209437621203, + "grad_norm": 1.9824846982955933, + "learning_rate": 0.001, + "loss": 1.2691, + "step": 182800 + }, + { + "epoch": 59.11441499676794, + "grad_norm": 1.9162917137145996, + "learning_rate": 0.001, + "loss": 1.2654, + "step": 182900 + }, + { + "epoch": 59.146735617323856, + "grad_norm": 2.174314022064209, + "learning_rate": 0.001, + "loss": 1.2729, + "step": 183000 + }, + { + "epoch": 59.17905623787977, + "grad_norm": 1.950962781906128, + "learning_rate": 0.001, + "loss": 1.2842, + "step": 183100 + }, + { + "epoch": 59.211376858435685, + "grad_norm": 2.099749803543091, + "learning_rate": 0.001, + "loss": 1.2827, + "step": 183200 + }, + { + "epoch": 59.2436974789916, + "grad_norm": 1.7778706550598145, + "learning_rate": 0.001, + "loss": 1.2878, + "step": 183300 + }, + { + "epoch": 59.276018099547514, + "grad_norm": 2.519252300262451, + "learning_rate": 0.001, + "loss": 1.2948, + "step": 183400 + }, + { + "epoch": 59.30833872010343, + "grad_norm": 2.304509162902832, + "learning_rate": 0.001, + "loss": 1.2958, + "step": 183500 + }, + { + "epoch": 59.34065934065934, + "grad_norm": 2.029158353805542, + "learning_rate": 0.001, + "loss": 1.3077, + "step": 183600 + }, + { + "epoch": 59.37297996121526, + "grad_norm": 2.7044732570648193, + "learning_rate": 0.001, + "loss": 1.3067, + "step": 183700 + }, + { + "epoch": 59.40530058177117, + "grad_norm": 2.5257177352905273, + "learning_rate": 0.001, + "loss": 1.3115, + "step": 183800 + }, + { + "epoch": 59.43762120232709, + "grad_norm": 2.422498941421509, + "learning_rate": 0.001, + "loss": 1.3333, + "step": 183900 + }, + { + "epoch": 59.469941822883, + "grad_norm": 2.1336445808410645, + "learning_rate": 0.001, + "loss": 1.3216, + "step": 184000 + }, + { + "epoch": 59.502262443438916, + "grad_norm": 1.7418371438980103, + "learning_rate": 0.001, + "loss": 1.3296, + "step": 184100 + }, + { + "epoch": 59.53458306399483, + "grad_norm": 1.9461792707443237, + "learning_rate": 0.001, + "loss": 1.3301, + "step": 184200 + }, + { + "epoch": 59.566903684550745, + "grad_norm": 1.9766813516616821, + "learning_rate": 0.001, + "loss": 1.3293, + "step": 184300 + }, + { + "epoch": 59.59922430510666, + "grad_norm": 1.7881988286972046, + "learning_rate": 0.001, + "loss": 1.3434, + "step": 184400 + }, + { + "epoch": 59.631544925662574, + "grad_norm": 2.1620521545410156, + "learning_rate": 0.001, + "loss": 1.3423, + "step": 184500 + }, + { + "epoch": 59.66386554621849, + "grad_norm": 1.9429798126220703, + "learning_rate": 0.001, + "loss": 1.3373, + "step": 184600 + }, + { + "epoch": 59.6961861667744, + "grad_norm": 1.764739990234375, + "learning_rate": 0.001, + "loss": 1.3384, + "step": 184700 + }, + { + "epoch": 59.72850678733032, + "grad_norm": 1.7464152574539185, + "learning_rate": 0.001, + "loss": 1.3531, + "step": 184800 + }, + { + "epoch": 59.76082740788623, + "grad_norm": 2.2472636699676514, + "learning_rate": 0.001, + "loss": 1.359, + "step": 184900 + }, + { + "epoch": 59.79314802844215, + "grad_norm": 1.8305083513259888, + "learning_rate": 0.001, + "loss": 1.353, + "step": 185000 + }, + { + "epoch": 59.82546864899806, + "grad_norm": 1.6903504133224487, + "learning_rate": 0.001, + "loss": 1.3556, + "step": 185100 + }, + { + "epoch": 59.857789269553976, + "grad_norm": 2.093635082244873, + "learning_rate": 0.001, + "loss": 1.3483, + "step": 185200 + }, + { + "epoch": 59.89010989010989, + "grad_norm": 2.056464910507202, + "learning_rate": 0.001, + "loss": 1.3502, + "step": 185300 + }, + { + "epoch": 59.922430510665805, + "grad_norm": 1.7775726318359375, + "learning_rate": 0.001, + "loss": 1.3602, + "step": 185400 + }, + { + "epoch": 59.95475113122172, + "grad_norm": 1.6163804531097412, + "learning_rate": 0.001, + "loss": 1.3618, + "step": 185500 + }, + { + "epoch": 59.987071751777634, + "grad_norm": 1.7184257507324219, + "learning_rate": 0.001, + "loss": 1.3704, + "step": 185600 + }, + { + "epoch": 60.01939237233355, + "grad_norm": 1.7940794229507446, + "learning_rate": 0.001, + "loss": 1.2951, + "step": 185700 + }, + { + "epoch": 60.05171299288946, + "grad_norm": 1.667309284210205, + "learning_rate": 0.001, + "loss": 1.248, + "step": 185800 + }, + { + "epoch": 60.08403361344538, + "grad_norm": 1.5899906158447266, + "learning_rate": 0.001, + "loss": 1.2604, + "step": 185900 + }, + { + "epoch": 60.11635423400129, + "grad_norm": 1.9902898073196411, + "learning_rate": 0.001, + "loss": 1.2485, + "step": 186000 + }, + { + "epoch": 60.14867485455721, + "grad_norm": 1.522161841392517, + "learning_rate": 0.001, + "loss": 1.2438, + "step": 186100 + }, + { + "epoch": 60.18099547511312, + "grad_norm": 1.5227113962173462, + "learning_rate": 0.001, + "loss": 1.2651, + "step": 186200 + }, + { + "epoch": 60.213316095669036, + "grad_norm": 1.6835823059082031, + "learning_rate": 0.001, + "loss": 1.2652, + "step": 186300 + }, + { + "epoch": 60.24563671622495, + "grad_norm": 1.817858338356018, + "learning_rate": 0.001, + "loss": 1.2638, + "step": 186400 + }, + { + "epoch": 60.277957336780865, + "grad_norm": 1.9655687808990479, + "learning_rate": 0.001, + "loss": 1.2967, + "step": 186500 + }, + { + "epoch": 60.31027795733678, + "grad_norm": 2.3373231887817383, + "learning_rate": 0.001, + "loss": 1.2971, + "step": 186600 + }, + { + "epoch": 60.342598577892694, + "grad_norm": 2.2942612171173096, + "learning_rate": 0.001, + "loss": 1.2899, + "step": 186700 + }, + { + "epoch": 60.37491919844861, + "grad_norm": 2.092560291290283, + "learning_rate": 0.001, + "loss": 1.2844, + "step": 186800 + }, + { + "epoch": 60.40723981900452, + "grad_norm": 1.5219141244888306, + "learning_rate": 0.001, + "loss": 1.2982, + "step": 186900 + }, + { + "epoch": 60.43956043956044, + "grad_norm": 1.906146764755249, + "learning_rate": 0.001, + "loss": 1.2941, + "step": 187000 + }, + { + "epoch": 60.47188106011635, + "grad_norm": 1.6990609169006348, + "learning_rate": 0.001, + "loss": 1.2908, + "step": 187100 + }, + { + "epoch": 60.50420168067227, + "grad_norm": 1.9348620176315308, + "learning_rate": 0.001, + "loss": 1.3104, + "step": 187200 + }, + { + "epoch": 60.53652230122818, + "grad_norm": 1.878623366355896, + "learning_rate": 0.001, + "loss": 1.3012, + "step": 187300 + }, + { + "epoch": 60.568842921784096, + "grad_norm": 1.4890978336334229, + "learning_rate": 0.001, + "loss": 1.342, + "step": 187400 + }, + { + "epoch": 60.60116354234001, + "grad_norm": 3.4084646701812744, + "learning_rate": 0.001, + "loss": 1.3242, + "step": 187500 + }, + { + "epoch": 60.633484162895925, + "grad_norm": 1.784811019897461, + "learning_rate": 0.001, + "loss": 1.3028, + "step": 187600 + }, + { + "epoch": 60.66580478345184, + "grad_norm": 1.9564650058746338, + "learning_rate": 0.001, + "loss": 1.3375, + "step": 187700 + }, + { + "epoch": 60.698125404007754, + "grad_norm": 1.9819107055664062, + "learning_rate": 0.001, + "loss": 1.3309, + "step": 187800 + }, + { + "epoch": 60.73044602456367, + "grad_norm": 1.938475489616394, + "learning_rate": 0.001, + "loss": 1.3285, + "step": 187900 + }, + { + "epoch": 60.762766645119584, + "grad_norm": 1.7497060298919678, + "learning_rate": 0.001, + "loss": 1.3384, + "step": 188000 + }, + { + "epoch": 60.7950872656755, + "grad_norm": 1.5079044103622437, + "learning_rate": 0.001, + "loss": 1.342, + "step": 188100 + }, + { + "epoch": 60.82740788623141, + "grad_norm": 1.7045460939407349, + "learning_rate": 0.001, + "loss": 1.3398, + "step": 188200 + }, + { + "epoch": 60.85972850678733, + "grad_norm": 1.809365153312683, + "learning_rate": 0.001, + "loss": 1.3565, + "step": 188300 + }, + { + "epoch": 60.89204912734324, + "grad_norm": 1.9220020771026611, + "learning_rate": 0.001, + "loss": 1.3592, + "step": 188400 + }, + { + "epoch": 60.924369747899156, + "grad_norm": 1.7121220827102661, + "learning_rate": 0.001, + "loss": 1.3586, + "step": 188500 + }, + { + "epoch": 60.95669036845507, + "grad_norm": 1.5285897254943848, + "learning_rate": 0.001, + "loss": 1.3651, + "step": 188600 + }, + { + "epoch": 60.98901098901099, + "grad_norm": 1.6838990449905396, + "learning_rate": 0.001, + "loss": 1.3529, + "step": 188700 + }, + { + "epoch": 61.02133160956691, + "grad_norm": 1.7248560190200806, + "learning_rate": 0.001, + "loss": 1.2819, + "step": 188800 + }, + { + "epoch": 61.05365223012282, + "grad_norm": 1.699639916419983, + "learning_rate": 0.001, + "loss": 1.2323, + "step": 188900 + }, + { + "epoch": 61.085972850678736, + "grad_norm": 1.7705409526824951, + "learning_rate": 0.001, + "loss": 1.2365, + "step": 189000 + }, + { + "epoch": 61.11829347123465, + "grad_norm": 1.709913969039917, + "learning_rate": 0.001, + "loss": 1.257, + "step": 189100 + }, + { + "epoch": 61.150614091790565, + "grad_norm": 1.7027641534805298, + "learning_rate": 0.001, + "loss": 1.2317, + "step": 189200 + }, + { + "epoch": 61.18293471234648, + "grad_norm": 1.5179117918014526, + "learning_rate": 0.001, + "loss": 1.2539, + "step": 189300 + }, + { + "epoch": 61.215255332902395, + "grad_norm": 3.146793842315674, + "learning_rate": 0.001, + "loss": 1.2494, + "step": 189400 + }, + { + "epoch": 61.24757595345831, + "grad_norm": 1.4820927381515503, + "learning_rate": 0.001, + "loss": 1.26, + "step": 189500 + }, + { + "epoch": 61.279896574014224, + "grad_norm": 2.3077597618103027, + "learning_rate": 0.001, + "loss": 1.2502, + "step": 189600 + }, + { + "epoch": 61.31221719457014, + "grad_norm": 1.8483630418777466, + "learning_rate": 0.001, + "loss": 1.2922, + "step": 189700 + }, + { + "epoch": 61.34453781512605, + "grad_norm": 1.5808696746826172, + "learning_rate": 0.001, + "loss": 1.2773, + "step": 189800 + }, + { + "epoch": 61.37685843568197, + "grad_norm": 1.8577014207839966, + "learning_rate": 0.001, + "loss": 1.2712, + "step": 189900 + }, + { + "epoch": 61.40917905623788, + "grad_norm": 1.837640643119812, + "learning_rate": 0.001, + "loss": 1.2924, + "step": 190000 + }, + { + "epoch": 61.441499676793796, + "grad_norm": 1.4489262104034424, + "learning_rate": 0.001, + "loss": 1.2856, + "step": 190100 + }, + { + "epoch": 61.47382029734971, + "grad_norm": 1.756155014038086, + "learning_rate": 0.001, + "loss": 1.2945, + "step": 190200 + }, + { + "epoch": 61.506140917905626, + "grad_norm": 1.7968300580978394, + "learning_rate": 0.001, + "loss": 1.3083, + "step": 190300 + }, + { + "epoch": 61.53846153846154, + "grad_norm": 1.3242460489273071, + "learning_rate": 0.001, + "loss": 1.3004, + "step": 190400 + }, + { + "epoch": 61.570782159017455, + "grad_norm": 1.5521363019943237, + "learning_rate": 0.001, + "loss": 1.3086, + "step": 190500 + }, + { + "epoch": 61.60310277957337, + "grad_norm": 1.6019891500473022, + "learning_rate": 0.001, + "loss": 1.3037, + "step": 190600 + }, + { + "epoch": 61.635423400129284, + "grad_norm": 2.068342685699463, + "learning_rate": 0.001, + "loss": 1.3043, + "step": 190700 + }, + { + "epoch": 61.6677440206852, + "grad_norm": 1.7331029176712036, + "learning_rate": 0.001, + "loss": 1.3196, + "step": 190800 + }, + { + "epoch": 61.70006464124111, + "grad_norm": 2.1270153522491455, + "learning_rate": 0.001, + "loss": 1.3156, + "step": 190900 + }, + { + "epoch": 61.73238526179703, + "grad_norm": 1.7159593105316162, + "learning_rate": 0.001, + "loss": 1.3161, + "step": 191000 + }, + { + "epoch": 61.76470588235294, + "grad_norm": 1.6671262979507446, + "learning_rate": 0.001, + "loss": 1.3241, + "step": 191100 + }, + { + "epoch": 61.79702650290886, + "grad_norm": 1.7520173788070679, + "learning_rate": 0.001, + "loss": 1.3196, + "step": 191200 + }, + { + "epoch": 61.82934712346477, + "grad_norm": 1.407358169555664, + "learning_rate": 0.001, + "loss": 1.3465, + "step": 191300 + }, + { + "epoch": 61.861667744020686, + "grad_norm": 1.7164902687072754, + "learning_rate": 0.001, + "loss": 1.3407, + "step": 191400 + }, + { + "epoch": 61.8939883645766, + "grad_norm": 1.714900016784668, + "learning_rate": 0.001, + "loss": 1.3496, + "step": 191500 + }, + { + "epoch": 61.926308985132515, + "grad_norm": 2.160857677459717, + "learning_rate": 0.001, + "loss": 1.3361, + "step": 191600 + }, + { + "epoch": 61.95862960568843, + "grad_norm": 1.8158776760101318, + "learning_rate": 0.001, + "loss": 1.3433, + "step": 191700 + }, + { + "epoch": 61.990950226244344, + "grad_norm": 1.847709059715271, + "learning_rate": 0.001, + "loss": 1.3598, + "step": 191800 + }, + { + "epoch": 62.02327084680026, + "grad_norm": 1.7720732688903809, + "learning_rate": 0.001, + "loss": 1.2744, + "step": 191900 + }, + { + "epoch": 62.05559146735617, + "grad_norm": 1.5437538623809814, + "learning_rate": 0.001, + "loss": 1.2139, + "step": 192000 + }, + { + "epoch": 62.08791208791209, + "grad_norm": 1.6394867897033691, + "learning_rate": 0.001, + "loss": 1.2301, + "step": 192100 + }, + { + "epoch": 62.120232708468, + "grad_norm": 1.5179033279418945, + "learning_rate": 0.001, + "loss": 1.2352, + "step": 192200 + }, + { + "epoch": 62.15255332902392, + "grad_norm": 2.0513007640838623, + "learning_rate": 0.001, + "loss": 1.2292, + "step": 192300 + }, + { + "epoch": 62.18487394957983, + "grad_norm": 1.76735258102417, + "learning_rate": 0.001, + "loss": 1.2531, + "step": 192400 + }, + { + "epoch": 62.217194570135746, + "grad_norm": 1.9428764581680298, + "learning_rate": 0.001, + "loss": 1.2507, + "step": 192500 + }, + { + "epoch": 62.24951519069166, + "grad_norm": 1.3896160125732422, + "learning_rate": 0.001, + "loss": 1.2556, + "step": 192600 + }, + { + "epoch": 62.281835811247575, + "grad_norm": 1.634589433670044, + "learning_rate": 0.001, + "loss": 1.2598, + "step": 192700 + }, + { + "epoch": 62.31415643180349, + "grad_norm": 1.906327486038208, + "learning_rate": 0.001, + "loss": 1.2587, + "step": 192800 + }, + { + "epoch": 62.346477052359404, + "grad_norm": 1.7237509489059448, + "learning_rate": 0.001, + "loss": 1.2544, + "step": 192900 + }, + { + "epoch": 62.37879767291532, + "grad_norm": 1.5861009359359741, + "learning_rate": 0.001, + "loss": 1.2693, + "step": 193000 + }, + { + "epoch": 62.41111829347123, + "grad_norm": 1.6999458074569702, + "learning_rate": 0.001, + "loss": 1.2815, + "step": 193100 + }, + { + "epoch": 62.44343891402715, + "grad_norm": 1.9093917608261108, + "learning_rate": 0.001, + "loss": 1.268, + "step": 193200 + }, + { + "epoch": 62.47575953458306, + "grad_norm": 1.64690363407135, + "learning_rate": 0.001, + "loss": 1.2734, + "step": 193300 + }, + { + "epoch": 62.50808015513898, + "grad_norm": 1.8700169324874878, + "learning_rate": 0.001, + "loss": 1.2684, + "step": 193400 + }, + { + "epoch": 62.54040077569489, + "grad_norm": 1.862876534461975, + "learning_rate": 0.001, + "loss": 1.3038, + "step": 193500 + }, + { + "epoch": 62.572721396250806, + "grad_norm": 1.5933880805969238, + "learning_rate": 0.001, + "loss": 1.3032, + "step": 193600 + }, + { + "epoch": 62.60504201680672, + "grad_norm": 1.569177269935608, + "learning_rate": 0.001, + "loss": 1.2934, + "step": 193700 + }, + { + "epoch": 62.637362637362635, + "grad_norm": 1.7045800685882568, + "learning_rate": 0.001, + "loss": 1.3054, + "step": 193800 + }, + { + "epoch": 62.66968325791855, + "grad_norm": 1.502102017402649, + "learning_rate": 0.001, + "loss": 1.2973, + "step": 193900 + }, + { + "epoch": 62.702003878474464, + "grad_norm": 1.3741238117218018, + "learning_rate": 0.001, + "loss": 1.3095, + "step": 194000 + }, + { + "epoch": 62.73432449903038, + "grad_norm": 1.508573293685913, + "learning_rate": 0.001, + "loss": 1.3103, + "step": 194100 + }, + { + "epoch": 62.76664511958629, + "grad_norm": 1.6713041067123413, + "learning_rate": 0.001, + "loss": 1.3066, + "step": 194200 + }, + { + "epoch": 62.79896574014221, + "grad_norm": 2.0911645889282227, + "learning_rate": 0.001, + "loss": 1.3418, + "step": 194300 + }, + { + "epoch": 62.83128636069812, + "grad_norm": 1.6197428703308105, + "learning_rate": 0.001, + "loss": 1.3179, + "step": 194400 + }, + { + "epoch": 62.86360698125404, + "grad_norm": 1.4563323259353638, + "learning_rate": 0.001, + "loss": 1.3078, + "step": 194500 + }, + { + "epoch": 62.89592760180995, + "grad_norm": 1.447568655014038, + "learning_rate": 0.001, + "loss": 1.3128, + "step": 194600 + }, + { + "epoch": 62.928248222365866, + "grad_norm": 1.577903151512146, + "learning_rate": 0.001, + "loss": 1.3518, + "step": 194700 + }, + { + "epoch": 62.96056884292178, + "grad_norm": 1.6062977313995361, + "learning_rate": 0.001, + "loss": 1.3364, + "step": 194800 + }, + { + "epoch": 62.992889463477695, + "grad_norm": 1.6443058252334595, + "learning_rate": 0.001, + "loss": 1.3298, + "step": 194900 + }, + { + "epoch": 63.02521008403362, + "grad_norm": 1.7947728633880615, + "learning_rate": 0.001, + "loss": 1.2272, + "step": 195000 + }, + { + "epoch": 63.05753070458953, + "grad_norm": 1.7414778470993042, + "learning_rate": 0.001, + "loss": 1.2157, + "step": 195100 + }, + { + "epoch": 63.089851325145446, + "grad_norm": 1.7546195983886719, + "learning_rate": 0.001, + "loss": 1.2209, + "step": 195200 + }, + { + "epoch": 63.12217194570136, + "grad_norm": 1.7215334177017212, + "learning_rate": 0.001, + "loss": 1.2283, + "step": 195300 + }, + { + "epoch": 63.154492566257275, + "grad_norm": 1.6781865358352661, + "learning_rate": 0.001, + "loss": 1.2115, + "step": 195400 + }, + { + "epoch": 63.18681318681319, + "grad_norm": 2.100222110748291, + "learning_rate": 0.001, + "loss": 1.2348, + "step": 195500 + }, + { + "epoch": 63.219133807369104, + "grad_norm": 1.6543059349060059, + "learning_rate": 0.001, + "loss": 1.2364, + "step": 195600 + }, + { + "epoch": 63.25145442792502, + "grad_norm": 1.8639600276947021, + "learning_rate": 0.001, + "loss": 1.2418, + "step": 195700 + }, + { + "epoch": 63.28377504848093, + "grad_norm": 1.4900529384613037, + "learning_rate": 0.001, + "loss": 1.2321, + "step": 195800 + }, + { + "epoch": 63.31609566903685, + "grad_norm": 1.7212891578674316, + "learning_rate": 0.001, + "loss": 1.2556, + "step": 195900 + }, + { + "epoch": 63.34841628959276, + "grad_norm": 1.827954649925232, + "learning_rate": 0.001, + "loss": 1.2661, + "step": 196000 + }, + { + "epoch": 63.38073691014868, + "grad_norm": 2.1511712074279785, + "learning_rate": 0.001, + "loss": 1.2529, + "step": 196100 + }, + { + "epoch": 63.41305753070459, + "grad_norm": 1.4950047731399536, + "learning_rate": 0.001, + "loss": 1.2808, + "step": 196200 + }, + { + "epoch": 63.445378151260506, + "grad_norm": 1.7102982997894287, + "learning_rate": 0.001, + "loss": 1.2631, + "step": 196300 + }, + { + "epoch": 63.47769877181642, + "grad_norm": 1.6856714487075806, + "learning_rate": 0.001, + "loss": 1.2602, + "step": 196400 + }, + { + "epoch": 63.510019392372335, + "grad_norm": 1.7770227193832397, + "learning_rate": 0.001, + "loss": 1.2767, + "step": 196500 + }, + { + "epoch": 63.54234001292825, + "grad_norm": 2.618396520614624, + "learning_rate": 0.001, + "loss": 1.2671, + "step": 196600 + }, + { + "epoch": 63.574660633484164, + "grad_norm": 2.238501787185669, + "learning_rate": 0.001, + "loss": 1.2707, + "step": 196700 + }, + { + "epoch": 63.60698125404008, + "grad_norm": 1.4879484176635742, + "learning_rate": 0.001, + "loss": 1.2657, + "step": 196800 + }, + { + "epoch": 63.63930187459599, + "grad_norm": 1.6655293703079224, + "learning_rate": 0.001, + "loss": 1.2964, + "step": 196900 + }, + { + "epoch": 63.67162249515191, + "grad_norm": 1.5491960048675537, + "learning_rate": 0.001, + "loss": 1.2974, + "step": 197000 + }, + { + "epoch": 63.70394311570782, + "grad_norm": 2.067685842514038, + "learning_rate": 0.001, + "loss": 1.2997, + "step": 197100 + }, + { + "epoch": 63.73626373626374, + "grad_norm": 1.9535744190216064, + "learning_rate": 0.001, + "loss": 1.298, + "step": 197200 + }, + { + "epoch": 63.76858435681965, + "grad_norm": 2.4942731857299805, + "learning_rate": 0.001, + "loss": 1.2957, + "step": 197300 + }, + { + "epoch": 63.800904977375566, + "grad_norm": 1.6687902212142944, + "learning_rate": 0.001, + "loss": 1.2998, + "step": 197400 + }, + { + "epoch": 63.83322559793148, + "grad_norm": 1.4590330123901367, + "learning_rate": 0.001, + "loss": 1.2974, + "step": 197500 + }, + { + "epoch": 63.865546218487395, + "grad_norm": 1.4519684314727783, + "learning_rate": 0.001, + "loss": 1.3264, + "step": 197600 + }, + { + "epoch": 63.89786683904331, + "grad_norm": 1.676956057548523, + "learning_rate": 0.001, + "loss": 1.3151, + "step": 197700 + }, + { + "epoch": 63.930187459599225, + "grad_norm": 1.7003228664398193, + "learning_rate": 0.001, + "loss": 1.314, + "step": 197800 + }, + { + "epoch": 63.96250808015514, + "grad_norm": 1.785687804222107, + "learning_rate": 0.001, + "loss": 1.3328, + "step": 197900 + }, + { + "epoch": 63.994828700711054, + "grad_norm": 1.8566793203353882, + "learning_rate": 0.001, + "loss": 1.3199, + "step": 198000 + }, + { + "epoch": 64.02714932126698, + "grad_norm": 2.3256516456604004, + "learning_rate": 0.001, + "loss": 1.2133, + "step": 198100 + }, + { + "epoch": 64.05946994182288, + "grad_norm": 1.8626536130905151, + "learning_rate": 0.001, + "loss": 1.1985, + "step": 198200 + }, + { + "epoch": 64.0917905623788, + "grad_norm": 1.5357648134231567, + "learning_rate": 0.001, + "loss": 1.2016, + "step": 198300 + }, + { + "epoch": 64.12411118293471, + "grad_norm": 2.2160654067993164, + "learning_rate": 0.001, + "loss": 1.2191, + "step": 198400 + }, + { + "epoch": 64.15643180349063, + "grad_norm": 1.763607144355774, + "learning_rate": 0.001, + "loss": 1.2095, + "step": 198500 + }, + { + "epoch": 64.18875242404654, + "grad_norm": 5.209096431732178, + "learning_rate": 0.001, + "loss": 1.2131, + "step": 198600 + }, + { + "epoch": 64.22107304460246, + "grad_norm": 1.854581356048584, + "learning_rate": 0.001, + "loss": 1.2368, + "step": 198700 + }, + { + "epoch": 64.25339366515837, + "grad_norm": 1.6673282384872437, + "learning_rate": 0.001, + "loss": 1.2243, + "step": 198800 + }, + { + "epoch": 64.28571428571429, + "grad_norm": 81.45235443115234, + "learning_rate": 0.001, + "loss": 1.2377, + "step": 198900 + }, + { + "epoch": 64.3180349062702, + "grad_norm": 2.5092577934265137, + "learning_rate": 0.001, + "loss": 1.2312, + "step": 199000 + }, + { + "epoch": 64.35035552682612, + "grad_norm": 1.8041945695877075, + "learning_rate": 0.001, + "loss": 1.2396, + "step": 199100 + }, + { + "epoch": 64.38267614738203, + "grad_norm": 1.6541348695755005, + "learning_rate": 0.001, + "loss": 1.2544, + "step": 199200 + }, + { + "epoch": 64.41499676793795, + "grad_norm": 2.0701494216918945, + "learning_rate": 0.001, + "loss": 1.2392, + "step": 199300 + }, + { + "epoch": 64.44731738849386, + "grad_norm": 1.6884028911590576, + "learning_rate": 0.001, + "loss": 1.2514, + "step": 199400 + }, + { + "epoch": 64.47963800904978, + "grad_norm": 1.775291085243225, + "learning_rate": 0.001, + "loss": 1.2639, + "step": 199500 + }, + { + "epoch": 64.51195862960569, + "grad_norm": 2.3357295989990234, + "learning_rate": 0.001, + "loss": 1.2685, + "step": 199600 + }, + { + "epoch": 64.54427925016161, + "grad_norm": 2.315829277038574, + "learning_rate": 0.001, + "loss": 1.2687, + "step": 199700 + }, + { + "epoch": 64.57659987071752, + "grad_norm": 1.7337130308151245, + "learning_rate": 0.001, + "loss": 1.2687, + "step": 199800 + }, + { + "epoch": 64.60892049127344, + "grad_norm": 1.704189658164978, + "learning_rate": 0.001, + "loss": 1.2801, + "step": 199900 + }, + { + "epoch": 64.64124111182934, + "grad_norm": 1.7531574964523315, + "learning_rate": 0.001, + "loss": 1.2692, + "step": 200000 + }, + { + "epoch": 64.67356173238527, + "grad_norm": 2.4136340618133545, + "learning_rate": 0.001, + "loss": 1.2724, + "step": 200100 + }, + { + "epoch": 64.70588235294117, + "grad_norm": 1.4422646760940552, + "learning_rate": 0.001, + "loss": 1.2875, + "step": 200200 + }, + { + "epoch": 64.7382029734971, + "grad_norm": 1.8993738889694214, + "learning_rate": 0.001, + "loss": 1.2896, + "step": 200300 + }, + { + "epoch": 64.770523594053, + "grad_norm": 1.7030922174453735, + "learning_rate": 0.001, + "loss": 1.2888, + "step": 200400 + }, + { + "epoch": 64.80284421460892, + "grad_norm": 1.6060155630111694, + "learning_rate": 0.001, + "loss": 1.3016, + "step": 200500 + }, + { + "epoch": 64.83516483516483, + "grad_norm": 4.930392265319824, + "learning_rate": 0.001, + "loss": 1.299, + "step": 200600 + }, + { + "epoch": 64.86748545572075, + "grad_norm": 1.7060980796813965, + "learning_rate": 0.001, + "loss": 1.3025, + "step": 200700 + }, + { + "epoch": 64.89980607627666, + "grad_norm": 1.754961609840393, + "learning_rate": 0.001, + "loss": 1.2893, + "step": 200800 + }, + { + "epoch": 64.93212669683258, + "grad_norm": 1.7960935831069946, + "learning_rate": 0.001, + "loss": 1.2953, + "step": 200900 + }, + { + "epoch": 64.96444731738849, + "grad_norm": 2.0267605781555176, + "learning_rate": 0.001, + "loss": 1.3143, + "step": 201000 + }, + { + "epoch": 64.99676793794441, + "grad_norm": 2.2149808406829834, + "learning_rate": 0.001, + "loss": 1.2921, + "step": 201100 + }, + { + "epoch": 65.02908855850032, + "grad_norm": 2.4537580013275146, + "learning_rate": 0.001, + "loss": 1.1977, + "step": 201200 + }, + { + "epoch": 65.06140917905624, + "grad_norm": 1.6021087169647217, + "learning_rate": 0.001, + "loss": 1.1736, + "step": 201300 + }, + { + "epoch": 65.09372979961215, + "grad_norm": 1.9216489791870117, + "learning_rate": 0.001, + "loss": 1.1835, + "step": 201400 + }, + { + "epoch": 65.12605042016807, + "grad_norm": 2.218745470046997, + "learning_rate": 0.001, + "loss": 1.2057, + "step": 201500 + }, + { + "epoch": 65.15837104072398, + "grad_norm": 1.8575019836425781, + "learning_rate": 0.001, + "loss": 1.2034, + "step": 201600 + }, + { + "epoch": 65.1906916612799, + "grad_norm": 2.2216298580169678, + "learning_rate": 0.001, + "loss": 1.2068, + "step": 201700 + }, + { + "epoch": 65.2230122818358, + "grad_norm": 1.9984310865402222, + "learning_rate": 0.001, + "loss": 1.1996, + "step": 201800 + }, + { + "epoch": 65.25533290239173, + "grad_norm": 1.926300048828125, + "learning_rate": 0.001, + "loss": 1.2163, + "step": 201900 + }, + { + "epoch": 65.28765352294764, + "grad_norm": 2.752197504043579, + "learning_rate": 0.001, + "loss": 1.2257, + "step": 202000 + }, + { + "epoch": 65.31997414350356, + "grad_norm": 1.9841582775115967, + "learning_rate": 0.001, + "loss": 1.2055, + "step": 202100 + }, + { + "epoch": 65.35229476405947, + "grad_norm": 2.0355589389801025, + "learning_rate": 0.001, + "loss": 1.2373, + "step": 202200 + }, + { + "epoch": 65.38461538461539, + "grad_norm": 1.867073893547058, + "learning_rate": 0.001, + "loss": 1.227, + "step": 202300 + }, + { + "epoch": 65.4169360051713, + "grad_norm": 1.8634486198425293, + "learning_rate": 0.001, + "loss": 1.2322, + "step": 202400 + }, + { + "epoch": 65.44925662572722, + "grad_norm": 1.804103136062622, + "learning_rate": 0.001, + "loss": 1.2494, + "step": 202500 + }, + { + "epoch": 65.48157724628312, + "grad_norm": 1.7396485805511475, + "learning_rate": 0.001, + "loss": 1.2445, + "step": 202600 + }, + { + "epoch": 65.51389786683905, + "grad_norm": 2.412324905395508, + "learning_rate": 0.001, + "loss": 1.2476, + "step": 202700 + }, + { + "epoch": 65.54621848739495, + "grad_norm": 1.8536796569824219, + "learning_rate": 0.001, + "loss": 1.259, + "step": 202800 + }, + { + "epoch": 65.57853910795087, + "grad_norm": 1.7683600187301636, + "learning_rate": 0.001, + "loss": 1.2411, + "step": 202900 + }, + { + "epoch": 65.61085972850678, + "grad_norm": 1.793859839439392, + "learning_rate": 0.001, + "loss": 1.2516, + "step": 203000 + }, + { + "epoch": 65.6431803490627, + "grad_norm": 1.6775269508361816, + "learning_rate": 0.001, + "loss": 1.2822, + "step": 203100 + }, + { + "epoch": 65.67550096961861, + "grad_norm": 1.8493618965148926, + "learning_rate": 0.001, + "loss": 1.2761, + "step": 203200 + }, + { + "epoch": 65.70782159017453, + "grad_norm": 1.6558525562286377, + "learning_rate": 0.001, + "loss": 1.2712, + "step": 203300 + }, + { + "epoch": 65.74014221073044, + "grad_norm": 2.0000391006469727, + "learning_rate": 0.001, + "loss": 1.2709, + "step": 203400 + }, + { + "epoch": 65.77246283128636, + "grad_norm": 2.3382959365844727, + "learning_rate": 0.001, + "loss": 1.2822, + "step": 203500 + }, + { + "epoch": 65.80478345184227, + "grad_norm": 1.79212486743927, + "learning_rate": 0.001, + "loss": 1.2702, + "step": 203600 + }, + { + "epoch": 65.83710407239819, + "grad_norm": 1.6742587089538574, + "learning_rate": 0.001, + "loss": 1.2835, + "step": 203700 + }, + { + "epoch": 65.8694246929541, + "grad_norm": 2.517657995223999, + "learning_rate": 0.001, + "loss": 1.2946, + "step": 203800 + }, + { + "epoch": 65.90174531351002, + "grad_norm": 1.6825872659683228, + "learning_rate": 0.001, + "loss": 1.2763, + "step": 203900 + }, + { + "epoch": 65.93406593406593, + "grad_norm": 2.0730693340301514, + "learning_rate": 0.001, + "loss": 1.3014, + "step": 204000 + }, + { + "epoch": 65.96638655462185, + "grad_norm": 2.2603330612182617, + "learning_rate": 0.001, + "loss": 1.3025, + "step": 204100 + }, + { + "epoch": 65.99870717517777, + "grad_norm": 2.1212308406829834, + "learning_rate": 0.001, + "loss": 1.2994, + "step": 204200 + }, + { + "epoch": 66.03102779573368, + "grad_norm": 2.014443874359131, + "learning_rate": 0.001, + "loss": 1.1643, + "step": 204300 + }, + { + "epoch": 66.0633484162896, + "grad_norm": 1.8272758722305298, + "learning_rate": 0.001, + "loss": 1.1669, + "step": 204400 + }, + { + "epoch": 66.0956690368455, + "grad_norm": 1.8921802043914795, + "learning_rate": 0.001, + "loss": 1.1608, + "step": 204500 + }, + { + "epoch": 66.12798965740143, + "grad_norm": 2.260369062423706, + "learning_rate": 0.001, + "loss": 1.2049, + "step": 204600 + }, + { + "epoch": 66.16031027795734, + "grad_norm": 2.3378474712371826, + "learning_rate": 0.001, + "loss": 1.175, + "step": 204700 + }, + { + "epoch": 66.19263089851326, + "grad_norm": 1.8673335313796997, + "learning_rate": 0.001, + "loss": 1.1971, + "step": 204800 + }, + { + "epoch": 66.22495151906917, + "grad_norm": 2.035820960998535, + "learning_rate": 0.001, + "loss": 1.2128, + "step": 204900 + }, + { + "epoch": 66.25727213962509, + "grad_norm": 1.6351029872894287, + "learning_rate": 0.001, + "loss": 1.2055, + "step": 205000 + }, + { + "epoch": 66.289592760181, + "grad_norm": 2.6478259563446045, + "learning_rate": 0.001, + "loss": 1.1864, + "step": 205100 + }, + { + "epoch": 66.32191338073692, + "grad_norm": 2.325328826904297, + "learning_rate": 0.001, + "loss": 1.2158, + "step": 205200 + }, + { + "epoch": 66.35423400129282, + "grad_norm": 1.886999487876892, + "learning_rate": 0.001, + "loss": 1.2038, + "step": 205300 + }, + { + "epoch": 66.38655462184875, + "grad_norm": 2.104828119277954, + "learning_rate": 0.001, + "loss": 1.2265, + "step": 205400 + }, + { + "epoch": 66.41887524240465, + "grad_norm": 1.7025809288024902, + "learning_rate": 0.001, + "loss": 1.2375, + "step": 205500 + }, + { + "epoch": 66.45119586296057, + "grad_norm": 2.26194429397583, + "learning_rate": 0.001, + "loss": 1.2465, + "step": 205600 + }, + { + "epoch": 66.48351648351648, + "grad_norm": 2.2679295539855957, + "learning_rate": 0.001, + "loss": 1.2504, + "step": 205700 + }, + { + "epoch": 66.5158371040724, + "grad_norm": 2.020747184753418, + "learning_rate": 0.001, + "loss": 1.2355, + "step": 205800 + }, + { + "epoch": 66.54815772462831, + "grad_norm": 2.4566147327423096, + "learning_rate": 0.001, + "loss": 1.2323, + "step": 205900 + }, + { + "epoch": 66.58047834518423, + "grad_norm": 1.9596573114395142, + "learning_rate": 0.001, + "loss": 1.2406, + "step": 206000 + }, + { + "epoch": 66.61279896574014, + "grad_norm": 2.363355875015259, + "learning_rate": 0.001, + "loss": 1.2559, + "step": 206100 + }, + { + "epoch": 66.64511958629606, + "grad_norm": 2.451207160949707, + "learning_rate": 0.001, + "loss": 1.2636, + "step": 206200 + }, + { + "epoch": 66.67744020685197, + "grad_norm": 1.8735805749893188, + "learning_rate": 0.001, + "loss": 1.2496, + "step": 206300 + }, + { + "epoch": 66.70976082740789, + "grad_norm": 2.4483344554901123, + "learning_rate": 0.001, + "loss": 1.2541, + "step": 206400 + }, + { + "epoch": 66.7420814479638, + "grad_norm": 2.172673463821411, + "learning_rate": 0.001, + "loss": 1.2746, + "step": 206500 + }, + { + "epoch": 66.77440206851972, + "grad_norm": 1.755159616470337, + "learning_rate": 0.001, + "loss": 1.2705, + "step": 206600 + }, + { + "epoch": 66.80672268907563, + "grad_norm": 2.2386600971221924, + "learning_rate": 0.001, + "loss": 1.2813, + "step": 206700 + }, + { + "epoch": 66.83904330963155, + "grad_norm": 2.0152530670166016, + "learning_rate": 0.001, + "loss": 1.2673, + "step": 206800 + }, + { + "epoch": 66.87136393018746, + "grad_norm": 2.021780014038086, + "learning_rate": 0.001, + "loss": 1.2761, + "step": 206900 + }, + { + "epoch": 66.90368455074338, + "grad_norm": 1.9706413745880127, + "learning_rate": 0.001, + "loss": 1.2577, + "step": 207000 + }, + { + "epoch": 66.93600517129929, + "grad_norm": 1.9766478538513184, + "learning_rate": 0.001, + "loss": 1.2831, + "step": 207100 + }, + { + "epoch": 66.96832579185521, + "grad_norm": 2.4492130279541016, + "learning_rate": 0.001, + "loss": 1.2767, + "step": 207200 + }, + { + "epoch": 67.00064641241111, + "grad_norm": 3.763784885406494, + "learning_rate": 0.001, + "loss": 1.2547, + "step": 207300 + }, + { + "epoch": 67.03296703296704, + "grad_norm": 1.9353259801864624, + "learning_rate": 0.001, + "loss": 1.1491, + "step": 207400 + }, + { + "epoch": 67.06528765352294, + "grad_norm": 2.1818177700042725, + "learning_rate": 0.001, + "loss": 1.1611, + "step": 207500 + }, + { + "epoch": 67.09760827407887, + "grad_norm": 2.305077314376831, + "learning_rate": 0.001, + "loss": 1.166, + "step": 207600 + }, + { + "epoch": 67.12992889463477, + "grad_norm": 3.8952369689941406, + "learning_rate": 0.001, + "loss": 1.1686, + "step": 207700 + }, + { + "epoch": 67.1622495151907, + "grad_norm": 2.74645733833313, + "learning_rate": 0.001, + "loss": 1.1822, + "step": 207800 + }, + { + "epoch": 67.1945701357466, + "grad_norm": 2.66611909866333, + "learning_rate": 0.001, + "loss": 1.1769, + "step": 207900 + }, + { + "epoch": 67.22689075630252, + "grad_norm": 4.485386848449707, + "learning_rate": 0.001, + "loss": 1.1983, + "step": 208000 + }, + { + "epoch": 67.25921137685843, + "grad_norm": 2.7096071243286133, + "learning_rate": 0.001, + "loss": 1.1863, + "step": 208100 + }, + { + "epoch": 67.29153199741435, + "grad_norm": 2.0292038917541504, + "learning_rate": 0.001, + "loss": 1.1882, + "step": 208200 + }, + { + "epoch": 67.32385261797026, + "grad_norm": 2.7626845836639404, + "learning_rate": 0.001, + "loss": 1.2086, + "step": 208300 + }, + { + "epoch": 67.35617323852618, + "grad_norm": 2.0532233715057373, + "learning_rate": 0.001, + "loss": 1.2166, + "step": 208400 + }, + { + "epoch": 67.38849385908209, + "grad_norm": 1.7796385288238525, + "learning_rate": 0.001, + "loss": 1.2025, + "step": 208500 + }, + { + "epoch": 67.42081447963801, + "grad_norm": 2.683412790298462, + "learning_rate": 0.001, + "loss": 1.2139, + "step": 208600 + }, + { + "epoch": 67.45313510019392, + "grad_norm": 2.9881527423858643, + "learning_rate": 0.001, + "loss": 1.2179, + "step": 208700 + }, + { + "epoch": 67.48545572074984, + "grad_norm": 2.409613609313965, + "learning_rate": 0.001, + "loss": 1.2216, + "step": 208800 + }, + { + "epoch": 67.51777634130575, + "grad_norm": 2.4920945167541504, + "learning_rate": 0.001, + "loss": 1.2171, + "step": 208900 + }, + { + "epoch": 67.55009696186167, + "grad_norm": 2.262345790863037, + "learning_rate": 0.001, + "loss": 1.2262, + "step": 209000 + }, + { + "epoch": 67.58241758241758, + "grad_norm": 2.0987603664398193, + "learning_rate": 0.001, + "loss": 1.243, + "step": 209100 + }, + { + "epoch": 67.6147382029735, + "grad_norm": 2.0718164443969727, + "learning_rate": 0.001, + "loss": 1.2397, + "step": 209200 + }, + { + "epoch": 67.6470588235294, + "grad_norm": 3.155087947845459, + "learning_rate": 0.001, + "loss": 1.2412, + "step": 209300 + }, + { + "epoch": 67.67937944408533, + "grad_norm": 2.312764883041382, + "learning_rate": 0.001, + "loss": 1.2456, + "step": 209400 + }, + { + "epoch": 67.71170006464124, + "grad_norm": 2.4429471492767334, + "learning_rate": 0.001, + "loss": 1.2403, + "step": 209500 + }, + { + "epoch": 67.74402068519716, + "grad_norm": 2.445016384124756, + "learning_rate": 0.001, + "loss": 1.2438, + "step": 209600 + }, + { + "epoch": 67.77634130575306, + "grad_norm": 2.587137460708618, + "learning_rate": 0.001, + "loss": 1.269, + "step": 209700 + }, + { + "epoch": 67.80866192630899, + "grad_norm": 3.0269641876220703, + "learning_rate": 0.001, + "loss": 1.2673, + "step": 209800 + }, + { + "epoch": 67.8409825468649, + "grad_norm": 3.1837799549102783, + "learning_rate": 0.001, + "loss": 1.2538, + "step": 209900 + }, + { + "epoch": 67.87330316742081, + "grad_norm": 3.0893819332122803, + "learning_rate": 0.001, + "loss": 1.2495, + "step": 210000 + }, + { + "epoch": 67.90562378797672, + "grad_norm": 2.351608991622925, + "learning_rate": 0.001, + "loss": 1.2596, + "step": 210100 + }, + { + "epoch": 67.93794440853264, + "grad_norm": 2.225374221801758, + "learning_rate": 0.001, + "loss": 1.2566, + "step": 210200 + }, + { + "epoch": 67.97026502908855, + "grad_norm": 2.7132184505462646, + "learning_rate": 0.001, + "loss": 1.2556, + "step": 210300 + }, + { + "epoch": 68.00258564964447, + "grad_norm": 1.5622951984405518, + "learning_rate": 0.001, + "loss": 1.2869, + "step": 210400 + }, + { + "epoch": 68.0349062702004, + "grad_norm": 2.333732843399048, + "learning_rate": 0.001, + "loss": 1.1528, + "step": 210500 + }, + { + "epoch": 68.0672268907563, + "grad_norm": 1.970744252204895, + "learning_rate": 0.001, + "loss": 1.1497, + "step": 210600 + }, + { + "epoch": 68.09954751131222, + "grad_norm": 2.23043155670166, + "learning_rate": 0.001, + "loss": 1.1657, + "step": 210700 + }, + { + "epoch": 68.13186813186813, + "grad_norm": 1.8723633289337158, + "learning_rate": 0.001, + "loss": 1.1477, + "step": 210800 + }, + { + "epoch": 68.16418875242405, + "grad_norm": 2.50980544090271, + "learning_rate": 0.001, + "loss": 1.1604, + "step": 210900 + }, + { + "epoch": 68.19650937297996, + "grad_norm": 2.1375374794006348, + "learning_rate": 0.001, + "loss": 1.1707, + "step": 211000 + }, + { + "epoch": 68.22882999353588, + "grad_norm": 1.930857539176941, + "learning_rate": 0.001, + "loss": 1.1758, + "step": 211100 + }, + { + "epoch": 68.26115061409179, + "grad_norm": 2.3334290981292725, + "learning_rate": 0.001, + "loss": 1.1827, + "step": 211200 + }, + { + "epoch": 68.29347123464771, + "grad_norm": 2.729995012283325, + "learning_rate": 0.001, + "loss": 1.1833, + "step": 211300 + }, + { + "epoch": 68.32579185520362, + "grad_norm": 3.255042552947998, + "learning_rate": 0.001, + "loss": 1.1921, + "step": 211400 + }, + { + "epoch": 68.35811247575954, + "grad_norm": 1.533887505531311, + "learning_rate": 0.001, + "loss": 1.1909, + "step": 211500 + }, + { + "epoch": 68.39043309631545, + "grad_norm": 1.8926416635513306, + "learning_rate": 0.001, + "loss": 1.2026, + "step": 211600 + }, + { + "epoch": 68.42275371687137, + "grad_norm": 2.0479161739349365, + "learning_rate": 0.001, + "loss": 1.2002, + "step": 211700 + }, + { + "epoch": 68.45507433742728, + "grad_norm": 1.674736499786377, + "learning_rate": 0.001, + "loss": 1.2203, + "step": 211800 + }, + { + "epoch": 68.4873949579832, + "grad_norm": 2.498920440673828, + "learning_rate": 0.001, + "loss": 1.2082, + "step": 211900 + }, + { + "epoch": 68.5197155785391, + "grad_norm": 2.435779571533203, + "learning_rate": 0.001, + "loss": 1.226, + "step": 212000 + }, + { + "epoch": 68.55203619909503, + "grad_norm": 1.8328531980514526, + "learning_rate": 0.001, + "loss": 1.2292, + "step": 212100 + }, + { + "epoch": 68.58435681965094, + "grad_norm": 2.3320796489715576, + "learning_rate": 0.001, + "loss": 1.2184, + "step": 212200 + }, + { + "epoch": 68.61667744020686, + "grad_norm": 1.9430440664291382, + "learning_rate": 0.001, + "loss": 1.2313, + "step": 212300 + }, + { + "epoch": 68.64899806076276, + "grad_norm": 2.1987974643707275, + "learning_rate": 0.001, + "loss": 1.2309, + "step": 212400 + }, + { + "epoch": 68.68131868131869, + "grad_norm": 1.669899344444275, + "learning_rate": 0.001, + "loss": 1.2107, + "step": 212500 + }, + { + "epoch": 68.7136393018746, + "grad_norm": 2.1277084350585938, + "learning_rate": 0.001, + "loss": 1.2357, + "step": 212600 + }, + { + "epoch": 68.74595992243052, + "grad_norm": 1.9171146154403687, + "learning_rate": 0.001, + "loss": 1.2358, + "step": 212700 + }, + { + "epoch": 68.77828054298642, + "grad_norm": 1.9358433485031128, + "learning_rate": 0.001, + "loss": 1.2294, + "step": 212800 + }, + { + "epoch": 68.81060116354234, + "grad_norm": 1.9799690246582031, + "learning_rate": 0.001, + "loss": 1.242, + "step": 212900 + }, + { + "epoch": 68.84292178409825, + "grad_norm": 1.7972420454025269, + "learning_rate": 0.001, + "loss": 1.2474, + "step": 213000 + }, + { + "epoch": 68.87524240465417, + "grad_norm": 1.9665274620056152, + "learning_rate": 0.001, + "loss": 1.2567, + "step": 213100 + }, + { + "epoch": 68.90756302521008, + "grad_norm": 2.131694793701172, + "learning_rate": 0.001, + "loss": 1.2475, + "step": 213200 + }, + { + "epoch": 68.939883645766, + "grad_norm": 1.7594505548477173, + "learning_rate": 0.001, + "loss": 1.2593, + "step": 213300 + }, + { + "epoch": 68.97220426632191, + "grad_norm": 5.345921993255615, + "learning_rate": 0.001, + "loss": 1.2423, + "step": 213400 + }, + { + "epoch": 69.00452488687783, + "grad_norm": 1.549072504043579, + "learning_rate": 0.001, + "loss": 1.2657, + "step": 213500 + }, + { + "epoch": 69.03684550743374, + "grad_norm": 1.8662244081497192, + "learning_rate": 0.001, + "loss": 1.1174, + "step": 213600 + }, + { + "epoch": 69.06916612798966, + "grad_norm": 1.5122578144073486, + "learning_rate": 0.001, + "loss": 1.1488, + "step": 213700 + }, + { + "epoch": 69.10148674854557, + "grad_norm": 2.3468809127807617, + "learning_rate": 0.001, + "loss": 1.1498, + "step": 213800 + }, + { + "epoch": 69.13380736910149, + "grad_norm": 1.8483396768569946, + "learning_rate": 0.001, + "loss": 1.1517, + "step": 213900 + }, + { + "epoch": 69.1661279896574, + "grad_norm": 1.6163078546524048, + "learning_rate": 0.001, + "loss": 1.1765, + "step": 214000 + }, + { + "epoch": 69.19844861021332, + "grad_norm": 1.5859019756317139, + "learning_rate": 0.001, + "loss": 1.1647, + "step": 214100 + }, + { + "epoch": 69.23076923076923, + "grad_norm": 1.5599806308746338, + "learning_rate": 0.001, + "loss": 1.1526, + "step": 214200 + }, + { + "epoch": 69.26308985132515, + "grad_norm": 2.253723382949829, + "learning_rate": 0.001, + "loss": 1.1671, + "step": 214300 + }, + { + "epoch": 69.29541047188106, + "grad_norm": 1.7934225797653198, + "learning_rate": 0.001, + "loss": 1.1669, + "step": 214400 + }, + { + "epoch": 69.32773109243698, + "grad_norm": 1.9382452964782715, + "learning_rate": 0.001, + "loss": 1.165, + "step": 214500 + }, + { + "epoch": 69.36005171299288, + "grad_norm": 2.22196626663208, + "learning_rate": 0.001, + "loss": 1.1777, + "step": 214600 + }, + { + "epoch": 69.3923723335488, + "grad_norm": 1.5601698160171509, + "learning_rate": 0.001, + "loss": 1.1841, + "step": 214700 + }, + { + "epoch": 69.42469295410471, + "grad_norm": 2.0012974739074707, + "learning_rate": 0.001, + "loss": 1.1786, + "step": 214800 + }, + { + "epoch": 69.45701357466064, + "grad_norm": 1.9047846794128418, + "learning_rate": 0.001, + "loss": 1.2062, + "step": 214900 + }, + { + "epoch": 69.48933419521654, + "grad_norm": 1.562074899673462, + "learning_rate": 0.001, + "loss": 1.2138, + "step": 215000 + }, + { + "epoch": 69.52165481577246, + "grad_norm": 1.7316213846206665, + "learning_rate": 0.001, + "loss": 1.2113, + "step": 215100 + }, + { + "epoch": 69.55397543632837, + "grad_norm": 1.642343282699585, + "learning_rate": 0.001, + "loss": 1.2102, + "step": 215200 + }, + { + "epoch": 69.5862960568843, + "grad_norm": 1.579736590385437, + "learning_rate": 0.001, + "loss": 1.2164, + "step": 215300 + }, + { + "epoch": 69.6186166774402, + "grad_norm": 1.445095181465149, + "learning_rate": 0.001, + "loss": 1.2076, + "step": 215400 + }, + { + "epoch": 69.65093729799612, + "grad_norm": 1.5851157903671265, + "learning_rate": 0.001, + "loss": 1.2212, + "step": 215500 + }, + { + "epoch": 69.68325791855203, + "grad_norm": 1.7522008419036865, + "learning_rate": 0.001, + "loss": 1.2242, + "step": 215600 + }, + { + "epoch": 69.71557853910795, + "grad_norm": 1.8869768381118774, + "learning_rate": 0.001, + "loss": 1.2357, + "step": 215700 + }, + { + "epoch": 69.74789915966386, + "grad_norm": 1.9056206941604614, + "learning_rate": 0.001, + "loss": 1.2266, + "step": 215800 + }, + { + "epoch": 69.78021978021978, + "grad_norm": 1.9747616052627563, + "learning_rate": 0.001, + "loss": 1.2128, + "step": 215900 + }, + { + "epoch": 69.81254040077569, + "grad_norm": 1.853060245513916, + "learning_rate": 0.001, + "loss": 1.2285, + "step": 216000 + }, + { + "epoch": 69.84486102133161, + "grad_norm": 1.8886538743972778, + "learning_rate": 0.001, + "loss": 1.2295, + "step": 216100 + }, + { + "epoch": 69.87718164188752, + "grad_norm": 2.0137014389038086, + "learning_rate": 0.001, + "loss": 1.2251, + "step": 216200 + }, + { + "epoch": 69.90950226244344, + "grad_norm": 1.9366350173950195, + "learning_rate": 0.001, + "loss": 1.2532, + "step": 216300 + }, + { + "epoch": 69.94182288299935, + "grad_norm": 2.439671277999878, + "learning_rate": 0.001, + "loss": 1.2342, + "step": 216400 + }, + { + "epoch": 69.97414350355527, + "grad_norm": 1.625091552734375, + "learning_rate": 0.001, + "loss": 1.2523, + "step": 216500 + }, + { + "epoch": 70.00646412411119, + "grad_norm": 1.5262469053268433, + "learning_rate": 0.001, + "loss": 1.2416, + "step": 216600 + }, + { + "epoch": 70.0387847446671, + "grad_norm": 1.6262037754058838, + "learning_rate": 0.001, + "loss": 1.1363, + "step": 216700 + }, + { + "epoch": 70.07110536522302, + "grad_norm": 1.6918765306472778, + "learning_rate": 0.001, + "loss": 1.1288, + "step": 216800 + }, + { + "epoch": 70.10342598577893, + "grad_norm": 1.7063210010528564, + "learning_rate": 0.001, + "loss": 1.1313, + "step": 216900 + }, + { + "epoch": 70.13574660633485, + "grad_norm": 2.047490358352661, + "learning_rate": 0.001, + "loss": 1.1365, + "step": 217000 + }, + { + "epoch": 70.16806722689076, + "grad_norm": 1.7240195274353027, + "learning_rate": 0.001, + "loss": 1.1556, + "step": 217100 + }, + { + "epoch": 70.20038784744668, + "grad_norm": 2.204714775085449, + "learning_rate": 0.001, + "loss": 1.1435, + "step": 217200 + }, + { + "epoch": 70.23270846800258, + "grad_norm": 1.780590295791626, + "learning_rate": 0.001, + "loss": 1.1467, + "step": 217300 + }, + { + "epoch": 70.2650290885585, + "grad_norm": 1.6181354522705078, + "learning_rate": 0.001, + "loss": 1.1562, + "step": 217400 + }, + { + "epoch": 70.29734970911441, + "grad_norm": 1.7674238681793213, + "learning_rate": 0.001, + "loss": 1.1726, + "step": 217500 + }, + { + "epoch": 70.32967032967034, + "grad_norm": 1.9929686784744263, + "learning_rate": 0.001, + "loss": 1.1684, + "step": 217600 + }, + { + "epoch": 70.36199095022624, + "grad_norm": 1.562532901763916, + "learning_rate": 0.001, + "loss": 1.1832, + "step": 217700 + }, + { + "epoch": 70.39431157078216, + "grad_norm": 1.6133959293365479, + "learning_rate": 0.001, + "loss": 1.174, + "step": 217800 + }, + { + "epoch": 70.42663219133807, + "grad_norm": 1.9063694477081299, + "learning_rate": 0.001, + "loss": 1.1721, + "step": 217900 + }, + { + "epoch": 70.458952811894, + "grad_norm": 1.6396887302398682, + "learning_rate": 0.001, + "loss": 1.1818, + "step": 218000 + }, + { + "epoch": 70.4912734324499, + "grad_norm": 5.434491157531738, + "learning_rate": 0.001, + "loss": 1.1929, + "step": 218100 + }, + { + "epoch": 70.52359405300582, + "grad_norm": 2.1285393238067627, + "learning_rate": 0.001, + "loss": 1.2068, + "step": 218200 + }, + { + "epoch": 70.55591467356173, + "grad_norm": 2.146338701248169, + "learning_rate": 0.001, + "loss": 1.1927, + "step": 218300 + }, + { + "epoch": 70.58823529411765, + "grad_norm": 1.5270682573318481, + "learning_rate": 0.001, + "loss": 1.196, + "step": 218400 + }, + { + "epoch": 70.62055591467356, + "grad_norm": 1.8791449069976807, + "learning_rate": 0.001, + "loss": 1.2008, + "step": 218500 + }, + { + "epoch": 70.65287653522948, + "grad_norm": 1.5155069828033447, + "learning_rate": 0.001, + "loss": 1.202, + "step": 218600 + }, + { + "epoch": 70.68519715578539, + "grad_norm": 1.610196828842163, + "learning_rate": 0.001, + "loss": 1.2087, + "step": 218700 + }, + { + "epoch": 70.71751777634131, + "grad_norm": 1.5798680782318115, + "learning_rate": 0.001, + "loss": 1.1986, + "step": 218800 + }, + { + "epoch": 70.74983839689722, + "grad_norm": 1.494688868522644, + "learning_rate": 0.001, + "loss": 1.2055, + "step": 218900 + }, + { + "epoch": 70.78215901745314, + "grad_norm": 1.8557690382003784, + "learning_rate": 0.001, + "loss": 1.2105, + "step": 219000 + }, + { + "epoch": 70.81447963800905, + "grad_norm": 1.6135377883911133, + "learning_rate": 0.001, + "loss": 1.2151, + "step": 219100 + }, + { + "epoch": 70.84680025856497, + "grad_norm": 1.7288098335266113, + "learning_rate": 0.001, + "loss": 1.2289, + "step": 219200 + }, + { + "epoch": 70.87912087912088, + "grad_norm": 1.838810920715332, + "learning_rate": 0.001, + "loss": 1.2268, + "step": 219300 + }, + { + "epoch": 70.9114414996768, + "grad_norm": 2.151858329772949, + "learning_rate": 0.001, + "loss": 1.2403, + "step": 219400 + }, + { + "epoch": 70.9437621202327, + "grad_norm": 1.791812539100647, + "learning_rate": 0.001, + "loss": 1.2395, + "step": 219500 + }, + { + "epoch": 70.97608274078863, + "grad_norm": 1.4549312591552734, + "learning_rate": 0.001, + "loss": 1.2375, + "step": 219600 + }, + { + "epoch": 71.00840336134453, + "grad_norm": 1.5893354415893555, + "learning_rate": 0.001, + "loss": 1.1986, + "step": 219700 + }, + { + "epoch": 71.04072398190046, + "grad_norm": 1.598484992980957, + "learning_rate": 0.001, + "loss": 1.1016, + "step": 219800 + }, + { + "epoch": 71.07304460245636, + "grad_norm": 1.74235200881958, + "learning_rate": 0.001, + "loss": 1.1297, + "step": 219900 + }, + { + "epoch": 71.10536522301229, + "grad_norm": 2.0175647735595703, + "learning_rate": 0.001, + "loss": 1.1254, + "step": 220000 + }, + { + "epoch": 71.13768584356819, + "grad_norm": 1.877590537071228, + "learning_rate": 0.001, + "loss": 1.1301, + "step": 220100 + }, + { + "epoch": 71.17000646412411, + "grad_norm": 1.6253721714019775, + "learning_rate": 0.001, + "loss": 1.127, + "step": 220200 + }, + { + "epoch": 71.20232708468002, + "grad_norm": 1.6335793733596802, + "learning_rate": 0.001, + "loss": 1.1491, + "step": 220300 + }, + { + "epoch": 71.23464770523594, + "grad_norm": 1.8717635869979858, + "learning_rate": 0.001, + "loss": 1.129, + "step": 220400 + }, + { + "epoch": 71.26696832579185, + "grad_norm": 1.9798500537872314, + "learning_rate": 0.001, + "loss": 1.1498, + "step": 220500 + }, + { + "epoch": 71.29928894634777, + "grad_norm": 1.6062580347061157, + "learning_rate": 0.001, + "loss": 1.149, + "step": 220600 + }, + { + "epoch": 71.33160956690368, + "grad_norm": 1.7187769412994385, + "learning_rate": 0.001, + "loss": 1.1589, + "step": 220700 + }, + { + "epoch": 71.3639301874596, + "grad_norm": 1.6825973987579346, + "learning_rate": 0.001, + "loss": 1.1705, + "step": 220800 + }, + { + "epoch": 71.39625080801551, + "grad_norm": 1.5847103595733643, + "learning_rate": 0.001, + "loss": 1.1657, + "step": 220900 + }, + { + "epoch": 71.42857142857143, + "grad_norm": 1.477260947227478, + "learning_rate": 0.001, + "loss": 1.1714, + "step": 221000 + }, + { + "epoch": 71.46089204912734, + "grad_norm": 2.0396647453308105, + "learning_rate": 0.001, + "loss": 1.1745, + "step": 221100 + }, + { + "epoch": 71.49321266968326, + "grad_norm": 1.5809693336486816, + "learning_rate": 0.001, + "loss": 1.169, + "step": 221200 + }, + { + "epoch": 71.52553329023917, + "grad_norm": 1.5389111042022705, + "learning_rate": 0.001, + "loss": 1.184, + "step": 221300 + }, + { + "epoch": 71.55785391079509, + "grad_norm": 1.5018457174301147, + "learning_rate": 0.001, + "loss": 1.1905, + "step": 221400 + }, + { + "epoch": 71.590174531351, + "grad_norm": 1.8730908632278442, + "learning_rate": 0.001, + "loss": 1.1808, + "step": 221500 + }, + { + "epoch": 71.62249515190692, + "grad_norm": 1.9678943157196045, + "learning_rate": 0.001, + "loss": 1.1696, + "step": 221600 + }, + { + "epoch": 71.65481577246283, + "grad_norm": 1.6994378566741943, + "learning_rate": 0.001, + "loss": 1.1976, + "step": 221700 + }, + { + "epoch": 71.68713639301875, + "grad_norm": 1.820391058921814, + "learning_rate": 0.001, + "loss": 1.2076, + "step": 221800 + }, + { + "epoch": 71.71945701357465, + "grad_norm": 1.6418462991714478, + "learning_rate": 0.001, + "loss": 1.1973, + "step": 221900 + }, + { + "epoch": 71.75177763413058, + "grad_norm": 1.805640459060669, + "learning_rate": 0.001, + "loss": 1.2126, + "step": 222000 + }, + { + "epoch": 71.78409825468648, + "grad_norm": 1.5427037477493286, + "learning_rate": 0.001, + "loss": 1.2058, + "step": 222100 + }, + { + "epoch": 71.8164188752424, + "grad_norm": 1.6487582921981812, + "learning_rate": 0.001, + "loss": 1.2155, + "step": 222200 + }, + { + "epoch": 71.84873949579831, + "grad_norm": 2.0085551738739014, + "learning_rate": 0.001, + "loss": 1.2167, + "step": 222300 + }, + { + "epoch": 71.88106011635423, + "grad_norm": 1.4483178853988647, + "learning_rate": 0.001, + "loss": 1.2155, + "step": 222400 + }, + { + "epoch": 71.91338073691014, + "grad_norm": 1.6381824016571045, + "learning_rate": 0.001, + "loss": 1.2192, + "step": 222500 + }, + { + "epoch": 71.94570135746606, + "grad_norm": 2.2828452587127686, + "learning_rate": 0.001, + "loss": 1.227, + "step": 222600 + }, + { + "epoch": 71.97802197802197, + "grad_norm": 2.1275863647460938, + "learning_rate": 0.001, + "loss": 1.2355, + "step": 222700 + }, + { + "epoch": 72.01034259857789, + "grad_norm": 1.8210406303405762, + "learning_rate": 0.001, + "loss": 1.1774, + "step": 222800 + }, + { + "epoch": 72.04266321913381, + "grad_norm": 1.9121840000152588, + "learning_rate": 0.001, + "loss": 1.102, + "step": 222900 + }, + { + "epoch": 72.07498383968972, + "grad_norm": 1.948397159576416, + "learning_rate": 0.001, + "loss": 1.1163, + "step": 223000 + }, + { + "epoch": 72.10730446024564, + "grad_norm": 2.113853693008423, + "learning_rate": 0.001, + "loss": 1.1221, + "step": 223100 + }, + { + "epoch": 72.13962508080155, + "grad_norm": 1.9064280986785889, + "learning_rate": 0.001, + "loss": 1.1297, + "step": 223200 + }, + { + "epoch": 72.17194570135747, + "grad_norm": 1.8943758010864258, + "learning_rate": 0.001, + "loss": 1.1214, + "step": 223300 + }, + { + "epoch": 72.20426632191338, + "grad_norm": 2.0406997203826904, + "learning_rate": 0.001, + "loss": 1.134, + "step": 223400 + }, + { + "epoch": 72.2365869424693, + "grad_norm": 1.837704062461853, + "learning_rate": 0.001, + "loss": 1.1316, + "step": 223500 + }, + { + "epoch": 72.26890756302521, + "grad_norm": 1.7503560781478882, + "learning_rate": 0.001, + "loss": 1.1263, + "step": 223600 + }, + { + "epoch": 72.30122818358113, + "grad_norm": 1.602188229560852, + "learning_rate": 0.001, + "loss": 1.1543, + "step": 223700 + }, + { + "epoch": 72.33354880413704, + "grad_norm": 1.8570600748062134, + "learning_rate": 0.001, + "loss": 1.159, + "step": 223800 + }, + { + "epoch": 72.36586942469296, + "grad_norm": 1.7517465353012085, + "learning_rate": 0.001, + "loss": 1.1539, + "step": 223900 + }, + { + "epoch": 72.39819004524887, + "grad_norm": 1.8670244216918945, + "learning_rate": 0.001, + "loss": 1.1495, + "step": 224000 + }, + { + "epoch": 72.43051066580479, + "grad_norm": 1.831101655960083, + "learning_rate": 0.001, + "loss": 1.1613, + "step": 224100 + }, + { + "epoch": 72.4628312863607, + "grad_norm": 1.6500862836837769, + "learning_rate": 0.001, + "loss": 1.1413, + "step": 224200 + }, + { + "epoch": 72.49515190691662, + "grad_norm": 1.9786334037780762, + "learning_rate": 0.001, + "loss": 1.1655, + "step": 224300 + }, + { + "epoch": 72.52747252747253, + "grad_norm": 1.7236011028289795, + "learning_rate": 0.001, + "loss": 1.1654, + "step": 224400 + }, + { + "epoch": 72.55979314802845, + "grad_norm": 2.018129587173462, + "learning_rate": 0.001, + "loss": 1.1609, + "step": 224500 + }, + { + "epoch": 72.59211376858435, + "grad_norm": 1.9249852895736694, + "learning_rate": 0.001, + "loss": 1.1889, + "step": 224600 + }, + { + "epoch": 72.62443438914028, + "grad_norm": 1.736116886138916, + "learning_rate": 0.001, + "loss": 1.1828, + "step": 224700 + }, + { + "epoch": 72.65675500969618, + "grad_norm": 1.6699976921081543, + "learning_rate": 0.001, + "loss": 1.173, + "step": 224800 + }, + { + "epoch": 72.6890756302521, + "grad_norm": 2.20131778717041, + "learning_rate": 0.001, + "loss": 1.1921, + "step": 224900 + }, + { + "epoch": 72.72139625080801, + "grad_norm": 1.9852443933486938, + "learning_rate": 0.001, + "loss": 1.1904, + "step": 225000 + }, + { + "epoch": 72.75371687136393, + "grad_norm": 1.9196783304214478, + "learning_rate": 0.001, + "loss": 1.1867, + "step": 225100 + }, + { + "epoch": 72.78603749191984, + "grad_norm": 1.7682510614395142, + "learning_rate": 0.001, + "loss": 1.193, + "step": 225200 + }, + { + "epoch": 72.81835811247576, + "grad_norm": 2.267354726791382, + "learning_rate": 0.001, + "loss": 1.1856, + "step": 225300 + }, + { + "epoch": 72.85067873303167, + "grad_norm": 1.7886394262313843, + "learning_rate": 0.001, + "loss": 1.2008, + "step": 225400 + }, + { + "epoch": 72.88299935358759, + "grad_norm": 1.6785390377044678, + "learning_rate": 0.001, + "loss": 1.2051, + "step": 225500 + }, + { + "epoch": 72.9153199741435, + "grad_norm": 1.8354374170303345, + "learning_rate": 0.001, + "loss": 1.1972, + "step": 225600 + }, + { + "epoch": 72.94764059469942, + "grad_norm": 1.6264238357543945, + "learning_rate": 0.001, + "loss": 1.2162, + "step": 225700 + }, + { + "epoch": 72.97996121525533, + "grad_norm": 2.1394410133361816, + "learning_rate": 0.001, + "loss": 1.223, + "step": 225800 + }, + { + "epoch": 73.01228183581125, + "grad_norm": 1.795905351638794, + "learning_rate": 0.001, + "loss": 1.1582, + "step": 225900 + }, + { + "epoch": 73.04460245636716, + "grad_norm": 2.1484296321868896, + "learning_rate": 0.001, + "loss": 1.0819, + "step": 226000 + }, + { + "epoch": 73.07692307692308, + "grad_norm": 1.72504460811615, + "learning_rate": 0.001, + "loss": 1.0955, + "step": 226100 + }, + { + "epoch": 73.10924369747899, + "grad_norm": 1.5780655145645142, + "learning_rate": 0.001, + "loss": 1.0913, + "step": 226200 + }, + { + "epoch": 73.14156431803491, + "grad_norm": 1.83950674533844, + "learning_rate": 0.001, + "loss": 1.1112, + "step": 226300 + }, + { + "epoch": 73.17388493859082, + "grad_norm": 2.016282320022583, + "learning_rate": 0.001, + "loss": 1.1083, + "step": 226400 + }, + { + "epoch": 73.20620555914674, + "grad_norm": 2.395186185836792, + "learning_rate": 0.001, + "loss": 1.111, + "step": 226500 + }, + { + "epoch": 73.23852617970265, + "grad_norm": 2.154127836227417, + "learning_rate": 0.001, + "loss": 1.1132, + "step": 226600 + }, + { + "epoch": 73.27084680025857, + "grad_norm": 1.6973828077316284, + "learning_rate": 0.001, + "loss": 1.1365, + "step": 226700 + }, + { + "epoch": 73.30316742081448, + "grad_norm": 1.861863613128662, + "learning_rate": 0.001, + "loss": 1.1306, + "step": 226800 + }, + { + "epoch": 73.3354880413704, + "grad_norm": 2.0509049892425537, + "learning_rate": 0.001, + "loss": 1.1463, + "step": 226900 + }, + { + "epoch": 73.3678086619263, + "grad_norm": 1.933553695678711, + "learning_rate": 0.001, + "loss": 1.161, + "step": 227000 + }, + { + "epoch": 73.40012928248223, + "grad_norm": 2.0474693775177, + "learning_rate": 0.001, + "loss": 1.1489, + "step": 227100 + }, + { + "epoch": 73.43244990303813, + "grad_norm": 2.182870388031006, + "learning_rate": 0.001, + "loss": 1.1481, + "step": 227200 + }, + { + "epoch": 73.46477052359405, + "grad_norm": 1.6766440868377686, + "learning_rate": 0.001, + "loss": 1.1566, + "step": 227300 + }, + { + "epoch": 73.49709114414996, + "grad_norm": 2.327120542526245, + "learning_rate": 0.001, + "loss": 1.1424, + "step": 227400 + }, + { + "epoch": 73.52941176470588, + "grad_norm": 1.8558249473571777, + "learning_rate": 0.001, + "loss": 1.1572, + "step": 227500 + }, + { + "epoch": 73.56173238526179, + "grad_norm": 2.0666706562042236, + "learning_rate": 0.001, + "loss": 1.1534, + "step": 227600 + }, + { + "epoch": 73.59405300581771, + "grad_norm": 2.563960313796997, + "learning_rate": 0.001, + "loss": 1.1588, + "step": 227700 + }, + { + "epoch": 73.62637362637362, + "grad_norm": 1.841058373451233, + "learning_rate": 0.001, + "loss": 1.174, + "step": 227800 + }, + { + "epoch": 73.65869424692954, + "grad_norm": 1.7862833738327026, + "learning_rate": 0.001, + "loss": 1.1698, + "step": 227900 + }, + { + "epoch": 73.69101486748545, + "grad_norm": 1.935362458229065, + "learning_rate": 0.001, + "loss": 1.1793, + "step": 228000 + }, + { + "epoch": 73.72333548804137, + "grad_norm": 1.609352707862854, + "learning_rate": 0.001, + "loss": 1.1831, + "step": 228100 + }, + { + "epoch": 73.75565610859728, + "grad_norm": 2.0712058544158936, + "learning_rate": 0.001, + "loss": 1.18, + "step": 228200 + }, + { + "epoch": 73.7879767291532, + "grad_norm": 1.8805397748947144, + "learning_rate": 0.001, + "loss": 1.1897, + "step": 228300 + }, + { + "epoch": 73.82029734970911, + "grad_norm": 1.8375552892684937, + "learning_rate": 0.001, + "loss": 1.1886, + "step": 228400 + }, + { + "epoch": 73.85261797026503, + "grad_norm": 2.0276403427124023, + "learning_rate": 0.001, + "loss": 1.1811, + "step": 228500 + }, + { + "epoch": 73.88493859082094, + "grad_norm": 2.1257636547088623, + "learning_rate": 0.001, + "loss": 1.1942, + "step": 228600 + }, + { + "epoch": 73.91725921137686, + "grad_norm": 1.6965306997299194, + "learning_rate": 0.001, + "loss": 1.2039, + "step": 228700 + }, + { + "epoch": 73.94957983193277, + "grad_norm": 2.1263046264648438, + "learning_rate": 0.001, + "loss": 1.2031, + "step": 228800 + }, + { + "epoch": 73.98190045248869, + "grad_norm": 1.8232370615005493, + "learning_rate": 0.001, + "loss": 1.2104, + "step": 228900 + }, + { + "epoch": 74.01422107304461, + "grad_norm": 2.033893346786499, + "learning_rate": 0.001, + "loss": 1.1334, + "step": 229000 + }, + { + "epoch": 74.04654169360052, + "grad_norm": 1.9348260164260864, + "learning_rate": 0.001, + "loss": 1.0812, + "step": 229100 + }, + { + "epoch": 74.07886231415644, + "grad_norm": 2.0578017234802246, + "learning_rate": 0.001, + "loss": 1.0994, + "step": 229200 + }, + { + "epoch": 74.11118293471235, + "grad_norm": 2.131514072418213, + "learning_rate": 0.001, + "loss": 1.1066, + "step": 229300 + }, + { + "epoch": 74.14350355526827, + "grad_norm": 2.09892201423645, + "learning_rate": 0.001, + "loss": 1.0978, + "step": 229400 + }, + { + "epoch": 74.17582417582418, + "grad_norm": 2.2361555099487305, + "learning_rate": 0.001, + "loss": 1.0948, + "step": 229500 + }, + { + "epoch": 74.2081447963801, + "grad_norm": 2.1429715156555176, + "learning_rate": 0.001, + "loss": 1.12, + "step": 229600 + }, + { + "epoch": 74.240465416936, + "grad_norm": 1.7811630964279175, + "learning_rate": 0.001, + "loss": 1.1277, + "step": 229700 + }, + { + "epoch": 74.27278603749193, + "grad_norm": 2.2443344593048096, + "learning_rate": 0.001, + "loss": 1.1082, + "step": 229800 + }, + { + "epoch": 74.30510665804783, + "grad_norm": 2.121556520462036, + "learning_rate": 0.001, + "loss": 1.1229, + "step": 229900 + }, + { + "epoch": 74.33742727860376, + "grad_norm": 2.5482897758483887, + "learning_rate": 0.001, + "loss": 1.1368, + "step": 230000 + }, + { + "epoch": 74.36974789915966, + "grad_norm": 1.9960894584655762, + "learning_rate": 0.001, + "loss": 1.1245, + "step": 230100 + }, + { + "epoch": 74.40206851971558, + "grad_norm": 2.089287757873535, + "learning_rate": 0.001, + "loss": 1.1405, + "step": 230200 + }, + { + "epoch": 74.43438914027149, + "grad_norm": 1.8604276180267334, + "learning_rate": 0.001, + "loss": 1.1538, + "step": 230300 + }, + { + "epoch": 74.46670976082741, + "grad_norm": 1.9729381799697876, + "learning_rate": 0.001, + "loss": 1.1571, + "step": 230400 + }, + { + "epoch": 74.49903038138332, + "grad_norm": 1.8836462497711182, + "learning_rate": 0.001, + "loss": 1.1498, + "step": 230500 + }, + { + "epoch": 74.53135100193924, + "grad_norm": 1.781795859336853, + "learning_rate": 0.001, + "loss": 1.1463, + "step": 230600 + }, + { + "epoch": 74.56367162249515, + "grad_norm": 2.167222023010254, + "learning_rate": 0.001, + "loss": 1.1387, + "step": 230700 + }, + { + "epoch": 74.59599224305107, + "grad_norm": 2.2089223861694336, + "learning_rate": 0.001, + "loss": 1.1499, + "step": 230800 + }, + { + "epoch": 74.62831286360698, + "grad_norm": 2.309671401977539, + "learning_rate": 0.001, + "loss": 1.1569, + "step": 230900 + }, + { + "epoch": 74.6606334841629, + "grad_norm": 1.9005663394927979, + "learning_rate": 0.001, + "loss": 1.1723, + "step": 231000 + }, + { + "epoch": 74.69295410471881, + "grad_norm": 1.8752561807632446, + "learning_rate": 0.001, + "loss": 1.1675, + "step": 231100 + }, + { + "epoch": 74.72527472527473, + "grad_norm": 2.7028305530548096, + "learning_rate": 0.001, + "loss": 1.1656, + "step": 231200 + }, + { + "epoch": 74.75759534583064, + "grad_norm": 2.19155216217041, + "learning_rate": 0.001, + "loss": 1.1671, + "step": 231300 + }, + { + "epoch": 74.78991596638656, + "grad_norm": 2.1153290271759033, + "learning_rate": 0.001, + "loss": 1.1774, + "step": 231400 + }, + { + "epoch": 74.82223658694247, + "grad_norm": 1.7875677347183228, + "learning_rate": 0.001, + "loss": 1.1752, + "step": 231500 + }, + { + "epoch": 74.85455720749839, + "grad_norm": 2.1596906185150146, + "learning_rate": 0.001, + "loss": 1.1678, + "step": 231600 + }, + { + "epoch": 74.8868778280543, + "grad_norm": 2.106273889541626, + "learning_rate": 0.001, + "loss": 1.1814, + "step": 231700 + }, + { + "epoch": 74.91919844861022, + "grad_norm": 2.4843180179595947, + "learning_rate": 0.001, + "loss": 1.1823, + "step": 231800 + }, + { + "epoch": 74.95151906916612, + "grad_norm": 2.45287823677063, + "learning_rate": 0.001, + "loss": 1.1806, + "step": 231900 + }, + { + "epoch": 74.98383968972205, + "grad_norm": 2.348428249359131, + "learning_rate": 0.001, + "loss": 1.1847, + "step": 232000 + }, + { + "epoch": 75.01616031027795, + "grad_norm": 2.1072657108306885, + "learning_rate": 0.001, + "loss": 1.1007, + "step": 232100 + }, + { + "epoch": 75.04848093083388, + "grad_norm": 2.227376699447632, + "learning_rate": 0.001, + "loss": 1.0656, + "step": 232200 + }, + { + "epoch": 75.08080155138978, + "grad_norm": 2.6325557231903076, + "learning_rate": 0.001, + "loss": 1.0806, + "step": 232300 + }, + { + "epoch": 75.1131221719457, + "grad_norm": 2.4202475547790527, + "learning_rate": 0.001, + "loss": 1.0943, + "step": 232400 + }, + { + "epoch": 75.14544279250161, + "grad_norm": 2.6263668537139893, + "learning_rate": 0.001, + "loss": 1.0991, + "step": 232500 + }, + { + "epoch": 75.17776341305753, + "grad_norm": 2.652233600616455, + "learning_rate": 0.001, + "loss": 1.0959, + "step": 232600 + }, + { + "epoch": 75.21008403361344, + "grad_norm": 2.31381893157959, + "learning_rate": 0.001, + "loss": 1.1064, + "step": 232700 + }, + { + "epoch": 75.24240465416936, + "grad_norm": 2.8877060413360596, + "learning_rate": 0.001, + "loss": 1.101, + "step": 232800 + }, + { + "epoch": 75.27472527472527, + "grad_norm": 3.5789542198181152, + "learning_rate": 0.001, + "loss": 1.1082, + "step": 232900 + }, + { + "epoch": 75.30704589528119, + "grad_norm": 2.622349500656128, + "learning_rate": 0.001, + "loss": 1.101, + "step": 233000 + }, + { + "epoch": 75.3393665158371, + "grad_norm": 2.9478626251220703, + "learning_rate": 0.001, + "loss": 1.1134, + "step": 233100 + }, + { + "epoch": 75.37168713639302, + "grad_norm": 2.258899688720703, + "learning_rate": 0.001, + "loss": 1.1272, + "step": 233200 + }, + { + "epoch": 75.40400775694893, + "grad_norm": 2.8616788387298584, + "learning_rate": 0.001, + "loss": 1.1265, + "step": 233300 + }, + { + "epoch": 75.43632837750485, + "grad_norm": 2.5713586807250977, + "learning_rate": 0.001, + "loss": 1.1222, + "step": 233400 + }, + { + "epoch": 75.46864899806076, + "grad_norm": 2.9132118225097656, + "learning_rate": 0.001, + "loss": 1.129, + "step": 233500 + }, + { + "epoch": 75.50096961861668, + "grad_norm": 2.412076473236084, + "learning_rate": 0.001, + "loss": 1.1272, + "step": 233600 + }, + { + "epoch": 75.53329023917259, + "grad_norm": 2.0582900047302246, + "learning_rate": 0.001, + "loss": 1.1382, + "step": 233700 + }, + { + "epoch": 75.56561085972851, + "grad_norm": 2.4409642219543457, + "learning_rate": 0.001, + "loss": 1.1436, + "step": 233800 + }, + { + "epoch": 75.59793148028442, + "grad_norm": 2.7730462551116943, + "learning_rate": 0.001, + "loss": 1.1466, + "step": 233900 + }, + { + "epoch": 75.63025210084034, + "grad_norm": 2.9127862453460693, + "learning_rate": 0.001, + "loss": 1.1634, + "step": 234000 + }, + { + "epoch": 75.66257272139624, + "grad_norm": 2.6338980197906494, + "learning_rate": 0.001, + "loss": 1.1515, + "step": 234100 + }, + { + "epoch": 75.69489334195217, + "grad_norm": 2.3682878017425537, + "learning_rate": 0.001, + "loss": 1.1716, + "step": 234200 + }, + { + "epoch": 75.72721396250807, + "grad_norm": 3.0970637798309326, + "learning_rate": 0.001, + "loss": 1.1681, + "step": 234300 + }, + { + "epoch": 75.759534583064, + "grad_norm": 3.246640682220459, + "learning_rate": 0.001, + "loss": 1.1656, + "step": 234400 + }, + { + "epoch": 75.7918552036199, + "grad_norm": 2.827653169631958, + "learning_rate": 0.001, + "loss": 1.1712, + "step": 234500 + }, + { + "epoch": 75.82417582417582, + "grad_norm": 2.37351393699646, + "learning_rate": 0.001, + "loss": 1.1767, + "step": 234600 + }, + { + "epoch": 75.85649644473173, + "grad_norm": 2.936460018157959, + "learning_rate": 0.001, + "loss": 1.1798, + "step": 234700 + }, + { + "epoch": 75.88881706528765, + "grad_norm": 2.810807228088379, + "learning_rate": 0.001, + "loss": 1.1756, + "step": 234800 + }, + { + "epoch": 75.92113768584356, + "grad_norm": 2.5156216621398926, + "learning_rate": 0.001, + "loss": 1.1682, + "step": 234900 + }, + { + "epoch": 75.95345830639948, + "grad_norm": 2.3264551162719727, + "learning_rate": 0.001, + "loss": 1.1573, + "step": 235000 + }, + { + "epoch": 75.98577892695539, + "grad_norm": 2.1307079792022705, + "learning_rate": 0.001, + "loss": 1.189, + "step": 235100 + }, + { + "epoch": 76.01809954751131, + "grad_norm": 1.8315860033035278, + "learning_rate": 0.001, + "loss": 1.1347, + "step": 235200 + }, + { + "epoch": 76.05042016806723, + "grad_norm": 1.5642365217208862, + "learning_rate": 0.001, + "loss": 1.062, + "step": 235300 + }, + { + "epoch": 76.08274078862314, + "grad_norm": 2.2328715324401855, + "learning_rate": 0.001, + "loss": 1.0708, + "step": 235400 + }, + { + "epoch": 76.11506140917906, + "grad_norm": 1.7951757907867432, + "learning_rate": 0.001, + "loss": 1.0796, + "step": 235500 + }, + { + "epoch": 76.14738202973497, + "grad_norm": 1.8427295684814453, + "learning_rate": 0.001, + "loss": 1.0759, + "step": 235600 + }, + { + "epoch": 76.17970265029089, + "grad_norm": 1.8815374374389648, + "learning_rate": 0.001, + "loss": 1.0794, + "step": 235700 + }, + { + "epoch": 76.2120232708468, + "grad_norm": 2.425978899002075, + "learning_rate": 0.001, + "loss": 1.1024, + "step": 235800 + }, + { + "epoch": 76.24434389140272, + "grad_norm": 2.3707029819488525, + "learning_rate": 0.001, + "loss": 1.0746, + "step": 235900 + }, + { + "epoch": 76.27666451195863, + "grad_norm": 2.2675940990448, + "learning_rate": 0.001, + "loss": 1.1096, + "step": 236000 + }, + { + "epoch": 76.30898513251455, + "grad_norm": 1.627684473991394, + "learning_rate": 0.001, + "loss": 1.1072, + "step": 236100 + }, + { + "epoch": 76.34130575307046, + "grad_norm": 1.8927665948867798, + "learning_rate": 0.001, + "loss": 1.1098, + "step": 236200 + }, + { + "epoch": 76.37362637362638, + "grad_norm": 1.8863893747329712, + "learning_rate": 0.001, + "loss": 1.0979, + "step": 236300 + }, + { + "epoch": 76.40594699418229, + "grad_norm": 2.0543971061706543, + "learning_rate": 0.001, + "loss": 1.1123, + "step": 236400 + }, + { + "epoch": 76.43826761473821, + "grad_norm": 2.1276564598083496, + "learning_rate": 0.001, + "loss": 1.1174, + "step": 236500 + }, + { + "epoch": 76.47058823529412, + "grad_norm": 2.0694077014923096, + "learning_rate": 0.001, + "loss": 1.1169, + "step": 236600 + }, + { + "epoch": 76.50290885585004, + "grad_norm": 1.7249040603637695, + "learning_rate": 0.001, + "loss": 1.1289, + "step": 236700 + }, + { + "epoch": 76.53522947640595, + "grad_norm": 2.4833602905273438, + "learning_rate": 0.001, + "loss": 1.1457, + "step": 236800 + }, + { + "epoch": 76.56755009696187, + "grad_norm": 1.8441822528839111, + "learning_rate": 0.001, + "loss": 1.122, + "step": 236900 + }, + { + "epoch": 76.59987071751777, + "grad_norm": 1.9829216003417969, + "learning_rate": 0.001, + "loss": 1.134, + "step": 237000 + }, + { + "epoch": 76.6321913380737, + "grad_norm": 2.1754186153411865, + "learning_rate": 0.001, + "loss": 1.1358, + "step": 237100 + }, + { + "epoch": 76.6645119586296, + "grad_norm": 1.6874982118606567, + "learning_rate": 0.001, + "loss": 1.1385, + "step": 237200 + }, + { + "epoch": 76.69683257918552, + "grad_norm": 1.662231683731079, + "learning_rate": 0.001, + "loss": 1.1278, + "step": 237300 + }, + { + "epoch": 76.72915319974143, + "grad_norm": 1.977017879486084, + "learning_rate": 0.001, + "loss": 1.1418, + "step": 237400 + }, + { + "epoch": 76.76147382029735, + "grad_norm": 1.5747963190078735, + "learning_rate": 0.001, + "loss": 1.1458, + "step": 237500 + }, + { + "epoch": 76.79379444085326, + "grad_norm": 2.3878796100616455, + "learning_rate": 0.001, + "loss": 1.1603, + "step": 237600 + }, + { + "epoch": 76.82611506140918, + "grad_norm": 2.2227580547332764, + "learning_rate": 0.001, + "loss": 1.174, + "step": 237700 + }, + { + "epoch": 76.85843568196509, + "grad_norm": 2.162853479385376, + "learning_rate": 0.001, + "loss": 1.1545, + "step": 237800 + }, + { + "epoch": 76.89075630252101, + "grad_norm": 1.6438781023025513, + "learning_rate": 0.001, + "loss": 1.1794, + "step": 237900 + }, + { + "epoch": 76.92307692307692, + "grad_norm": 1.6767332553863525, + "learning_rate": 0.001, + "loss": 1.1709, + "step": 238000 + }, + { + "epoch": 76.95539754363284, + "grad_norm": 1.6060292720794678, + "learning_rate": 0.001, + "loss": 1.1717, + "step": 238100 + }, + { + "epoch": 76.98771816418875, + "grad_norm": 1.7817925214767456, + "learning_rate": 0.001, + "loss": 1.1818, + "step": 238200 + }, + { + "epoch": 77.02003878474467, + "grad_norm": 1.6019141674041748, + "learning_rate": 0.001, + "loss": 1.1144, + "step": 238300 + }, + { + "epoch": 77.05235940530058, + "grad_norm": 1.992630124092102, + "learning_rate": 0.001, + "loss": 1.0535, + "step": 238400 + }, + { + "epoch": 77.0846800258565, + "grad_norm": 1.9400583505630493, + "learning_rate": 0.001, + "loss": 1.0624, + "step": 238500 + }, + { + "epoch": 77.11700064641241, + "grad_norm": 1.91287100315094, + "learning_rate": 0.001, + "loss": 1.0571, + "step": 238600 + }, + { + "epoch": 77.14932126696833, + "grad_norm": 1.7860596179962158, + "learning_rate": 0.001, + "loss": 1.0697, + "step": 238700 + }, + { + "epoch": 77.18164188752424, + "grad_norm": 1.6752883195877075, + "learning_rate": 0.001, + "loss": 1.0746, + "step": 238800 + }, + { + "epoch": 77.21396250808016, + "grad_norm": 1.7331321239471436, + "learning_rate": 0.001, + "loss": 1.0964, + "step": 238900 + }, + { + "epoch": 77.24628312863607, + "grad_norm": 1.943953275680542, + "learning_rate": 0.001, + "loss": 1.0953, + "step": 239000 + }, + { + "epoch": 77.27860374919199, + "grad_norm": 1.7045990228652954, + "learning_rate": 0.001, + "loss": 1.0782, + "step": 239100 + }, + { + "epoch": 77.3109243697479, + "grad_norm": 2.1768674850463867, + "learning_rate": 0.001, + "loss": 1.0918, + "step": 239200 + }, + { + "epoch": 77.34324499030382, + "grad_norm": 1.8054300546646118, + "learning_rate": 0.001, + "loss": 1.0982, + "step": 239300 + }, + { + "epoch": 77.37556561085972, + "grad_norm": 1.6111589670181274, + "learning_rate": 0.001, + "loss": 1.094, + "step": 239400 + }, + { + "epoch": 77.40788623141565, + "grad_norm": 1.7453973293304443, + "learning_rate": 0.001, + "loss": 1.1012, + "step": 239500 + }, + { + "epoch": 77.44020685197155, + "grad_norm": 2.423070192337036, + "learning_rate": 0.001, + "loss": 1.1149, + "step": 239600 + }, + { + "epoch": 77.47252747252747, + "grad_norm": 2.0236239433288574, + "learning_rate": 0.001, + "loss": 1.1102, + "step": 239700 + }, + { + "epoch": 77.50484809308338, + "grad_norm": 2.0543761253356934, + "learning_rate": 0.001, + "loss": 1.1263, + "step": 239800 + }, + { + "epoch": 77.5371687136393, + "grad_norm": 1.643644094467163, + "learning_rate": 0.001, + "loss": 1.1168, + "step": 239900 + }, + { + "epoch": 77.56948933419521, + "grad_norm": 1.8869572877883911, + "learning_rate": 0.001, + "loss": 1.125, + "step": 240000 + }, + { + "epoch": 77.60180995475113, + "grad_norm": 1.5091863870620728, + "learning_rate": 0.001, + "loss": 1.1353, + "step": 240100 + }, + { + "epoch": 77.63413057530704, + "grad_norm": 1.7741518020629883, + "learning_rate": 0.001, + "loss": 1.1292, + "step": 240200 + }, + { + "epoch": 77.66645119586296, + "grad_norm": 1.767402172088623, + "learning_rate": 0.001, + "loss": 1.1508, + "step": 240300 + }, + { + "epoch": 77.69877181641887, + "grad_norm": 1.9282313585281372, + "learning_rate": 0.001, + "loss": 1.1398, + "step": 240400 + }, + { + "epoch": 77.73109243697479, + "grad_norm": 1.8274956941604614, + "learning_rate": 0.001, + "loss": 1.1345, + "step": 240500 + }, + { + "epoch": 77.7634130575307, + "grad_norm": 1.7720264196395874, + "learning_rate": 0.001, + "loss": 1.1591, + "step": 240600 + }, + { + "epoch": 77.79573367808662, + "grad_norm": 1.7153974771499634, + "learning_rate": 0.001, + "loss": 1.1481, + "step": 240700 + }, + { + "epoch": 77.82805429864253, + "grad_norm": 2.328188180923462, + "learning_rate": 0.001, + "loss": 1.1558, + "step": 240800 + }, + { + "epoch": 77.86037491919845, + "grad_norm": 1.966943383216858, + "learning_rate": 0.001, + "loss": 1.1406, + "step": 240900 + }, + { + "epoch": 77.89269553975436, + "grad_norm": 1.7131937742233276, + "learning_rate": 0.001, + "loss": 1.1459, + "step": 241000 + }, + { + "epoch": 77.92501616031028, + "grad_norm": 1.452742576599121, + "learning_rate": 0.001, + "loss": 1.1633, + "step": 241100 + }, + { + "epoch": 77.95733678086619, + "grad_norm": 1.5621833801269531, + "learning_rate": 0.001, + "loss": 1.1689, + "step": 241200 + }, + { + "epoch": 77.98965740142211, + "grad_norm": 2.072866916656494, + "learning_rate": 0.001, + "loss": 1.1554, + "step": 241300 + }, + { + "epoch": 78.02197802197803, + "grad_norm": 1.7882293462753296, + "learning_rate": 0.001, + "loss": 1.0927, + "step": 241400 + }, + { + "epoch": 78.05429864253394, + "grad_norm": 1.8420997858047485, + "learning_rate": 0.001, + "loss": 1.051, + "step": 241500 + }, + { + "epoch": 78.08661926308986, + "grad_norm": 1.671029806137085, + "learning_rate": 0.001, + "loss": 1.0459, + "step": 241600 + }, + { + "epoch": 78.11893988364577, + "grad_norm": 2.1033594608306885, + "learning_rate": 0.001, + "loss": 1.0547, + "step": 241700 + }, + { + "epoch": 78.15126050420169, + "grad_norm": 1.9632328748703003, + "learning_rate": 0.001, + "loss": 1.0743, + "step": 241800 + }, + { + "epoch": 78.1835811247576, + "grad_norm": 1.8516623973846436, + "learning_rate": 0.001, + "loss": 1.0665, + "step": 241900 + }, + { + "epoch": 78.21590174531352, + "grad_norm": 1.782353162765503, + "learning_rate": 0.001, + "loss": 1.081, + "step": 242000 + }, + { + "epoch": 78.24822236586942, + "grad_norm": 2.159865140914917, + "learning_rate": 0.001, + "loss": 1.0792, + "step": 242100 + }, + { + "epoch": 78.28054298642535, + "grad_norm": 1.7599161863327026, + "learning_rate": 0.001, + "loss": 1.0962, + "step": 242200 + }, + { + "epoch": 78.31286360698125, + "grad_norm": 1.6156737804412842, + "learning_rate": 0.001, + "loss": 1.0881, + "step": 242300 + }, + { + "epoch": 78.34518422753717, + "grad_norm": 1.9483507871627808, + "learning_rate": 0.001, + "loss": 1.093, + "step": 242400 + }, + { + "epoch": 78.37750484809308, + "grad_norm": 1.5878406763076782, + "learning_rate": 0.001, + "loss": 1.0838, + "step": 242500 + }, + { + "epoch": 78.409825468649, + "grad_norm": 1.6766425371170044, + "learning_rate": 0.001, + "loss": 1.0998, + "step": 242600 + }, + { + "epoch": 78.44214608920491, + "grad_norm": 1.4642685651779175, + "learning_rate": 0.001, + "loss": 1.0929, + "step": 242700 + }, + { + "epoch": 78.47446670976083, + "grad_norm": 1.8012272119522095, + "learning_rate": 0.001, + "loss": 1.1047, + "step": 242800 + }, + { + "epoch": 78.50678733031674, + "grad_norm": 1.8630805015563965, + "learning_rate": 0.001, + "loss": 1.1029, + "step": 242900 + }, + { + "epoch": 78.53910795087266, + "grad_norm": 1.4753458499908447, + "learning_rate": 0.001, + "loss": 1.1177, + "step": 243000 + }, + { + "epoch": 78.57142857142857, + "grad_norm": 2.500214099884033, + "learning_rate": 0.001, + "loss": 1.109, + "step": 243100 + }, + { + "epoch": 78.60374919198449, + "grad_norm": 1.680084466934204, + "learning_rate": 0.001, + "loss": 1.1114, + "step": 243200 + }, + { + "epoch": 78.6360698125404, + "grad_norm": 1.6189630031585693, + "learning_rate": 0.001, + "loss": 1.124, + "step": 243300 + }, + { + "epoch": 78.66839043309632, + "grad_norm": 1.7440742254257202, + "learning_rate": 0.001, + "loss": 1.1218, + "step": 243400 + }, + { + "epoch": 78.70071105365223, + "grad_norm": 1.8845442533493042, + "learning_rate": 0.001, + "loss": 1.1245, + "step": 243500 + }, + { + "epoch": 78.73303167420815, + "grad_norm": 1.6810959577560425, + "learning_rate": 0.001, + "loss": 1.123, + "step": 243600 + }, + { + "epoch": 78.76535229476406, + "grad_norm": 2.0967421531677246, + "learning_rate": 0.001, + "loss": 1.1421, + "step": 243700 + }, + { + "epoch": 78.79767291531998, + "grad_norm": 1.5401570796966553, + "learning_rate": 0.001, + "loss": 1.1464, + "step": 243800 + }, + { + "epoch": 78.82999353587589, + "grad_norm": 1.8480286598205566, + "learning_rate": 0.001, + "loss": 1.1326, + "step": 243900 + }, + { + "epoch": 78.86231415643181, + "grad_norm": 1.814274787902832, + "learning_rate": 0.001, + "loss": 1.1383, + "step": 244000 + }, + { + "epoch": 78.89463477698771, + "grad_norm": 2.2483580112457275, + "learning_rate": 0.001, + "loss": 1.157, + "step": 244100 + }, + { + "epoch": 78.92695539754364, + "grad_norm": 1.8789129257202148, + "learning_rate": 0.001, + "loss": 1.1414, + "step": 244200 + }, + { + "epoch": 78.95927601809954, + "grad_norm": 1.6194325685501099, + "learning_rate": 0.001, + "loss": 1.1507, + "step": 244300 + }, + { + "epoch": 78.99159663865547, + "grad_norm": 1.7908927202224731, + "learning_rate": 0.001, + "loss": 1.1566, + "step": 244400 + }, + { + "epoch": 79.02391725921137, + "grad_norm": 2.1163265705108643, + "learning_rate": 0.001, + "loss": 1.0654, + "step": 244500 + }, + { + "epoch": 79.0562378797673, + "grad_norm": 1.7553027868270874, + "learning_rate": 0.001, + "loss": 1.0315, + "step": 244600 + }, + { + "epoch": 79.0885585003232, + "grad_norm": 1.6223750114440918, + "learning_rate": 0.001, + "loss": 1.0521, + "step": 244700 + }, + { + "epoch": 79.12087912087912, + "grad_norm": 1.7326061725616455, + "learning_rate": 0.001, + "loss": 1.0585, + "step": 244800 + }, + { + "epoch": 79.15319974143503, + "grad_norm": 2.0212154388427734, + "learning_rate": 0.001, + "loss": 1.0589, + "step": 244900 + }, + { + "epoch": 79.18552036199095, + "grad_norm": 1.9795660972595215, + "learning_rate": 0.001, + "loss": 1.0626, + "step": 245000 + }, + { + "epoch": 79.21784098254686, + "grad_norm": 2.4282681941986084, + "learning_rate": 0.001, + "loss": 1.0559, + "step": 245100 + }, + { + "epoch": 79.25016160310278, + "grad_norm": 1.6875724792480469, + "learning_rate": 0.001, + "loss": 1.083, + "step": 245200 + }, + { + "epoch": 79.28248222365869, + "grad_norm": 2.1071879863739014, + "learning_rate": 0.001, + "loss": 1.0644, + "step": 245300 + }, + { + "epoch": 79.31480284421461, + "grad_norm": 1.889708161354065, + "learning_rate": 0.001, + "loss": 1.0704, + "step": 245400 + }, + { + "epoch": 79.34712346477052, + "grad_norm": 1.9262644052505493, + "learning_rate": 0.001, + "loss": 1.0836, + "step": 245500 + }, + { + "epoch": 79.37944408532644, + "grad_norm": 1.769736886024475, + "learning_rate": 0.001, + "loss": 1.0883, + "step": 245600 + }, + { + "epoch": 79.41176470588235, + "grad_norm": 1.7323601245880127, + "learning_rate": 0.001, + "loss": 1.0888, + "step": 245700 + }, + { + "epoch": 79.44408532643827, + "grad_norm": 1.7969982624053955, + "learning_rate": 0.001, + "loss": 1.0957, + "step": 245800 + }, + { + "epoch": 79.47640594699418, + "grad_norm": 1.693835735321045, + "learning_rate": 0.001, + "loss": 1.0999, + "step": 245900 + }, + { + "epoch": 79.5087265675501, + "grad_norm": 1.5995159149169922, + "learning_rate": 0.001, + "loss": 1.0871, + "step": 246000 + }, + { + "epoch": 79.541047188106, + "grad_norm": 1.8406943082809448, + "learning_rate": 0.001, + "loss": 1.0955, + "step": 246100 + }, + { + "epoch": 79.57336780866193, + "grad_norm": 1.8488978147506714, + "learning_rate": 0.001, + "loss": 1.1048, + "step": 246200 + }, + { + "epoch": 79.60568842921784, + "grad_norm": 1.6870185136795044, + "learning_rate": 0.001, + "loss": 1.0973, + "step": 246300 + }, + { + "epoch": 79.63800904977376, + "grad_norm": 1.5949409008026123, + "learning_rate": 0.001, + "loss": 1.1263, + "step": 246400 + }, + { + "epoch": 79.67032967032966, + "grad_norm": 1.8820393085479736, + "learning_rate": 0.001, + "loss": 1.1124, + "step": 246500 + }, + { + "epoch": 79.70265029088559, + "grad_norm": 1.714228868484497, + "learning_rate": 0.001, + "loss": 1.1208, + "step": 246600 + }, + { + "epoch": 79.7349709114415, + "grad_norm": 1.7019908428192139, + "learning_rate": 0.001, + "loss": 1.1255, + "step": 246700 + }, + { + "epoch": 79.76729153199742, + "grad_norm": 1.826647162437439, + "learning_rate": 0.001, + "loss": 1.1164, + "step": 246800 + }, + { + "epoch": 79.79961215255332, + "grad_norm": 1.647612452507019, + "learning_rate": 0.001, + "loss": 1.1187, + "step": 246900 + }, + { + "epoch": 79.83193277310924, + "grad_norm": 1.8978357315063477, + "learning_rate": 0.001, + "loss": 1.1302, + "step": 247000 + }, + { + "epoch": 79.86425339366515, + "grad_norm": 1.7612745761871338, + "learning_rate": 0.001, + "loss": 1.1271, + "step": 247100 + }, + { + "epoch": 79.89657401422107, + "grad_norm": 1.9454165697097778, + "learning_rate": 0.001, + "loss": 1.146, + "step": 247200 + }, + { + "epoch": 79.92889463477698, + "grad_norm": 1.8270703554153442, + "learning_rate": 0.001, + "loss": 1.1378, + "step": 247300 + }, + { + "epoch": 79.9612152553329, + "grad_norm": 1.6154886484146118, + "learning_rate": 0.001, + "loss": 1.1621, + "step": 247400 + }, + { + "epoch": 79.99353587588882, + "grad_norm": 2.264509916305542, + "learning_rate": 0.001, + "loss": 1.1423, + "step": 247500 + }, + { + "epoch": 80.02585649644473, + "grad_norm": 2.3280816078186035, + "learning_rate": 0.001, + "loss": 1.0525, + "step": 247600 + }, + { + "epoch": 80.05817711700065, + "grad_norm": 1.731689453125, + "learning_rate": 0.001, + "loss": 1.0274, + "step": 247700 + }, + { + "epoch": 80.09049773755656, + "grad_norm": 2.1244609355926514, + "learning_rate": 0.001, + "loss": 1.031, + "step": 247800 + }, + { + "epoch": 80.12281835811248, + "grad_norm": 1.8890272378921509, + "learning_rate": 0.001, + "loss": 1.0379, + "step": 247900 + }, + { + "epoch": 80.15513897866839, + "grad_norm": 2.0371882915496826, + "learning_rate": 0.001, + "loss": 1.0401, + "step": 248000 + }, + { + "epoch": 80.18745959922431, + "grad_norm": 1.8187429904937744, + "learning_rate": 0.001, + "loss": 1.0508, + "step": 248100 + }, + { + "epoch": 80.21978021978022, + "grad_norm": 1.776618480682373, + "learning_rate": 0.001, + "loss": 1.0607, + "step": 248200 + }, + { + "epoch": 80.25210084033614, + "grad_norm": 1.967862844467163, + "learning_rate": 0.001, + "loss": 1.0589, + "step": 248300 + }, + { + "epoch": 80.28442146089205, + "grad_norm": 1.8256206512451172, + "learning_rate": 0.001, + "loss": 1.066, + "step": 248400 + }, + { + "epoch": 80.31674208144797, + "grad_norm": 2.1693742275238037, + "learning_rate": 0.001, + "loss": 1.0786, + "step": 248500 + }, + { + "epoch": 80.34906270200388, + "grad_norm": 2.159891128540039, + "learning_rate": 0.001, + "loss": 1.0698, + "step": 248600 + }, + { + "epoch": 80.3813833225598, + "grad_norm": 1.8442610502243042, + "learning_rate": 0.001, + "loss": 1.0768, + "step": 248700 + }, + { + "epoch": 80.4137039431157, + "grad_norm": 1.802513599395752, + "learning_rate": 0.001, + "loss": 1.0811, + "step": 248800 + }, + { + "epoch": 80.44602456367163, + "grad_norm": 1.897873878479004, + "learning_rate": 0.001, + "loss": 1.0844, + "step": 248900 + }, + { + "epoch": 80.47834518422754, + "grad_norm": 2.003659248352051, + "learning_rate": 0.001, + "loss": 1.0747, + "step": 249000 + }, + { + "epoch": 80.51066580478346, + "grad_norm": 1.620818853378296, + "learning_rate": 0.001, + "loss": 1.0853, + "step": 249100 + }, + { + "epoch": 80.54298642533936, + "grad_norm": 1.9488434791564941, + "learning_rate": 0.001, + "loss": 1.0955, + "step": 249200 + }, + { + "epoch": 80.57530704589529, + "grad_norm": 1.8911210298538208, + "learning_rate": 0.001, + "loss": 1.1133, + "step": 249300 + }, + { + "epoch": 80.6076276664512, + "grad_norm": 2.607034206390381, + "learning_rate": 0.001, + "loss": 1.0753, + "step": 249400 + }, + { + "epoch": 80.63994828700712, + "grad_norm": 1.5252186059951782, + "learning_rate": 0.001, + "loss": 1.0995, + "step": 249500 + }, + { + "epoch": 80.67226890756302, + "grad_norm": 1.8382611274719238, + "learning_rate": 0.001, + "loss": 1.1067, + "step": 249600 + }, + { + "epoch": 80.70458952811894, + "grad_norm": 1.9256694316864014, + "learning_rate": 0.001, + "loss": 1.113, + "step": 249700 + }, + { + "epoch": 80.73691014867485, + "grad_norm": 2.985775947570801, + "learning_rate": 0.001, + "loss": 1.1075, + "step": 249800 + }, + { + "epoch": 80.76923076923077, + "grad_norm": 1.662009835243225, + "learning_rate": 0.001, + "loss": 1.1271, + "step": 249900 + }, + { + "epoch": 80.80155138978668, + "grad_norm": 2.561980724334717, + "learning_rate": 0.001, + "loss": 1.1167, + "step": 250000 + }, + { + "epoch": 80.8338720103426, + "grad_norm": 1.9441149234771729, + "learning_rate": 0.001, + "loss": 1.1162, + "step": 250100 + }, + { + "epoch": 80.86619263089851, + "grad_norm": 2.120574474334717, + "learning_rate": 0.001, + "loss": 1.1231, + "step": 250200 + }, + { + "epoch": 80.89851325145443, + "grad_norm": 1.7940996885299683, + "learning_rate": 0.001, + "loss": 1.1185, + "step": 250300 + }, + { + "epoch": 80.93083387201034, + "grad_norm": 2.060868263244629, + "learning_rate": 0.001, + "loss": 1.1226, + "step": 250400 + }, + { + "epoch": 80.96315449256626, + "grad_norm": 1.9433925151824951, + "learning_rate": 0.001, + "loss": 1.1511, + "step": 250500 + }, + { + "epoch": 80.99547511312217, + "grad_norm": 2.249880075454712, + "learning_rate": 0.001, + "loss": 1.1191, + "step": 250600 + }, + { + "epoch": 81.02779573367809, + "grad_norm": 2.3257598876953125, + "learning_rate": 0.001, + "loss": 1.0245, + "step": 250700 + }, + { + "epoch": 81.060116354234, + "grad_norm": 2.1970770359039307, + "learning_rate": 0.001, + "loss": 1.02, + "step": 250800 + }, + { + "epoch": 81.09243697478992, + "grad_norm": 1.8370674848556519, + "learning_rate": 0.001, + "loss": 1.0309, + "step": 250900 + }, + { + "epoch": 81.12475759534583, + "grad_norm": 2.330951452255249, + "learning_rate": 0.001, + "loss": 1.0438, + "step": 251000 + }, + { + "epoch": 81.15707821590175, + "grad_norm": 1.8591450452804565, + "learning_rate": 0.001, + "loss": 1.0488, + "step": 251100 + }, + { + "epoch": 81.18939883645766, + "grad_norm": 2.0385501384735107, + "learning_rate": 0.001, + "loss": 1.0499, + "step": 251200 + }, + { + "epoch": 81.22171945701358, + "grad_norm": 1.6502009630203247, + "learning_rate": 0.001, + "loss": 1.0435, + "step": 251300 + }, + { + "epoch": 81.25404007756948, + "grad_norm": 2.523043394088745, + "learning_rate": 0.001, + "loss": 1.0426, + "step": 251400 + }, + { + "epoch": 81.2863606981254, + "grad_norm": 2.149442434310913, + "learning_rate": 0.001, + "loss": 1.0617, + "step": 251500 + }, + { + "epoch": 81.31868131868131, + "grad_norm": 1.8482860326766968, + "learning_rate": 0.001, + "loss": 1.0735, + "step": 251600 + }, + { + "epoch": 81.35100193923724, + "grad_norm": 2.3649866580963135, + "learning_rate": 0.001, + "loss": 1.0445, + "step": 251700 + }, + { + "epoch": 81.38332255979314, + "grad_norm": 1.7468416690826416, + "learning_rate": 0.001, + "loss": 1.0568, + "step": 251800 + }, + { + "epoch": 81.41564318034906, + "grad_norm": 2.0502419471740723, + "learning_rate": 0.001, + "loss": 1.0767, + "step": 251900 + }, + { + "epoch": 81.44796380090497, + "grad_norm": 1.6154065132141113, + "learning_rate": 0.001, + "loss": 1.0785, + "step": 252000 + }, + { + "epoch": 81.4802844214609, + "grad_norm": 1.6533535718917847, + "learning_rate": 0.001, + "loss": 1.0839, + "step": 252100 + }, + { + "epoch": 81.5126050420168, + "grad_norm": 1.7687009572982788, + "learning_rate": 0.001, + "loss": 1.0721, + "step": 252200 + }, + { + "epoch": 81.54492566257272, + "grad_norm": 1.9510821104049683, + "learning_rate": 0.001, + "loss": 1.0825, + "step": 252300 + }, + { + "epoch": 81.57724628312863, + "grad_norm": 2.1591577529907227, + "learning_rate": 0.001, + "loss": 1.0866, + "step": 252400 + }, + { + "epoch": 81.60956690368455, + "grad_norm": 2.1044604778289795, + "learning_rate": 0.001, + "loss": 1.1099, + "step": 252500 + }, + { + "epoch": 81.64188752424046, + "grad_norm": 2.2758374214172363, + "learning_rate": 0.001, + "loss": 1.0963, + "step": 252600 + }, + { + "epoch": 81.67420814479638, + "grad_norm": 2.09716534614563, + "learning_rate": 0.001, + "loss": 1.0998, + "step": 252700 + }, + { + "epoch": 81.70652876535229, + "grad_norm": 2.193350076675415, + "learning_rate": 0.001, + "loss": 1.0906, + "step": 252800 + }, + { + "epoch": 81.73884938590821, + "grad_norm": 1.868490219116211, + "learning_rate": 0.001, + "loss": 1.1172, + "step": 252900 + }, + { + "epoch": 81.77117000646412, + "grad_norm": 2.276905059814453, + "learning_rate": 0.001, + "loss": 1.1076, + "step": 253000 + }, + { + "epoch": 81.80349062702004, + "grad_norm": 1.9997442960739136, + "learning_rate": 0.001, + "loss": 1.1122, + "step": 253100 + }, + { + "epoch": 81.83581124757595, + "grad_norm": 2.066814422607422, + "learning_rate": 0.001, + "loss": 1.1113, + "step": 253200 + }, + { + "epoch": 81.86813186813187, + "grad_norm": 2.0630595684051514, + "learning_rate": 0.001, + "loss": 1.14, + "step": 253300 + }, + { + "epoch": 81.90045248868778, + "grad_norm": 1.7941197156906128, + "learning_rate": 0.001, + "loss": 1.1236, + "step": 253400 + }, + { + "epoch": 81.9327731092437, + "grad_norm": 1.8599138259887695, + "learning_rate": 0.001, + "loss": 1.1066, + "step": 253500 + }, + { + "epoch": 81.9650937297996, + "grad_norm": 2.007969379425049, + "learning_rate": 0.001, + "loss": 1.1161, + "step": 253600 + }, + { + "epoch": 81.99741435035553, + "grad_norm": 2.2284483909606934, + "learning_rate": 0.001, + "loss": 1.1156, + "step": 253700 + }, + { + "epoch": 82.02973497091145, + "grad_norm": 2.0131847858428955, + "learning_rate": 0.001, + "loss": 1.0191, + "step": 253800 + }, + { + "epoch": 82.06205559146736, + "grad_norm": 2.0108609199523926, + "learning_rate": 0.001, + "loss": 1.01, + "step": 253900 + }, + { + "epoch": 82.09437621202328, + "grad_norm": 2.120751142501831, + "learning_rate": 0.001, + "loss": 1.0098, + "step": 254000 + }, + { + "epoch": 82.12669683257919, + "grad_norm": 2.449652671813965, + "learning_rate": 0.001, + "loss": 1.021, + "step": 254100 + }, + { + "epoch": 82.1590174531351, + "grad_norm": 2.4260470867156982, + "learning_rate": 0.001, + "loss": 1.0347, + "step": 254200 + }, + { + "epoch": 82.19133807369101, + "grad_norm": 2.6759181022644043, + "learning_rate": 0.001, + "loss": 1.0325, + "step": 254300 + }, + { + "epoch": 82.22365869424694, + "grad_norm": 2.0493171215057373, + "learning_rate": 0.001, + "loss": 1.0422, + "step": 254400 + }, + { + "epoch": 82.25597931480284, + "grad_norm": 2.4781503677368164, + "learning_rate": 0.001, + "loss": 1.0344, + "step": 254500 + }, + { + "epoch": 82.28829993535876, + "grad_norm": 3.0941216945648193, + "learning_rate": 0.001, + "loss": 1.0478, + "step": 254600 + }, + { + "epoch": 82.32062055591467, + "grad_norm": 2.149822235107422, + "learning_rate": 0.001, + "loss": 1.0428, + "step": 254700 + }, + { + "epoch": 82.3529411764706, + "grad_norm": 1.9726589918136597, + "learning_rate": 0.001, + "loss": 1.0629, + "step": 254800 + }, + { + "epoch": 82.3852617970265, + "grad_norm": 2.1840827465057373, + "learning_rate": 0.001, + "loss": 1.0648, + "step": 254900 + }, + { + "epoch": 82.41758241758242, + "grad_norm": 2.0328707695007324, + "learning_rate": 0.001, + "loss": 1.0667, + "step": 255000 + }, + { + "epoch": 82.44990303813833, + "grad_norm": 2.3106980323791504, + "learning_rate": 0.001, + "loss": 1.0656, + "step": 255100 + }, + { + "epoch": 82.48222365869425, + "grad_norm": 2.14380145072937, + "learning_rate": 0.001, + "loss": 1.0788, + "step": 255200 + }, + { + "epoch": 82.51454427925016, + "grad_norm": 1.6182340383529663, + "learning_rate": 0.001, + "loss": 1.0802, + "step": 255300 + }, + { + "epoch": 82.54686489980608, + "grad_norm": 2.0344672203063965, + "learning_rate": 0.001, + "loss": 1.0871, + "step": 255400 + }, + { + "epoch": 82.57918552036199, + "grad_norm": 1.7792701721191406, + "learning_rate": 0.001, + "loss": 1.0804, + "step": 255500 + }, + { + "epoch": 82.61150614091791, + "grad_norm": 2.1773808002471924, + "learning_rate": 0.001, + "loss": 1.0792, + "step": 255600 + }, + { + "epoch": 82.64382676147382, + "grad_norm": 1.9904121160507202, + "learning_rate": 0.001, + "loss": 1.0807, + "step": 255700 + }, + { + "epoch": 82.67614738202974, + "grad_norm": 2.2778069972991943, + "learning_rate": 0.001, + "loss": 1.0802, + "step": 255800 + }, + { + "epoch": 82.70846800258565, + "grad_norm": 2.273298978805542, + "learning_rate": 0.001, + "loss": 1.1049, + "step": 255900 + }, + { + "epoch": 82.74078862314157, + "grad_norm": 2.3957090377807617, + "learning_rate": 0.001, + "loss": 1.0932, + "step": 256000 + }, + { + "epoch": 82.77310924369748, + "grad_norm": 1.931657314300537, + "learning_rate": 0.001, + "loss": 1.1028, + "step": 256100 + }, + { + "epoch": 82.8054298642534, + "grad_norm": 1.7905445098876953, + "learning_rate": 0.001, + "loss": 1.0865, + "step": 256200 + }, + { + "epoch": 82.8377504848093, + "grad_norm": 1.855185627937317, + "learning_rate": 0.001, + "loss": 1.0979, + "step": 256300 + }, + { + "epoch": 82.87007110536523, + "grad_norm": 1.7903704643249512, + "learning_rate": 0.001, + "loss": 1.0997, + "step": 256400 + }, + { + "epoch": 82.90239172592113, + "grad_norm": 2.024670124053955, + "learning_rate": 0.001, + "loss": 1.0971, + "step": 256500 + }, + { + "epoch": 82.93471234647706, + "grad_norm": 2.054471492767334, + "learning_rate": 0.001, + "loss": 1.1191, + "step": 256600 + }, + { + "epoch": 82.96703296703296, + "grad_norm": 2.021584987640381, + "learning_rate": 0.001, + "loss": 1.1027, + "step": 256700 + }, + { + "epoch": 82.99935358758889, + "grad_norm": 2.977588176727295, + "learning_rate": 0.001, + "loss": 1.0963, + "step": 256800 + }, + { + "epoch": 83.03167420814479, + "grad_norm": 2.254401922225952, + "learning_rate": 0.001, + "loss": 0.9857, + "step": 256900 + }, + { + "epoch": 83.06399482870071, + "grad_norm": 2.0758557319641113, + "learning_rate": 0.001, + "loss": 0.9981, + "step": 257000 + }, + { + "epoch": 83.09631544925662, + "grad_norm": 2.851766586303711, + "learning_rate": 0.001, + "loss": 1.0049, + "step": 257100 + }, + { + "epoch": 83.12863606981254, + "grad_norm": 1.9387279748916626, + "learning_rate": 0.001, + "loss": 1.0158, + "step": 257200 + }, + { + "epoch": 83.16095669036845, + "grad_norm": 2.2792084217071533, + "learning_rate": 0.001, + "loss": 1.0109, + "step": 257300 + }, + { + "epoch": 83.19327731092437, + "grad_norm": 2.8158106803894043, + "learning_rate": 0.001, + "loss": 1.0325, + "step": 257400 + }, + { + "epoch": 83.22559793148028, + "grad_norm": 1.8931865692138672, + "learning_rate": 0.001, + "loss": 1.0084, + "step": 257500 + }, + { + "epoch": 83.2579185520362, + "grad_norm": 2.0553183555603027, + "learning_rate": 0.001, + "loss": 1.0223, + "step": 257600 + }, + { + "epoch": 83.29023917259211, + "grad_norm": 2.032785415649414, + "learning_rate": 0.001, + "loss": 1.0368, + "step": 257700 + }, + { + "epoch": 83.32255979314803, + "grad_norm": 2.276414632797241, + "learning_rate": 0.001, + "loss": 1.0434, + "step": 257800 + }, + { + "epoch": 83.35488041370394, + "grad_norm": 2.2769389152526855, + "learning_rate": 0.001, + "loss": 1.0405, + "step": 257900 + }, + { + "epoch": 83.38720103425986, + "grad_norm": 2.457798719406128, + "learning_rate": 0.001, + "loss": 1.0536, + "step": 258000 + }, + { + "epoch": 83.41952165481577, + "grad_norm": 2.427795886993408, + "learning_rate": 0.001, + "loss": 1.0574, + "step": 258100 + }, + { + "epoch": 83.45184227537169, + "grad_norm": 2.3485844135284424, + "learning_rate": 0.001, + "loss": 1.0604, + "step": 258200 + }, + { + "epoch": 83.4841628959276, + "grad_norm": 2.122995615005493, + "learning_rate": 0.001, + "loss": 1.0593, + "step": 258300 + }, + { + "epoch": 83.51648351648352, + "grad_norm": 2.6342482566833496, + "learning_rate": 0.001, + "loss": 1.0711, + "step": 258400 + }, + { + "epoch": 83.54880413703943, + "grad_norm": 2.1874687671661377, + "learning_rate": 0.001, + "loss": 1.063, + "step": 258500 + }, + { + "epoch": 83.58112475759535, + "grad_norm": 2.3595123291015625, + "learning_rate": 0.001, + "loss": 1.0756, + "step": 258600 + }, + { + "epoch": 83.61344537815125, + "grad_norm": 2.185002326965332, + "learning_rate": 0.001, + "loss": 1.0728, + "step": 258700 + }, + { + "epoch": 83.64576599870718, + "grad_norm": 2.350257158279419, + "learning_rate": 0.001, + "loss": 1.0819, + "step": 258800 + }, + { + "epoch": 83.67808661926308, + "grad_norm": 2.661860227584839, + "learning_rate": 0.001, + "loss": 1.0873, + "step": 258900 + }, + { + "epoch": 83.710407239819, + "grad_norm": 2.071593999862671, + "learning_rate": 0.001, + "loss": 1.0969, + "step": 259000 + }, + { + "epoch": 83.74272786037491, + "grad_norm": 2.097931146621704, + "learning_rate": 0.001, + "loss": 1.0821, + "step": 259100 + }, + { + "epoch": 83.77504848093083, + "grad_norm": 2.636651039123535, + "learning_rate": 0.001, + "loss": 1.0815, + "step": 259200 + }, + { + "epoch": 83.80736910148674, + "grad_norm": 2.398634433746338, + "learning_rate": 0.001, + "loss": 1.0849, + "step": 259300 + }, + { + "epoch": 83.83968972204266, + "grad_norm": 1.94718599319458, + "learning_rate": 0.001, + "loss": 1.0987, + "step": 259400 + }, + { + "epoch": 83.87201034259857, + "grad_norm": 2.3222267627716064, + "learning_rate": 0.001, + "loss": 1.0896, + "step": 259500 + }, + { + "epoch": 83.9043309631545, + "grad_norm": 2.0322906970977783, + "learning_rate": 0.001, + "loss": 1.0968, + "step": 259600 + }, + { + "epoch": 83.9366515837104, + "grad_norm": 2.293200731277466, + "learning_rate": 0.001, + "loss": 1.1018, + "step": 259700 + }, + { + "epoch": 83.96897220426632, + "grad_norm": 2.212113380432129, + "learning_rate": 0.001, + "loss": 1.1003, + "step": 259800 + }, + { + "epoch": 84.00129282482224, + "grad_norm": 1.9160456657409668, + "learning_rate": 0.001, + "loss": 1.1094, + "step": 259900 + }, + { + "epoch": 84.03361344537815, + "grad_norm": 2.0068440437316895, + "learning_rate": 0.001, + "loss": 0.9994, + "step": 260000 + }, + { + "epoch": 84.06593406593407, + "grad_norm": 1.8730653524398804, + "learning_rate": 0.001, + "loss": 0.9916, + "step": 260100 + }, + { + "epoch": 84.09825468648998, + "grad_norm": 1.9432926177978516, + "learning_rate": 0.001, + "loss": 0.9884, + "step": 260200 + }, + { + "epoch": 84.1305753070459, + "grad_norm": 2.5531952381134033, + "learning_rate": 0.001, + "loss": 1.0135, + "step": 260300 + }, + { + "epoch": 84.16289592760181, + "grad_norm": 1.827590823173523, + "learning_rate": 0.001, + "loss": 1.0184, + "step": 260400 + }, + { + "epoch": 84.19521654815773, + "grad_norm": 1.953426718711853, + "learning_rate": 0.001, + "loss": 1.0346, + "step": 260500 + }, + { + "epoch": 84.22753716871364, + "grad_norm": 1.895742416381836, + "learning_rate": 0.001, + "loss": 1.0192, + "step": 260600 + }, + { + "epoch": 84.25985778926956, + "grad_norm": 2.4154608249664307, + "learning_rate": 0.001, + "loss": 1.0212, + "step": 260700 + }, + { + "epoch": 84.29217840982547, + "grad_norm": 1.9845346212387085, + "learning_rate": 0.001, + "loss": 1.0385, + "step": 260800 + }, + { + "epoch": 84.32449903038139, + "grad_norm": 1.9528859853744507, + "learning_rate": 0.001, + "loss": 1.038, + "step": 260900 + }, + { + "epoch": 84.3568196509373, + "grad_norm": 1.5986595153808594, + "learning_rate": 0.001, + "loss": 1.0472, + "step": 261000 + }, + { + "epoch": 84.38914027149322, + "grad_norm": 2.879544734954834, + "learning_rate": 0.001, + "loss": 1.0387, + "step": 261100 + }, + { + "epoch": 84.42146089204913, + "grad_norm": 2.07099986076355, + "learning_rate": 0.001, + "loss": 1.0485, + "step": 261200 + }, + { + "epoch": 84.45378151260505, + "grad_norm": 1.9697378873825073, + "learning_rate": 0.001, + "loss": 1.046, + "step": 261300 + }, + { + "epoch": 84.48610213316095, + "grad_norm": 2.495154619216919, + "learning_rate": 0.001, + "loss": 1.0427, + "step": 261400 + }, + { + "epoch": 84.51842275371688, + "grad_norm": 2.241727113723755, + "learning_rate": 0.001, + "loss": 1.0599, + "step": 261500 + }, + { + "epoch": 84.55074337427278, + "grad_norm": 2.0560214519500732, + "learning_rate": 0.001, + "loss": 1.0515, + "step": 261600 + }, + { + "epoch": 84.5830639948287, + "grad_norm": 2.1529815196990967, + "learning_rate": 0.001, + "loss": 1.0516, + "step": 261700 + }, + { + "epoch": 84.61538461538461, + "grad_norm": 2.4618332386016846, + "learning_rate": 0.001, + "loss": 1.0721, + "step": 261800 + }, + { + "epoch": 84.64770523594053, + "grad_norm": 1.9415512084960938, + "learning_rate": 0.001, + "loss": 1.0545, + "step": 261900 + }, + { + "epoch": 84.68002585649644, + "grad_norm": 40.67611312866211, + "learning_rate": 0.001, + "loss": 1.0695, + "step": 262000 + }, + { + "epoch": 84.71234647705236, + "grad_norm": 1.830438256263733, + "learning_rate": 0.001, + "loss": 1.0702, + "step": 262100 + }, + { + "epoch": 84.74466709760827, + "grad_norm": 2.0758326053619385, + "learning_rate": 0.001, + "loss": 1.0715, + "step": 262200 + }, + { + "epoch": 84.7769877181642, + "grad_norm": 1.9768143892288208, + "learning_rate": 0.001, + "loss": 1.0926, + "step": 262300 + }, + { + "epoch": 84.8093083387201, + "grad_norm": 2.4921510219573975, + "learning_rate": 0.001, + "loss": 1.0887, + "step": 262400 + }, + { + "epoch": 84.84162895927602, + "grad_norm": 1.8019990921020508, + "learning_rate": 0.001, + "loss": 1.1003, + "step": 262500 + }, + { + "epoch": 84.87394957983193, + "grad_norm": 1.7619433403015137, + "learning_rate": 0.001, + "loss": 1.0909, + "step": 262600 + }, + { + "epoch": 84.90627020038785, + "grad_norm": 2.410701274871826, + "learning_rate": 0.001, + "loss": 1.094, + "step": 262700 + }, + { + "epoch": 84.93859082094376, + "grad_norm": 2.021223306655884, + "learning_rate": 0.001, + "loss": 1.0909, + "step": 262800 + }, + { + "epoch": 84.97091144149968, + "grad_norm": 10.952349662780762, + "learning_rate": 0.001, + "loss": 1.1052, + "step": 262900 + }, + { + "epoch": 85.00323206205559, + "grad_norm": 1.784826636314392, + "learning_rate": 0.001, + "loss": 1.1001, + "step": 263000 + }, + { + "epoch": 85.03555268261151, + "grad_norm": 2.587707042694092, + "learning_rate": 0.001, + "loss": 0.9838, + "step": 263100 + }, + { + "epoch": 85.06787330316742, + "grad_norm": 2.302337169647217, + "learning_rate": 0.001, + "loss": 0.9897, + "step": 263200 + }, + { + "epoch": 85.10019392372334, + "grad_norm": 1.8791141510009766, + "learning_rate": 0.001, + "loss": 0.9905, + "step": 263300 + }, + { + "epoch": 85.13251454427925, + "grad_norm": 1.891874074935913, + "learning_rate": 0.001, + "loss": 0.9889, + "step": 263400 + }, + { + "epoch": 85.16483516483517, + "grad_norm": 2.0654654502868652, + "learning_rate": 0.001, + "loss": 0.9969, + "step": 263500 + }, + { + "epoch": 85.19715578539108, + "grad_norm": 2.999326229095459, + "learning_rate": 0.001, + "loss": 1.0026, + "step": 263600 + }, + { + "epoch": 85.229476405947, + "grad_norm": 2.2999064922332764, + "learning_rate": 0.001, + "loss": 1.0114, + "step": 263700 + }, + { + "epoch": 85.2617970265029, + "grad_norm": 2.1461431980133057, + "learning_rate": 0.001, + "loss": 1.0403, + "step": 263800 + }, + { + "epoch": 85.29411764705883, + "grad_norm": 1.715585708618164, + "learning_rate": 0.001, + "loss": 1.0147, + "step": 263900 + }, + { + "epoch": 85.32643826761473, + "grad_norm": 2.0307717323303223, + "learning_rate": 0.001, + "loss": 1.0311, + "step": 264000 + }, + { + "epoch": 85.35875888817066, + "grad_norm": 1.924255609512329, + "learning_rate": 0.001, + "loss": 1.0376, + "step": 264100 + }, + { + "epoch": 85.39107950872656, + "grad_norm": 3.5688138008117676, + "learning_rate": 0.001, + "loss": 1.0331, + "step": 264200 + }, + { + "epoch": 85.42340012928248, + "grad_norm": 1.627633810043335, + "learning_rate": 0.001, + "loss": 1.0413, + "step": 264300 + }, + { + "epoch": 85.45572074983839, + "grad_norm": 2.091357946395874, + "learning_rate": 0.001, + "loss": 1.0438, + "step": 264400 + }, + { + "epoch": 85.48804137039431, + "grad_norm": 1.7921959161758423, + "learning_rate": 0.001, + "loss": 1.0332, + "step": 264500 + }, + { + "epoch": 85.52036199095022, + "grad_norm": 1.9144126176834106, + "learning_rate": 0.001, + "loss": 1.0537, + "step": 264600 + }, + { + "epoch": 85.55268261150614, + "grad_norm": 1.995881199836731, + "learning_rate": 0.001, + "loss": 1.0525, + "step": 264700 + }, + { + "epoch": 85.58500323206205, + "grad_norm": 1.8649122714996338, + "learning_rate": 0.001, + "loss": 1.0506, + "step": 264800 + }, + { + "epoch": 85.61732385261797, + "grad_norm": 1.7572340965270996, + "learning_rate": 0.001, + "loss": 1.0468, + "step": 264900 + }, + { + "epoch": 85.64964447317388, + "grad_norm": 2.5630545616149902, + "learning_rate": 0.001, + "loss": 1.0482, + "step": 265000 + }, + { + "epoch": 85.6819650937298, + "grad_norm": 1.790952205657959, + "learning_rate": 0.001, + "loss": 1.0607, + "step": 265100 + }, + { + "epoch": 85.71428571428571, + "grad_norm": 1.7042549848556519, + "learning_rate": 0.001, + "loss": 1.0452, + "step": 265200 + }, + { + "epoch": 85.74660633484163, + "grad_norm": 2.4065299034118652, + "learning_rate": 0.001, + "loss": 1.0821, + "step": 265300 + }, + { + "epoch": 85.77892695539754, + "grad_norm": 1.7829989194869995, + "learning_rate": 0.001, + "loss": 1.0725, + "step": 265400 + }, + { + "epoch": 85.81124757595346, + "grad_norm": 2.1033124923706055, + "learning_rate": 0.001, + "loss": 1.0794, + "step": 265500 + }, + { + "epoch": 85.84356819650937, + "grad_norm": 1.9515258073806763, + "learning_rate": 0.001, + "loss": 1.0688, + "step": 265600 + }, + { + "epoch": 85.87588881706529, + "grad_norm": 1.796294927597046, + "learning_rate": 0.001, + "loss": 1.0845, + "step": 265700 + }, + { + "epoch": 85.9082094376212, + "grad_norm": 1.7326546907424927, + "learning_rate": 0.001, + "loss": 1.0781, + "step": 265800 + }, + { + "epoch": 85.94053005817712, + "grad_norm": 1.7627480030059814, + "learning_rate": 0.001, + "loss": 1.0979, + "step": 265900 + }, + { + "epoch": 85.97285067873302, + "grad_norm": 2.2406697273254395, + "learning_rate": 0.001, + "loss": 1.0872, + "step": 266000 + }, + { + "epoch": 86.00517129928895, + "grad_norm": 1.6462572813034058, + "learning_rate": 0.001, + "loss": 1.08, + "step": 266100 + }, + { + "epoch": 86.03749191984487, + "grad_norm": 1.6872225999832153, + "learning_rate": 0.001, + "loss": 0.9723, + "step": 266200 + }, + { + "epoch": 86.06981254040078, + "grad_norm": 2.2746024131774902, + "learning_rate": 0.001, + "loss": 0.9815, + "step": 266300 + }, + { + "epoch": 86.1021331609567, + "grad_norm": 1.6541063785552979, + "learning_rate": 0.001, + "loss": 0.9831, + "step": 266400 + }, + { + "epoch": 86.1344537815126, + "grad_norm": 1.9259800910949707, + "learning_rate": 0.001, + "loss": 0.9992, + "step": 266500 + }, + { + "epoch": 86.16677440206853, + "grad_norm": 1.8844987154006958, + "learning_rate": 0.001, + "loss": 0.9949, + "step": 266600 + }, + { + "epoch": 86.19909502262443, + "grad_norm": 1.5561342239379883, + "learning_rate": 0.001, + "loss": 0.9989, + "step": 266700 + }, + { + "epoch": 86.23141564318036, + "grad_norm": 2.4345591068267822, + "learning_rate": 0.001, + "loss": 0.9958, + "step": 266800 + }, + { + "epoch": 86.26373626373626, + "grad_norm": 1.8101435899734497, + "learning_rate": 0.001, + "loss": 1.0139, + "step": 266900 + }, + { + "epoch": 86.29605688429218, + "grad_norm": 2.0286176204681396, + "learning_rate": 0.001, + "loss": 1.0251, + "step": 267000 + }, + { + "epoch": 86.32837750484809, + "grad_norm": 1.9802911281585693, + "learning_rate": 0.001, + "loss": 1.0139, + "step": 267100 + }, + { + "epoch": 86.36069812540401, + "grad_norm": 1.6296907663345337, + "learning_rate": 0.001, + "loss": 1.0182, + "step": 267200 + }, + { + "epoch": 86.39301874595992, + "grad_norm": 2.130852460861206, + "learning_rate": 0.001, + "loss": 1.0272, + "step": 267300 + }, + { + "epoch": 86.42533936651584, + "grad_norm": 1.6675856113433838, + "learning_rate": 0.001, + "loss": 1.0309, + "step": 267400 + }, + { + "epoch": 86.45765998707175, + "grad_norm": 1.873880386352539, + "learning_rate": 0.001, + "loss": 1.02, + "step": 267500 + }, + { + "epoch": 86.48998060762767, + "grad_norm": 1.6466994285583496, + "learning_rate": 0.001, + "loss": 1.036, + "step": 267600 + }, + { + "epoch": 86.52230122818358, + "grad_norm": 2.3659939765930176, + "learning_rate": 0.001, + "loss": 1.0379, + "step": 267700 + }, + { + "epoch": 86.5546218487395, + "grad_norm": 1.846489667892456, + "learning_rate": 0.001, + "loss": 1.0386, + "step": 267800 + }, + { + "epoch": 86.58694246929541, + "grad_norm": 1.9823304414749146, + "learning_rate": 0.001, + "loss": 1.0549, + "step": 267900 + }, + { + "epoch": 86.61926308985133, + "grad_norm": 2.0953738689422607, + "learning_rate": 0.001, + "loss": 1.0404, + "step": 268000 + }, + { + "epoch": 86.65158371040724, + "grad_norm": 5.9522705078125, + "learning_rate": 0.001, + "loss": 1.0526, + "step": 268100 + }, + { + "epoch": 86.68390433096316, + "grad_norm": 1.8562268018722534, + "learning_rate": 0.001, + "loss": 1.0652, + "step": 268200 + }, + { + "epoch": 86.71622495151907, + "grad_norm": 1.8963710069656372, + "learning_rate": 0.001, + "loss": 1.0499, + "step": 268300 + }, + { + "epoch": 86.74854557207499, + "grad_norm": 1.8700145483016968, + "learning_rate": 0.001, + "loss": 1.0587, + "step": 268400 + }, + { + "epoch": 86.7808661926309, + "grad_norm": 1.7103973627090454, + "learning_rate": 0.001, + "loss": 1.0584, + "step": 268500 + }, + { + "epoch": 86.81318681318682, + "grad_norm": 1.864179253578186, + "learning_rate": 0.001, + "loss": 1.0672, + "step": 268600 + }, + { + "epoch": 86.84550743374272, + "grad_norm": 2.295189380645752, + "learning_rate": 0.001, + "loss": 1.0639, + "step": 268700 + }, + { + "epoch": 86.87782805429865, + "grad_norm": 1.8950257301330566, + "learning_rate": 0.001, + "loss": 1.0659, + "step": 268800 + }, + { + "epoch": 86.91014867485455, + "grad_norm": 2.016186237335205, + "learning_rate": 0.001, + "loss": 1.0682, + "step": 268900 + }, + { + "epoch": 86.94246929541048, + "grad_norm": 1.8546134233474731, + "learning_rate": 0.001, + "loss": 1.0724, + "step": 269000 + }, + { + "epoch": 86.97478991596638, + "grad_norm": 1.4972987174987793, + "learning_rate": 0.001, + "loss": 1.0884, + "step": 269100 + }, + { + "epoch": 87.0071105365223, + "grad_norm": 2.2357635498046875, + "learning_rate": 0.001, + "loss": 1.0689, + "step": 269200 + }, + { + "epoch": 87.03943115707821, + "grad_norm": 1.957870602607727, + "learning_rate": 0.001, + "loss": 0.9617, + "step": 269300 + }, + { + "epoch": 87.07175177763413, + "grad_norm": 1.9988821744918823, + "learning_rate": 0.001, + "loss": 0.9642, + "step": 269400 + }, + { + "epoch": 87.10407239819004, + "grad_norm": 1.902817964553833, + "learning_rate": 0.001, + "loss": 0.9724, + "step": 269500 + }, + { + "epoch": 87.13639301874596, + "grad_norm": 1.8384301662445068, + "learning_rate": 0.001, + "loss": 0.9825, + "step": 269600 + }, + { + "epoch": 87.16871363930187, + "grad_norm": 1.9688271284103394, + "learning_rate": 0.001, + "loss": 0.9702, + "step": 269700 + }, + { + "epoch": 87.20103425985779, + "grad_norm": 1.5213277339935303, + "learning_rate": 0.001, + "loss": 0.9891, + "step": 269800 + }, + { + "epoch": 87.2333548804137, + "grad_norm": 1.8877270221710205, + "learning_rate": 0.001, + "loss": 0.9974, + "step": 269900 + }, + { + "epoch": 87.26567550096962, + "grad_norm": 1.5288257598876953, + "learning_rate": 0.001, + "loss": 0.999, + "step": 270000 + }, + { + "epoch": 87.29799612152553, + "grad_norm": 1.6828035116195679, + "learning_rate": 0.001, + "loss": 1.0073, + "step": 270100 + }, + { + "epoch": 87.33031674208145, + "grad_norm": 1.9386004209518433, + "learning_rate": 0.001, + "loss": 1.0163, + "step": 270200 + }, + { + "epoch": 87.36263736263736, + "grad_norm": 1.8773216009140015, + "learning_rate": 0.001, + "loss": 1.0245, + "step": 270300 + }, + { + "epoch": 87.39495798319328, + "grad_norm": 1.8428542613983154, + "learning_rate": 0.001, + "loss": 1.0207, + "step": 270400 + }, + { + "epoch": 87.42727860374919, + "grad_norm": 2.0465738773345947, + "learning_rate": 0.001, + "loss": 1.0034, + "step": 270500 + }, + { + "epoch": 87.45959922430511, + "grad_norm": 2.624429225921631, + "learning_rate": 0.001, + "loss": 1.0296, + "step": 270600 + }, + { + "epoch": 87.49191984486102, + "grad_norm": 2.0403313636779785, + "learning_rate": 0.001, + "loss": 1.0304, + "step": 270700 + }, + { + "epoch": 87.52424046541694, + "grad_norm": 1.6845457553863525, + "learning_rate": 0.001, + "loss": 1.0235, + "step": 270800 + }, + { + "epoch": 87.55656108597285, + "grad_norm": 1.5582964420318604, + "learning_rate": 0.001, + "loss": 1.0235, + "step": 270900 + }, + { + "epoch": 87.58888170652877, + "grad_norm": 2.086839199066162, + "learning_rate": 0.001, + "loss": 1.0346, + "step": 271000 + }, + { + "epoch": 87.62120232708467, + "grad_norm": 1.4940403699874878, + "learning_rate": 0.001, + "loss": 1.0434, + "step": 271100 + }, + { + "epoch": 87.6535229476406, + "grad_norm": 1.7732195854187012, + "learning_rate": 0.001, + "loss": 1.0391, + "step": 271200 + }, + { + "epoch": 87.6858435681965, + "grad_norm": 1.9552558660507202, + "learning_rate": 0.001, + "loss": 1.0588, + "step": 271300 + }, + { + "epoch": 87.71816418875243, + "grad_norm": 1.6922935247421265, + "learning_rate": 0.001, + "loss": 1.0525, + "step": 271400 + }, + { + "epoch": 87.75048480930833, + "grad_norm": 1.606885552406311, + "learning_rate": 0.001, + "loss": 1.0609, + "step": 271500 + }, + { + "epoch": 87.78280542986425, + "grad_norm": 1.7126843929290771, + "learning_rate": 0.001, + "loss": 1.0544, + "step": 271600 + }, + { + "epoch": 87.81512605042016, + "grad_norm": 1.9816648960113525, + "learning_rate": 0.001, + "loss": 1.0636, + "step": 271700 + }, + { + "epoch": 87.84744667097608, + "grad_norm": 1.776976466178894, + "learning_rate": 0.001, + "loss": 1.0594, + "step": 271800 + }, + { + "epoch": 87.87976729153199, + "grad_norm": 2.1870553493499756, + "learning_rate": 0.001, + "loss": 1.0482, + "step": 271900 + }, + { + "epoch": 87.91208791208791, + "grad_norm": 1.7203819751739502, + "learning_rate": 0.001, + "loss": 1.0711, + "step": 272000 + }, + { + "epoch": 87.94440853264382, + "grad_norm": 1.6706738471984863, + "learning_rate": 0.001, + "loss": 1.0778, + "step": 272100 + }, + { + "epoch": 87.97672915319974, + "grad_norm": 2.048996925354004, + "learning_rate": 0.001, + "loss": 1.071, + "step": 272200 + }, + { + "epoch": 88.00904977375566, + "grad_norm": 1.791468858718872, + "learning_rate": 0.001, + "loss": 1.0356, + "step": 272300 + }, + { + "epoch": 88.04137039431157, + "grad_norm": 2.0778145790100098, + "learning_rate": 0.001, + "loss": 0.9626, + "step": 272400 + }, + { + "epoch": 88.07369101486749, + "grad_norm": 2.016916275024414, + "learning_rate": 0.001, + "loss": 0.9573, + "step": 272500 + }, + { + "epoch": 88.1060116354234, + "grad_norm": 1.8074690103530884, + "learning_rate": 0.001, + "loss": 0.9704, + "step": 272600 + }, + { + "epoch": 88.13833225597932, + "grad_norm": 1.7361093759536743, + "learning_rate": 0.001, + "loss": 0.9908, + "step": 272700 + }, + { + "epoch": 88.17065287653523, + "grad_norm": 1.7573654651641846, + "learning_rate": 0.001, + "loss": 0.9922, + "step": 272800 + }, + { + "epoch": 88.20297349709115, + "grad_norm": 1.6067391633987427, + "learning_rate": 0.001, + "loss": 0.9822, + "step": 272900 + }, + { + "epoch": 88.23529411764706, + "grad_norm": 1.9911582469940186, + "learning_rate": 0.001, + "loss": 0.9969, + "step": 273000 + }, + { + "epoch": 88.26761473820298, + "grad_norm": 1.9228503704071045, + "learning_rate": 0.001, + "loss": 0.9908, + "step": 273100 + }, + { + "epoch": 88.29993535875889, + "grad_norm": 1.954790711402893, + "learning_rate": 0.001, + "loss": 0.9979, + "step": 273200 + }, + { + "epoch": 88.33225597931481, + "grad_norm": 2.02158784866333, + "learning_rate": 0.001, + "loss": 1.0014, + "step": 273300 + }, + { + "epoch": 88.36457659987072, + "grad_norm": 1.9888592958450317, + "learning_rate": 0.001, + "loss": 0.9951, + "step": 273400 + }, + { + "epoch": 88.39689722042664, + "grad_norm": 1.7298367023468018, + "learning_rate": 0.001, + "loss": 1.0075, + "step": 273500 + }, + { + "epoch": 88.42921784098255, + "grad_norm": 1.7865296602249146, + "learning_rate": 0.001, + "loss": 1.0108, + "step": 273600 + }, + { + "epoch": 88.46153846153847, + "grad_norm": 2.6155149936676025, + "learning_rate": 0.001, + "loss": 1.0153, + "step": 273700 + }, + { + "epoch": 88.49385908209437, + "grad_norm": 1.765749216079712, + "learning_rate": 0.001, + "loss": 1.0148, + "step": 273800 + }, + { + "epoch": 88.5261797026503, + "grad_norm": 1.943948745727539, + "learning_rate": 0.001, + "loss": 1.017, + "step": 273900 + }, + { + "epoch": 88.5585003232062, + "grad_norm": 1.7162889242172241, + "learning_rate": 0.001, + "loss": 1.0314, + "step": 274000 + }, + { + "epoch": 88.59082094376213, + "grad_norm": 1.5523930788040161, + "learning_rate": 0.001, + "loss": 1.0302, + "step": 274100 + }, + { + "epoch": 88.62314156431803, + "grad_norm": 1.733382225036621, + "learning_rate": 0.001, + "loss": 1.0303, + "step": 274200 + }, + { + "epoch": 88.65546218487395, + "grad_norm": 2.412778615951538, + "learning_rate": 0.001, + "loss": 1.0385, + "step": 274300 + }, + { + "epoch": 88.68778280542986, + "grad_norm": 1.9336631298065186, + "learning_rate": 0.001, + "loss": 1.0359, + "step": 274400 + }, + { + "epoch": 88.72010342598578, + "grad_norm": 2.2607991695404053, + "learning_rate": 0.001, + "loss": 1.0412, + "step": 274500 + }, + { + "epoch": 88.75242404654169, + "grad_norm": 1.6918398141860962, + "learning_rate": 0.001, + "loss": 1.056, + "step": 274600 + }, + { + "epoch": 88.78474466709761, + "grad_norm": 1.6877381801605225, + "learning_rate": 0.001, + "loss": 1.0467, + "step": 274700 + }, + { + "epoch": 88.81706528765352, + "grad_norm": 1.8707000017166138, + "learning_rate": 0.001, + "loss": 1.0523, + "step": 274800 + }, + { + "epoch": 88.84938590820944, + "grad_norm": 1.7763044834136963, + "learning_rate": 0.001, + "loss": 1.044, + "step": 274900 + }, + { + "epoch": 88.88170652876535, + "grad_norm": 2.0772578716278076, + "learning_rate": 0.001, + "loss": 1.0556, + "step": 275000 + }, + { + "epoch": 88.91402714932127, + "grad_norm": 1.7194854021072388, + "learning_rate": 0.001, + "loss": 1.0596, + "step": 275100 + }, + { + "epoch": 88.94634776987718, + "grad_norm": 2.1079659461975098, + "learning_rate": 0.001, + "loss": 1.0678, + "step": 275200 + }, + { + "epoch": 88.9786683904331, + "grad_norm": 1.841643214225769, + "learning_rate": 0.001, + "loss": 1.0617, + "step": 275300 + }, + { + "epoch": 89.01098901098901, + "grad_norm": 1.7587463855743408, + "learning_rate": 0.001, + "loss": 1.0222, + "step": 275400 + }, + { + "epoch": 89.04330963154493, + "grad_norm": 1.8258470296859741, + "learning_rate": 0.001, + "loss": 0.9631, + "step": 275500 + }, + { + "epoch": 89.07563025210084, + "grad_norm": 1.7834150791168213, + "learning_rate": 0.001, + "loss": 0.9544, + "step": 275600 + }, + { + "epoch": 89.10795087265676, + "grad_norm": 1.8407515287399292, + "learning_rate": 0.001, + "loss": 0.9701, + "step": 275700 + }, + { + "epoch": 89.14027149321267, + "grad_norm": 1.9101598262786865, + "learning_rate": 0.001, + "loss": 0.9584, + "step": 275800 + }, + { + "epoch": 89.17259211376859, + "grad_norm": 1.7279962301254272, + "learning_rate": 0.001, + "loss": 0.983, + "step": 275900 + }, + { + "epoch": 89.2049127343245, + "grad_norm": 1.8472046852111816, + "learning_rate": 0.001, + "loss": 0.9722, + "step": 276000 + }, + { + "epoch": 89.23723335488042, + "grad_norm": 1.6440308094024658, + "learning_rate": 0.001, + "loss": 0.984, + "step": 276100 + }, + { + "epoch": 89.26955397543632, + "grad_norm": 1.48750901222229, + "learning_rate": 0.001, + "loss": 0.9765, + "step": 276200 + }, + { + "epoch": 89.30187459599225, + "grad_norm": 1.7771786451339722, + "learning_rate": 0.001, + "loss": 1.0027, + "step": 276300 + }, + { + "epoch": 89.33419521654815, + "grad_norm": 1.5432380437850952, + "learning_rate": 0.001, + "loss": 0.9903, + "step": 276400 + }, + { + "epoch": 89.36651583710407, + "grad_norm": 2.1023707389831543, + "learning_rate": 0.001, + "loss": 0.9946, + "step": 276500 + }, + { + "epoch": 89.39883645765998, + "grad_norm": 1.942596435546875, + "learning_rate": 0.001, + "loss": 1.0045, + "step": 276600 + }, + { + "epoch": 89.4311570782159, + "grad_norm": 2.248884916305542, + "learning_rate": 0.001, + "loss": 0.9991, + "step": 276700 + }, + { + "epoch": 89.46347769877181, + "grad_norm": 2.2643485069274902, + "learning_rate": 0.001, + "loss": 1.0091, + "step": 276800 + }, + { + "epoch": 89.49579831932773, + "grad_norm": 2.112210273742676, + "learning_rate": 0.001, + "loss": 1.0035, + "step": 276900 + }, + { + "epoch": 89.52811893988364, + "grad_norm": 2.1388866901397705, + "learning_rate": 0.001, + "loss": 1.0105, + "step": 277000 + }, + { + "epoch": 89.56043956043956, + "grad_norm": 1.910740613937378, + "learning_rate": 0.001, + "loss": 1.0142, + "step": 277100 + }, + { + "epoch": 89.59276018099547, + "grad_norm": 2.0235660076141357, + "learning_rate": 0.001, + "loss": 1.0254, + "step": 277200 + }, + { + "epoch": 89.62508080155139, + "grad_norm": 1.9163745641708374, + "learning_rate": 0.001, + "loss": 1.0277, + "step": 277300 + }, + { + "epoch": 89.6574014221073, + "grad_norm": 1.8493446111679077, + "learning_rate": 0.001, + "loss": 1.0239, + "step": 277400 + }, + { + "epoch": 89.68972204266322, + "grad_norm": 2.082435131072998, + "learning_rate": 0.001, + "loss": 1.0255, + "step": 277500 + }, + { + "epoch": 89.72204266321913, + "grad_norm": 1.9139171838760376, + "learning_rate": 0.001, + "loss": 1.0456, + "step": 277600 + }, + { + "epoch": 89.75436328377505, + "grad_norm": 1.9875322580337524, + "learning_rate": 0.001, + "loss": 1.034, + "step": 277700 + }, + { + "epoch": 89.78668390433096, + "grad_norm": 2.1025478839874268, + "learning_rate": 0.001, + "loss": 1.0329, + "step": 277800 + }, + { + "epoch": 89.81900452488688, + "grad_norm": 1.9872552156448364, + "learning_rate": 0.001, + "loss": 1.042, + "step": 277900 + }, + { + "epoch": 89.85132514544279, + "grad_norm": 2.346891164779663, + "learning_rate": 0.001, + "loss": 1.0439, + "step": 278000 + }, + { + "epoch": 89.88364576599871, + "grad_norm": 2.0617330074310303, + "learning_rate": 0.001, + "loss": 1.0526, + "step": 278100 + }, + { + "epoch": 89.91596638655462, + "grad_norm": 2.033355474472046, + "learning_rate": 0.001, + "loss": 1.053, + "step": 278200 + }, + { + "epoch": 89.94828700711054, + "grad_norm": 1.9766206741333008, + "learning_rate": 0.001, + "loss": 1.0571, + "step": 278300 + }, + { + "epoch": 89.98060762766644, + "grad_norm": 1.6863994598388672, + "learning_rate": 0.001, + "loss": 1.0536, + "step": 278400 + }, + { + "epoch": 90.01292824822237, + "grad_norm": 2.222926616668701, + "learning_rate": 0.001, + "loss": 0.9922, + "step": 278500 + }, + { + "epoch": 90.04524886877829, + "grad_norm": 2.0000972747802734, + "learning_rate": 0.001, + "loss": 0.9525, + "step": 278600 + }, + { + "epoch": 90.0775694893342, + "grad_norm": 2.640467643737793, + "learning_rate": 0.001, + "loss": 0.9637, + "step": 278700 + }, + { + "epoch": 90.10989010989012, + "grad_norm": 2.6437063217163086, + "learning_rate": 0.001, + "loss": 0.9518, + "step": 278800 + }, + { + "epoch": 90.14221073044602, + "grad_norm": 2.0729470252990723, + "learning_rate": 0.001, + "loss": 0.9644, + "step": 278900 + }, + { + "epoch": 90.17453135100195, + "grad_norm": 1.665229320526123, + "learning_rate": 0.001, + "loss": 0.9703, + "step": 279000 + }, + { + "epoch": 90.20685197155785, + "grad_norm": 2.4748964309692383, + "learning_rate": 0.001, + "loss": 0.9828, + "step": 279100 + }, + { + "epoch": 90.23917259211377, + "grad_norm": 2.3275153636932373, + "learning_rate": 0.001, + "loss": 0.9758, + "step": 279200 + }, + { + "epoch": 90.27149321266968, + "grad_norm": 2.267796516418457, + "learning_rate": 0.001, + "loss": 0.9762, + "step": 279300 + }, + { + "epoch": 90.3038138332256, + "grad_norm": 3.0510411262512207, + "learning_rate": 0.001, + "loss": 0.9634, + "step": 279400 + }, + { + "epoch": 90.33613445378151, + "grad_norm": 1.9009895324707031, + "learning_rate": 0.001, + "loss": 0.9854, + "step": 279500 + }, + { + "epoch": 90.36845507433743, + "grad_norm": 2.0102601051330566, + "learning_rate": 0.001, + "loss": 0.9867, + "step": 279600 + }, + { + "epoch": 90.40077569489334, + "grad_norm": 1.7540924549102783, + "learning_rate": 0.001, + "loss": 1.0011, + "step": 279700 + }, + { + "epoch": 90.43309631544926, + "grad_norm": 2.0557961463928223, + "learning_rate": 0.001, + "loss": 0.9923, + "step": 279800 + }, + { + "epoch": 90.46541693600517, + "grad_norm": 2.086256980895996, + "learning_rate": 0.001, + "loss": 0.9925, + "step": 279900 + }, + { + "epoch": 90.49773755656109, + "grad_norm": 2.0977132320404053, + "learning_rate": 0.001, + "loss": 1.0084, + "step": 280000 + }, + { + "epoch": 90.530058177117, + "grad_norm": 2.174241304397583, + "learning_rate": 0.001, + "loss": 1.0013, + "step": 280100 + }, + { + "epoch": 90.56237879767292, + "grad_norm": 1.7331504821777344, + "learning_rate": 0.001, + "loss": 1.0204, + "step": 280200 + }, + { + "epoch": 90.59469941822883, + "grad_norm": 1.912105679512024, + "learning_rate": 0.001, + "loss": 1.0077, + "step": 280300 + }, + { + "epoch": 90.62702003878475, + "grad_norm": 1.9463039636611938, + "learning_rate": 0.001, + "loss": 1.014, + "step": 280400 + }, + { + "epoch": 90.65934065934066, + "grad_norm": 1.9718258380889893, + "learning_rate": 0.001, + "loss": 1.0205, + "step": 280500 + }, + { + "epoch": 90.69166127989658, + "grad_norm": 2.5784502029418945, + "learning_rate": 0.001, + "loss": 1.0149, + "step": 280600 + }, + { + "epoch": 90.72398190045249, + "grad_norm": 1.9825204610824585, + "learning_rate": 0.001, + "loss": 1.025, + "step": 280700 + }, + { + "epoch": 90.75630252100841, + "grad_norm": 1.9849514961242676, + "learning_rate": 0.001, + "loss": 1.0323, + "step": 280800 + }, + { + "epoch": 90.78862314156432, + "grad_norm": 2.067756414413452, + "learning_rate": 0.001, + "loss": 1.0378, + "step": 280900 + }, + { + "epoch": 90.82094376212024, + "grad_norm": 3.0538594722747803, + "learning_rate": 0.001, + "loss": 1.0244, + "step": 281000 + }, + { + "epoch": 90.85326438267614, + "grad_norm": 2.0136399269104004, + "learning_rate": 0.001, + "loss": 1.0411, + "step": 281100 + }, + { + "epoch": 90.88558500323207, + "grad_norm": 1.8992334604263306, + "learning_rate": 0.001, + "loss": 1.0451, + "step": 281200 + }, + { + "epoch": 90.91790562378797, + "grad_norm": 2.309920310974121, + "learning_rate": 0.001, + "loss": 1.0304, + "step": 281300 + }, + { + "epoch": 90.9502262443439, + "grad_norm": 2.1204164028167725, + "learning_rate": 0.001, + "loss": 1.0425, + "step": 281400 + }, + { + "epoch": 90.9825468648998, + "grad_norm": 2.363699436187744, + "learning_rate": 0.001, + "loss": 1.0557, + "step": 281500 + }, + { + "epoch": 91.01486748545572, + "grad_norm": 2.024256706237793, + "learning_rate": 0.001, + "loss": 0.98, + "step": 281600 + }, + { + "epoch": 91.04718810601163, + "grad_norm": 1.8037205934524536, + "learning_rate": 0.001, + "loss": 0.9362, + "step": 281700 + }, + { + "epoch": 91.07950872656755, + "grad_norm": 2.3736801147460938, + "learning_rate": 0.001, + "loss": 0.944, + "step": 281800 + }, + { + "epoch": 91.11182934712346, + "grad_norm": 2.0845656394958496, + "learning_rate": 0.001, + "loss": 0.9526, + "step": 281900 + }, + { + "epoch": 91.14414996767938, + "grad_norm": 1.9515149593353271, + "learning_rate": 0.001, + "loss": 0.9533, + "step": 282000 + }, + { + "epoch": 91.17647058823529, + "grad_norm": 2.203016757965088, + "learning_rate": 0.001, + "loss": 0.9585, + "step": 282100 + }, + { + "epoch": 91.20879120879121, + "grad_norm": 2.299415349960327, + "learning_rate": 0.001, + "loss": 0.952, + "step": 282200 + }, + { + "epoch": 91.24111182934712, + "grad_norm": 1.9382708072662354, + "learning_rate": 0.001, + "loss": 0.9691, + "step": 282300 + }, + { + "epoch": 91.27343244990304, + "grad_norm": 2.493281602859497, + "learning_rate": 0.001, + "loss": 0.9665, + "step": 282400 + }, + { + "epoch": 91.30575307045895, + "grad_norm": 2.7134528160095215, + "learning_rate": 0.001, + "loss": 0.971, + "step": 282500 + }, + { + "epoch": 91.33807369101487, + "grad_norm": 2.434889078140259, + "learning_rate": 0.001, + "loss": 0.973, + "step": 282600 + }, + { + "epoch": 91.37039431157078, + "grad_norm": 1.6504502296447754, + "learning_rate": 0.001, + "loss": 0.9882, + "step": 282700 + }, + { + "epoch": 91.4027149321267, + "grad_norm": 2.516812324523926, + "learning_rate": 0.001, + "loss": 0.9917, + "step": 282800 + }, + { + "epoch": 91.4350355526826, + "grad_norm": 2.250521421432495, + "learning_rate": 0.001, + "loss": 0.9858, + "step": 282900 + }, + { + "epoch": 91.46735617323853, + "grad_norm": 2.721055746078491, + "learning_rate": 0.001, + "loss": 0.9903, + "step": 283000 + }, + { + "epoch": 91.49967679379444, + "grad_norm": 2.0062334537506104, + "learning_rate": 0.001, + "loss": 1.0004, + "step": 283100 + }, + { + "epoch": 91.53199741435036, + "grad_norm": 2.329655408859253, + "learning_rate": 0.001, + "loss": 0.9853, + "step": 283200 + }, + { + "epoch": 91.56431803490626, + "grad_norm": 1.8753358125686646, + "learning_rate": 0.001, + "loss": 1.0124, + "step": 283300 + }, + { + "epoch": 91.59663865546219, + "grad_norm": 2.209588050842285, + "learning_rate": 0.001, + "loss": 1.0197, + "step": 283400 + }, + { + "epoch": 91.6289592760181, + "grad_norm": 2.219310760498047, + "learning_rate": 0.001, + "loss": 1.0008, + "step": 283500 + }, + { + "epoch": 91.66127989657402, + "grad_norm": 2.1283211708068848, + "learning_rate": 0.001, + "loss": 1.0384, + "step": 283600 + }, + { + "epoch": 91.69360051712992, + "grad_norm": 1.7601873874664307, + "learning_rate": 0.001, + "loss": 1.0298, + "step": 283700 + }, + { + "epoch": 91.72592113768584, + "grad_norm": 1.8941537141799927, + "learning_rate": 0.001, + "loss": 1.0165, + "step": 283800 + }, + { + "epoch": 91.75824175824175, + "grad_norm": 2.771538734436035, + "learning_rate": 0.001, + "loss": 1.0324, + "step": 283900 + }, + { + "epoch": 91.79056237879767, + "grad_norm": 2.580246686935425, + "learning_rate": 0.001, + "loss": 1.0321, + "step": 284000 + }, + { + "epoch": 91.82288299935358, + "grad_norm": 2.1796884536743164, + "learning_rate": 0.001, + "loss": 1.0281, + "step": 284100 + }, + { + "epoch": 91.8552036199095, + "grad_norm": 2.224008798599243, + "learning_rate": 0.001, + "loss": 1.0264, + "step": 284200 + }, + { + "epoch": 91.88752424046541, + "grad_norm": 2.6273763179779053, + "learning_rate": 0.001, + "loss": 1.0378, + "step": 284300 + }, + { + "epoch": 91.91984486102133, + "grad_norm": 2.327000856399536, + "learning_rate": 0.001, + "loss": 1.0412, + "step": 284400 + }, + { + "epoch": 91.95216548157724, + "grad_norm": 2.1067605018615723, + "learning_rate": 0.001, + "loss": 1.0331, + "step": 284500 + }, + { + "epoch": 91.98448610213316, + "grad_norm": 2.0482475757598877, + "learning_rate": 0.001, + "loss": 1.0539, + "step": 284600 + }, + { + "epoch": 92.01680672268908, + "grad_norm": 2.45538067817688, + "learning_rate": 0.001, + "loss": 0.9696, + "step": 284700 + }, + { + "epoch": 92.04912734324499, + "grad_norm": 2.8478100299835205, + "learning_rate": 0.001, + "loss": 0.9344, + "step": 284800 + }, + { + "epoch": 92.08144796380091, + "grad_norm": 2.6036441326141357, + "learning_rate": 0.001, + "loss": 0.9354, + "step": 284900 + }, + { + "epoch": 92.11376858435682, + "grad_norm": 3.401431083679199, + "learning_rate": 0.001, + "loss": 0.9437, + "step": 285000 + }, + { + "epoch": 92.14608920491274, + "grad_norm": 2.8432137966156006, + "learning_rate": 0.001, + "loss": 0.9512, + "step": 285100 + }, + { + "epoch": 92.17840982546865, + "grad_norm": 2.483217477798462, + "learning_rate": 0.001, + "loss": 0.9671, + "step": 285200 + }, + { + "epoch": 92.21073044602457, + "grad_norm": 2.2827484607696533, + "learning_rate": 0.001, + "loss": 0.9642, + "step": 285300 + }, + { + "epoch": 92.24305106658048, + "grad_norm": 1.9338748455047607, + "learning_rate": 0.001, + "loss": 0.9574, + "step": 285400 + }, + { + "epoch": 92.2753716871364, + "grad_norm": 3.546093225479126, + "learning_rate": 0.001, + "loss": 0.9691, + "step": 285500 + }, + { + "epoch": 92.3076923076923, + "grad_norm": 2.77447247505188, + "learning_rate": 0.001, + "loss": 0.967, + "step": 285600 + }, + { + "epoch": 92.34001292824823, + "grad_norm": 2.7795073986053467, + "learning_rate": 0.001, + "loss": 0.9534, + "step": 285700 + }, + { + "epoch": 92.37233354880414, + "grad_norm": 3.57477068901062, + "learning_rate": 0.001, + "loss": 0.9676, + "step": 285800 + }, + { + "epoch": 92.40465416936006, + "grad_norm": 2.2975080013275146, + "learning_rate": 0.001, + "loss": 0.979, + "step": 285900 + }, + { + "epoch": 92.43697478991596, + "grad_norm": 2.965996026992798, + "learning_rate": 0.001, + "loss": 0.9877, + "step": 286000 + }, + { + "epoch": 92.46929541047189, + "grad_norm": 2.1376302242279053, + "learning_rate": 0.001, + "loss": 1.0011, + "step": 286100 + }, + { + "epoch": 92.5016160310278, + "grad_norm": 2.3904857635498047, + "learning_rate": 0.001, + "loss": 0.9948, + "step": 286200 + }, + { + "epoch": 92.53393665158372, + "grad_norm": 2.1248250007629395, + "learning_rate": 0.001, + "loss": 0.9912, + "step": 286300 + }, + { + "epoch": 92.56625727213962, + "grad_norm": 3.2705423831939697, + "learning_rate": 0.001, + "loss": 0.9846, + "step": 286400 + }, + { + "epoch": 92.59857789269554, + "grad_norm": 2.439633369445801, + "learning_rate": 0.001, + "loss": 0.9929, + "step": 286500 + }, + { + "epoch": 92.63089851325145, + "grad_norm": 2.881314277648926, + "learning_rate": 0.001, + "loss": 1.0093, + "step": 286600 + }, + { + "epoch": 92.66321913380737, + "grad_norm": 3.5969815254211426, + "learning_rate": 0.001, + "loss": 0.9981, + "step": 286700 + }, + { + "epoch": 92.69553975436328, + "grad_norm": 1.9576853513717651, + "learning_rate": 0.001, + "loss": 1.0196, + "step": 286800 + }, + { + "epoch": 92.7278603749192, + "grad_norm": 3.114851713180542, + "learning_rate": 0.001, + "loss": 1.0087, + "step": 286900 + }, + { + "epoch": 92.76018099547511, + "grad_norm": 2.5850727558135986, + "learning_rate": 0.001, + "loss": 1.012, + "step": 287000 + }, + { + "epoch": 92.79250161603103, + "grad_norm": 2.376007080078125, + "learning_rate": 0.001, + "loss": 1.0238, + "step": 287100 + }, + { + "epoch": 92.82482223658694, + "grad_norm": 2.491420269012451, + "learning_rate": 0.001, + "loss": 1.023, + "step": 287200 + }, + { + "epoch": 92.85714285714286, + "grad_norm": 2.0345892906188965, + "learning_rate": 0.001, + "loss": 1.0287, + "step": 287300 + }, + { + "epoch": 92.88946347769877, + "grad_norm": 4.1525702476501465, + "learning_rate": 0.001, + "loss": 1.037, + "step": 287400 + }, + { + "epoch": 92.92178409825469, + "grad_norm": 3.620197057723999, + "learning_rate": 0.001, + "loss": 1.0469, + "step": 287500 + }, + { + "epoch": 92.9541047188106, + "grad_norm": 3.1248631477355957, + "learning_rate": 0.001, + "loss": 1.0341, + "step": 287600 + }, + { + "epoch": 92.98642533936652, + "grad_norm": 2.6980485916137695, + "learning_rate": 0.001, + "loss": 1.0328, + "step": 287700 + }, + { + "epoch": 93.01874595992243, + "grad_norm": 2.4870355129241943, + "learning_rate": 0.001, + "loss": 0.9789, + "step": 287800 + }, + { + "epoch": 93.05106658047835, + "grad_norm": 2.198549270629883, + "learning_rate": 0.001, + "loss": 0.9173, + "step": 287900 + }, + { + "epoch": 93.08338720103426, + "grad_norm": 1.8441483974456787, + "learning_rate": 0.001, + "loss": 0.934, + "step": 288000 + }, + { + "epoch": 93.11570782159018, + "grad_norm": 2.0669450759887695, + "learning_rate": 0.001, + "loss": 0.9305, + "step": 288100 + }, + { + "epoch": 93.14802844214609, + "grad_norm": 2.3490703105926514, + "learning_rate": 0.001, + "loss": 0.9552, + "step": 288200 + }, + { + "epoch": 93.180349062702, + "grad_norm": 2.179919481277466, + "learning_rate": 0.001, + "loss": 0.9391, + "step": 288300 + }, + { + "epoch": 93.21266968325791, + "grad_norm": 1.8578362464904785, + "learning_rate": 0.001, + "loss": 0.942, + "step": 288400 + }, + { + "epoch": 93.24499030381384, + "grad_norm": 1.6484123468399048, + "learning_rate": 0.001, + "loss": 0.9545, + "step": 288500 + }, + { + "epoch": 93.27731092436974, + "grad_norm": 2.483506441116333, + "learning_rate": 0.001, + "loss": 0.9774, + "step": 288600 + }, + { + "epoch": 93.30963154492567, + "grad_norm": 1.8346855640411377, + "learning_rate": 0.001, + "loss": 0.9638, + "step": 288700 + }, + { + "epoch": 93.34195216548157, + "grad_norm": 1.6694176197052002, + "learning_rate": 0.001, + "loss": 0.9704, + "step": 288800 + }, + { + "epoch": 93.3742727860375, + "grad_norm": 2.2945752143859863, + "learning_rate": 0.001, + "loss": 0.9813, + "step": 288900 + }, + { + "epoch": 93.4065934065934, + "grad_norm": 2.2705020904541016, + "learning_rate": 0.001, + "loss": 0.9744, + "step": 289000 + }, + { + "epoch": 93.43891402714932, + "grad_norm": 2.5650382041931152, + "learning_rate": 0.001, + "loss": 0.9751, + "step": 289100 + }, + { + "epoch": 93.47123464770523, + "grad_norm": 1.7471798658370972, + "learning_rate": 0.001, + "loss": 0.9724, + "step": 289200 + }, + { + "epoch": 93.50355526826115, + "grad_norm": 2.5118908882141113, + "learning_rate": 0.001, + "loss": 0.9774, + "step": 289300 + }, + { + "epoch": 93.53587588881706, + "grad_norm": 1.7975685596466064, + "learning_rate": 0.001, + "loss": 0.9868, + "step": 289400 + }, + { + "epoch": 93.56819650937298, + "grad_norm": 2.0186667442321777, + "learning_rate": 0.001, + "loss": 0.9923, + "step": 289500 + }, + { + "epoch": 93.60051712992889, + "grad_norm": 3.608877420425415, + "learning_rate": 0.001, + "loss": 0.987, + "step": 289600 + }, + { + "epoch": 93.63283775048481, + "grad_norm": 2.746725559234619, + "learning_rate": 0.001, + "loss": 0.9888, + "step": 289700 + }, + { + "epoch": 93.66515837104072, + "grad_norm": 2.6709630489349365, + "learning_rate": 0.001, + "loss": 0.9949, + "step": 289800 + }, + { + "epoch": 93.69747899159664, + "grad_norm": 2.15086030960083, + "learning_rate": 0.001, + "loss": 0.9904, + "step": 289900 + }, + { + "epoch": 93.72979961215255, + "grad_norm": 2.7408602237701416, + "learning_rate": 0.001, + "loss": 1.0152, + "step": 290000 + }, + { + "epoch": 93.76212023270847, + "grad_norm": 2.362180233001709, + "learning_rate": 0.001, + "loss": 1.0091, + "step": 290100 + }, + { + "epoch": 93.79444085326438, + "grad_norm": 1.734055519104004, + "learning_rate": 0.001, + "loss": 1.0105, + "step": 290200 + }, + { + "epoch": 93.8267614738203, + "grad_norm": 1.879744052886963, + "learning_rate": 0.001, + "loss": 1.0108, + "step": 290300 + }, + { + "epoch": 93.8590820943762, + "grad_norm": 2.03096342086792, + "learning_rate": 0.001, + "loss": 1.0089, + "step": 290400 + }, + { + "epoch": 93.89140271493213, + "grad_norm": 1.8851549625396729, + "learning_rate": 0.001, + "loss": 1.0173, + "step": 290500 + }, + { + "epoch": 93.92372333548803, + "grad_norm": 2.0545690059661865, + "learning_rate": 0.001, + "loss": 1.0307, + "step": 290600 + }, + { + "epoch": 93.95604395604396, + "grad_norm": 2.3553688526153564, + "learning_rate": 0.001, + "loss": 1.023, + "step": 290700 + }, + { + "epoch": 93.98836457659988, + "grad_norm": 2.5887322425842285, + "learning_rate": 0.001, + "loss": 1.0195, + "step": 290800 + }, + { + "epoch": 94.02068519715579, + "grad_norm": 1.7967345714569092, + "learning_rate": 0.001, + "loss": 0.9646, + "step": 290900 + }, + { + "epoch": 94.0530058177117, + "grad_norm": 2.0765960216522217, + "learning_rate": 0.001, + "loss": 0.9075, + "step": 291000 + }, + { + "epoch": 94.08532643826761, + "grad_norm": 1.8205326795578003, + "learning_rate": 0.001, + "loss": 0.9234, + "step": 291100 + }, + { + "epoch": 94.11764705882354, + "grad_norm": 2.5845706462860107, + "learning_rate": 0.001, + "loss": 0.9408, + "step": 291200 + }, + { + "epoch": 94.14996767937944, + "grad_norm": 2.3111801147460938, + "learning_rate": 0.001, + "loss": 0.9202, + "step": 291300 + }, + { + "epoch": 94.18228829993537, + "grad_norm": 2.1115758419036865, + "learning_rate": 0.001, + "loss": 0.941, + "step": 291400 + }, + { + "epoch": 94.21460892049127, + "grad_norm": 1.778016448020935, + "learning_rate": 0.001, + "loss": 0.9442, + "step": 291500 + }, + { + "epoch": 94.2469295410472, + "grad_norm": 2.3348562717437744, + "learning_rate": 0.001, + "loss": 0.9538, + "step": 291600 + }, + { + "epoch": 94.2792501616031, + "grad_norm": 1.9034233093261719, + "learning_rate": 0.001, + "loss": 0.9438, + "step": 291700 + }, + { + "epoch": 94.31157078215902, + "grad_norm": 1.7321242094039917, + "learning_rate": 0.001, + "loss": 0.9677, + "step": 291800 + }, + { + "epoch": 94.34389140271493, + "grad_norm": 1.788539171218872, + "learning_rate": 0.001, + "loss": 0.9492, + "step": 291900 + }, + { + "epoch": 94.37621202327085, + "grad_norm": 1.9123979806900024, + "learning_rate": 0.001, + "loss": 0.9652, + "step": 292000 + }, + { + "epoch": 94.40853264382676, + "grad_norm": 2.391134262084961, + "learning_rate": 0.001, + "loss": 0.9594, + "step": 292100 + }, + { + "epoch": 94.44085326438268, + "grad_norm": 1.6947717666625977, + "learning_rate": 0.001, + "loss": 0.9634, + "step": 292200 + }, + { + "epoch": 94.47317388493859, + "grad_norm": 3.093050241470337, + "learning_rate": 0.001, + "loss": 0.9646, + "step": 292300 + }, + { + "epoch": 94.50549450549451, + "grad_norm": 2.0479140281677246, + "learning_rate": 0.001, + "loss": 0.9826, + "step": 292400 + }, + { + "epoch": 94.53781512605042, + "grad_norm": 2.06463623046875, + "learning_rate": 0.001, + "loss": 0.9834, + "step": 292500 + }, + { + "epoch": 94.57013574660634, + "grad_norm": 1.81849205493927, + "learning_rate": 0.001, + "loss": 0.976, + "step": 292600 + }, + { + "epoch": 94.60245636716225, + "grad_norm": 1.5885215997695923, + "learning_rate": 0.001, + "loss": 0.9767, + "step": 292700 + }, + { + "epoch": 94.63477698771817, + "grad_norm": 3.0277457237243652, + "learning_rate": 0.001, + "loss": 0.985, + "step": 292800 + }, + { + "epoch": 94.66709760827408, + "grad_norm": 1.899543046951294, + "learning_rate": 0.001, + "loss": 0.9824, + "step": 292900 + }, + { + "epoch": 94.69941822883, + "grad_norm": 2.1208620071411133, + "learning_rate": 0.001, + "loss": 1.0109, + "step": 293000 + }, + { + "epoch": 94.7317388493859, + "grad_norm": 2.451232671737671, + "learning_rate": 0.001, + "loss": 0.9952, + "step": 293100 + }, + { + "epoch": 94.76405946994183, + "grad_norm": 2.232257604598999, + "learning_rate": 0.001, + "loss": 1.0, + "step": 293200 + }, + { + "epoch": 94.79638009049773, + "grad_norm": 1.7241202592849731, + "learning_rate": 0.001, + "loss": 0.9966, + "step": 293300 + }, + { + "epoch": 94.82870071105366, + "grad_norm": 2.0741376876831055, + "learning_rate": 0.001, + "loss": 1.0213, + "step": 293400 + }, + { + "epoch": 94.86102133160956, + "grad_norm": 1.8772655725479126, + "learning_rate": 0.001, + "loss": 1.0044, + "step": 293500 + }, + { + "epoch": 94.89334195216549, + "grad_norm": 2.1163671016693115, + "learning_rate": 0.001, + "loss": 1.0106, + "step": 293600 + }, + { + "epoch": 94.9256625727214, + "grad_norm": 2.3701653480529785, + "learning_rate": 0.001, + "loss": 1.0054, + "step": 293700 + }, + { + "epoch": 94.95798319327731, + "grad_norm": 1.8388867378234863, + "learning_rate": 0.001, + "loss": 1.0137, + "step": 293800 + }, + { + "epoch": 94.99030381383322, + "grad_norm": 1.9023023843765259, + "learning_rate": 0.001, + "loss": 1.0238, + "step": 293900 + }, + { + "epoch": 95.02262443438914, + "grad_norm": 2.3602747917175293, + "learning_rate": 0.001, + "loss": 0.9515, + "step": 294000 + }, + { + "epoch": 95.05494505494505, + "grad_norm": 1.9043653011322021, + "learning_rate": 0.001, + "loss": 0.9271, + "step": 294100 + }, + { + "epoch": 95.08726567550097, + "grad_norm": 1.6567951440811157, + "learning_rate": 0.001, + "loss": 0.9284, + "step": 294200 + }, + { + "epoch": 95.11958629605688, + "grad_norm": 1.8202804327011108, + "learning_rate": 0.001, + "loss": 0.9266, + "step": 294300 + }, + { + "epoch": 95.1519069166128, + "grad_norm": 1.613312005996704, + "learning_rate": 0.001, + "loss": 0.9181, + "step": 294400 + }, + { + "epoch": 95.18422753716871, + "grad_norm": 2.1593878269195557, + "learning_rate": 0.001, + "loss": 0.932, + "step": 294500 + }, + { + "epoch": 95.21654815772463, + "grad_norm": 1.7870193719863892, + "learning_rate": 0.001, + "loss": 0.9439, + "step": 294600 + }, + { + "epoch": 95.24886877828054, + "grad_norm": 1.8779339790344238, + "learning_rate": 0.001, + "loss": 0.9384, + "step": 294700 + }, + { + "epoch": 95.28118939883646, + "grad_norm": 1.589490532875061, + "learning_rate": 0.001, + "loss": 0.9503, + "step": 294800 + }, + { + "epoch": 95.31351001939237, + "grad_norm": 2.23694109916687, + "learning_rate": 0.001, + "loss": 0.9386, + "step": 294900 + }, + { + "epoch": 95.34583063994829, + "grad_norm": 1.9749599695205688, + "learning_rate": 0.001, + "loss": 0.9478, + "step": 295000 + }, + { + "epoch": 95.3781512605042, + "grad_norm": 1.9359219074249268, + "learning_rate": 0.001, + "loss": 0.9549, + "step": 295100 + }, + { + "epoch": 95.41047188106012, + "grad_norm": 1.7539458274841309, + "learning_rate": 0.001, + "loss": 0.9532, + "step": 295200 + }, + { + "epoch": 95.44279250161603, + "grad_norm": 2.1427483558654785, + "learning_rate": 0.001, + "loss": 0.954, + "step": 295300 + }, + { + "epoch": 95.47511312217195, + "grad_norm": 1.6027040481567383, + "learning_rate": 0.001, + "loss": 0.9658, + "step": 295400 + }, + { + "epoch": 95.50743374272786, + "grad_norm": 1.9406092166900635, + "learning_rate": 0.001, + "loss": 0.9661, + "step": 295500 + }, + { + "epoch": 95.53975436328378, + "grad_norm": 1.6757909059524536, + "learning_rate": 0.001, + "loss": 0.9748, + "step": 295600 + }, + { + "epoch": 95.57207498383968, + "grad_norm": 1.8995640277862549, + "learning_rate": 0.001, + "loss": 0.9726, + "step": 295700 + }, + { + "epoch": 95.6043956043956, + "grad_norm": 1.8258506059646606, + "learning_rate": 0.001, + "loss": 0.9771, + "step": 295800 + }, + { + "epoch": 95.63671622495151, + "grad_norm": 2.0489180088043213, + "learning_rate": 0.001, + "loss": 0.9804, + "step": 295900 + }, + { + "epoch": 95.66903684550743, + "grad_norm": 1.8056089878082275, + "learning_rate": 0.001, + "loss": 0.9902, + "step": 296000 + }, + { + "epoch": 95.70135746606334, + "grad_norm": 1.7540534734725952, + "learning_rate": 0.001, + "loss": 0.9825, + "step": 296100 + }, + { + "epoch": 95.73367808661926, + "grad_norm": 1.9533356428146362, + "learning_rate": 0.001, + "loss": 0.9919, + "step": 296200 + }, + { + "epoch": 95.76599870717517, + "grad_norm": 2.378761053085327, + "learning_rate": 0.001, + "loss": 0.99, + "step": 296300 + }, + { + "epoch": 95.7983193277311, + "grad_norm": 1.724198818206787, + "learning_rate": 0.001, + "loss": 0.9953, + "step": 296400 + }, + { + "epoch": 95.830639948287, + "grad_norm": 2.2945659160614014, + "learning_rate": 0.001, + "loss": 0.9999, + "step": 296500 + }, + { + "epoch": 95.86296056884292, + "grad_norm": 1.710324764251709, + "learning_rate": 0.001, + "loss": 1.0072, + "step": 296600 + }, + { + "epoch": 95.89528118939883, + "grad_norm": 1.5836914777755737, + "learning_rate": 0.001, + "loss": 0.9861, + "step": 296700 + }, + { + "epoch": 95.92760180995475, + "grad_norm": 1.6677262783050537, + "learning_rate": 0.001, + "loss": 0.9965, + "step": 296800 + }, + { + "epoch": 95.95992243051066, + "grad_norm": 1.6849901676177979, + "learning_rate": 0.001, + "loss": 1.0093, + "step": 296900 + }, + { + "epoch": 95.99224305106658, + "grad_norm": 1.8210523128509521, + "learning_rate": 0.001, + "loss": 1.0217, + "step": 297000 + }, + { + "epoch": 96.0245636716225, + "grad_norm": 1.721255898475647, + "learning_rate": 0.001, + "loss": 0.9372, + "step": 297100 + }, + { + "epoch": 96.05688429217841, + "grad_norm": 1.9859453439712524, + "learning_rate": 0.001, + "loss": 0.9033, + "step": 297200 + }, + { + "epoch": 96.08920491273433, + "grad_norm": 1.655301809310913, + "learning_rate": 0.001, + "loss": 0.9167, + "step": 297300 + }, + { + "epoch": 96.12152553329024, + "grad_norm": 2.0336883068084717, + "learning_rate": 0.001, + "loss": 0.9247, + "step": 297400 + }, + { + "epoch": 96.15384615384616, + "grad_norm": 2.0027976036071777, + "learning_rate": 0.001, + "loss": 0.9255, + "step": 297500 + }, + { + "epoch": 96.18616677440207, + "grad_norm": 1.4069693088531494, + "learning_rate": 0.001, + "loss": 0.9324, + "step": 297600 + }, + { + "epoch": 96.21848739495799, + "grad_norm": 1.8238356113433838, + "learning_rate": 0.001, + "loss": 0.9208, + "step": 297700 + }, + { + "epoch": 96.2508080155139, + "grad_norm": 1.9688968658447266, + "learning_rate": 0.001, + "loss": 0.9407, + "step": 297800 + }, + { + "epoch": 96.28312863606982, + "grad_norm": 1.9155817031860352, + "learning_rate": 0.001, + "loss": 0.939, + "step": 297900 + }, + { + "epoch": 96.31544925662573, + "grad_norm": 1.7525216341018677, + "learning_rate": 0.001, + "loss": 0.9424, + "step": 298000 + }, + { + "epoch": 96.34776987718165, + "grad_norm": 2.573129415512085, + "learning_rate": 0.001, + "loss": 0.9396, + "step": 298100 + }, + { + "epoch": 96.38009049773756, + "grad_norm": 2.0800180435180664, + "learning_rate": 0.001, + "loss": 0.9402, + "step": 298200 + }, + { + "epoch": 96.41241111829348, + "grad_norm": 1.723068118095398, + "learning_rate": 0.001, + "loss": 0.9481, + "step": 298300 + }, + { + "epoch": 96.44473173884938, + "grad_norm": 1.7221217155456543, + "learning_rate": 0.001, + "loss": 0.9542, + "step": 298400 + }, + { + "epoch": 96.4770523594053, + "grad_norm": 1.8597713708877563, + "learning_rate": 0.001, + "loss": 0.9532, + "step": 298500 + }, + { + "epoch": 96.50937297996121, + "grad_norm": 1.7525596618652344, + "learning_rate": 0.001, + "loss": 0.9557, + "step": 298600 + }, + { + "epoch": 96.54169360051714, + "grad_norm": 2.1579041481018066, + "learning_rate": 0.001, + "loss": 0.9692, + "step": 298700 + }, + { + "epoch": 96.57401422107304, + "grad_norm": 1.7372926473617554, + "learning_rate": 0.001, + "loss": 0.9585, + "step": 298800 + }, + { + "epoch": 96.60633484162896, + "grad_norm": 1.6883844137191772, + "learning_rate": 0.001, + "loss": 0.9598, + "step": 298900 + }, + { + "epoch": 96.63865546218487, + "grad_norm": 2.018057346343994, + "learning_rate": 0.001, + "loss": 0.9729, + "step": 299000 + }, + { + "epoch": 96.6709760827408, + "grad_norm": 1.6414058208465576, + "learning_rate": 0.001, + "loss": 0.9783, + "step": 299100 + }, + { + "epoch": 96.7032967032967, + "grad_norm": 1.5033763647079468, + "learning_rate": 0.001, + "loss": 0.9791, + "step": 299200 + }, + { + "epoch": 96.73561732385262, + "grad_norm": 1.5835376977920532, + "learning_rate": 0.001, + "loss": 0.9804, + "step": 299300 + }, + { + "epoch": 96.76793794440853, + "grad_norm": 2.2092549800872803, + "learning_rate": 0.001, + "loss": 1.0009, + "step": 299400 + }, + { + "epoch": 96.80025856496445, + "grad_norm": 1.7708107233047485, + "learning_rate": 0.001, + "loss": 0.9765, + "step": 299500 + }, + { + "epoch": 96.83257918552036, + "grad_norm": 1.8122010231018066, + "learning_rate": 0.001, + "loss": 1.0012, + "step": 299600 + }, + { + "epoch": 96.86489980607628, + "grad_norm": 1.8053666353225708, + "learning_rate": 0.001, + "loss": 0.9871, + "step": 299700 + }, + { + "epoch": 96.89722042663219, + "grad_norm": 2.1326189041137695, + "learning_rate": 0.001, + "loss": 0.9737, + "step": 299800 + }, + { + "epoch": 96.92954104718811, + "grad_norm": 2.2978103160858154, + "learning_rate": 0.001, + "loss": 0.9991, + "step": 299900 + }, + { + "epoch": 96.96186166774402, + "grad_norm": 1.8257157802581787, + "learning_rate": 0.001, + "loss": 1.0042, + "step": 300000 + }, + { + "epoch": 96.99418228829994, + "grad_norm": 1.9147756099700928, + "learning_rate": 0.001, + "loss": 0.9982, + "step": 300100 + }, + { + "epoch": 97.02650290885585, + "grad_norm": 1.7436898946762085, + "learning_rate": 0.001, + "loss": 0.9217, + "step": 300200 + }, + { + "epoch": 97.05882352941177, + "grad_norm": 2.3132598400115967, + "learning_rate": 0.001, + "loss": 0.9009, + "step": 300300 + }, + { + "epoch": 97.09114414996768, + "grad_norm": 1.7439414262771606, + "learning_rate": 0.001, + "loss": 0.9058, + "step": 300400 + }, + { + "epoch": 97.1234647705236, + "grad_norm": 1.6833429336547852, + "learning_rate": 0.001, + "loss": 0.8999, + "step": 300500 + }, + { + "epoch": 97.1557853910795, + "grad_norm": 1.65819251537323, + "learning_rate": 0.001, + "loss": 0.9105, + "step": 300600 + }, + { + "epoch": 97.18810601163543, + "grad_norm": 1.7139781713485718, + "learning_rate": 0.001, + "loss": 0.9218, + "step": 300700 + }, + { + "epoch": 97.22042663219133, + "grad_norm": 2.181692361831665, + "learning_rate": 0.001, + "loss": 0.9336, + "step": 300800 + }, + { + "epoch": 97.25274725274726, + "grad_norm": 2.0716326236724854, + "learning_rate": 0.001, + "loss": 0.9164, + "step": 300900 + }, + { + "epoch": 97.28506787330316, + "grad_norm": 2.2653331756591797, + "learning_rate": 0.001, + "loss": 0.9367, + "step": 301000 + }, + { + "epoch": 97.31738849385908, + "grad_norm": 2.1563384532928467, + "learning_rate": 0.001, + "loss": 0.9216, + "step": 301100 + }, + { + "epoch": 97.34970911441499, + "grad_norm": 2.309046983718872, + "learning_rate": 0.001, + "loss": 0.9338, + "step": 301200 + }, + { + "epoch": 97.38202973497091, + "grad_norm": 2.024261236190796, + "learning_rate": 0.001, + "loss": 0.9365, + "step": 301300 + }, + { + "epoch": 97.41435035552682, + "grad_norm": 2.32181978225708, + "learning_rate": 0.001, + "loss": 0.9496, + "step": 301400 + }, + { + "epoch": 97.44667097608274, + "grad_norm": 1.8799872398376465, + "learning_rate": 0.001, + "loss": 0.9549, + "step": 301500 + }, + { + "epoch": 97.47899159663865, + "grad_norm": 2.4056990146636963, + "learning_rate": 0.001, + "loss": 0.954, + "step": 301600 + }, + { + "epoch": 97.51131221719457, + "grad_norm": 2.1647398471832275, + "learning_rate": 0.001, + "loss": 0.9651, + "step": 301700 + }, + { + "epoch": 97.54363283775048, + "grad_norm": 2.175403356552124, + "learning_rate": 0.001, + "loss": 0.9534, + "step": 301800 + }, + { + "epoch": 97.5759534583064, + "grad_norm": 1.9020843505859375, + "learning_rate": 0.001, + "loss": 0.9618, + "step": 301900 + }, + { + "epoch": 97.60827407886231, + "grad_norm": 2.246063232421875, + "learning_rate": 0.001, + "loss": 0.9539, + "step": 302000 + }, + { + "epoch": 97.64059469941823, + "grad_norm": 1.7729310989379883, + "learning_rate": 0.001, + "loss": 0.9634, + "step": 302100 + }, + { + "epoch": 97.67291531997414, + "grad_norm": 1.6846141815185547, + "learning_rate": 0.001, + "loss": 0.9614, + "step": 302200 + }, + { + "epoch": 97.70523594053006, + "grad_norm": 1.7244802713394165, + "learning_rate": 0.001, + "loss": 0.9859, + "step": 302300 + }, + { + "epoch": 97.73755656108597, + "grad_norm": 2.1463193893432617, + "learning_rate": 0.001, + "loss": 0.9803, + "step": 302400 + }, + { + "epoch": 97.76987718164189, + "grad_norm": 1.5155985355377197, + "learning_rate": 0.001, + "loss": 0.9796, + "step": 302500 + }, + { + "epoch": 97.8021978021978, + "grad_norm": 2.3664512634277344, + "learning_rate": 0.001, + "loss": 0.9756, + "step": 302600 + }, + { + "epoch": 97.83451842275372, + "grad_norm": 2.193302631378174, + "learning_rate": 0.001, + "loss": 0.9786, + "step": 302700 + }, + { + "epoch": 97.86683904330962, + "grad_norm": 1.757538080215454, + "learning_rate": 0.001, + "loss": 0.9773, + "step": 302800 + }, + { + "epoch": 97.89915966386555, + "grad_norm": 2.0512661933898926, + "learning_rate": 0.001, + "loss": 0.9928, + "step": 302900 + }, + { + "epoch": 97.93148028442145, + "grad_norm": 1.9938185214996338, + "learning_rate": 0.001, + "loss": 0.9976, + "step": 303000 + }, + { + "epoch": 97.96380090497738, + "grad_norm": 2.04455304145813, + "learning_rate": 0.001, + "loss": 0.9859, + "step": 303100 + }, + { + "epoch": 97.99612152553328, + "grad_norm": 2.0681285858154297, + "learning_rate": 0.001, + "loss": 0.994, + "step": 303200 + }, + { + "epoch": 98.0284421460892, + "grad_norm": 1.7298678159713745, + "learning_rate": 0.001, + "loss": 0.9041, + "step": 303300 + }, + { + "epoch": 98.06076276664513, + "grad_norm": 1.8354662656784058, + "learning_rate": 0.001, + "loss": 0.888, + "step": 303400 + }, + { + "epoch": 98.09308338720103, + "grad_norm": 1.9386900663375854, + "learning_rate": 0.001, + "loss": 0.8944, + "step": 303500 + }, + { + "epoch": 98.12540400775696, + "grad_norm": 2.130673408508301, + "learning_rate": 0.001, + "loss": 0.9052, + "step": 303600 + }, + { + "epoch": 98.15772462831286, + "grad_norm": 1.854596734046936, + "learning_rate": 0.001, + "loss": 0.9047, + "step": 303700 + }, + { + "epoch": 98.19004524886878, + "grad_norm": 1.988502860069275, + "learning_rate": 0.001, + "loss": 0.9114, + "step": 303800 + }, + { + "epoch": 98.22236586942469, + "grad_norm": 1.8438735008239746, + "learning_rate": 0.001, + "loss": 0.905, + "step": 303900 + }, + { + "epoch": 98.25468648998061, + "grad_norm": 1.9197919368743896, + "learning_rate": 0.001, + "loss": 0.9029, + "step": 304000 + }, + { + "epoch": 98.28700711053652, + "grad_norm": 1.8311904668807983, + "learning_rate": 0.001, + "loss": 0.9201, + "step": 304100 + }, + { + "epoch": 98.31932773109244, + "grad_norm": 2.5999064445495605, + "learning_rate": 0.001, + "loss": 0.93, + "step": 304200 + }, + { + "epoch": 98.35164835164835, + "grad_norm": 2.2026302814483643, + "learning_rate": 0.001, + "loss": 0.9327, + "step": 304300 + }, + { + "epoch": 98.38396897220427, + "grad_norm": 2.3874950408935547, + "learning_rate": 0.001, + "loss": 0.9415, + "step": 304400 + }, + { + "epoch": 98.41628959276018, + "grad_norm": 1.6854950189590454, + "learning_rate": 0.001, + "loss": 0.9401, + "step": 304500 + }, + { + "epoch": 98.4486102133161, + "grad_norm": 1.915907621383667, + "learning_rate": 0.001, + "loss": 0.9611, + "step": 304600 + }, + { + "epoch": 98.48093083387201, + "grad_norm": 2.3232321739196777, + "learning_rate": 0.001, + "loss": 0.9398, + "step": 304700 + }, + { + "epoch": 98.51325145442793, + "grad_norm": 1.8774093389511108, + "learning_rate": 0.001, + "loss": 0.9582, + "step": 304800 + }, + { + "epoch": 98.54557207498384, + "grad_norm": 2.2585456371307373, + "learning_rate": 0.001, + "loss": 0.9513, + "step": 304900 + }, + { + "epoch": 98.57789269553976, + "grad_norm": 1.8954609632492065, + "learning_rate": 0.001, + "loss": 0.9578, + "step": 305000 + }, + { + "epoch": 98.61021331609567, + "grad_norm": 2.1864609718322754, + "learning_rate": 0.001, + "loss": 0.9581, + "step": 305100 + }, + { + "epoch": 98.64253393665159, + "grad_norm": 2.2927186489105225, + "learning_rate": 0.001, + "loss": 0.956, + "step": 305200 + }, + { + "epoch": 98.6748545572075, + "grad_norm": 2.110893964767456, + "learning_rate": 0.001, + "loss": 0.9598, + "step": 305300 + }, + { + "epoch": 98.70717517776342, + "grad_norm": 1.7718578577041626, + "learning_rate": 0.001, + "loss": 0.95, + "step": 305400 + }, + { + "epoch": 98.73949579831933, + "grad_norm": 1.7689002752304077, + "learning_rate": 0.001, + "loss": 0.9672, + "step": 305500 + }, + { + "epoch": 98.77181641887525, + "grad_norm": 2.084601879119873, + "learning_rate": 0.001, + "loss": 0.9858, + "step": 305600 + }, + { + "epoch": 98.80413703943115, + "grad_norm": 1.817209243774414, + "learning_rate": 0.001, + "loss": 0.9709, + "step": 305700 + }, + { + "epoch": 98.83645765998708, + "grad_norm": 2.194476842880249, + "learning_rate": 0.001, + "loss": 0.9676, + "step": 305800 + }, + { + "epoch": 98.86877828054298, + "grad_norm": 2.089932680130005, + "learning_rate": 0.001, + "loss": 0.9804, + "step": 305900 + }, + { + "epoch": 98.9010989010989, + "grad_norm": 1.8631789684295654, + "learning_rate": 0.001, + "loss": 0.9771, + "step": 306000 + }, + { + "epoch": 98.93341952165481, + "grad_norm": 2.2455215454101562, + "learning_rate": 0.001, + "loss": 0.9932, + "step": 306100 + }, + { + "epoch": 98.96574014221073, + "grad_norm": 1.965256690979004, + "learning_rate": 0.001, + "loss": 0.9878, + "step": 306200 + }, + { + "epoch": 98.99806076276664, + "grad_norm": 2.0805888175964355, + "learning_rate": 0.001, + "loss": 0.9783, + "step": 306300 + }, + { + "epoch": 99.03038138332256, + "grad_norm": 1.8865318298339844, + "learning_rate": 0.001, + "loss": 0.8933, + "step": 306400 + }, + { + "epoch": 99.06270200387847, + "grad_norm": 1.988250970840454, + "learning_rate": 0.001, + "loss": 0.8875, + "step": 306500 + }, + { + "epoch": 99.09502262443439, + "grad_norm": 1.8929754495620728, + "learning_rate": 0.001, + "loss": 0.892, + "step": 306600 + }, + { + "epoch": 99.1273432449903, + "grad_norm": 2.4540634155273438, + "learning_rate": 0.001, + "loss": 0.886, + "step": 306700 + }, + { + "epoch": 99.15966386554622, + "grad_norm": 2.009747266769409, + "learning_rate": 0.001, + "loss": 0.904, + "step": 306800 + }, + { + "epoch": 99.19198448610213, + "grad_norm": 1.8114582300186157, + "learning_rate": 0.001, + "loss": 0.9166, + "step": 306900 + }, + { + "epoch": 99.22430510665805, + "grad_norm": 2.278742790222168, + "learning_rate": 0.001, + "loss": 0.9117, + "step": 307000 + }, + { + "epoch": 99.25662572721396, + "grad_norm": 2.1780436038970947, + "learning_rate": 0.001, + "loss": 0.9065, + "step": 307100 + }, + { + "epoch": 99.28894634776988, + "grad_norm": 2.099867343902588, + "learning_rate": 0.001, + "loss": 0.9221, + "step": 307200 + }, + { + "epoch": 99.32126696832579, + "grad_norm": 2.2217254638671875, + "learning_rate": 0.001, + "loss": 0.9354, + "step": 307300 + }, + { + "epoch": 99.35358758888171, + "grad_norm": 1.9928747415542603, + "learning_rate": 0.001, + "loss": 0.9202, + "step": 307400 + }, + { + "epoch": 99.38590820943762, + "grad_norm": 1.8501205444335938, + "learning_rate": 0.001, + "loss": 0.9434, + "step": 307500 + }, + { + "epoch": 99.41822882999354, + "grad_norm": 2.4966423511505127, + "learning_rate": 0.001, + "loss": 0.9282, + "step": 307600 + }, + { + "epoch": 99.45054945054945, + "grad_norm": 2.4920759201049805, + "learning_rate": 0.001, + "loss": 0.9306, + "step": 307700 + }, + { + "epoch": 99.48287007110537, + "grad_norm": 2.0279624462127686, + "learning_rate": 0.001, + "loss": 0.9554, + "step": 307800 + }, + { + "epoch": 99.51519069166127, + "grad_norm": 2.1771886348724365, + "learning_rate": 0.001, + "loss": 0.9452, + "step": 307900 + }, + { + "epoch": 99.5475113122172, + "grad_norm": 2.0133235454559326, + "learning_rate": 0.001, + "loss": 0.9396, + "step": 308000 + }, + { + "epoch": 99.5798319327731, + "grad_norm": 2.3399863243103027, + "learning_rate": 0.001, + "loss": 0.9375, + "step": 308100 + }, + { + "epoch": 99.61215255332903, + "grad_norm": 2.099194288253784, + "learning_rate": 0.001, + "loss": 0.9495, + "step": 308200 + }, + { + "epoch": 99.64447317388493, + "grad_norm": 2.0607011318206787, + "learning_rate": 0.001, + "loss": 0.9423, + "step": 308300 + }, + { + "epoch": 99.67679379444085, + "grad_norm": 2.133153200149536, + "learning_rate": 0.001, + "loss": 0.9684, + "step": 308400 + }, + { + "epoch": 99.70911441499676, + "grad_norm": 2.1646578311920166, + "learning_rate": 0.001, + "loss": 0.9546, + "step": 308500 + }, + { + "epoch": 99.74143503555268, + "grad_norm": 1.746728777885437, + "learning_rate": 0.001, + "loss": 0.9624, + "step": 308600 + }, + { + "epoch": 99.77375565610859, + "grad_norm": 2.2110424041748047, + "learning_rate": 0.001, + "loss": 0.9573, + "step": 308700 + }, + { + "epoch": 99.80607627666451, + "grad_norm": 2.3667218685150146, + "learning_rate": 0.001, + "loss": 0.9723, + "step": 308800 + }, + { + "epoch": 99.83839689722042, + "grad_norm": 2.146162986755371, + "learning_rate": 0.001, + "loss": 0.9695, + "step": 308900 + }, + { + "epoch": 99.87071751777634, + "grad_norm": 2.2320897579193115, + "learning_rate": 0.001, + "loss": 0.9716, + "step": 309000 + }, + { + "epoch": 99.90303813833225, + "grad_norm": 2.1772756576538086, + "learning_rate": 0.001, + "loss": 0.9756, + "step": 309100 + }, + { + "epoch": 99.93535875888817, + "grad_norm": 2.2531981468200684, + "learning_rate": 0.001, + "loss": 0.9768, + "step": 309200 + }, + { + "epoch": 99.96767937944408, + "grad_norm": 1.8836907148361206, + "learning_rate": 0.001, + "loss": 0.9641, + "step": 309300 + }, + { + "epoch": 100.0, + "grad_norm": 2.6943843364715576, + "learning_rate": 0.001, + "loss": 0.9524, + "step": 309400 + }, + { + "epoch": 100.0, + "step": 309400, + "total_flos": 3.2618021998218854e+17, + "train_loss": 1.6592349118391159, + "train_runtime": 40857.8788, + "train_samples_per_second": 242.303, + "train_steps_per_second": 7.573 + } + ], + "logging_steps": 100, + "max_steps": 309400, + "num_input_tokens_seen": 0, + "num_train_epochs": 100, + "save_steps": 2000, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 3.2618021998218854e+17, + "train_batch_size": 32, + "trial_name": null, + "trial_params": null +}