| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 1.8458242699887943, | |
| "eval_steps": 500, | |
| "global_step": 7000, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.001318304660206974, | |
| "grad_norm": 4.59375, | |
| "learning_rate": 0.0002, | |
| "loss": 1.9624, | |
| "step": 5 | |
| }, | |
| { | |
| "epoch": 0.002636609320413948, | |
| "grad_norm": 1.7421875, | |
| "learning_rate": 0.00019986805647183008, | |
| "loss": 0.6513, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.003954913980620921, | |
| "grad_norm": 1.84375, | |
| "learning_rate": 0.00019973611294366012, | |
| "loss": 0.1146, | |
| "step": 15 | |
| }, | |
| { | |
| "epoch": 0.005273218640827896, | |
| "grad_norm": 1.3203125, | |
| "learning_rate": 0.0001996041694154902, | |
| "loss": 0.0529, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.006591523301034869, | |
| "grad_norm": 0.40234375, | |
| "learning_rate": 0.00019947222588732023, | |
| "loss": 0.1214, | |
| "step": 25 | |
| }, | |
| { | |
| "epoch": 0.007909827961241843, | |
| "grad_norm": 1.5390625, | |
| "learning_rate": 0.0001993402823591503, | |
| "loss": 0.0919, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.009228132621448816, | |
| "grad_norm": 0.06201171875, | |
| "learning_rate": 0.00019920833883098034, | |
| "loss": 0.09, | |
| "step": 35 | |
| }, | |
| { | |
| "epoch": 0.010546437281655791, | |
| "grad_norm": 1.53125, | |
| "learning_rate": 0.0001990763953028104, | |
| "loss": 0.1945, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.011864741941862765, | |
| "grad_norm": 0.2890625, | |
| "learning_rate": 0.00019894445177464048, | |
| "loss": 0.1259, | |
| "step": 45 | |
| }, | |
| { | |
| "epoch": 0.013183046602069738, | |
| "grad_norm": 0.609375, | |
| "learning_rate": 0.00019881250824647052, | |
| "loss": 0.027, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.014501351262276712, | |
| "grad_norm": 0.369140625, | |
| "learning_rate": 0.00019868056471830057, | |
| "loss": 0.1068, | |
| "step": 55 | |
| }, | |
| { | |
| "epoch": 0.015819655922483685, | |
| "grad_norm": 0.34765625, | |
| "learning_rate": 0.00019854862119013064, | |
| "loss": 0.0542, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.01713796058269066, | |
| "grad_norm": 0.055419921875, | |
| "learning_rate": 0.00019841667766196068, | |
| "loss": 0.0901, | |
| "step": 65 | |
| }, | |
| { | |
| "epoch": 0.018456265242897632, | |
| "grad_norm": 0.0247802734375, | |
| "learning_rate": 0.00019828473413379075, | |
| "loss": 0.0091, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.019774569903104607, | |
| "grad_norm": 0.0079345703125, | |
| "learning_rate": 0.0001981527906056208, | |
| "loss": 0.0744, | |
| "step": 75 | |
| }, | |
| { | |
| "epoch": 0.021092874563311582, | |
| "grad_norm": 0.65234375, | |
| "learning_rate": 0.00019802084707745086, | |
| "loss": 0.1108, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.022411179223518554, | |
| "grad_norm": 0.50390625, | |
| "learning_rate": 0.0001978889035492809, | |
| "loss": 0.0446, | |
| "step": 85 | |
| }, | |
| { | |
| "epoch": 0.02372948388372553, | |
| "grad_norm": 0.1787109375, | |
| "learning_rate": 0.00019775696002111097, | |
| "loss": 0.0982, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.0250477885439325, | |
| "grad_norm": 0.490234375, | |
| "learning_rate": 0.00019762501649294104, | |
| "loss": 0.1035, | |
| "step": 95 | |
| }, | |
| { | |
| "epoch": 0.026366093204139476, | |
| "grad_norm": 0.12158203125, | |
| "learning_rate": 0.00019749307296477108, | |
| "loss": 0.0401, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.02768439786434645, | |
| "grad_norm": 0.16015625, | |
| "learning_rate": 0.00019736112943660115, | |
| "loss": 0.0309, | |
| "step": 105 | |
| }, | |
| { | |
| "epoch": 0.029002702524553423, | |
| "grad_norm": 1.359375, | |
| "learning_rate": 0.0001972291859084312, | |
| "loss": 0.1032, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.0303210071847604, | |
| "grad_norm": 0.52734375, | |
| "learning_rate": 0.00019709724238026126, | |
| "loss": 0.0811, | |
| "step": 115 | |
| }, | |
| { | |
| "epoch": 0.03163931184496737, | |
| "grad_norm": 0.177734375, | |
| "learning_rate": 0.00019696529885209133, | |
| "loss": 0.0258, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.03295761650517435, | |
| "grad_norm": 0.234375, | |
| "learning_rate": 0.00019683335532392137, | |
| "loss": 0.0437, | |
| "step": 125 | |
| }, | |
| { | |
| "epoch": 0.03427592116538132, | |
| "grad_norm": 1.3046875, | |
| "learning_rate": 0.00019670141179575144, | |
| "loss": 0.0967, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.03559422582558829, | |
| "grad_norm": 0.2734375, | |
| "learning_rate": 0.00019656946826758148, | |
| "loss": 0.0132, | |
| "step": 135 | |
| }, | |
| { | |
| "epoch": 0.036912530485795264, | |
| "grad_norm": 0.66015625, | |
| "learning_rate": 0.00019643752473941155, | |
| "loss": 0.0396, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.03823083514600224, | |
| "grad_norm": 1.0546875, | |
| "learning_rate": 0.0001963055812112416, | |
| "loss": 0.0449, | |
| "step": 145 | |
| }, | |
| { | |
| "epoch": 0.039549139806209214, | |
| "grad_norm": 0.2021484375, | |
| "learning_rate": 0.00019617363768307166, | |
| "loss": 0.1196, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.040867444466416186, | |
| "grad_norm": 0.5859375, | |
| "learning_rate": 0.0001960416941549017, | |
| "loss": 0.0588, | |
| "step": 155 | |
| }, | |
| { | |
| "epoch": 0.042185749126623165, | |
| "grad_norm": 0.06005859375, | |
| "learning_rate": 0.00019590975062673175, | |
| "loss": 0.0234, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.04350405378683014, | |
| "grad_norm": 0.4921875, | |
| "learning_rate": 0.00019577780709856182, | |
| "loss": 0.0916, | |
| "step": 165 | |
| }, | |
| { | |
| "epoch": 0.04482235844703711, | |
| "grad_norm": 0.84375, | |
| "learning_rate": 0.0001956458635703919, | |
| "loss": 0.0271, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.04614066310724409, | |
| "grad_norm": 0.8828125, | |
| "learning_rate": 0.00019551392004222193, | |
| "loss": 0.0175, | |
| "step": 175 | |
| }, | |
| { | |
| "epoch": 0.04745896776745106, | |
| "grad_norm": 0.0152587890625, | |
| "learning_rate": 0.000195381976514052, | |
| "loss": 0.0356, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.04877727242765803, | |
| "grad_norm": 0.09326171875, | |
| "learning_rate": 0.00019525003298588204, | |
| "loss": 0.0057, | |
| "step": 185 | |
| }, | |
| { | |
| "epoch": 0.050095577087865, | |
| "grad_norm": 0.24609375, | |
| "learning_rate": 0.0001951180894577121, | |
| "loss": 0.0082, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 0.05141388174807198, | |
| "grad_norm": 0.05029296875, | |
| "learning_rate": 0.00019498614592954215, | |
| "loss": 0.0178, | |
| "step": 195 | |
| }, | |
| { | |
| "epoch": 0.05273218640827895, | |
| "grad_norm": 0.0390625, | |
| "learning_rate": 0.00019485420240137222, | |
| "loss": 0.0789, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.054050491068485924, | |
| "grad_norm": 0.5625, | |
| "learning_rate": 0.0001947222588732023, | |
| "loss": 0.0645, | |
| "step": 205 | |
| }, | |
| { | |
| "epoch": 0.0553687957286929, | |
| "grad_norm": 0.53515625, | |
| "learning_rate": 0.00019459031534503233, | |
| "loss": 0.116, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 0.056687100388899875, | |
| "grad_norm": 0.55078125, | |
| "learning_rate": 0.0001944583718168624, | |
| "loss": 0.0516, | |
| "step": 215 | |
| }, | |
| { | |
| "epoch": 0.058005405049106847, | |
| "grad_norm": 0.314453125, | |
| "learning_rate": 0.00019432642828869244, | |
| "loss": 0.1019, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 0.059323709709313825, | |
| "grad_norm": 0.1123046875, | |
| "learning_rate": 0.0001941944847605225, | |
| "loss": 0.0529, | |
| "step": 225 | |
| }, | |
| { | |
| "epoch": 0.0606420143695208, | |
| "grad_norm": 0.4921875, | |
| "learning_rate": 0.00019406254123235256, | |
| "loss": 0.0368, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 0.06196031902972777, | |
| "grad_norm": 0.054443359375, | |
| "learning_rate": 0.00019393059770418262, | |
| "loss": 0.037, | |
| "step": 235 | |
| }, | |
| { | |
| "epoch": 0.06327862368993474, | |
| "grad_norm": 0.008544921875, | |
| "learning_rate": 0.0001937986541760127, | |
| "loss": 0.0324, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 0.06459692835014172, | |
| "grad_norm": 1.5, | |
| "learning_rate": 0.00019366671064784274, | |
| "loss": 0.0334, | |
| "step": 245 | |
| }, | |
| { | |
| "epoch": 0.0659152330103487, | |
| "grad_norm": 0.2109375, | |
| "learning_rate": 0.0001935347671196728, | |
| "loss": 0.0671, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.06723353767055566, | |
| "grad_norm": 2.0625, | |
| "learning_rate": 0.00019340282359150285, | |
| "loss": 0.1559, | |
| "step": 255 | |
| }, | |
| { | |
| "epoch": 0.06855184233076264, | |
| "grad_norm": 0.7734375, | |
| "learning_rate": 0.0001932708800633329, | |
| "loss": 0.0198, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 0.06987014699096962, | |
| "grad_norm": 0.42578125, | |
| "learning_rate": 0.00019313893653516296, | |
| "loss": 0.0151, | |
| "step": 265 | |
| }, | |
| { | |
| "epoch": 0.07118845165117658, | |
| "grad_norm": 0.1884765625, | |
| "learning_rate": 0.000193006993006993, | |
| "loss": 0.0269, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 0.07250675631138356, | |
| "grad_norm": 1.546875, | |
| "learning_rate": 0.00019287504947882307, | |
| "loss": 0.0565, | |
| "step": 275 | |
| }, | |
| { | |
| "epoch": 0.07382506097159053, | |
| "grad_norm": 0.5078125, | |
| "learning_rate": 0.0001927431059506531, | |
| "loss": 0.0942, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 0.0751433656317975, | |
| "grad_norm": 0.392578125, | |
| "learning_rate": 0.00019261116242248318, | |
| "loss": 0.0061, | |
| "step": 285 | |
| }, | |
| { | |
| "epoch": 0.07646167029200449, | |
| "grad_norm": 1.9140625, | |
| "learning_rate": 0.00019247921889431325, | |
| "loss": 0.0497, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 0.07777997495221145, | |
| "grad_norm": 0.08837890625, | |
| "learning_rate": 0.0001923472753661433, | |
| "loss": 0.0573, | |
| "step": 295 | |
| }, | |
| { | |
| "epoch": 0.07909827961241843, | |
| "grad_norm": 1.046875, | |
| "learning_rate": 0.00019221533183797336, | |
| "loss": 0.0528, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.08041658427262541, | |
| "grad_norm": 0.2275390625, | |
| "learning_rate": 0.0001920833883098034, | |
| "loss": 0.0506, | |
| "step": 305 | |
| }, | |
| { | |
| "epoch": 0.08173488893283237, | |
| "grad_norm": 0.08203125, | |
| "learning_rate": 0.00019195144478163347, | |
| "loss": 0.0307, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 0.08305319359303935, | |
| "grad_norm": 0.111328125, | |
| "learning_rate": 0.00019181950125346354, | |
| "loss": 0.0365, | |
| "step": 315 | |
| }, | |
| { | |
| "epoch": 0.08437149825324633, | |
| "grad_norm": 1.2890625, | |
| "learning_rate": 0.00019168755772529358, | |
| "loss": 0.0447, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 0.0856898029134533, | |
| "grad_norm": 0.6015625, | |
| "learning_rate": 0.00019155561419712365, | |
| "loss": 0.0605, | |
| "step": 325 | |
| }, | |
| { | |
| "epoch": 0.08700810757366027, | |
| "grad_norm": 0.71875, | |
| "learning_rate": 0.0001914236706689537, | |
| "loss": 0.0846, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 0.08832641223386725, | |
| "grad_norm": 0.1494140625, | |
| "learning_rate": 0.00019129172714078376, | |
| "loss": 0.0713, | |
| "step": 335 | |
| }, | |
| { | |
| "epoch": 0.08964471689407422, | |
| "grad_norm": 0.1669921875, | |
| "learning_rate": 0.0001911597836126138, | |
| "loss": 0.0826, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 0.0909630215542812, | |
| "grad_norm": 2.203125, | |
| "learning_rate": 0.00019102784008444388, | |
| "loss": 0.0441, | |
| "step": 345 | |
| }, | |
| { | |
| "epoch": 0.09228132621448817, | |
| "grad_norm": 1.21875, | |
| "learning_rate": 0.00019089589655627395, | |
| "loss": 0.1378, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 0.09359963087469514, | |
| "grad_norm": 3.0625, | |
| "learning_rate": 0.00019076395302810396, | |
| "loss": 0.1552, | |
| "step": 355 | |
| }, | |
| { | |
| "epoch": 0.09491793553490212, | |
| "grad_norm": 0.232421875, | |
| "learning_rate": 0.00019063200949993403, | |
| "loss": 0.0458, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 0.0962362401951091, | |
| "grad_norm": 0.71875, | |
| "learning_rate": 0.0001905000659717641, | |
| "loss": 0.0312, | |
| "step": 365 | |
| }, | |
| { | |
| "epoch": 0.09755454485531606, | |
| "grad_norm": 0.0218505859375, | |
| "learning_rate": 0.00019036812244359414, | |
| "loss": 0.0247, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 0.09887284951552304, | |
| "grad_norm": 0.064453125, | |
| "learning_rate": 0.0001902361789154242, | |
| "loss": 0.054, | |
| "step": 375 | |
| }, | |
| { | |
| "epoch": 0.10019115417573, | |
| "grad_norm": 0.021240234375, | |
| "learning_rate": 0.00019010423538725425, | |
| "loss": 0.0023, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 0.10150945883593698, | |
| "grad_norm": 0.0361328125, | |
| "learning_rate": 0.00018997229185908432, | |
| "loss": 0.0884, | |
| "step": 385 | |
| }, | |
| { | |
| "epoch": 0.10282776349614396, | |
| "grad_norm": 1.703125, | |
| "learning_rate": 0.00018984034833091436, | |
| "loss": 0.0506, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 0.10414606815635093, | |
| "grad_norm": 0.08837890625, | |
| "learning_rate": 0.00018970840480274443, | |
| "loss": 0.1123, | |
| "step": 395 | |
| }, | |
| { | |
| "epoch": 0.1054643728165579, | |
| "grad_norm": 0.6953125, | |
| "learning_rate": 0.0001895764612745745, | |
| "loss": 0.0597, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.10678267747676488, | |
| "grad_norm": 0.18359375, | |
| "learning_rate": 0.00018944451774640454, | |
| "loss": 0.0138, | |
| "step": 405 | |
| }, | |
| { | |
| "epoch": 0.10810098213697185, | |
| "grad_norm": 0.0272216796875, | |
| "learning_rate": 0.0001893125742182346, | |
| "loss": 0.0249, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 0.10941928679717883, | |
| "grad_norm": 0.00970458984375, | |
| "learning_rate": 0.00018918063069006466, | |
| "loss": 0.0084, | |
| "step": 415 | |
| }, | |
| { | |
| "epoch": 0.1107375914573858, | |
| "grad_norm": 0.54296875, | |
| "learning_rate": 0.00018904868716189472, | |
| "loss": 0.0541, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 0.11205589611759277, | |
| "grad_norm": 0.74609375, | |
| "learning_rate": 0.00018891674363372477, | |
| "loss": 0.007, | |
| "step": 425 | |
| }, | |
| { | |
| "epoch": 0.11337420077779975, | |
| "grad_norm": 0.0211181640625, | |
| "learning_rate": 0.00018878480010555484, | |
| "loss": 0.0875, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 0.11469250543800673, | |
| "grad_norm": 0.9296875, | |
| "learning_rate": 0.0001886528565773849, | |
| "loss": 0.1207, | |
| "step": 435 | |
| }, | |
| { | |
| "epoch": 0.11601081009821369, | |
| "grad_norm": 1.2734375, | |
| "learning_rate": 0.00018852091304921495, | |
| "loss": 0.1143, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 0.11732911475842067, | |
| "grad_norm": 0.6484375, | |
| "learning_rate": 0.00018838896952104502, | |
| "loss": 0.0393, | |
| "step": 445 | |
| }, | |
| { | |
| "epoch": 0.11864741941862765, | |
| "grad_norm": 0.1552734375, | |
| "learning_rate": 0.00018825702599287506, | |
| "loss": 0.02, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 0.11996572407883462, | |
| "grad_norm": 0.486328125, | |
| "learning_rate": 0.0001881250824647051, | |
| "loss": 0.0891, | |
| "step": 455 | |
| }, | |
| { | |
| "epoch": 0.1212840287390416, | |
| "grad_norm": 1.0, | |
| "learning_rate": 0.00018799313893653517, | |
| "loss": 0.0469, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 0.12260233339924857, | |
| "grad_norm": 0.2099609375, | |
| "learning_rate": 0.0001878611954083652, | |
| "loss": 0.019, | |
| "step": 465 | |
| }, | |
| { | |
| "epoch": 0.12392063805945554, | |
| "grad_norm": 0.03857421875, | |
| "learning_rate": 0.00018772925188019528, | |
| "loss": 0.007, | |
| "step": 470 | |
| }, | |
| { | |
| "epoch": 0.12523894271966252, | |
| "grad_norm": 0.0257568359375, | |
| "learning_rate": 0.00018759730835202532, | |
| "loss": 0.0039, | |
| "step": 475 | |
| }, | |
| { | |
| "epoch": 0.12655724737986948, | |
| "grad_norm": 0.014404296875, | |
| "learning_rate": 0.0001874653648238554, | |
| "loss": 0.0043, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 0.12787555204007647, | |
| "grad_norm": 0.51953125, | |
| "learning_rate": 0.00018733342129568546, | |
| "loss": 0.1326, | |
| "step": 485 | |
| }, | |
| { | |
| "epoch": 0.12919385670028344, | |
| "grad_norm": 0.99609375, | |
| "learning_rate": 0.0001872014777675155, | |
| "loss": 0.0369, | |
| "step": 490 | |
| }, | |
| { | |
| "epoch": 0.1305121613604904, | |
| "grad_norm": 0.2734375, | |
| "learning_rate": 0.00018706953423934557, | |
| "loss": 0.0395, | |
| "step": 495 | |
| }, | |
| { | |
| "epoch": 0.1318304660206974, | |
| "grad_norm": 0.083984375, | |
| "learning_rate": 0.00018693759071117561, | |
| "loss": 0.0284, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.1318304660206974, | |
| "eval_loss": 0.04542969539761543, | |
| "eval_model_preparation_time": 0.0076, | |
| "eval_runtime": 457.5293, | |
| "eval_samples_per_second": 7.37, | |
| "eval_steps_per_second": 3.685, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.13314877068090436, | |
| "grad_norm": 0.0291748046875, | |
| "learning_rate": 0.00018680564718300568, | |
| "loss": 0.0533, | |
| "step": 505 | |
| }, | |
| { | |
| "epoch": 0.13446707534111133, | |
| "grad_norm": 0.71484375, | |
| "learning_rate": 0.00018667370365483575, | |
| "loss": 0.0183, | |
| "step": 510 | |
| }, | |
| { | |
| "epoch": 0.13578538000131832, | |
| "grad_norm": 0.018798828125, | |
| "learning_rate": 0.0001865417601266658, | |
| "loss": 0.0473, | |
| "step": 515 | |
| }, | |
| { | |
| "epoch": 0.13710368466152528, | |
| "grad_norm": 0.388671875, | |
| "learning_rate": 0.00018640981659849586, | |
| "loss": 0.0562, | |
| "step": 520 | |
| }, | |
| { | |
| "epoch": 0.13842198932173225, | |
| "grad_norm": 0.77734375, | |
| "learning_rate": 0.0001862778730703259, | |
| "loss": 0.0755, | |
| "step": 525 | |
| }, | |
| { | |
| "epoch": 0.13974029398193924, | |
| "grad_norm": 2.8125, | |
| "learning_rate": 0.00018614592954215598, | |
| "loss": 0.0422, | |
| "step": 530 | |
| }, | |
| { | |
| "epoch": 0.1410585986421462, | |
| "grad_norm": 0.48828125, | |
| "learning_rate": 0.00018601398601398602, | |
| "loss": 0.0882, | |
| "step": 535 | |
| }, | |
| { | |
| "epoch": 0.14237690330235317, | |
| "grad_norm": 0.16015625, | |
| "learning_rate": 0.0001858820424858161, | |
| "loss": 0.0131, | |
| "step": 540 | |
| }, | |
| { | |
| "epoch": 0.14369520796256013, | |
| "grad_norm": 0.31640625, | |
| "learning_rate": 0.00018575009895764616, | |
| "loss": 0.03, | |
| "step": 545 | |
| }, | |
| { | |
| "epoch": 0.14501351262276713, | |
| "grad_norm": 0.0120849609375, | |
| "learning_rate": 0.0001856181554294762, | |
| "loss": 0.0425, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 0.1463318172829741, | |
| "grad_norm": 0.390625, | |
| "learning_rate": 0.00018548621190130624, | |
| "loss": 0.011, | |
| "step": 555 | |
| }, | |
| { | |
| "epoch": 0.14765012194318106, | |
| "grad_norm": 1.9609375, | |
| "learning_rate": 0.0001853542683731363, | |
| "loss": 0.0807, | |
| "step": 560 | |
| }, | |
| { | |
| "epoch": 0.14896842660338805, | |
| "grad_norm": 0.609375, | |
| "learning_rate": 0.00018522232484496635, | |
| "loss": 0.0278, | |
| "step": 565 | |
| }, | |
| { | |
| "epoch": 0.150286731263595, | |
| "grad_norm": 0.087890625, | |
| "learning_rate": 0.00018509038131679642, | |
| "loss": 0.0484, | |
| "step": 570 | |
| }, | |
| { | |
| "epoch": 0.15160503592380198, | |
| "grad_norm": 0.5078125, | |
| "learning_rate": 0.00018495843778862646, | |
| "loss": 0.1277, | |
| "step": 575 | |
| }, | |
| { | |
| "epoch": 0.15292334058400897, | |
| "grad_norm": 0.8125, | |
| "learning_rate": 0.00018482649426045653, | |
| "loss": 0.058, | |
| "step": 580 | |
| }, | |
| { | |
| "epoch": 0.15424164524421594, | |
| "grad_norm": 0.22265625, | |
| "learning_rate": 0.00018469455073228657, | |
| "loss": 0.0259, | |
| "step": 585 | |
| }, | |
| { | |
| "epoch": 0.1555599499044229, | |
| "grad_norm": 1.8984375, | |
| "learning_rate": 0.00018456260720411664, | |
| "loss": 0.113, | |
| "step": 590 | |
| }, | |
| { | |
| "epoch": 0.1568782545646299, | |
| "grad_norm": 0.12451171875, | |
| "learning_rate": 0.0001844306636759467, | |
| "loss": 0.0312, | |
| "step": 595 | |
| }, | |
| { | |
| "epoch": 0.15819655922483686, | |
| "grad_norm": 0.0322265625, | |
| "learning_rate": 0.00018429872014777676, | |
| "loss": 0.0476, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 0.15951486388504382, | |
| "grad_norm": 0.0281982421875, | |
| "learning_rate": 0.00018416677661960682, | |
| "loss": 0.0232, | |
| "step": 605 | |
| }, | |
| { | |
| "epoch": 0.16083316854525082, | |
| "grad_norm": 0.57421875, | |
| "learning_rate": 0.00018403483309143687, | |
| "loss": 0.1287, | |
| "step": 610 | |
| }, | |
| { | |
| "epoch": 0.16215147320545778, | |
| "grad_norm": 0.765625, | |
| "learning_rate": 0.00018390288956326694, | |
| "loss": 0.0991, | |
| "step": 615 | |
| }, | |
| { | |
| "epoch": 0.16346977786566474, | |
| "grad_norm": 0.3125, | |
| "learning_rate": 0.00018377094603509698, | |
| "loss": 0.0247, | |
| "step": 620 | |
| }, | |
| { | |
| "epoch": 0.16478808252587174, | |
| "grad_norm": 0.37890625, | |
| "learning_rate": 0.00018363900250692705, | |
| "loss": 0.0632, | |
| "step": 625 | |
| }, | |
| { | |
| "epoch": 0.1661063871860787, | |
| "grad_norm": 0.1494140625, | |
| "learning_rate": 0.00018350705897875712, | |
| "loss": 0.0314, | |
| "step": 630 | |
| }, | |
| { | |
| "epoch": 0.16742469184628567, | |
| "grad_norm": 0.0673828125, | |
| "learning_rate": 0.00018337511545058716, | |
| "loss": 0.0425, | |
| "step": 635 | |
| }, | |
| { | |
| "epoch": 0.16874299650649266, | |
| "grad_norm": 0.396484375, | |
| "learning_rate": 0.00018324317192241723, | |
| "loss": 0.0613, | |
| "step": 640 | |
| }, | |
| { | |
| "epoch": 0.17006130116669962, | |
| "grad_norm": 0.057373046875, | |
| "learning_rate": 0.00018311122839424727, | |
| "loss": 0.0569, | |
| "step": 645 | |
| }, | |
| { | |
| "epoch": 0.1713796058269066, | |
| "grad_norm": 0.001373291015625, | |
| "learning_rate": 0.00018297928486607734, | |
| "loss": 0.007, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 0.17269791048711358, | |
| "grad_norm": 1.0859375, | |
| "learning_rate": 0.00018284734133790738, | |
| "loss": 0.0189, | |
| "step": 655 | |
| }, | |
| { | |
| "epoch": 0.17401621514732055, | |
| "grad_norm": 0.6015625, | |
| "learning_rate": 0.00018271539780973742, | |
| "loss": 0.0601, | |
| "step": 660 | |
| }, | |
| { | |
| "epoch": 0.1753345198075275, | |
| "grad_norm": 0.25390625, | |
| "learning_rate": 0.0001825834542815675, | |
| "loss": 0.0211, | |
| "step": 665 | |
| }, | |
| { | |
| "epoch": 0.1766528244677345, | |
| "grad_norm": 2.6875, | |
| "learning_rate": 0.00018245151075339753, | |
| "loss": 0.0713, | |
| "step": 670 | |
| }, | |
| { | |
| "epoch": 0.17797112912794147, | |
| "grad_norm": 1.1875, | |
| "learning_rate": 0.0001823195672252276, | |
| "loss": 0.0522, | |
| "step": 675 | |
| }, | |
| { | |
| "epoch": 0.17928943378814843, | |
| "grad_norm": 0.025146484375, | |
| "learning_rate": 0.00018218762369705767, | |
| "loss": 0.0242, | |
| "step": 680 | |
| }, | |
| { | |
| "epoch": 0.18060773844835543, | |
| "grad_norm": 0.048095703125, | |
| "learning_rate": 0.00018205568016888772, | |
| "loss": 0.0129, | |
| "step": 685 | |
| }, | |
| { | |
| "epoch": 0.1819260431085624, | |
| "grad_norm": 0.04541015625, | |
| "learning_rate": 0.00018192373664071778, | |
| "loss": 0.0142, | |
| "step": 690 | |
| }, | |
| { | |
| "epoch": 0.18324434776876936, | |
| "grad_norm": 0.00830078125, | |
| "learning_rate": 0.00018179179311254783, | |
| "loss": 0.0121, | |
| "step": 695 | |
| }, | |
| { | |
| "epoch": 0.18456265242897635, | |
| "grad_norm": 0.53125, | |
| "learning_rate": 0.0001816598495843779, | |
| "loss": 0.0163, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 0.1858809570891833, | |
| "grad_norm": 0.185546875, | |
| "learning_rate": 0.00018152790605620796, | |
| "loss": 0.0203, | |
| "step": 705 | |
| }, | |
| { | |
| "epoch": 0.18719926174939028, | |
| "grad_norm": 1.2578125, | |
| "learning_rate": 0.000181395962528038, | |
| "loss": 0.1548, | |
| "step": 710 | |
| }, | |
| { | |
| "epoch": 0.18851756640959727, | |
| "grad_norm": 0.0247802734375, | |
| "learning_rate": 0.00018126401899986808, | |
| "loss": 0.0543, | |
| "step": 715 | |
| }, | |
| { | |
| "epoch": 0.18983587106980424, | |
| "grad_norm": 0.07568359375, | |
| "learning_rate": 0.00018113207547169812, | |
| "loss": 0.0346, | |
| "step": 720 | |
| }, | |
| { | |
| "epoch": 0.1911541757300112, | |
| "grad_norm": 0.1318359375, | |
| "learning_rate": 0.0001810001319435282, | |
| "loss": 0.03, | |
| "step": 725 | |
| }, | |
| { | |
| "epoch": 0.1924724803902182, | |
| "grad_norm": 0.1455078125, | |
| "learning_rate": 0.00018086818841535823, | |
| "loss": 0.0796, | |
| "step": 730 | |
| }, | |
| { | |
| "epoch": 0.19379078505042516, | |
| "grad_norm": 0.09814453125, | |
| "learning_rate": 0.0001807362448871883, | |
| "loss": 0.0662, | |
| "step": 735 | |
| }, | |
| { | |
| "epoch": 0.19510908971063212, | |
| "grad_norm": 0.91015625, | |
| "learning_rate": 0.00018060430135901837, | |
| "loss": 0.0675, | |
| "step": 740 | |
| }, | |
| { | |
| "epoch": 0.19642739437083911, | |
| "grad_norm": 0.10693359375, | |
| "learning_rate": 0.0001804723578308484, | |
| "loss": 0.0377, | |
| "step": 745 | |
| }, | |
| { | |
| "epoch": 0.19774569903104608, | |
| "grad_norm": 0.95703125, | |
| "learning_rate": 0.00018034041430267848, | |
| "loss": 0.0174, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 0.19906400369125304, | |
| "grad_norm": 1.7890625, | |
| "learning_rate": 0.00018020847077450852, | |
| "loss": 0.0278, | |
| "step": 755 | |
| }, | |
| { | |
| "epoch": 0.20038230835146, | |
| "grad_norm": 0.8515625, | |
| "learning_rate": 0.00018007652724633856, | |
| "loss": 0.0113, | |
| "step": 760 | |
| }, | |
| { | |
| "epoch": 0.201700613011667, | |
| "grad_norm": 0.016845703125, | |
| "learning_rate": 0.00017994458371816863, | |
| "loss": 0.0589, | |
| "step": 765 | |
| }, | |
| { | |
| "epoch": 0.20301891767187397, | |
| "grad_norm": 0.01043701171875, | |
| "learning_rate": 0.00017981264018999867, | |
| "loss": 0.0203, | |
| "step": 770 | |
| }, | |
| { | |
| "epoch": 0.20433722233208093, | |
| "grad_norm": 0.0242919921875, | |
| "learning_rate": 0.00017968069666182874, | |
| "loss": 0.0494, | |
| "step": 775 | |
| }, | |
| { | |
| "epoch": 0.20565552699228792, | |
| "grad_norm": 0.56640625, | |
| "learning_rate": 0.00017954875313365879, | |
| "loss": 0.0394, | |
| "step": 780 | |
| }, | |
| { | |
| "epoch": 0.2069738316524949, | |
| "grad_norm": 0.06591796875, | |
| "learning_rate": 0.00017941680960548886, | |
| "loss": 0.0848, | |
| "step": 785 | |
| }, | |
| { | |
| "epoch": 0.20829213631270185, | |
| "grad_norm": 0.40234375, | |
| "learning_rate": 0.00017928486607731892, | |
| "loss": 0.0464, | |
| "step": 790 | |
| }, | |
| { | |
| "epoch": 0.20961044097290885, | |
| "grad_norm": 0.06298828125, | |
| "learning_rate": 0.00017915292254914897, | |
| "loss": 0.0222, | |
| "step": 795 | |
| }, | |
| { | |
| "epoch": 0.2109287456331158, | |
| "grad_norm": 0.5390625, | |
| "learning_rate": 0.00017902097902097904, | |
| "loss": 0.0434, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 0.21224705029332278, | |
| "grad_norm": 1.390625, | |
| "learning_rate": 0.00017888903549280908, | |
| "loss": 0.0222, | |
| "step": 805 | |
| }, | |
| { | |
| "epoch": 0.21356535495352977, | |
| "grad_norm": 0.0272216796875, | |
| "learning_rate": 0.00017875709196463915, | |
| "loss": 0.0099, | |
| "step": 810 | |
| }, | |
| { | |
| "epoch": 0.21488365961373673, | |
| "grad_norm": 0.10009765625, | |
| "learning_rate": 0.0001786251484364692, | |
| "loss": 0.0086, | |
| "step": 815 | |
| }, | |
| { | |
| "epoch": 0.2162019642739437, | |
| "grad_norm": 0.06396484375, | |
| "learning_rate": 0.00017849320490829926, | |
| "loss": 0.0715, | |
| "step": 820 | |
| }, | |
| { | |
| "epoch": 0.2175202689341507, | |
| "grad_norm": 0.365234375, | |
| "learning_rate": 0.00017836126138012933, | |
| "loss": 0.0642, | |
| "step": 825 | |
| }, | |
| { | |
| "epoch": 0.21883857359435765, | |
| "grad_norm": 0.01519775390625, | |
| "learning_rate": 0.00017822931785195937, | |
| "loss": 0.0111, | |
| "step": 830 | |
| }, | |
| { | |
| "epoch": 0.22015687825456462, | |
| "grad_norm": 1.1640625, | |
| "learning_rate": 0.00017809737432378944, | |
| "loss": 0.0518, | |
| "step": 835 | |
| }, | |
| { | |
| "epoch": 0.2214751829147716, | |
| "grad_norm": 0.00921630859375, | |
| "learning_rate": 0.00017796543079561948, | |
| "loss": 0.0384, | |
| "step": 840 | |
| }, | |
| { | |
| "epoch": 0.22279348757497858, | |
| "grad_norm": 0.33984375, | |
| "learning_rate": 0.00017783348726744955, | |
| "loss": 0.0204, | |
| "step": 845 | |
| }, | |
| { | |
| "epoch": 0.22411179223518554, | |
| "grad_norm": 0.294921875, | |
| "learning_rate": 0.00017770154373927962, | |
| "loss": 0.0075, | |
| "step": 850 | |
| }, | |
| { | |
| "epoch": 0.22543009689539253, | |
| "grad_norm": 0.033203125, | |
| "learning_rate": 0.00017756960021110963, | |
| "loss": 0.0895, | |
| "step": 855 | |
| }, | |
| { | |
| "epoch": 0.2267484015555995, | |
| "grad_norm": 0.08056640625, | |
| "learning_rate": 0.0001774376566829397, | |
| "loss": 0.1039, | |
| "step": 860 | |
| }, | |
| { | |
| "epoch": 0.22806670621580646, | |
| "grad_norm": 0.55078125, | |
| "learning_rate": 0.00017730571315476975, | |
| "loss": 0.0125, | |
| "step": 865 | |
| }, | |
| { | |
| "epoch": 0.22938501087601346, | |
| "grad_norm": 0.5859375, | |
| "learning_rate": 0.00017717376962659982, | |
| "loss": 0.0381, | |
| "step": 870 | |
| }, | |
| { | |
| "epoch": 0.23070331553622042, | |
| "grad_norm": 0.029052734375, | |
| "learning_rate": 0.00017704182609842988, | |
| "loss": 0.0434, | |
| "step": 875 | |
| }, | |
| { | |
| "epoch": 0.23202162019642739, | |
| "grad_norm": 0.43359375, | |
| "learning_rate": 0.00017690988257025993, | |
| "loss": 0.0799, | |
| "step": 880 | |
| }, | |
| { | |
| "epoch": 0.23333992485663438, | |
| "grad_norm": 0.04150390625, | |
| "learning_rate": 0.00017677793904209, | |
| "loss": 0.0692, | |
| "step": 885 | |
| }, | |
| { | |
| "epoch": 0.23465822951684134, | |
| "grad_norm": 0.435546875, | |
| "learning_rate": 0.00017664599551392004, | |
| "loss": 0.0544, | |
| "step": 890 | |
| }, | |
| { | |
| "epoch": 0.2359765341770483, | |
| "grad_norm": 1.171875, | |
| "learning_rate": 0.0001765140519857501, | |
| "loss": 0.0619, | |
| "step": 895 | |
| }, | |
| { | |
| "epoch": 0.2372948388372553, | |
| "grad_norm": 0.01263427734375, | |
| "learning_rate": 0.00017638210845758018, | |
| "loss": 0.0418, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 0.23861314349746227, | |
| "grad_norm": 0.017578125, | |
| "learning_rate": 0.00017625016492941022, | |
| "loss": 0.0195, | |
| "step": 905 | |
| }, | |
| { | |
| "epoch": 0.23993144815766923, | |
| "grad_norm": 0.6171875, | |
| "learning_rate": 0.0001761182214012403, | |
| "loss": 0.067, | |
| "step": 910 | |
| }, | |
| { | |
| "epoch": 0.24124975281787622, | |
| "grad_norm": 0.59765625, | |
| "learning_rate": 0.00017598627787307033, | |
| "loss": 0.049, | |
| "step": 915 | |
| }, | |
| { | |
| "epoch": 0.2425680574780832, | |
| "grad_norm": 1.2421875, | |
| "learning_rate": 0.0001758543343449004, | |
| "loss": 0.0539, | |
| "step": 920 | |
| }, | |
| { | |
| "epoch": 0.24388636213829015, | |
| "grad_norm": 0.10302734375, | |
| "learning_rate": 0.00017572239081673044, | |
| "loss": 0.0725, | |
| "step": 925 | |
| }, | |
| { | |
| "epoch": 0.24520466679849715, | |
| "grad_norm": 0.330078125, | |
| "learning_rate": 0.0001755904472885605, | |
| "loss": 0.064, | |
| "step": 930 | |
| }, | |
| { | |
| "epoch": 0.2465229714587041, | |
| "grad_norm": 0.220703125, | |
| "learning_rate": 0.00017545850376039058, | |
| "loss": 0.0271, | |
| "step": 935 | |
| }, | |
| { | |
| "epoch": 0.24784127611891107, | |
| "grad_norm": 0.01470947265625, | |
| "learning_rate": 0.00017532656023222062, | |
| "loss": 0.0247, | |
| "step": 940 | |
| }, | |
| { | |
| "epoch": 0.24915958077911807, | |
| "grad_norm": 0.013427734375, | |
| "learning_rate": 0.0001751946167040507, | |
| "loss": 0.017, | |
| "step": 945 | |
| }, | |
| { | |
| "epoch": 0.25047788543932503, | |
| "grad_norm": 0.58984375, | |
| "learning_rate": 0.00017506267317588073, | |
| "loss": 0.0254, | |
| "step": 950 | |
| }, | |
| { | |
| "epoch": 0.251796190099532, | |
| "grad_norm": 0.412109375, | |
| "learning_rate": 0.00017493072964771078, | |
| "loss": 0.0186, | |
| "step": 955 | |
| }, | |
| { | |
| "epoch": 0.25311449475973896, | |
| "grad_norm": 0.66796875, | |
| "learning_rate": 0.00017479878611954084, | |
| "loss": 0.0617, | |
| "step": 960 | |
| }, | |
| { | |
| "epoch": 0.25443279941994595, | |
| "grad_norm": 0.322265625, | |
| "learning_rate": 0.00017466684259137089, | |
| "loss": 0.0173, | |
| "step": 965 | |
| }, | |
| { | |
| "epoch": 0.25575110408015295, | |
| "grad_norm": 0.83203125, | |
| "learning_rate": 0.00017453489906320096, | |
| "loss": 0.0512, | |
| "step": 970 | |
| }, | |
| { | |
| "epoch": 0.2570694087403599, | |
| "grad_norm": 0.08447265625, | |
| "learning_rate": 0.000174402955535031, | |
| "loss": 0.0361, | |
| "step": 975 | |
| }, | |
| { | |
| "epoch": 0.2583877134005669, | |
| "grad_norm": 0.423828125, | |
| "learning_rate": 0.00017427101200686107, | |
| "loss": 0.0175, | |
| "step": 980 | |
| }, | |
| { | |
| "epoch": 0.25970601806077387, | |
| "grad_norm": 0.77734375, | |
| "learning_rate": 0.00017413906847869114, | |
| "loss": 0.0139, | |
| "step": 985 | |
| }, | |
| { | |
| "epoch": 0.2610243227209808, | |
| "grad_norm": 0.515625, | |
| "learning_rate": 0.00017400712495052118, | |
| "loss": 0.0948, | |
| "step": 990 | |
| }, | |
| { | |
| "epoch": 0.2623426273811878, | |
| "grad_norm": 1.421875, | |
| "learning_rate": 0.00017387518142235125, | |
| "loss": 0.0406, | |
| "step": 995 | |
| }, | |
| { | |
| "epoch": 0.2636609320413948, | |
| "grad_norm": 0.058837890625, | |
| "learning_rate": 0.0001737432378941813, | |
| "loss": 0.1011, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.2636609320413948, | |
| "eval_loss": 0.045552924275398254, | |
| "eval_model_preparation_time": 0.0076, | |
| "eval_runtime": 457.6113, | |
| "eval_samples_per_second": 7.369, | |
| "eval_steps_per_second": 3.684, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.26497923670160173, | |
| "grad_norm": 0.380859375, | |
| "learning_rate": 0.00017361129436601136, | |
| "loss": 0.0711, | |
| "step": 1005 | |
| }, | |
| { | |
| "epoch": 0.2662975413618087, | |
| "grad_norm": 0.0208740234375, | |
| "learning_rate": 0.00017347935083784143, | |
| "loss": 0.0218, | |
| "step": 1010 | |
| }, | |
| { | |
| "epoch": 0.2676158460220157, | |
| "grad_norm": 0.04345703125, | |
| "learning_rate": 0.00017334740730967147, | |
| "loss": 0.0301, | |
| "step": 1015 | |
| }, | |
| { | |
| "epoch": 0.26893415068222265, | |
| "grad_norm": 0.2734375, | |
| "learning_rate": 0.00017321546378150154, | |
| "loss": 0.0721, | |
| "step": 1020 | |
| }, | |
| { | |
| "epoch": 0.27025245534242964, | |
| "grad_norm": 0.25390625, | |
| "learning_rate": 0.00017308352025333158, | |
| "loss": 0.0363, | |
| "step": 1025 | |
| }, | |
| { | |
| "epoch": 0.27157076000263664, | |
| "grad_norm": 0.04345703125, | |
| "learning_rate": 0.00017295157672516165, | |
| "loss": 0.0313, | |
| "step": 1030 | |
| }, | |
| { | |
| "epoch": 0.2728890646628436, | |
| "grad_norm": 0.0211181640625, | |
| "learning_rate": 0.0001728196331969917, | |
| "loss": 0.0385, | |
| "step": 1035 | |
| }, | |
| { | |
| "epoch": 0.27420736932305056, | |
| "grad_norm": 0.00787353515625, | |
| "learning_rate": 0.00017268768966882176, | |
| "loss": 0.0405, | |
| "step": 1040 | |
| }, | |
| { | |
| "epoch": 0.27552567398325756, | |
| "grad_norm": 0.484375, | |
| "learning_rate": 0.00017255574614065183, | |
| "loss": 0.0616, | |
| "step": 1045 | |
| }, | |
| { | |
| "epoch": 0.2768439786434645, | |
| "grad_norm": 0.0908203125, | |
| "learning_rate": 0.00017242380261248185, | |
| "loss": 0.0057, | |
| "step": 1050 | |
| }, | |
| { | |
| "epoch": 0.2781622833036715, | |
| "grad_norm": 0.1904296875, | |
| "learning_rate": 0.00017229185908431192, | |
| "loss": 0.0417, | |
| "step": 1055 | |
| }, | |
| { | |
| "epoch": 0.2794805879638785, | |
| "grad_norm": 0.30078125, | |
| "learning_rate": 0.00017215991555614196, | |
| "loss": 0.0346, | |
| "step": 1060 | |
| }, | |
| { | |
| "epoch": 0.2807988926240854, | |
| "grad_norm": 0.016357421875, | |
| "learning_rate": 0.00017202797202797203, | |
| "loss": 0.0295, | |
| "step": 1065 | |
| }, | |
| { | |
| "epoch": 0.2821171972842924, | |
| "grad_norm": 0.490234375, | |
| "learning_rate": 0.0001718960284998021, | |
| "loss": 0.0448, | |
| "step": 1070 | |
| }, | |
| { | |
| "epoch": 0.28343550194449935, | |
| "grad_norm": 0.004241943359375, | |
| "learning_rate": 0.00017176408497163214, | |
| "loss": 0.0051, | |
| "step": 1075 | |
| }, | |
| { | |
| "epoch": 0.28475380660470634, | |
| "grad_norm": 0.01904296875, | |
| "learning_rate": 0.0001716321414434622, | |
| "loss": 0.0894, | |
| "step": 1080 | |
| }, | |
| { | |
| "epoch": 0.28607211126491333, | |
| "grad_norm": 0.83984375, | |
| "learning_rate": 0.00017150019791529225, | |
| "loss": 0.0288, | |
| "step": 1085 | |
| }, | |
| { | |
| "epoch": 0.28739041592512027, | |
| "grad_norm": 0.2021484375, | |
| "learning_rate": 0.00017136825438712232, | |
| "loss": 0.0222, | |
| "step": 1090 | |
| }, | |
| { | |
| "epoch": 0.28870872058532726, | |
| "grad_norm": 0.322265625, | |
| "learning_rate": 0.0001712363108589524, | |
| "loss": 0.0444, | |
| "step": 1095 | |
| }, | |
| { | |
| "epoch": 0.29002702524553425, | |
| "grad_norm": 0.408203125, | |
| "learning_rate": 0.00017110436733078243, | |
| "loss": 0.0828, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 0.2913453299057412, | |
| "grad_norm": 0.04052734375, | |
| "learning_rate": 0.0001709724238026125, | |
| "loss": 0.0725, | |
| "step": 1105 | |
| }, | |
| { | |
| "epoch": 0.2926636345659482, | |
| "grad_norm": 0.2578125, | |
| "learning_rate": 0.00017084048027444254, | |
| "loss": 0.0204, | |
| "step": 1110 | |
| }, | |
| { | |
| "epoch": 0.2939819392261552, | |
| "grad_norm": 0.67578125, | |
| "learning_rate": 0.0001707085367462726, | |
| "loss": 0.0503, | |
| "step": 1115 | |
| }, | |
| { | |
| "epoch": 0.2953002438863621, | |
| "grad_norm": 0.0059814453125, | |
| "learning_rate": 0.00017057659321810265, | |
| "loss": 0.0144, | |
| "step": 1120 | |
| }, | |
| { | |
| "epoch": 0.2966185485465691, | |
| "grad_norm": 0.0269775390625, | |
| "learning_rate": 0.00017044464968993272, | |
| "loss": 0.0044, | |
| "step": 1125 | |
| }, | |
| { | |
| "epoch": 0.2979368532067761, | |
| "grad_norm": 0.1396484375, | |
| "learning_rate": 0.0001703127061617628, | |
| "loss": 0.013, | |
| "step": 1130 | |
| }, | |
| { | |
| "epoch": 0.29925515786698303, | |
| "grad_norm": 0.287109375, | |
| "learning_rate": 0.00017018076263359283, | |
| "loss": 0.0245, | |
| "step": 1135 | |
| }, | |
| { | |
| "epoch": 0.30057346252719, | |
| "grad_norm": 0.26171875, | |
| "learning_rate": 0.0001700488191054229, | |
| "loss": 0.0247, | |
| "step": 1140 | |
| }, | |
| { | |
| "epoch": 0.301891767187397, | |
| "grad_norm": 0.40625, | |
| "learning_rate": 0.00016991687557725294, | |
| "loss": 0.0402, | |
| "step": 1145 | |
| }, | |
| { | |
| "epoch": 0.30321007184760396, | |
| "grad_norm": 1.2578125, | |
| "learning_rate": 0.000169784932049083, | |
| "loss": 0.0071, | |
| "step": 1150 | |
| }, | |
| { | |
| "epoch": 0.30452837650781095, | |
| "grad_norm": 0.330078125, | |
| "learning_rate": 0.00016965298852091306, | |
| "loss": 0.0177, | |
| "step": 1155 | |
| }, | |
| { | |
| "epoch": 0.30584668116801794, | |
| "grad_norm": 0.07275390625, | |
| "learning_rate": 0.0001695210449927431, | |
| "loss": 0.0029, | |
| "step": 1160 | |
| }, | |
| { | |
| "epoch": 0.3071649858282249, | |
| "grad_norm": 0.455078125, | |
| "learning_rate": 0.00016938910146457317, | |
| "loss": 0.0262, | |
| "step": 1165 | |
| }, | |
| { | |
| "epoch": 0.30848329048843187, | |
| "grad_norm": 0.002655029296875, | |
| "learning_rate": 0.0001692571579364032, | |
| "loss": 0.0346, | |
| "step": 1170 | |
| }, | |
| { | |
| "epoch": 0.30980159514863886, | |
| "grad_norm": 0.1748046875, | |
| "learning_rate": 0.00016912521440823328, | |
| "loss": 0.0494, | |
| "step": 1175 | |
| }, | |
| { | |
| "epoch": 0.3111198998088458, | |
| "grad_norm": 1.4609375, | |
| "learning_rate": 0.00016899327088006335, | |
| "loss": 0.0603, | |
| "step": 1180 | |
| }, | |
| { | |
| "epoch": 0.3124382044690528, | |
| "grad_norm": 0.1572265625, | |
| "learning_rate": 0.0001688613273518934, | |
| "loss": 0.0366, | |
| "step": 1185 | |
| }, | |
| { | |
| "epoch": 0.3137565091292598, | |
| "grad_norm": 0.01422119140625, | |
| "learning_rate": 0.00016872938382372346, | |
| "loss": 0.0678, | |
| "step": 1190 | |
| }, | |
| { | |
| "epoch": 0.3150748137894667, | |
| "grad_norm": 0.2412109375, | |
| "learning_rate": 0.0001685974402955535, | |
| "loss": 0.0359, | |
| "step": 1195 | |
| }, | |
| { | |
| "epoch": 0.3163931184496737, | |
| "grad_norm": 0.275390625, | |
| "learning_rate": 0.00016846549676738357, | |
| "loss": 0.1099, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 0.3177114231098807, | |
| "grad_norm": 0.212890625, | |
| "learning_rate": 0.00016833355323921364, | |
| "loss": 0.0343, | |
| "step": 1205 | |
| }, | |
| { | |
| "epoch": 0.31902972777008765, | |
| "grad_norm": 0.0302734375, | |
| "learning_rate": 0.00016820160971104368, | |
| "loss": 0.0138, | |
| "step": 1210 | |
| }, | |
| { | |
| "epoch": 0.32034803243029464, | |
| "grad_norm": 0.016845703125, | |
| "learning_rate": 0.00016806966618287375, | |
| "loss": 0.0202, | |
| "step": 1215 | |
| }, | |
| { | |
| "epoch": 0.32166633709050163, | |
| "grad_norm": 0.1474609375, | |
| "learning_rate": 0.0001679377226547038, | |
| "loss": 0.0442, | |
| "step": 1220 | |
| }, | |
| { | |
| "epoch": 0.32298464175070857, | |
| "grad_norm": 0.049072265625, | |
| "learning_rate": 0.00016780577912653386, | |
| "loss": 0.0375, | |
| "step": 1225 | |
| }, | |
| { | |
| "epoch": 0.32430294641091556, | |
| "grad_norm": 0.1337890625, | |
| "learning_rate": 0.0001676738355983639, | |
| "loss": 0.01, | |
| "step": 1230 | |
| }, | |
| { | |
| "epoch": 0.32562125107112255, | |
| "grad_norm": 0.02197265625, | |
| "learning_rate": 0.00016754189207019397, | |
| "loss": 0.0139, | |
| "step": 1235 | |
| }, | |
| { | |
| "epoch": 0.3269395557313295, | |
| "grad_norm": 0.09228515625, | |
| "learning_rate": 0.00016740994854202404, | |
| "loss": 0.014, | |
| "step": 1240 | |
| }, | |
| { | |
| "epoch": 0.3282578603915365, | |
| "grad_norm": 0.47265625, | |
| "learning_rate": 0.00016727800501385408, | |
| "loss": 0.1546, | |
| "step": 1245 | |
| }, | |
| { | |
| "epoch": 0.3295761650517435, | |
| "grad_norm": 0.02294921875, | |
| "learning_rate": 0.00016714606148568413, | |
| "loss": 0.0803, | |
| "step": 1250 | |
| }, | |
| { | |
| "epoch": 0.3308944697119504, | |
| "grad_norm": 0.185546875, | |
| "learning_rate": 0.00016701411795751417, | |
| "loss": 0.0376, | |
| "step": 1255 | |
| }, | |
| { | |
| "epoch": 0.3322127743721574, | |
| "grad_norm": 0.1123046875, | |
| "learning_rate": 0.00016688217442934424, | |
| "loss": 0.0375, | |
| "step": 1260 | |
| }, | |
| { | |
| "epoch": 0.3335310790323644, | |
| "grad_norm": 1.03125, | |
| "learning_rate": 0.0001667502309011743, | |
| "loss": 0.0442, | |
| "step": 1265 | |
| }, | |
| { | |
| "epoch": 0.33484938369257133, | |
| "grad_norm": 0.0172119140625, | |
| "learning_rate": 0.00016661828737300435, | |
| "loss": 0.0261, | |
| "step": 1270 | |
| }, | |
| { | |
| "epoch": 0.3361676883527783, | |
| "grad_norm": 0.42578125, | |
| "learning_rate": 0.00016648634384483442, | |
| "loss": 0.0553, | |
| "step": 1275 | |
| }, | |
| { | |
| "epoch": 0.3374859930129853, | |
| "grad_norm": 0.1328125, | |
| "learning_rate": 0.00016635440031666446, | |
| "loss": 0.0065, | |
| "step": 1280 | |
| }, | |
| { | |
| "epoch": 0.33880429767319226, | |
| "grad_norm": 0.263671875, | |
| "learning_rate": 0.00016622245678849453, | |
| "loss": 0.0527, | |
| "step": 1285 | |
| }, | |
| { | |
| "epoch": 0.34012260233339925, | |
| "grad_norm": 0.314453125, | |
| "learning_rate": 0.0001660905132603246, | |
| "loss": 0.0297, | |
| "step": 1290 | |
| }, | |
| { | |
| "epoch": 0.34144090699360624, | |
| "grad_norm": 0.04345703125, | |
| "learning_rate": 0.00016595856973215464, | |
| "loss": 0.0477, | |
| "step": 1295 | |
| }, | |
| { | |
| "epoch": 0.3427592116538132, | |
| "grad_norm": 0.08154296875, | |
| "learning_rate": 0.0001658266262039847, | |
| "loss": 0.0298, | |
| "step": 1300 | |
| }, | |
| { | |
| "epoch": 0.34407751631402017, | |
| "grad_norm": 0.08935546875, | |
| "learning_rate": 0.00016569468267581475, | |
| "loss": 0.0481, | |
| "step": 1305 | |
| }, | |
| { | |
| "epoch": 0.34539582097422716, | |
| "grad_norm": 0.06640625, | |
| "learning_rate": 0.00016556273914764482, | |
| "loss": 0.0153, | |
| "step": 1310 | |
| }, | |
| { | |
| "epoch": 0.3467141256344341, | |
| "grad_norm": 0.00592041015625, | |
| "learning_rate": 0.00016543079561947486, | |
| "loss": 0.0111, | |
| "step": 1315 | |
| }, | |
| { | |
| "epoch": 0.3480324302946411, | |
| "grad_norm": 0.2236328125, | |
| "learning_rate": 0.00016529885209130493, | |
| "loss": 0.0309, | |
| "step": 1320 | |
| }, | |
| { | |
| "epoch": 0.3493507349548481, | |
| "grad_norm": 0.0198974609375, | |
| "learning_rate": 0.000165166908563135, | |
| "loss": 0.0579, | |
| "step": 1325 | |
| }, | |
| { | |
| "epoch": 0.350669039615055, | |
| "grad_norm": 0.10107421875, | |
| "learning_rate": 0.00016503496503496504, | |
| "loss": 0.0055, | |
| "step": 1330 | |
| }, | |
| { | |
| "epoch": 0.351987344275262, | |
| "grad_norm": 0.71875, | |
| "learning_rate": 0.00016490302150679511, | |
| "loss": 0.0299, | |
| "step": 1335 | |
| }, | |
| { | |
| "epoch": 0.353305648935469, | |
| "grad_norm": 0.01348876953125, | |
| "learning_rate": 0.00016477107797862516, | |
| "loss": 0.0943, | |
| "step": 1340 | |
| }, | |
| { | |
| "epoch": 0.35462395359567594, | |
| "grad_norm": 0.3046875, | |
| "learning_rate": 0.00016463913445045523, | |
| "loss": 0.0216, | |
| "step": 1345 | |
| }, | |
| { | |
| "epoch": 0.35594225825588294, | |
| "grad_norm": 0.02392578125, | |
| "learning_rate": 0.00016450719092228527, | |
| "loss": 0.0265, | |
| "step": 1350 | |
| }, | |
| { | |
| "epoch": 0.35726056291608993, | |
| "grad_norm": 0.453125, | |
| "learning_rate": 0.0001643752473941153, | |
| "loss": 0.0539, | |
| "step": 1355 | |
| }, | |
| { | |
| "epoch": 0.35857886757629687, | |
| "grad_norm": 0.00823974609375, | |
| "learning_rate": 0.00016424330386594538, | |
| "loss": 0.0139, | |
| "step": 1360 | |
| }, | |
| { | |
| "epoch": 0.35989717223650386, | |
| "grad_norm": 0.55859375, | |
| "learning_rate": 0.00016411136033777542, | |
| "loss": 0.0428, | |
| "step": 1365 | |
| }, | |
| { | |
| "epoch": 0.36121547689671085, | |
| "grad_norm": 0.052734375, | |
| "learning_rate": 0.0001639794168096055, | |
| "loss": 0.0346, | |
| "step": 1370 | |
| }, | |
| { | |
| "epoch": 0.3625337815569178, | |
| "grad_norm": 0.12158203125, | |
| "learning_rate": 0.00016384747328143556, | |
| "loss": 0.0095, | |
| "step": 1375 | |
| }, | |
| { | |
| "epoch": 0.3638520862171248, | |
| "grad_norm": 0.0240478515625, | |
| "learning_rate": 0.0001637155297532656, | |
| "loss": 0.0224, | |
| "step": 1380 | |
| }, | |
| { | |
| "epoch": 0.3651703908773318, | |
| "grad_norm": 0.01318359375, | |
| "learning_rate": 0.00016358358622509567, | |
| "loss": 0.0316, | |
| "step": 1385 | |
| }, | |
| { | |
| "epoch": 0.3664886955375387, | |
| "grad_norm": 0.011962890625, | |
| "learning_rate": 0.0001634516426969257, | |
| "loss": 0.0051, | |
| "step": 1390 | |
| }, | |
| { | |
| "epoch": 0.3678070001977457, | |
| "grad_norm": 0.00396728515625, | |
| "learning_rate": 0.00016331969916875578, | |
| "loss": 0.038, | |
| "step": 1395 | |
| }, | |
| { | |
| "epoch": 0.3691253048579527, | |
| "grad_norm": 0.375, | |
| "learning_rate": 0.00016318775564058585, | |
| "loss": 0.029, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 0.37044360951815963, | |
| "grad_norm": 0.265625, | |
| "learning_rate": 0.0001630558121124159, | |
| "loss": 0.0072, | |
| "step": 1405 | |
| }, | |
| { | |
| "epoch": 0.3717619141783666, | |
| "grad_norm": 0.00127410888671875, | |
| "learning_rate": 0.00016292386858424596, | |
| "loss": 0.0381, | |
| "step": 1410 | |
| }, | |
| { | |
| "epoch": 0.3730802188385736, | |
| "grad_norm": 1.15625, | |
| "learning_rate": 0.000162791925056076, | |
| "loss": 0.0573, | |
| "step": 1415 | |
| }, | |
| { | |
| "epoch": 0.37439852349878056, | |
| "grad_norm": 0.0244140625, | |
| "learning_rate": 0.00016265998152790607, | |
| "loss": 0.051, | |
| "step": 1420 | |
| }, | |
| { | |
| "epoch": 0.37571682815898755, | |
| "grad_norm": 0.0015106201171875, | |
| "learning_rate": 0.00016252803799973612, | |
| "loss": 0.0239, | |
| "step": 1425 | |
| }, | |
| { | |
| "epoch": 0.37703513281919454, | |
| "grad_norm": 0.26953125, | |
| "learning_rate": 0.00016239609447156618, | |
| "loss": 0.0165, | |
| "step": 1430 | |
| }, | |
| { | |
| "epoch": 0.3783534374794015, | |
| "grad_norm": 0.006134033203125, | |
| "learning_rate": 0.00016226415094339625, | |
| "loss": 0.0071, | |
| "step": 1435 | |
| }, | |
| { | |
| "epoch": 0.37967174213960847, | |
| "grad_norm": 2.828125, | |
| "learning_rate": 0.0001621322074152263, | |
| "loss": 0.0272, | |
| "step": 1440 | |
| }, | |
| { | |
| "epoch": 0.38099004679981546, | |
| "grad_norm": 0.349609375, | |
| "learning_rate": 0.00016200026388705637, | |
| "loss": 0.0647, | |
| "step": 1445 | |
| }, | |
| { | |
| "epoch": 0.3823083514600224, | |
| "grad_norm": 0.09326171875, | |
| "learning_rate": 0.00016186832035888638, | |
| "loss": 0.0262, | |
| "step": 1450 | |
| }, | |
| { | |
| "epoch": 0.3836266561202294, | |
| "grad_norm": 0.041015625, | |
| "learning_rate": 0.00016173637683071645, | |
| "loss": 0.0576, | |
| "step": 1455 | |
| }, | |
| { | |
| "epoch": 0.3849449607804364, | |
| "grad_norm": 0.033935546875, | |
| "learning_rate": 0.00016160443330254652, | |
| "loss": 0.0142, | |
| "step": 1460 | |
| }, | |
| { | |
| "epoch": 0.3862632654406433, | |
| "grad_norm": 0.09130859375, | |
| "learning_rate": 0.00016147248977437656, | |
| "loss": 0.0348, | |
| "step": 1465 | |
| }, | |
| { | |
| "epoch": 0.3875815701008503, | |
| "grad_norm": 2.390625, | |
| "learning_rate": 0.00016134054624620663, | |
| "loss": 0.0672, | |
| "step": 1470 | |
| }, | |
| { | |
| "epoch": 0.3888998747610573, | |
| "grad_norm": 0.439453125, | |
| "learning_rate": 0.00016120860271803667, | |
| "loss": 0.0121, | |
| "step": 1475 | |
| }, | |
| { | |
| "epoch": 0.39021817942126424, | |
| "grad_norm": 0.1298828125, | |
| "learning_rate": 0.00016107665918986674, | |
| "loss": 0.0114, | |
| "step": 1480 | |
| }, | |
| { | |
| "epoch": 0.39153648408147124, | |
| "grad_norm": 0.85546875, | |
| "learning_rate": 0.0001609447156616968, | |
| "loss": 0.0968, | |
| "step": 1485 | |
| }, | |
| { | |
| "epoch": 0.39285478874167823, | |
| "grad_norm": 0.703125, | |
| "learning_rate": 0.00016081277213352685, | |
| "loss": 0.0349, | |
| "step": 1490 | |
| }, | |
| { | |
| "epoch": 0.39417309340188517, | |
| "grad_norm": 0.021728515625, | |
| "learning_rate": 0.00016068082860535692, | |
| "loss": 0.0106, | |
| "step": 1495 | |
| }, | |
| { | |
| "epoch": 0.39549139806209216, | |
| "grad_norm": 0.7265625, | |
| "learning_rate": 0.00016054888507718696, | |
| "loss": 0.0225, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 0.39549139806209216, | |
| "eval_loss": 0.03515048325061798, | |
| "eval_model_preparation_time": 0.0076, | |
| "eval_runtime": 457.3497, | |
| "eval_samples_per_second": 7.373, | |
| "eval_steps_per_second": 3.686, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 0.3968097027222991, | |
| "grad_norm": 0.016519820317626, | |
| "learning_rate": 0.00016041694154901703, | |
| "loss": 0.0202, | |
| "step": 1505 | |
| }, | |
| { | |
| "epoch": 0.3981280073825061, | |
| "grad_norm": 0.8505942225456238, | |
| "learning_rate": 0.00016028499802084708, | |
| "loss": 0.0541, | |
| "step": 1510 | |
| }, | |
| { | |
| "epoch": 0.3994463120427131, | |
| "grad_norm": 0.04163295030593872, | |
| "learning_rate": 0.00016015305449267714, | |
| "loss": 0.0037, | |
| "step": 1515 | |
| }, | |
| { | |
| "epoch": 0.40076461670292, | |
| "grad_norm": 0.011332935653626919, | |
| "learning_rate": 0.00016002111096450721, | |
| "loss": 0.0459, | |
| "step": 1520 | |
| }, | |
| { | |
| "epoch": 0.402082921363127, | |
| "grad_norm": 0.9360129833221436, | |
| "learning_rate": 0.00015988916743633726, | |
| "loss": 0.013, | |
| "step": 1525 | |
| }, | |
| { | |
| "epoch": 0.403401226023334, | |
| "grad_norm": 0.11991436779499054, | |
| "learning_rate": 0.00015975722390816733, | |
| "loss": 0.0079, | |
| "step": 1530 | |
| }, | |
| { | |
| "epoch": 0.40471953068354094, | |
| "grad_norm": 0.36911076307296753, | |
| "learning_rate": 0.00015962528037999737, | |
| "loss": 0.0638, | |
| "step": 1535 | |
| }, | |
| { | |
| "epoch": 0.40603783534374793, | |
| "grad_norm": 0.020278634503483772, | |
| "learning_rate": 0.00015949333685182744, | |
| "loss": 0.0217, | |
| "step": 1540 | |
| }, | |
| { | |
| "epoch": 0.4073561400039549, | |
| "grad_norm": 0.14263059198856354, | |
| "learning_rate": 0.0001593613933236575, | |
| "loss": 0.0495, | |
| "step": 1545 | |
| }, | |
| { | |
| "epoch": 0.40867444466416186, | |
| "grad_norm": 0.09494803845882416, | |
| "learning_rate": 0.00015922944979548752, | |
| "loss": 0.0248, | |
| "step": 1550 | |
| }, | |
| { | |
| "epoch": 0.40999274932436885, | |
| "grad_norm": 0.23064319789409637, | |
| "learning_rate": 0.0001590975062673176, | |
| "loss": 0.0285, | |
| "step": 1555 | |
| }, | |
| { | |
| "epoch": 0.41131105398457585, | |
| "grad_norm": 0.32220256328582764, | |
| "learning_rate": 0.00015896556273914763, | |
| "loss": 0.0537, | |
| "step": 1560 | |
| }, | |
| { | |
| "epoch": 0.4126293586447828, | |
| "grad_norm": 0.41208815574645996, | |
| "learning_rate": 0.0001588336192109777, | |
| "loss": 0.0453, | |
| "step": 1565 | |
| }, | |
| { | |
| "epoch": 0.4139476633049898, | |
| "grad_norm": 0.03775424137711525, | |
| "learning_rate": 0.00015870167568280777, | |
| "loss": 0.0134, | |
| "step": 1570 | |
| }, | |
| { | |
| "epoch": 0.41526596796519677, | |
| "grad_norm": 0.6526333093643188, | |
| "learning_rate": 0.0001585697321546378, | |
| "loss": 0.0329, | |
| "step": 1575 | |
| }, | |
| { | |
| "epoch": 0.4165842726254037, | |
| "grad_norm": 1.001305103302002, | |
| "learning_rate": 0.00015843778862646788, | |
| "loss": 0.0912, | |
| "step": 1580 | |
| }, | |
| { | |
| "epoch": 0.4179025772856107, | |
| "grad_norm": 0.4055219888687134, | |
| "learning_rate": 0.00015830584509829792, | |
| "loss": 0.0519, | |
| "step": 1585 | |
| }, | |
| { | |
| "epoch": 0.4192208819458177, | |
| "grad_norm": 0.035015616565942764, | |
| "learning_rate": 0.000158173901570128, | |
| "loss": 0.0191, | |
| "step": 1590 | |
| }, | |
| { | |
| "epoch": 0.42053918660602463, | |
| "grad_norm": 0.09326844662427902, | |
| "learning_rate": 0.00015804195804195806, | |
| "loss": 0.0106, | |
| "step": 1595 | |
| }, | |
| { | |
| "epoch": 0.4218574912662316, | |
| "grad_norm": 0.06223440542817116, | |
| "learning_rate": 0.0001579100145137881, | |
| "loss": 0.0113, | |
| "step": 1600 | |
| }, | |
| { | |
| "epoch": 0.4231757959264386, | |
| "grad_norm": 0.0625135526061058, | |
| "learning_rate": 0.00015777807098561817, | |
| "loss": 0.0191, | |
| "step": 1605 | |
| }, | |
| { | |
| "epoch": 0.42449410058664555, | |
| "grad_norm": 0.2645983099937439, | |
| "learning_rate": 0.00015764612745744822, | |
| "loss": 0.0829, | |
| "step": 1610 | |
| }, | |
| { | |
| "epoch": 0.42581240524685254, | |
| "grad_norm": 0.009632415138185024, | |
| "learning_rate": 0.00015751418392927829, | |
| "loss": 0.0542, | |
| "step": 1615 | |
| }, | |
| { | |
| "epoch": 0.42713070990705954, | |
| "grad_norm": 0.01979319378733635, | |
| "learning_rate": 0.00015738224040110833, | |
| "loss": 0.0517, | |
| "step": 1620 | |
| }, | |
| { | |
| "epoch": 0.4284490145672665, | |
| "grad_norm": 0.3065454065799713, | |
| "learning_rate": 0.0001572502968729384, | |
| "loss": 0.0738, | |
| "step": 1625 | |
| }, | |
| { | |
| "epoch": 0.42976731922747347, | |
| "grad_norm": 0.09581473469734192, | |
| "learning_rate": 0.00015711835334476847, | |
| "loss": 0.0571, | |
| "step": 1630 | |
| }, | |
| { | |
| "epoch": 0.43108562388768046, | |
| "grad_norm": 0.23746591806411743, | |
| "learning_rate": 0.0001569864098165985, | |
| "loss": 0.0128, | |
| "step": 1635 | |
| }, | |
| { | |
| "epoch": 0.4324039285478874, | |
| "grad_norm": 0.936278760433197, | |
| "learning_rate": 0.00015685446628842858, | |
| "loss": 0.0665, | |
| "step": 1640 | |
| }, | |
| { | |
| "epoch": 0.4337222332080944, | |
| "grad_norm": 0.18487441539764404, | |
| "learning_rate": 0.00015672252276025862, | |
| "loss": 0.0527, | |
| "step": 1645 | |
| }, | |
| { | |
| "epoch": 0.4350405378683014, | |
| "grad_norm": 0.6980624794960022, | |
| "learning_rate": 0.00015659057923208866, | |
| "loss": 0.0613, | |
| "step": 1650 | |
| }, | |
| { | |
| "epoch": 0.4363588425285083, | |
| "grad_norm": 0.4696301221847534, | |
| "learning_rate": 0.00015645863570391873, | |
| "loss": 0.0569, | |
| "step": 1655 | |
| }, | |
| { | |
| "epoch": 0.4376771471887153, | |
| "grad_norm": 0.15083105862140656, | |
| "learning_rate": 0.00015632669217574877, | |
| "loss": 0.0394, | |
| "step": 1660 | |
| }, | |
| { | |
| "epoch": 0.4389954518489223, | |
| "grad_norm": 0.44701239466667175, | |
| "learning_rate": 0.00015619474864757884, | |
| "loss": 0.0494, | |
| "step": 1665 | |
| }, | |
| { | |
| "epoch": 0.44031375650912924, | |
| "grad_norm": 0.07418403029441833, | |
| "learning_rate": 0.00015606280511940888, | |
| "loss": 0.0291, | |
| "step": 1670 | |
| }, | |
| { | |
| "epoch": 0.44163206116933623, | |
| "grad_norm": 0.02311861515045166, | |
| "learning_rate": 0.00015593086159123895, | |
| "loss": 0.0304, | |
| "step": 1675 | |
| }, | |
| { | |
| "epoch": 0.4429503658295432, | |
| "grad_norm": 0.4416038990020752, | |
| "learning_rate": 0.00015579891806306902, | |
| "loss": 0.0176, | |
| "step": 1680 | |
| }, | |
| { | |
| "epoch": 0.44426867048975016, | |
| "grad_norm": 0.5124915242195129, | |
| "learning_rate": 0.00015566697453489906, | |
| "loss": 0.0454, | |
| "step": 1685 | |
| }, | |
| { | |
| "epoch": 0.44558697514995715, | |
| "grad_norm": 0.3159286081790924, | |
| "learning_rate": 0.00015553503100672913, | |
| "loss": 0.047, | |
| "step": 1690 | |
| }, | |
| { | |
| "epoch": 0.44690527981016415, | |
| "grad_norm": 0.032126396894454956, | |
| "learning_rate": 0.00015540308747855918, | |
| "loss": 0.0151, | |
| "step": 1695 | |
| }, | |
| { | |
| "epoch": 0.4482235844703711, | |
| "grad_norm": 0.04663548618555069, | |
| "learning_rate": 0.00015527114395038924, | |
| "loss": 0.0375, | |
| "step": 1700 | |
| }, | |
| { | |
| "epoch": 0.4495418891305781, | |
| "grad_norm": 0.013753900304436684, | |
| "learning_rate": 0.0001551392004222193, | |
| "loss": 0.0485, | |
| "step": 1705 | |
| }, | |
| { | |
| "epoch": 0.45086019379078507, | |
| "grad_norm": 1.9952393770217896, | |
| "learning_rate": 0.00015500725689404936, | |
| "loss": 0.0625, | |
| "step": 1710 | |
| }, | |
| { | |
| "epoch": 0.452178498450992, | |
| "grad_norm": 0.014283270575106144, | |
| "learning_rate": 0.00015487531336587943, | |
| "loss": 0.0037, | |
| "step": 1715 | |
| }, | |
| { | |
| "epoch": 0.453496803111199, | |
| "grad_norm": 0.3897913098335266, | |
| "learning_rate": 0.00015474336983770947, | |
| "loss": 0.0304, | |
| "step": 1720 | |
| }, | |
| { | |
| "epoch": 0.454815107771406, | |
| "grad_norm": 0.3730885684490204, | |
| "learning_rate": 0.00015461142630953954, | |
| "loss": 0.0115, | |
| "step": 1725 | |
| }, | |
| { | |
| "epoch": 0.45613341243161293, | |
| "grad_norm": 0.035858724266290665, | |
| "learning_rate": 0.00015447948278136958, | |
| "loss": 0.0021, | |
| "step": 1730 | |
| }, | |
| { | |
| "epoch": 0.4574517170918199, | |
| "grad_norm": 0.20589517056941986, | |
| "learning_rate": 0.00015434753925319965, | |
| "loss": 0.0132, | |
| "step": 1735 | |
| }, | |
| { | |
| "epoch": 0.4587700217520269, | |
| "grad_norm": 0.004939342383295298, | |
| "learning_rate": 0.00015421559572502972, | |
| "loss": 0.0471, | |
| "step": 1740 | |
| }, | |
| { | |
| "epoch": 0.46008832641223385, | |
| "grad_norm": 0.03493283689022064, | |
| "learning_rate": 0.00015408365219685976, | |
| "loss": 0.0062, | |
| "step": 1745 | |
| }, | |
| { | |
| "epoch": 0.46140663107244084, | |
| "grad_norm": 0.045927103608846664, | |
| "learning_rate": 0.0001539517086686898, | |
| "loss": 0.0283, | |
| "step": 1750 | |
| }, | |
| { | |
| "epoch": 0.46272493573264784, | |
| "grad_norm": 0.012629454955458641, | |
| "learning_rate": 0.00015381976514051984, | |
| "loss": 0.0133, | |
| "step": 1755 | |
| }, | |
| { | |
| "epoch": 0.46404324039285477, | |
| "grad_norm": 0.8001697659492493, | |
| "learning_rate": 0.0001536878216123499, | |
| "loss": 0.0224, | |
| "step": 1760 | |
| }, | |
| { | |
| "epoch": 0.46536154505306176, | |
| "grad_norm": 0.002036362886428833, | |
| "learning_rate": 0.00015355587808417998, | |
| "loss": 0.0066, | |
| "step": 1765 | |
| }, | |
| { | |
| "epoch": 0.46667984971326876, | |
| "grad_norm": 1.0261330604553223, | |
| "learning_rate": 0.00015342393455601002, | |
| "loss": 0.191, | |
| "step": 1770 | |
| }, | |
| { | |
| "epoch": 0.4679981543734757, | |
| "grad_norm": 0.3033429682254791, | |
| "learning_rate": 0.0001532919910278401, | |
| "loss": 0.0222, | |
| "step": 1775 | |
| }, | |
| { | |
| "epoch": 0.4693164590336827, | |
| "grad_norm": 0.36911338567733765, | |
| "learning_rate": 0.00015316004749967014, | |
| "loss": 0.0363, | |
| "step": 1780 | |
| }, | |
| { | |
| "epoch": 0.4706347636938897, | |
| "grad_norm": 0.0406811460852623, | |
| "learning_rate": 0.0001530281039715002, | |
| "loss": 0.0283, | |
| "step": 1785 | |
| }, | |
| { | |
| "epoch": 0.4719530683540966, | |
| "grad_norm": 0.23334211111068726, | |
| "learning_rate": 0.00015289616044333027, | |
| "loss": 0.0274, | |
| "step": 1790 | |
| }, | |
| { | |
| "epoch": 0.4732713730143036, | |
| "grad_norm": 0.013081169687211514, | |
| "learning_rate": 0.00015276421691516032, | |
| "loss": 0.0221, | |
| "step": 1795 | |
| }, | |
| { | |
| "epoch": 0.4745896776745106, | |
| "grad_norm": 0.2480790615081787, | |
| "learning_rate": 0.00015263227338699039, | |
| "loss": 0.019, | |
| "step": 1800 | |
| }, | |
| { | |
| "epoch": 0.47590798233471754, | |
| "grad_norm": 0.0373196005821228, | |
| "learning_rate": 0.00015250032985882043, | |
| "loss": 0.0292, | |
| "step": 1805 | |
| }, | |
| { | |
| "epoch": 0.47722628699492453, | |
| "grad_norm": 0.004609994124621153, | |
| "learning_rate": 0.0001523683863306505, | |
| "loss": 0.0918, | |
| "step": 1810 | |
| }, | |
| { | |
| "epoch": 0.4785445916551315, | |
| "grad_norm": 0.02370987832546234, | |
| "learning_rate": 0.00015223644280248054, | |
| "loss": 0.0462, | |
| "step": 1815 | |
| }, | |
| { | |
| "epoch": 0.47986289631533846, | |
| "grad_norm": 0.05842221528291702, | |
| "learning_rate": 0.0001521044992743106, | |
| "loss": 0.0595, | |
| "step": 1820 | |
| }, | |
| { | |
| "epoch": 0.48118120097554545, | |
| "grad_norm": 0.009685276076197624, | |
| "learning_rate": 0.00015197255574614068, | |
| "loss": 0.0074, | |
| "step": 1825 | |
| }, | |
| { | |
| "epoch": 0.48249950563575245, | |
| "grad_norm": 0.8933250308036804, | |
| "learning_rate": 0.00015184061221797072, | |
| "loss": 0.0757, | |
| "step": 1830 | |
| }, | |
| { | |
| "epoch": 0.4838178102959594, | |
| "grad_norm": 0.07075401395559311, | |
| "learning_rate": 0.0001517086686898008, | |
| "loss": 0.0226, | |
| "step": 1835 | |
| }, | |
| { | |
| "epoch": 0.4851361149561664, | |
| "grad_norm": 0.732706606388092, | |
| "learning_rate": 0.00015157672516163083, | |
| "loss": 0.0161, | |
| "step": 1840 | |
| }, | |
| { | |
| "epoch": 0.48645441961637337, | |
| "grad_norm": 1.1897023916244507, | |
| "learning_rate": 0.0001514447816334609, | |
| "loss": 0.0265, | |
| "step": 1845 | |
| }, | |
| { | |
| "epoch": 0.4877727242765803, | |
| "grad_norm": 0.052572328597307205, | |
| "learning_rate": 0.00015131283810529094, | |
| "loss": 0.0094, | |
| "step": 1850 | |
| }, | |
| { | |
| "epoch": 0.4890910289367873, | |
| "grad_norm": 0.08263898640871048, | |
| "learning_rate": 0.00015118089457712098, | |
| "loss": 0.0631, | |
| "step": 1855 | |
| }, | |
| { | |
| "epoch": 0.4904093335969943, | |
| "grad_norm": 0.03225664421916008, | |
| "learning_rate": 0.00015104895104895105, | |
| "loss": 0.023, | |
| "step": 1860 | |
| }, | |
| { | |
| "epoch": 0.4917276382572012, | |
| "grad_norm": 0.007935039699077606, | |
| "learning_rate": 0.0001509170075207811, | |
| "loss": 0.0039, | |
| "step": 1865 | |
| }, | |
| { | |
| "epoch": 0.4930459429174082, | |
| "grad_norm": 0.00830796267837286, | |
| "learning_rate": 0.00015078506399261116, | |
| "loss": 0.007, | |
| "step": 1870 | |
| }, | |
| { | |
| "epoch": 0.4943642475776152, | |
| "grad_norm": 0.08042234182357788, | |
| "learning_rate": 0.00015065312046444123, | |
| "loss": 0.0366, | |
| "step": 1875 | |
| }, | |
| { | |
| "epoch": 0.49568255223782215, | |
| "grad_norm": 0.009092851541936398, | |
| "learning_rate": 0.00015052117693627128, | |
| "loss": 0.0107, | |
| "step": 1880 | |
| }, | |
| { | |
| "epoch": 0.49700085689802914, | |
| "grad_norm": 0.2674141824245453, | |
| "learning_rate": 0.00015038923340810135, | |
| "loss": 0.0076, | |
| "step": 1885 | |
| }, | |
| { | |
| "epoch": 0.49831916155823613, | |
| "grad_norm": 0.07694366574287415, | |
| "learning_rate": 0.0001502572898799314, | |
| "loss": 0.0252, | |
| "step": 1890 | |
| }, | |
| { | |
| "epoch": 0.49963746621844307, | |
| "grad_norm": 0.5699467062950134, | |
| "learning_rate": 0.00015012534635176146, | |
| "loss": 0.0487, | |
| "step": 1895 | |
| }, | |
| { | |
| "epoch": 0.5009557708786501, | |
| "grad_norm": 0.18800878524780273, | |
| "learning_rate": 0.0001499934028235915, | |
| "loss": 0.0183, | |
| "step": 1900 | |
| }, | |
| { | |
| "epoch": 0.5022740755388571, | |
| "grad_norm": 0.019469989463686943, | |
| "learning_rate": 0.00014986145929542157, | |
| "loss": 0.0268, | |
| "step": 1905 | |
| }, | |
| { | |
| "epoch": 0.503592380199064, | |
| "grad_norm": 0.01890506222844124, | |
| "learning_rate": 0.00014972951576725164, | |
| "loss": 0.0449, | |
| "step": 1910 | |
| }, | |
| { | |
| "epoch": 0.5049106848592709, | |
| "grad_norm": 0.0006314461352303624, | |
| "learning_rate": 0.00014959757223908168, | |
| "loss": 0.0056, | |
| "step": 1915 | |
| }, | |
| { | |
| "epoch": 0.5062289895194779, | |
| "grad_norm": 0.32654041051864624, | |
| "learning_rate": 0.00014946562871091175, | |
| "loss": 0.0256, | |
| "step": 1920 | |
| }, | |
| { | |
| "epoch": 0.5075472941796849, | |
| "grad_norm": 0.7803483605384827, | |
| "learning_rate": 0.0001493336851827418, | |
| "loss": 0.0374, | |
| "step": 1925 | |
| }, | |
| { | |
| "epoch": 0.5088655988398919, | |
| "grad_norm": 0.028441445901989937, | |
| "learning_rate": 0.00014920174165457186, | |
| "loss": 0.0161, | |
| "step": 1930 | |
| }, | |
| { | |
| "epoch": 0.5101839035000989, | |
| "grad_norm": 0.028379200026392937, | |
| "learning_rate": 0.00014906979812640193, | |
| "loss": 0.0151, | |
| "step": 1935 | |
| }, | |
| { | |
| "epoch": 0.5115022081603059, | |
| "grad_norm": 0.021159596741199493, | |
| "learning_rate": 0.00014893785459823197, | |
| "loss": 0.0303, | |
| "step": 1940 | |
| }, | |
| { | |
| "epoch": 0.5128205128205128, | |
| "grad_norm": 0.24903325736522675, | |
| "learning_rate": 0.000148805911070062, | |
| "loss": 0.0076, | |
| "step": 1945 | |
| }, | |
| { | |
| "epoch": 0.5141388174807198, | |
| "grad_norm": 0.007065301761031151, | |
| "learning_rate": 0.00014867396754189206, | |
| "loss": 0.022, | |
| "step": 1950 | |
| }, | |
| { | |
| "epoch": 0.5154571221409268, | |
| "grad_norm": 0.004032329190522432, | |
| "learning_rate": 0.00014854202401372212, | |
| "loss": 0.0083, | |
| "step": 1955 | |
| }, | |
| { | |
| "epoch": 0.5167754268011338, | |
| "grad_norm": 0.3045775592327118, | |
| "learning_rate": 0.0001484100804855522, | |
| "loss": 0.0113, | |
| "step": 1960 | |
| }, | |
| { | |
| "epoch": 0.5180937314613407, | |
| "grad_norm": 0.36974939703941345, | |
| "learning_rate": 0.00014827813695738224, | |
| "loss": 0.0267, | |
| "step": 1965 | |
| }, | |
| { | |
| "epoch": 0.5194120361215477, | |
| "grad_norm": 0.009729950688779354, | |
| "learning_rate": 0.0001481461934292123, | |
| "loss": 0.027, | |
| "step": 1970 | |
| }, | |
| { | |
| "epoch": 0.5207303407817546, | |
| "grad_norm": 0.0013097926275804639, | |
| "learning_rate": 0.00014801424990104235, | |
| "loss": 0.003, | |
| "step": 1975 | |
| }, | |
| { | |
| "epoch": 0.5220486454419616, | |
| "grad_norm": 0.0706263929605484, | |
| "learning_rate": 0.00014788230637287242, | |
| "loss": 0.0193, | |
| "step": 1980 | |
| }, | |
| { | |
| "epoch": 0.5233669501021686, | |
| "grad_norm": 1.435702919960022, | |
| "learning_rate": 0.00014775036284470249, | |
| "loss": 0.0647, | |
| "step": 1985 | |
| }, | |
| { | |
| "epoch": 0.5246852547623756, | |
| "grad_norm": 0.00661757867783308, | |
| "learning_rate": 0.00014761841931653253, | |
| "loss": 0.0373, | |
| "step": 1990 | |
| }, | |
| { | |
| "epoch": 0.5260035594225826, | |
| "grad_norm": 0.12014541029930115, | |
| "learning_rate": 0.0001474864757883626, | |
| "loss": 0.0178, | |
| "step": 1995 | |
| }, | |
| { | |
| "epoch": 0.5273218640827896, | |
| "grad_norm": 1.0549248456954956, | |
| "learning_rate": 0.00014735453226019264, | |
| "loss": 0.0191, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 0.5273218640827896, | |
| "eval_loss": 0.037292081862688065, | |
| "eval_runtime": 454.3033, | |
| "eval_samples_per_second": 7.422, | |
| "eval_steps_per_second": 3.711, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 0.5286401687429965, | |
| "grad_norm": 0.47634151577949524, | |
| "learning_rate": 0.0001472225887320227, | |
| "loss": 0.0404, | |
| "step": 2005 | |
| }, | |
| { | |
| "epoch": 0.5299584734032035, | |
| "grad_norm": 0.006752463988959789, | |
| "learning_rate": 0.00014709064520385275, | |
| "loss": 0.034, | |
| "step": 2010 | |
| }, | |
| { | |
| "epoch": 0.5312767780634104, | |
| "grad_norm": 0.20780125260353088, | |
| "learning_rate": 0.00014695870167568282, | |
| "loss": 0.0421, | |
| "step": 2015 | |
| }, | |
| { | |
| "epoch": 0.5325950827236174, | |
| "grad_norm": 0.010941066779196262, | |
| "learning_rate": 0.0001468267581475129, | |
| "loss": 0.0086, | |
| "step": 2020 | |
| }, | |
| { | |
| "epoch": 0.5339133873838244, | |
| "grad_norm": 0.3439581096172333, | |
| "learning_rate": 0.00014669481461934293, | |
| "loss": 0.0187, | |
| "step": 2025 | |
| }, | |
| { | |
| "epoch": 0.5352316920440314, | |
| "grad_norm": 0.14961636066436768, | |
| "learning_rate": 0.000146562871091173, | |
| "loss": 0.0504, | |
| "step": 2030 | |
| }, | |
| { | |
| "epoch": 0.5365499967042383, | |
| "grad_norm": 0.0044641937129199505, | |
| "learning_rate": 0.00014643092756300304, | |
| "loss": 0.0134, | |
| "step": 2035 | |
| }, | |
| { | |
| "epoch": 0.5378683013644453, | |
| "grad_norm": 0.14088386297225952, | |
| "learning_rate": 0.0001462989840348331, | |
| "loss": 0.0096, | |
| "step": 2040 | |
| }, | |
| { | |
| "epoch": 0.5391866060246523, | |
| "grad_norm": 0.48116979002952576, | |
| "learning_rate": 0.00014616704050666315, | |
| "loss": 0.0124, | |
| "step": 2045 | |
| }, | |
| { | |
| "epoch": 0.5405049106848593, | |
| "grad_norm": 0.3688766360282898, | |
| "learning_rate": 0.0001460350969784932, | |
| "loss": 0.0226, | |
| "step": 2050 | |
| }, | |
| { | |
| "epoch": 0.5418232153450663, | |
| "grad_norm": 0.002938181860372424, | |
| "learning_rate": 0.00014590315345032326, | |
| "loss": 0.0267, | |
| "step": 2055 | |
| }, | |
| { | |
| "epoch": 0.5431415200052733, | |
| "grad_norm": 0.3335214853286743, | |
| "learning_rate": 0.0001457712099221533, | |
| "loss": 0.0367, | |
| "step": 2060 | |
| }, | |
| { | |
| "epoch": 0.5444598246654802, | |
| "grad_norm": 0.004644686821848154, | |
| "learning_rate": 0.00014563926639398338, | |
| "loss": 0.0121, | |
| "step": 2065 | |
| }, | |
| { | |
| "epoch": 0.5457781293256871, | |
| "grad_norm": 0.19505545496940613, | |
| "learning_rate": 0.00014550732286581345, | |
| "loss": 0.0591, | |
| "step": 2070 | |
| }, | |
| { | |
| "epoch": 0.5470964339858941, | |
| "grad_norm": 0.018028756603598595, | |
| "learning_rate": 0.0001453753793376435, | |
| "loss": 0.0131, | |
| "step": 2075 | |
| }, | |
| { | |
| "epoch": 0.5484147386461011, | |
| "grad_norm": 0.045639291405677795, | |
| "learning_rate": 0.00014524343580947356, | |
| "loss": 0.0443, | |
| "step": 2080 | |
| }, | |
| { | |
| "epoch": 0.5497330433063081, | |
| "grad_norm": 0.727981686592102, | |
| "learning_rate": 0.0001451114922813036, | |
| "loss": 0.0205, | |
| "step": 2085 | |
| }, | |
| { | |
| "epoch": 0.5510513479665151, | |
| "grad_norm": 0.03766491636633873, | |
| "learning_rate": 0.00014497954875313367, | |
| "loss": 0.0067, | |
| "step": 2090 | |
| }, | |
| { | |
| "epoch": 0.552369652626722, | |
| "grad_norm": 0.1911504715681076, | |
| "learning_rate": 0.0001448476052249637, | |
| "loss": 0.0397, | |
| "step": 2095 | |
| }, | |
| { | |
| "epoch": 0.553687957286929, | |
| "grad_norm": 0.08238353580236435, | |
| "learning_rate": 0.00014471566169679378, | |
| "loss": 0.0513, | |
| "step": 2100 | |
| }, | |
| { | |
| "epoch": 0.555006261947136, | |
| "grad_norm": 0.06317206472158432, | |
| "learning_rate": 0.00014458371816862385, | |
| "loss": 0.0178, | |
| "step": 2105 | |
| }, | |
| { | |
| "epoch": 0.556324566607343, | |
| "grad_norm": 0.0652734637260437, | |
| "learning_rate": 0.0001444517746404539, | |
| "loss": 0.0184, | |
| "step": 2110 | |
| }, | |
| { | |
| "epoch": 0.55764287126755, | |
| "grad_norm": 0.05471858009696007, | |
| "learning_rate": 0.00014431983111228396, | |
| "loss": 0.0089, | |
| "step": 2115 | |
| }, | |
| { | |
| "epoch": 0.558961175927757, | |
| "grad_norm": 0.005062670446932316, | |
| "learning_rate": 0.000144187887584114, | |
| "loss": 0.0052, | |
| "step": 2120 | |
| }, | |
| { | |
| "epoch": 0.5602794805879638, | |
| "grad_norm": 0.06337414681911469, | |
| "learning_rate": 0.00014405594405594407, | |
| "loss": 0.053, | |
| "step": 2125 | |
| }, | |
| { | |
| "epoch": 0.5615977852481708, | |
| "grad_norm": 0.33745357394218445, | |
| "learning_rate": 0.00014392400052777414, | |
| "loss": 0.0166, | |
| "step": 2130 | |
| }, | |
| { | |
| "epoch": 0.5629160899083778, | |
| "grad_norm": 0.7382741570472717, | |
| "learning_rate": 0.00014379205699960418, | |
| "loss": 0.0191, | |
| "step": 2135 | |
| }, | |
| { | |
| "epoch": 0.5642343945685848, | |
| "grad_norm": 0.007551972754299641, | |
| "learning_rate": 0.00014366011347143425, | |
| "loss": 0.0022, | |
| "step": 2140 | |
| }, | |
| { | |
| "epoch": 0.5655526992287918, | |
| "grad_norm": 0.6260896921157837, | |
| "learning_rate": 0.00014352816994326427, | |
| "loss": 0.0095, | |
| "step": 2145 | |
| }, | |
| { | |
| "epoch": 0.5668710038889987, | |
| "grad_norm": 0.11619322001934052, | |
| "learning_rate": 0.00014339622641509434, | |
| "loss": 0.015, | |
| "step": 2150 | |
| }, | |
| { | |
| "epoch": 0.5681893085492057, | |
| "grad_norm": 1.1440670490264893, | |
| "learning_rate": 0.0001432642828869244, | |
| "loss": 0.1343, | |
| "step": 2155 | |
| }, | |
| { | |
| "epoch": 0.5695076132094127, | |
| "grad_norm": 1.1793878078460693, | |
| "learning_rate": 0.00014313233935875445, | |
| "loss": 0.0968, | |
| "step": 2160 | |
| }, | |
| { | |
| "epoch": 0.5708259178696197, | |
| "grad_norm": 0.6865736842155457, | |
| "learning_rate": 0.00014300039583058452, | |
| "loss": 0.0195, | |
| "step": 2165 | |
| }, | |
| { | |
| "epoch": 0.5721442225298267, | |
| "grad_norm": 0.140816792845726, | |
| "learning_rate": 0.00014286845230241456, | |
| "loss": 0.0761, | |
| "step": 2170 | |
| }, | |
| { | |
| "epoch": 0.5734625271900337, | |
| "grad_norm": 0.04071786254644394, | |
| "learning_rate": 0.00014273650877424463, | |
| "loss": 0.0193, | |
| "step": 2175 | |
| }, | |
| { | |
| "epoch": 0.5747808318502405, | |
| "grad_norm": 0.044617727398872375, | |
| "learning_rate": 0.0001426045652460747, | |
| "loss": 0.0112, | |
| "step": 2180 | |
| }, | |
| { | |
| "epoch": 0.5760991365104475, | |
| "grad_norm": 0.11001799255609512, | |
| "learning_rate": 0.00014247262171790474, | |
| "loss": 0.0039, | |
| "step": 2185 | |
| }, | |
| { | |
| "epoch": 0.5774174411706545, | |
| "grad_norm": 0.0036315324250608683, | |
| "learning_rate": 0.0001423406781897348, | |
| "loss": 0.0038, | |
| "step": 2190 | |
| }, | |
| { | |
| "epoch": 0.5787357458308615, | |
| "grad_norm": 0.9866570830345154, | |
| "learning_rate": 0.00014220873466156485, | |
| "loss": 0.025, | |
| "step": 2195 | |
| }, | |
| { | |
| "epoch": 0.5800540504910685, | |
| "grad_norm": 0.023570384830236435, | |
| "learning_rate": 0.00014207679113339492, | |
| "loss": 0.0468, | |
| "step": 2200 | |
| }, | |
| { | |
| "epoch": 0.5813723551512755, | |
| "grad_norm": 0.20010559260845184, | |
| "learning_rate": 0.00014194484760522496, | |
| "loss": 0.0198, | |
| "step": 2205 | |
| }, | |
| { | |
| "epoch": 0.5826906598114824, | |
| "grad_norm": 0.06153270602226257, | |
| "learning_rate": 0.00014181290407705503, | |
| "loss": 0.0764, | |
| "step": 2210 | |
| }, | |
| { | |
| "epoch": 0.5840089644716894, | |
| "grad_norm": 0.033162448555231094, | |
| "learning_rate": 0.0001416809605488851, | |
| "loss": 0.028, | |
| "step": 2215 | |
| }, | |
| { | |
| "epoch": 0.5853272691318964, | |
| "grad_norm": 0.428382933139801, | |
| "learning_rate": 0.00014154901702071514, | |
| "loss": 0.0652, | |
| "step": 2220 | |
| }, | |
| { | |
| "epoch": 0.5866455737921034, | |
| "grad_norm": 0.25004762411117554, | |
| "learning_rate": 0.0001414170734925452, | |
| "loss": 0.0411, | |
| "step": 2225 | |
| }, | |
| { | |
| "epoch": 0.5879638784523104, | |
| "grad_norm": 0.22649863362312317, | |
| "learning_rate": 0.00014128512996437525, | |
| "loss": 0.0517, | |
| "step": 2230 | |
| }, | |
| { | |
| "epoch": 0.5892821831125173, | |
| "grad_norm": 0.035932112485170364, | |
| "learning_rate": 0.00014115318643620532, | |
| "loss": 0.015, | |
| "step": 2235 | |
| }, | |
| { | |
| "epoch": 0.5906004877727242, | |
| "grad_norm": 0.3800172507762909, | |
| "learning_rate": 0.00014102124290803536, | |
| "loss": 0.0324, | |
| "step": 2240 | |
| }, | |
| { | |
| "epoch": 0.5919187924329312, | |
| "grad_norm": 0.6974118947982788, | |
| "learning_rate": 0.0001408892993798654, | |
| "loss": 0.0216, | |
| "step": 2245 | |
| }, | |
| { | |
| "epoch": 0.5932370970931382, | |
| "grad_norm": 0.15472032129764557, | |
| "learning_rate": 0.00014075735585169548, | |
| "loss": 0.0164, | |
| "step": 2250 | |
| }, | |
| { | |
| "epoch": 0.5945554017533452, | |
| "grad_norm": 0.015000814571976662, | |
| "learning_rate": 0.00014062541232352552, | |
| "loss": 0.0395, | |
| "step": 2255 | |
| }, | |
| { | |
| "epoch": 0.5958737064135522, | |
| "grad_norm": 0.052086081355810165, | |
| "learning_rate": 0.0001404934687953556, | |
| "loss": 0.0032, | |
| "step": 2260 | |
| }, | |
| { | |
| "epoch": 0.5971920110737592, | |
| "grad_norm": 0.004600350745022297, | |
| "learning_rate": 0.00014036152526718566, | |
| "loss": 0.0056, | |
| "step": 2265 | |
| }, | |
| { | |
| "epoch": 0.5985103157339661, | |
| "grad_norm": 0.4940958321094513, | |
| "learning_rate": 0.0001402295817390157, | |
| "loss": 0.0206, | |
| "step": 2270 | |
| }, | |
| { | |
| "epoch": 0.5998286203941731, | |
| "grad_norm": 0.09658394008874893, | |
| "learning_rate": 0.00014009763821084577, | |
| "loss": 0.0052, | |
| "step": 2275 | |
| }, | |
| { | |
| "epoch": 0.60114692505438, | |
| "grad_norm": 0.00020539117394946516, | |
| "learning_rate": 0.0001399656946826758, | |
| "loss": 0.087, | |
| "step": 2280 | |
| }, | |
| { | |
| "epoch": 0.602465229714587, | |
| "grad_norm": 0.1871018409729004, | |
| "learning_rate": 0.00013983375115450588, | |
| "loss": 0.0812, | |
| "step": 2285 | |
| }, | |
| { | |
| "epoch": 0.603783534374794, | |
| "grad_norm": 0.02583954855799675, | |
| "learning_rate": 0.00013970180762633592, | |
| "loss": 0.0232, | |
| "step": 2290 | |
| }, | |
| { | |
| "epoch": 0.605101839035001, | |
| "grad_norm": 1.2103784084320068, | |
| "learning_rate": 0.000139569864098166, | |
| "loss": 0.0151, | |
| "step": 2295 | |
| }, | |
| { | |
| "epoch": 0.6064201436952079, | |
| "grad_norm": 0.023514943197369576, | |
| "learning_rate": 0.00013943792056999606, | |
| "loss": 0.0193, | |
| "step": 2300 | |
| }, | |
| { | |
| "epoch": 0.6077384483554149, | |
| "grad_norm": 0.0076395305804908276, | |
| "learning_rate": 0.0001393059770418261, | |
| "loss": 0.0379, | |
| "step": 2305 | |
| }, | |
| { | |
| "epoch": 0.6090567530156219, | |
| "grad_norm": 0.12412039190530777, | |
| "learning_rate": 0.00013917403351365617, | |
| "loss": 0.0095, | |
| "step": 2310 | |
| }, | |
| { | |
| "epoch": 0.6103750576758289, | |
| "grad_norm": 0.021904783323407173, | |
| "learning_rate": 0.0001390420899854862, | |
| "loss": 0.0166, | |
| "step": 2315 | |
| }, | |
| { | |
| "epoch": 0.6116933623360359, | |
| "grad_norm": 0.004012851510196924, | |
| "learning_rate": 0.00013891014645731628, | |
| "loss": 0.0103, | |
| "step": 2320 | |
| }, | |
| { | |
| "epoch": 0.6130116669962429, | |
| "grad_norm": 0.007267913781106472, | |
| "learning_rate": 0.00013877820292914635, | |
| "loss": 0.0708, | |
| "step": 2325 | |
| }, | |
| { | |
| "epoch": 0.6143299716564498, | |
| "grad_norm": 0.10363642126321793, | |
| "learning_rate": 0.0001386462594009764, | |
| "loss": 0.0473, | |
| "step": 2330 | |
| }, | |
| { | |
| "epoch": 0.6156482763166568, | |
| "grad_norm": 0.04899830371141434, | |
| "learning_rate": 0.00013851431587280646, | |
| "loss": 0.0283, | |
| "step": 2335 | |
| }, | |
| { | |
| "epoch": 0.6169665809768637, | |
| "grad_norm": 0.39460498094558716, | |
| "learning_rate": 0.0001383823723446365, | |
| "loss": 0.0597, | |
| "step": 2340 | |
| }, | |
| { | |
| "epoch": 0.6182848856370707, | |
| "grad_norm": 0.04092290997505188, | |
| "learning_rate": 0.00013825042881646655, | |
| "loss": 0.0167, | |
| "step": 2345 | |
| }, | |
| { | |
| "epoch": 0.6196031902972777, | |
| "grad_norm": 0.2781132161617279, | |
| "learning_rate": 0.00013811848528829662, | |
| "loss": 0.0097, | |
| "step": 2350 | |
| }, | |
| { | |
| "epoch": 0.6209214949574847, | |
| "grad_norm": 0.041443537920713425, | |
| "learning_rate": 0.00013798654176012666, | |
| "loss": 0.0226, | |
| "step": 2355 | |
| }, | |
| { | |
| "epoch": 0.6222397996176916, | |
| "grad_norm": 0.1242462694644928, | |
| "learning_rate": 0.00013785459823195673, | |
| "loss": 0.0055, | |
| "step": 2360 | |
| }, | |
| { | |
| "epoch": 0.6235581042778986, | |
| "grad_norm": 0.4440467357635498, | |
| "learning_rate": 0.00013772265470378677, | |
| "loss": 0.049, | |
| "step": 2365 | |
| }, | |
| { | |
| "epoch": 0.6248764089381056, | |
| "grad_norm": 0.014354427345097065, | |
| "learning_rate": 0.00013759071117561684, | |
| "loss": 0.0327, | |
| "step": 2370 | |
| }, | |
| { | |
| "epoch": 0.6261947135983126, | |
| "grad_norm": 0.011539973318576813, | |
| "learning_rate": 0.0001374587676474469, | |
| "loss": 0.0222, | |
| "step": 2375 | |
| }, | |
| { | |
| "epoch": 0.6275130182585196, | |
| "grad_norm": 0.23539051413536072, | |
| "learning_rate": 0.00013732682411927695, | |
| "loss": 0.0816, | |
| "step": 2380 | |
| }, | |
| { | |
| "epoch": 0.6288313229187266, | |
| "grad_norm": 0.26793941855430603, | |
| "learning_rate": 0.00013719488059110702, | |
| "loss": 0.0325, | |
| "step": 2385 | |
| }, | |
| { | |
| "epoch": 0.6301496275789334, | |
| "grad_norm": 0.01662217453122139, | |
| "learning_rate": 0.00013706293706293706, | |
| "loss": 0.0221, | |
| "step": 2390 | |
| }, | |
| { | |
| "epoch": 0.6314679322391404, | |
| "grad_norm": 0.30669671297073364, | |
| "learning_rate": 0.00013693099353476713, | |
| "loss": 0.026, | |
| "step": 2395 | |
| }, | |
| { | |
| "epoch": 0.6327862368993474, | |
| "grad_norm": 0.03350894898176193, | |
| "learning_rate": 0.00013679905000659717, | |
| "loss": 0.0072, | |
| "step": 2400 | |
| }, | |
| { | |
| "epoch": 0.6341045415595544, | |
| "grad_norm": 0.014983875676989555, | |
| "learning_rate": 0.00013666710647842724, | |
| "loss": 0.049, | |
| "step": 2405 | |
| }, | |
| { | |
| "epoch": 0.6354228462197614, | |
| "grad_norm": 1.8989384174346924, | |
| "learning_rate": 0.0001365351629502573, | |
| "loss": 0.0335, | |
| "step": 2410 | |
| }, | |
| { | |
| "epoch": 0.6367411508799684, | |
| "grad_norm": 0.030135562643408775, | |
| "learning_rate": 0.00013640321942208735, | |
| "loss": 0.0051, | |
| "step": 2415 | |
| }, | |
| { | |
| "epoch": 0.6380594555401753, | |
| "grad_norm": 0.02079075388610363, | |
| "learning_rate": 0.00013627127589391742, | |
| "loss": 0.0138, | |
| "step": 2420 | |
| }, | |
| { | |
| "epoch": 0.6393777602003823, | |
| "grad_norm": 0.06065403297543526, | |
| "learning_rate": 0.00013613933236574746, | |
| "loss": 0.0357, | |
| "step": 2425 | |
| }, | |
| { | |
| "epoch": 0.6406960648605893, | |
| "grad_norm": 0.2980937659740448, | |
| "learning_rate": 0.00013600738883757753, | |
| "loss": 0.0138, | |
| "step": 2430 | |
| }, | |
| { | |
| "epoch": 0.6420143695207963, | |
| "grad_norm": 0.4820438623428345, | |
| "learning_rate": 0.00013587544530940758, | |
| "loss": 0.01, | |
| "step": 2435 | |
| }, | |
| { | |
| "epoch": 0.6433326741810033, | |
| "grad_norm": 0.005618259310722351, | |
| "learning_rate": 0.00013574350178123765, | |
| "loss": 0.0052, | |
| "step": 2440 | |
| }, | |
| { | |
| "epoch": 0.6446509788412103, | |
| "grad_norm": 0.7173821926116943, | |
| "learning_rate": 0.0001356115582530677, | |
| "loss": 0.0133, | |
| "step": 2445 | |
| }, | |
| { | |
| "epoch": 0.6459692835014171, | |
| "grad_norm": 0.0053142281249165535, | |
| "learning_rate": 0.00013547961472489773, | |
| "loss": 0.0045, | |
| "step": 2450 | |
| }, | |
| { | |
| "epoch": 0.6472875881616241, | |
| "grad_norm": 0.06118829548358917, | |
| "learning_rate": 0.0001353476711967278, | |
| "loss": 0.056, | |
| "step": 2455 | |
| }, | |
| { | |
| "epoch": 0.6486058928218311, | |
| "grad_norm": 3.5878078937530518, | |
| "learning_rate": 0.00013521572766855787, | |
| "loss": 0.0232, | |
| "step": 2460 | |
| }, | |
| { | |
| "epoch": 0.6499241974820381, | |
| "grad_norm": 0.004911276511847973, | |
| "learning_rate": 0.0001350837841403879, | |
| "loss": 0.0074, | |
| "step": 2465 | |
| }, | |
| { | |
| "epoch": 0.6512425021422451, | |
| "grad_norm": 0.0028026222717016935, | |
| "learning_rate": 0.00013495184061221798, | |
| "loss": 0.0782, | |
| "step": 2470 | |
| }, | |
| { | |
| "epoch": 0.6525608068024521, | |
| "grad_norm": 0.7317615747451782, | |
| "learning_rate": 0.00013481989708404802, | |
| "loss": 0.0222, | |
| "step": 2475 | |
| }, | |
| { | |
| "epoch": 0.653879111462659, | |
| "grad_norm": 0.01835751160979271, | |
| "learning_rate": 0.0001346879535558781, | |
| "loss": 0.0661, | |
| "step": 2480 | |
| }, | |
| { | |
| "epoch": 0.655197416122866, | |
| "grad_norm": 0.03598962351679802, | |
| "learning_rate": 0.00013455601002770813, | |
| "loss": 0.0395, | |
| "step": 2485 | |
| }, | |
| { | |
| "epoch": 0.656515720783073, | |
| "grad_norm": 0.013886351138353348, | |
| "learning_rate": 0.0001344240664995382, | |
| "loss": 0.0156, | |
| "step": 2490 | |
| }, | |
| { | |
| "epoch": 0.65783402544328, | |
| "grad_norm": 5.741530895233154, | |
| "learning_rate": 0.00013429212297136827, | |
| "loss": 0.0317, | |
| "step": 2495 | |
| }, | |
| { | |
| "epoch": 0.659152330103487, | |
| "grad_norm": 0.20793496072292328, | |
| "learning_rate": 0.0001341601794431983, | |
| "loss": 0.0072, | |
| "step": 2500 | |
| }, | |
| { | |
| "epoch": 0.659152330103487, | |
| "eval_loss": 0.0300898440182209, | |
| "eval_runtime": 453.0554, | |
| "eval_samples_per_second": 7.443, | |
| "eval_steps_per_second": 3.721, | |
| "step": 2500 | |
| }, | |
| { | |
| "epoch": 0.6604706347636939, | |
| "grad_norm": 0.03460961952805519, | |
| "learning_rate": 0.00013402823591502838, | |
| "loss": 0.0097, | |
| "step": 2505 | |
| }, | |
| { | |
| "epoch": 0.6617889394239008, | |
| "grad_norm": 0.31785696744918823, | |
| "learning_rate": 0.00013389629238685842, | |
| "loss": 0.0303, | |
| "step": 2510 | |
| }, | |
| { | |
| "epoch": 0.6631072440841078, | |
| "grad_norm": 0.4273851215839386, | |
| "learning_rate": 0.0001337643488586885, | |
| "loss": 0.0499, | |
| "step": 2515 | |
| }, | |
| { | |
| "epoch": 0.6644255487443148, | |
| "grad_norm": 0.02236153744161129, | |
| "learning_rate": 0.00013363240533051856, | |
| "loss": 0.0069, | |
| "step": 2520 | |
| }, | |
| { | |
| "epoch": 0.6657438534045218, | |
| "grad_norm": 0.1592864990234375, | |
| "learning_rate": 0.0001335004618023486, | |
| "loss": 0.0326, | |
| "step": 2525 | |
| }, | |
| { | |
| "epoch": 0.6670621580647288, | |
| "grad_norm": 0.029961545020341873, | |
| "learning_rate": 0.00013336851827417867, | |
| "loss": 0.0178, | |
| "step": 2530 | |
| }, | |
| { | |
| "epoch": 0.6683804627249358, | |
| "grad_norm": 0.03120764158666134, | |
| "learning_rate": 0.00013323657474600872, | |
| "loss": 0.115, | |
| "step": 2535 | |
| }, | |
| { | |
| "epoch": 0.6696987673851427, | |
| "grad_norm": 0.01060028001666069, | |
| "learning_rate": 0.00013310463121783879, | |
| "loss": 0.0036, | |
| "step": 2540 | |
| }, | |
| { | |
| "epoch": 0.6710170720453497, | |
| "grad_norm": 0.053470809012651443, | |
| "learning_rate": 0.00013297268768966883, | |
| "loss": 0.0079, | |
| "step": 2545 | |
| }, | |
| { | |
| "epoch": 0.6723353767055567, | |
| "grad_norm": 0.022777097299695015, | |
| "learning_rate": 0.00013284074416149887, | |
| "loss": 0.0078, | |
| "step": 2550 | |
| }, | |
| { | |
| "epoch": 0.6736536813657636, | |
| "grad_norm": 0.0548521913588047, | |
| "learning_rate": 0.00013270880063332894, | |
| "loss": 0.0503, | |
| "step": 2555 | |
| }, | |
| { | |
| "epoch": 0.6749719860259706, | |
| "grad_norm": 0.02028457075357437, | |
| "learning_rate": 0.00013257685710515898, | |
| "loss": 0.0096, | |
| "step": 2560 | |
| }, | |
| { | |
| "epoch": 0.6762902906861776, | |
| "grad_norm": 0.01569107361137867, | |
| "learning_rate": 0.00013244491357698905, | |
| "loss": 0.008, | |
| "step": 2565 | |
| }, | |
| { | |
| "epoch": 0.6776085953463845, | |
| "grad_norm": 0.00743742985650897, | |
| "learning_rate": 0.00013231297004881912, | |
| "loss": 0.005, | |
| "step": 2570 | |
| }, | |
| { | |
| "epoch": 0.6789269000065915, | |
| "grad_norm": 0.025164416059851646, | |
| "learning_rate": 0.00013218102652064916, | |
| "loss": 0.018, | |
| "step": 2575 | |
| }, | |
| { | |
| "epoch": 0.6802452046667985, | |
| "grad_norm": 0.3653188645839691, | |
| "learning_rate": 0.00013204908299247923, | |
| "loss": 0.0295, | |
| "step": 2580 | |
| }, | |
| { | |
| "epoch": 0.6815635093270055, | |
| "grad_norm": 0.685422956943512, | |
| "learning_rate": 0.00013191713946430927, | |
| "loss": 0.0335, | |
| "step": 2585 | |
| }, | |
| { | |
| "epoch": 0.6828818139872125, | |
| "grad_norm": 0.675740122795105, | |
| "learning_rate": 0.00013178519593613934, | |
| "loss": 0.0592, | |
| "step": 2590 | |
| }, | |
| { | |
| "epoch": 0.6842001186474194, | |
| "grad_norm": 0.10513252764940262, | |
| "learning_rate": 0.00013165325240796938, | |
| "loss": 0.0353, | |
| "step": 2595 | |
| }, | |
| { | |
| "epoch": 0.6855184233076264, | |
| "grad_norm": 0.43512973189353943, | |
| "learning_rate": 0.00013152130887979945, | |
| "loss": 0.0142, | |
| "step": 2600 | |
| }, | |
| { | |
| "epoch": 0.6868367279678333, | |
| "grad_norm": 0.029436839744448662, | |
| "learning_rate": 0.00013138936535162952, | |
| "loss": 0.0042, | |
| "step": 2605 | |
| }, | |
| { | |
| "epoch": 0.6881550326280403, | |
| "grad_norm": 0.5607122778892517, | |
| "learning_rate": 0.00013125742182345957, | |
| "loss": 0.0184, | |
| "step": 2610 | |
| }, | |
| { | |
| "epoch": 0.6894733372882473, | |
| "grad_norm": 0.11365406215190887, | |
| "learning_rate": 0.00013112547829528963, | |
| "loss": 0.006, | |
| "step": 2615 | |
| }, | |
| { | |
| "epoch": 0.6907916419484543, | |
| "grad_norm": 0.047227244824171066, | |
| "learning_rate": 0.00013099353476711968, | |
| "loss": 0.008, | |
| "step": 2620 | |
| }, | |
| { | |
| "epoch": 0.6921099466086612, | |
| "grad_norm": 0.0005877618095837533, | |
| "learning_rate": 0.00013086159123894975, | |
| "loss": 0.0286, | |
| "step": 2625 | |
| }, | |
| { | |
| "epoch": 0.6934282512688682, | |
| "grad_norm": 0.010759112425148487, | |
| "learning_rate": 0.0001307296477107798, | |
| "loss": 0.0062, | |
| "step": 2630 | |
| }, | |
| { | |
| "epoch": 0.6947465559290752, | |
| "grad_norm": 0.07117745280265808, | |
| "learning_rate": 0.00013059770418260986, | |
| "loss": 0.0891, | |
| "step": 2635 | |
| }, | |
| { | |
| "epoch": 0.6960648605892822, | |
| "grad_norm": 0.0639057606458664, | |
| "learning_rate": 0.00013046576065443993, | |
| "loss": 0.0072, | |
| "step": 2640 | |
| }, | |
| { | |
| "epoch": 0.6973831652494892, | |
| "grad_norm": 0.027350090444087982, | |
| "learning_rate": 0.00013033381712626994, | |
| "loss": 0.0103, | |
| "step": 2645 | |
| }, | |
| { | |
| "epoch": 0.6987014699096962, | |
| "grad_norm": 0.015336195938289165, | |
| "learning_rate": 0.0001302018735981, | |
| "loss": 0.0041, | |
| "step": 2650 | |
| }, | |
| { | |
| "epoch": 0.700019774569903, | |
| "grad_norm": 1.0650830268859863, | |
| "learning_rate": 0.00013006993006993008, | |
| "loss": 0.0443, | |
| "step": 2655 | |
| }, | |
| { | |
| "epoch": 0.70133807923011, | |
| "grad_norm": 0.019073212519288063, | |
| "learning_rate": 0.00012993798654176012, | |
| "loss": 0.0331, | |
| "step": 2660 | |
| }, | |
| { | |
| "epoch": 0.702656383890317, | |
| "grad_norm": 0.10109209269285202, | |
| "learning_rate": 0.0001298060430135902, | |
| "loss": 0.0054, | |
| "step": 2665 | |
| }, | |
| { | |
| "epoch": 0.703974688550524, | |
| "grad_norm": 0.03528957813978195, | |
| "learning_rate": 0.00012967409948542023, | |
| "loss": 0.0427, | |
| "step": 2670 | |
| }, | |
| { | |
| "epoch": 0.705292993210731, | |
| "grad_norm": 0.03577788919210434, | |
| "learning_rate": 0.0001295421559572503, | |
| "loss": 0.023, | |
| "step": 2675 | |
| }, | |
| { | |
| "epoch": 0.706611297870938, | |
| "grad_norm": 0.5576180815696716, | |
| "learning_rate": 0.00012941021242908034, | |
| "loss": 0.0416, | |
| "step": 2680 | |
| }, | |
| { | |
| "epoch": 0.7079296025311449, | |
| "grad_norm": 0.017131298780441284, | |
| "learning_rate": 0.0001292782689009104, | |
| "loss": 0.0235, | |
| "step": 2685 | |
| }, | |
| { | |
| "epoch": 0.7092479071913519, | |
| "grad_norm": 0.8517888784408569, | |
| "learning_rate": 0.00012914632537274048, | |
| "loss": 0.0168, | |
| "step": 2690 | |
| }, | |
| { | |
| "epoch": 0.7105662118515589, | |
| "grad_norm": 0.23812156915664673, | |
| "learning_rate": 0.00012901438184457052, | |
| "loss": 0.0483, | |
| "step": 2695 | |
| }, | |
| { | |
| "epoch": 0.7118845165117659, | |
| "grad_norm": 0.11746613681316376, | |
| "learning_rate": 0.0001288824383164006, | |
| "loss": 0.0255, | |
| "step": 2700 | |
| }, | |
| { | |
| "epoch": 0.7132028211719729, | |
| "grad_norm": 0.20089928805828094, | |
| "learning_rate": 0.00012875049478823064, | |
| "loss": 0.0267, | |
| "step": 2705 | |
| }, | |
| { | |
| "epoch": 0.7145211258321799, | |
| "grad_norm": 0.8301129937171936, | |
| "learning_rate": 0.0001286185512600607, | |
| "loss": 0.016, | |
| "step": 2710 | |
| }, | |
| { | |
| "epoch": 0.7158394304923867, | |
| "grad_norm": 0.01838674768805504, | |
| "learning_rate": 0.00012848660773189077, | |
| "loss": 0.0229, | |
| "step": 2715 | |
| }, | |
| { | |
| "epoch": 0.7171577351525937, | |
| "grad_norm": 0.03670337051153183, | |
| "learning_rate": 0.00012835466420372082, | |
| "loss": 0.038, | |
| "step": 2720 | |
| }, | |
| { | |
| "epoch": 0.7184760398128007, | |
| "grad_norm": 0.0452633760869503, | |
| "learning_rate": 0.00012822272067555089, | |
| "loss": 0.0622, | |
| "step": 2725 | |
| }, | |
| { | |
| "epoch": 0.7197943444730077, | |
| "grad_norm": 0.09503110498189926, | |
| "learning_rate": 0.00012809077714738093, | |
| "loss": 0.0209, | |
| "step": 2730 | |
| }, | |
| { | |
| "epoch": 0.7211126491332147, | |
| "grad_norm": 1.0327308177947998, | |
| "learning_rate": 0.000127958833619211, | |
| "loss": 0.0361, | |
| "step": 2735 | |
| }, | |
| { | |
| "epoch": 0.7224309537934217, | |
| "grad_norm": 1.0049290657043457, | |
| "learning_rate": 0.00012782689009104104, | |
| "loss": 0.0365, | |
| "step": 2740 | |
| }, | |
| { | |
| "epoch": 0.7237492584536286, | |
| "grad_norm": 0.029774073511362076, | |
| "learning_rate": 0.00012769494656287108, | |
| "loss": 0.0257, | |
| "step": 2745 | |
| }, | |
| { | |
| "epoch": 0.7250675631138356, | |
| "grad_norm": 0.20974040031433105, | |
| "learning_rate": 0.00012756300303470115, | |
| "loss": 0.0542, | |
| "step": 2750 | |
| }, | |
| { | |
| "epoch": 0.7263858677740426, | |
| "grad_norm": 0.8153854608535767, | |
| "learning_rate": 0.0001274310595065312, | |
| "loss": 0.0216, | |
| "step": 2755 | |
| }, | |
| { | |
| "epoch": 0.7277041724342496, | |
| "grad_norm": 0.4393698573112488, | |
| "learning_rate": 0.00012729911597836126, | |
| "loss": 0.0451, | |
| "step": 2760 | |
| }, | |
| { | |
| "epoch": 0.7290224770944566, | |
| "grad_norm": 0.06990349292755127, | |
| "learning_rate": 0.00012716717245019133, | |
| "loss": 0.03, | |
| "step": 2765 | |
| }, | |
| { | |
| "epoch": 0.7303407817546635, | |
| "grad_norm": 0.32689470052719116, | |
| "learning_rate": 0.00012703522892202137, | |
| "loss": 0.0263, | |
| "step": 2770 | |
| }, | |
| { | |
| "epoch": 0.7316590864148704, | |
| "grad_norm": 0.026600876823067665, | |
| "learning_rate": 0.00012690328539385144, | |
| "loss": 0.0404, | |
| "step": 2775 | |
| }, | |
| { | |
| "epoch": 0.7329773910750774, | |
| "grad_norm": 0.11228257417678833, | |
| "learning_rate": 0.00012677134186568148, | |
| "loss": 0.0224, | |
| "step": 2780 | |
| }, | |
| { | |
| "epoch": 0.7342956957352844, | |
| "grad_norm": 0.6469443440437317, | |
| "learning_rate": 0.00012663939833751155, | |
| "loss": 0.0178, | |
| "step": 2785 | |
| }, | |
| { | |
| "epoch": 0.7356140003954914, | |
| "grad_norm": 0.020773250609636307, | |
| "learning_rate": 0.0001265074548093416, | |
| "loss": 0.011, | |
| "step": 2790 | |
| }, | |
| { | |
| "epoch": 0.7369323050556984, | |
| "grad_norm": 0.7378728985786438, | |
| "learning_rate": 0.00012637551128117167, | |
| "loss": 0.0227, | |
| "step": 2795 | |
| }, | |
| { | |
| "epoch": 0.7382506097159054, | |
| "grad_norm": 0.008189595304429531, | |
| "learning_rate": 0.00012624356775300173, | |
| "loss": 0.0892, | |
| "step": 2800 | |
| }, | |
| { | |
| "epoch": 0.7395689143761123, | |
| "grad_norm": 0.031633853912353516, | |
| "learning_rate": 0.00012611162422483178, | |
| "loss": 0.0093, | |
| "step": 2805 | |
| }, | |
| { | |
| "epoch": 0.7408872190363193, | |
| "grad_norm": 0.5078475475311279, | |
| "learning_rate": 0.00012597968069666185, | |
| "loss": 0.0567, | |
| "step": 2810 | |
| }, | |
| { | |
| "epoch": 0.7422055236965263, | |
| "grad_norm": 0.21766887605190277, | |
| "learning_rate": 0.0001258477371684919, | |
| "loss": 0.0485, | |
| "step": 2815 | |
| }, | |
| { | |
| "epoch": 0.7435238283567333, | |
| "grad_norm": 0.3029612898826599, | |
| "learning_rate": 0.00012571579364032196, | |
| "loss": 0.032, | |
| "step": 2820 | |
| }, | |
| { | |
| "epoch": 0.7448421330169402, | |
| "grad_norm": 1.2135159969329834, | |
| "learning_rate": 0.00012558385011215203, | |
| "loss": 0.0139, | |
| "step": 2825 | |
| }, | |
| { | |
| "epoch": 0.7461604376771472, | |
| "grad_norm": 0.016875172033905983, | |
| "learning_rate": 0.00012545190658398207, | |
| "loss": 0.0323, | |
| "step": 2830 | |
| }, | |
| { | |
| "epoch": 0.7474787423373541, | |
| "grad_norm": 0.08923230320215225, | |
| "learning_rate": 0.00012531996305581214, | |
| "loss": 0.0343, | |
| "step": 2835 | |
| }, | |
| { | |
| "epoch": 0.7487970469975611, | |
| "grad_norm": 0.2958766520023346, | |
| "learning_rate": 0.00012518801952764215, | |
| "loss": 0.0431, | |
| "step": 2840 | |
| }, | |
| { | |
| "epoch": 0.7501153516577681, | |
| "grad_norm": 0.7344386577606201, | |
| "learning_rate": 0.00012505607599947222, | |
| "loss": 0.0389, | |
| "step": 2845 | |
| }, | |
| { | |
| "epoch": 0.7514336563179751, | |
| "grad_norm": 0.03681635856628418, | |
| "learning_rate": 0.0001249241324713023, | |
| "loss": 0.0258, | |
| "step": 2850 | |
| }, | |
| { | |
| "epoch": 0.7527519609781821, | |
| "grad_norm": 0.22866861522197723, | |
| "learning_rate": 0.00012479218894313233, | |
| "loss": 0.0223, | |
| "step": 2855 | |
| }, | |
| { | |
| "epoch": 0.7540702656383891, | |
| "grad_norm": 0.029770435765385628, | |
| "learning_rate": 0.0001246602454149624, | |
| "loss": 0.0205, | |
| "step": 2860 | |
| }, | |
| { | |
| "epoch": 0.755388570298596, | |
| "grad_norm": 0.011845707893371582, | |
| "learning_rate": 0.00012452830188679244, | |
| "loss": 0.0252, | |
| "step": 2865 | |
| }, | |
| { | |
| "epoch": 0.756706874958803, | |
| "grad_norm": 0.06696149706840515, | |
| "learning_rate": 0.00012439635835862251, | |
| "loss": 0.0166, | |
| "step": 2870 | |
| }, | |
| { | |
| "epoch": 0.75802517961901, | |
| "grad_norm": 0.01653144136071205, | |
| "learning_rate": 0.00012426441483045256, | |
| "loss": 0.0487, | |
| "step": 2875 | |
| }, | |
| { | |
| "epoch": 0.7593434842792169, | |
| "grad_norm": 0.031312476843595505, | |
| "learning_rate": 0.00012413247130228263, | |
| "loss": 0.0155, | |
| "step": 2880 | |
| }, | |
| { | |
| "epoch": 0.7606617889394239, | |
| "grad_norm": 0.011625733226537704, | |
| "learning_rate": 0.0001240005277741127, | |
| "loss": 0.0333, | |
| "step": 2885 | |
| }, | |
| { | |
| "epoch": 0.7619800935996309, | |
| "grad_norm": 0.012089414522051811, | |
| "learning_rate": 0.00012386858424594274, | |
| "loss": 0.003, | |
| "step": 2890 | |
| }, | |
| { | |
| "epoch": 0.7632983982598378, | |
| "grad_norm": 0.3012307584285736, | |
| "learning_rate": 0.0001237366407177728, | |
| "loss": 0.0172, | |
| "step": 2895 | |
| }, | |
| { | |
| "epoch": 0.7646167029200448, | |
| "grad_norm": 0.31575000286102295, | |
| "learning_rate": 0.00012360469718960285, | |
| "loss": 0.0409, | |
| "step": 2900 | |
| }, | |
| { | |
| "epoch": 0.7659350075802518, | |
| "grad_norm": 0.009794364683330059, | |
| "learning_rate": 0.00012347275366143292, | |
| "loss": 0.0214, | |
| "step": 2905 | |
| }, | |
| { | |
| "epoch": 0.7672533122404588, | |
| "grad_norm": 0.5973085165023804, | |
| "learning_rate": 0.00012334081013326299, | |
| "loss": 0.0245, | |
| "step": 2910 | |
| }, | |
| { | |
| "epoch": 0.7685716169006658, | |
| "grad_norm": 0.019750040024518967, | |
| "learning_rate": 0.00012320886660509303, | |
| "loss": 0.0063, | |
| "step": 2915 | |
| }, | |
| { | |
| "epoch": 0.7698899215608728, | |
| "grad_norm": 0.06402858346700668, | |
| "learning_rate": 0.0001230769230769231, | |
| "loss": 0.0444, | |
| "step": 2920 | |
| }, | |
| { | |
| "epoch": 0.7712082262210797, | |
| "grad_norm": 0.02876671403646469, | |
| "learning_rate": 0.00012294497954875314, | |
| "loss": 0.0103, | |
| "step": 2925 | |
| }, | |
| { | |
| "epoch": 0.7725265308812866, | |
| "grad_norm": 0.6962207555770874, | |
| "learning_rate": 0.0001228130360205832, | |
| "loss": 0.0318, | |
| "step": 2930 | |
| }, | |
| { | |
| "epoch": 0.7738448355414936, | |
| "grad_norm": 0.006536522414535284, | |
| "learning_rate": 0.00012268109249241325, | |
| "loss": 0.0096, | |
| "step": 2935 | |
| }, | |
| { | |
| "epoch": 0.7751631402017006, | |
| "grad_norm": 0.07097168266773224, | |
| "learning_rate": 0.0001225491489642433, | |
| "loss": 0.0174, | |
| "step": 2940 | |
| }, | |
| { | |
| "epoch": 0.7764814448619076, | |
| "grad_norm": 0.042360126972198486, | |
| "learning_rate": 0.00012241720543607336, | |
| "loss": 0.0158, | |
| "step": 2945 | |
| }, | |
| { | |
| "epoch": 0.7777997495221146, | |
| "grad_norm": 0.01159572321921587, | |
| "learning_rate": 0.0001222852619079034, | |
| "loss": 0.0265, | |
| "step": 2950 | |
| }, | |
| { | |
| "epoch": 0.7791180541823215, | |
| "grad_norm": 0.38408163189888, | |
| "learning_rate": 0.00012215331837973347, | |
| "loss": 0.0233, | |
| "step": 2955 | |
| }, | |
| { | |
| "epoch": 0.7804363588425285, | |
| "grad_norm": 0.15588605403900146, | |
| "learning_rate": 0.00012202137485156353, | |
| "loss": 0.0041, | |
| "step": 2960 | |
| }, | |
| { | |
| "epoch": 0.7817546635027355, | |
| "grad_norm": 0.006892362609505653, | |
| "learning_rate": 0.00012188943132339358, | |
| "loss": 0.0026, | |
| "step": 2965 | |
| }, | |
| { | |
| "epoch": 0.7830729681629425, | |
| "grad_norm": 0.030915727838873863, | |
| "learning_rate": 0.00012175748779522364, | |
| "loss": 0.0028, | |
| "step": 2970 | |
| }, | |
| { | |
| "epoch": 0.7843912728231495, | |
| "grad_norm": 0.8151025772094727, | |
| "learning_rate": 0.00012162554426705371, | |
| "loss": 0.0429, | |
| "step": 2975 | |
| }, | |
| { | |
| "epoch": 0.7857095774833565, | |
| "grad_norm": 0.6765475273132324, | |
| "learning_rate": 0.00012149360073888377, | |
| "loss": 0.0319, | |
| "step": 2980 | |
| }, | |
| { | |
| "epoch": 0.7870278821435633, | |
| "grad_norm": 0.054469238966703415, | |
| "learning_rate": 0.00012136165721071382, | |
| "loss": 0.0413, | |
| "step": 2985 | |
| }, | |
| { | |
| "epoch": 0.7883461868037703, | |
| "grad_norm": 0.045610666275024414, | |
| "learning_rate": 0.00012122971368254388, | |
| "loss": 0.0521, | |
| "step": 2990 | |
| }, | |
| { | |
| "epoch": 0.7896644914639773, | |
| "grad_norm": 0.4222470223903656, | |
| "learning_rate": 0.00012109777015437393, | |
| "loss": 0.0846, | |
| "step": 2995 | |
| }, | |
| { | |
| "epoch": 0.7909827961241843, | |
| "grad_norm": 0.0272397268563509, | |
| "learning_rate": 0.00012096582662620399, | |
| "loss": 0.0364, | |
| "step": 3000 | |
| }, | |
| { | |
| "epoch": 0.7909827961241843, | |
| "eval_loss": 0.033312585204839706, | |
| "eval_runtime": 452.2552, | |
| "eval_samples_per_second": 7.456, | |
| "eval_steps_per_second": 3.728, | |
| "step": 3000 | |
| }, | |
| { | |
| "epoch": 0.7923011007843913, | |
| "grad_norm": 0.08674059063196182, | |
| "learning_rate": 0.00012083388309803406, | |
| "loss": 0.0081, | |
| "step": 3005 | |
| }, | |
| { | |
| "epoch": 0.7936194054445982, | |
| "grad_norm": 0.21960832178592682, | |
| "learning_rate": 0.00012070193956986411, | |
| "loss": 0.0468, | |
| "step": 3010 | |
| }, | |
| { | |
| "epoch": 0.7949377101048052, | |
| "grad_norm": 0.11259289085865021, | |
| "learning_rate": 0.00012056999604169417, | |
| "loss": 0.0124, | |
| "step": 3015 | |
| }, | |
| { | |
| "epoch": 0.7962560147650122, | |
| "grad_norm": 0.02945362776517868, | |
| "learning_rate": 0.00012043805251352422, | |
| "loss": 0.0298, | |
| "step": 3020 | |
| }, | |
| { | |
| "epoch": 0.7975743194252192, | |
| "grad_norm": 0.27889615297317505, | |
| "learning_rate": 0.00012030610898535428, | |
| "loss": 0.0251, | |
| "step": 3025 | |
| }, | |
| { | |
| "epoch": 0.7988926240854262, | |
| "grad_norm": 0.05873241275548935, | |
| "learning_rate": 0.00012017416545718434, | |
| "loss": 0.0132, | |
| "step": 3030 | |
| }, | |
| { | |
| "epoch": 0.8002109287456332, | |
| "grad_norm": 0.1570046991109848, | |
| "learning_rate": 0.00012004222192901439, | |
| "loss": 0.0228, | |
| "step": 3035 | |
| }, | |
| { | |
| "epoch": 0.80152923340584, | |
| "grad_norm": 0.12575332820415497, | |
| "learning_rate": 0.00011991027840084443, | |
| "loss": 0.0049, | |
| "step": 3040 | |
| }, | |
| { | |
| "epoch": 0.802847538066047, | |
| "grad_norm": 0.8416435122489929, | |
| "learning_rate": 0.00011977833487267449, | |
| "loss": 0.0542, | |
| "step": 3045 | |
| }, | |
| { | |
| "epoch": 0.804165842726254, | |
| "grad_norm": 0.2605098485946655, | |
| "learning_rate": 0.00011964639134450454, | |
| "loss": 0.0084, | |
| "step": 3050 | |
| }, | |
| { | |
| "epoch": 0.805484147386461, | |
| "grad_norm": 0.8996294736862183, | |
| "learning_rate": 0.00011951444781633461, | |
| "loss": 0.0442, | |
| "step": 3055 | |
| }, | |
| { | |
| "epoch": 0.806802452046668, | |
| "grad_norm": 2.7525105476379395, | |
| "learning_rate": 0.00011938250428816467, | |
| "loss": 0.0642, | |
| "step": 3060 | |
| }, | |
| { | |
| "epoch": 0.808120756706875, | |
| "grad_norm": 0.14955930411815643, | |
| "learning_rate": 0.00011925056075999473, | |
| "loss": 0.0384, | |
| "step": 3065 | |
| }, | |
| { | |
| "epoch": 0.8094390613670819, | |
| "grad_norm": 0.018756115809082985, | |
| "learning_rate": 0.00011911861723182478, | |
| "loss": 0.0154, | |
| "step": 3070 | |
| }, | |
| { | |
| "epoch": 0.8107573660272889, | |
| "grad_norm": 0.23998615145683289, | |
| "learning_rate": 0.00011898667370365484, | |
| "loss": 0.0413, | |
| "step": 3075 | |
| }, | |
| { | |
| "epoch": 0.8120756706874959, | |
| "grad_norm": 0.27253249287605286, | |
| "learning_rate": 0.00011885473017548489, | |
| "loss": 0.0081, | |
| "step": 3080 | |
| }, | |
| { | |
| "epoch": 0.8133939753477029, | |
| "grad_norm": 0.2925993502140045, | |
| "learning_rate": 0.00011872278664731495, | |
| "loss": 0.0332, | |
| "step": 3085 | |
| }, | |
| { | |
| "epoch": 0.8147122800079099, | |
| "grad_norm": 0.5364832878112793, | |
| "learning_rate": 0.00011859084311914502, | |
| "loss": 0.0143, | |
| "step": 3090 | |
| }, | |
| { | |
| "epoch": 0.8160305846681168, | |
| "grad_norm": 0.32104921340942383, | |
| "learning_rate": 0.00011845889959097507, | |
| "loss": 0.0216, | |
| "step": 3095 | |
| }, | |
| { | |
| "epoch": 0.8173488893283237, | |
| "grad_norm": 0.0205856766551733, | |
| "learning_rate": 0.00011832695606280513, | |
| "loss": 0.0346, | |
| "step": 3100 | |
| }, | |
| { | |
| "epoch": 0.8186671939885307, | |
| "grad_norm": 0.2541547417640686, | |
| "learning_rate": 0.00011819501253463518, | |
| "loss": 0.0793, | |
| "step": 3105 | |
| }, | |
| { | |
| "epoch": 0.8199854986487377, | |
| "grad_norm": 0.08333491533994675, | |
| "learning_rate": 0.00011806306900646524, | |
| "loss": 0.0049, | |
| "step": 3110 | |
| }, | |
| { | |
| "epoch": 0.8213038033089447, | |
| "grad_norm": 0.0355968177318573, | |
| "learning_rate": 0.0001179311254782953, | |
| "loss": 0.0051, | |
| "step": 3115 | |
| }, | |
| { | |
| "epoch": 0.8226221079691517, | |
| "grad_norm": 0.06948401033878326, | |
| "learning_rate": 0.00011779918195012536, | |
| "loss": 0.013, | |
| "step": 3120 | |
| }, | |
| { | |
| "epoch": 0.8239404126293587, | |
| "grad_norm": 0.03328891843557358, | |
| "learning_rate": 0.00011766723842195542, | |
| "loss": 0.0122, | |
| "step": 3125 | |
| }, | |
| { | |
| "epoch": 0.8252587172895656, | |
| "grad_norm": 0.013782350346446037, | |
| "learning_rate": 0.00011753529489378548, | |
| "loss": 0.0073, | |
| "step": 3130 | |
| }, | |
| { | |
| "epoch": 0.8265770219497726, | |
| "grad_norm": 0.024390392005443573, | |
| "learning_rate": 0.00011740335136561553, | |
| "loss": 0.0143, | |
| "step": 3135 | |
| }, | |
| { | |
| "epoch": 0.8278953266099796, | |
| "grad_norm": 0.002548128366470337, | |
| "learning_rate": 0.00011727140783744557, | |
| "loss": 0.0027, | |
| "step": 3140 | |
| }, | |
| { | |
| "epoch": 0.8292136312701865, | |
| "grad_norm": 0.11674848943948746, | |
| "learning_rate": 0.00011713946430927563, | |
| "loss": 0.0253, | |
| "step": 3145 | |
| }, | |
| { | |
| "epoch": 0.8305319359303935, | |
| "grad_norm": 0.005774884019047022, | |
| "learning_rate": 0.00011700752078110568, | |
| "loss": 0.0018, | |
| "step": 3150 | |
| }, | |
| { | |
| "epoch": 0.8318502405906005, | |
| "grad_norm": 0.5763069987297058, | |
| "learning_rate": 0.00011687557725293574, | |
| "loss": 0.0119, | |
| "step": 3155 | |
| }, | |
| { | |
| "epoch": 0.8331685452508074, | |
| "grad_norm": 0.0027607593219727278, | |
| "learning_rate": 0.0001167436337247658, | |
| "loss": 0.0279, | |
| "step": 3160 | |
| }, | |
| { | |
| "epoch": 0.8344868499110144, | |
| "grad_norm": 1.859642505645752, | |
| "learning_rate": 0.00011661169019659585, | |
| "loss": 0.0228, | |
| "step": 3165 | |
| }, | |
| { | |
| "epoch": 0.8358051545712214, | |
| "grad_norm": 0.16597022116184235, | |
| "learning_rate": 0.00011647974666842592, | |
| "loss": 0.1228, | |
| "step": 3170 | |
| }, | |
| { | |
| "epoch": 0.8371234592314284, | |
| "grad_norm": 0.33833742141723633, | |
| "learning_rate": 0.00011634780314025598, | |
| "loss": 0.073, | |
| "step": 3175 | |
| }, | |
| { | |
| "epoch": 0.8384417638916354, | |
| "grad_norm": 0.024682912975549698, | |
| "learning_rate": 0.00011621585961208603, | |
| "loss": 0.0042, | |
| "step": 3180 | |
| }, | |
| { | |
| "epoch": 0.8397600685518424, | |
| "grad_norm": 0.05926942452788353, | |
| "learning_rate": 0.00011608391608391609, | |
| "loss": 0.0066, | |
| "step": 3185 | |
| }, | |
| { | |
| "epoch": 0.8410783732120493, | |
| "grad_norm": 0.1414029747247696, | |
| "learning_rate": 0.00011595197255574614, | |
| "loss": 0.0603, | |
| "step": 3190 | |
| }, | |
| { | |
| "epoch": 0.8423966778722562, | |
| "grad_norm": 0.37928736209869385, | |
| "learning_rate": 0.0001158200290275762, | |
| "loss": 0.0266, | |
| "step": 3195 | |
| }, | |
| { | |
| "epoch": 0.8437149825324632, | |
| "grad_norm": 0.018329354003071785, | |
| "learning_rate": 0.00011568808549940627, | |
| "loss": 0.0047, | |
| "step": 3200 | |
| }, | |
| { | |
| "epoch": 0.8450332871926702, | |
| "grad_norm": 0.2993735373020172, | |
| "learning_rate": 0.00011555614197123632, | |
| "loss": 0.0218, | |
| "step": 3205 | |
| }, | |
| { | |
| "epoch": 0.8463515918528772, | |
| "grad_norm": 0.1767728328704834, | |
| "learning_rate": 0.00011542419844306638, | |
| "loss": 0.0363, | |
| "step": 3210 | |
| }, | |
| { | |
| "epoch": 0.8476698965130842, | |
| "grad_norm": 0.39774414896965027, | |
| "learning_rate": 0.00011529225491489644, | |
| "loss": 0.0506, | |
| "step": 3215 | |
| }, | |
| { | |
| "epoch": 0.8489882011732911, | |
| "grad_norm": 0.021896762773394585, | |
| "learning_rate": 0.00011516031138672649, | |
| "loss": 0.0081, | |
| "step": 3220 | |
| }, | |
| { | |
| "epoch": 0.8503065058334981, | |
| "grad_norm": 0.358372300863266, | |
| "learning_rate": 0.00011502836785855655, | |
| "loss": 0.0224, | |
| "step": 3225 | |
| }, | |
| { | |
| "epoch": 0.8516248104937051, | |
| "grad_norm": 0.01605542004108429, | |
| "learning_rate": 0.00011489642433038662, | |
| "loss": 0.0215, | |
| "step": 3230 | |
| }, | |
| { | |
| "epoch": 0.8529431151539121, | |
| "grad_norm": 0.021189266815781593, | |
| "learning_rate": 0.00011476448080221667, | |
| "loss": 0.0051, | |
| "step": 3235 | |
| }, | |
| { | |
| "epoch": 0.8542614198141191, | |
| "grad_norm": 0.013394076377153397, | |
| "learning_rate": 0.0001146325372740467, | |
| "loss": 0.021, | |
| "step": 3240 | |
| }, | |
| { | |
| "epoch": 0.8555797244743261, | |
| "grad_norm": 0.19848507642745972, | |
| "learning_rate": 0.00011450059374587676, | |
| "loss": 0.0285, | |
| "step": 3245 | |
| }, | |
| { | |
| "epoch": 0.856898029134533, | |
| "grad_norm": 0.2463046759366989, | |
| "learning_rate": 0.00011436865021770683, | |
| "loss": 0.0384, | |
| "step": 3250 | |
| }, | |
| { | |
| "epoch": 0.8582163337947399, | |
| "grad_norm": 0.37432390451431274, | |
| "learning_rate": 0.00011423670668953688, | |
| "loss": 0.0098, | |
| "step": 3255 | |
| }, | |
| { | |
| "epoch": 0.8595346384549469, | |
| "grad_norm": 0.060943394899368286, | |
| "learning_rate": 0.00011410476316136694, | |
| "loss": 0.0087, | |
| "step": 3260 | |
| }, | |
| { | |
| "epoch": 0.8608529431151539, | |
| "grad_norm": 0.2846696674823761, | |
| "learning_rate": 0.00011397281963319699, | |
| "loss": 0.0148, | |
| "step": 3265 | |
| }, | |
| { | |
| "epoch": 0.8621712477753609, | |
| "grad_norm": 0.009311323054134846, | |
| "learning_rate": 0.00011384087610502705, | |
| "loss": 0.0024, | |
| "step": 3270 | |
| }, | |
| { | |
| "epoch": 0.8634895524355679, | |
| "grad_norm": 0.046277035027742386, | |
| "learning_rate": 0.0001137089325768571, | |
| "loss": 0.0274, | |
| "step": 3275 | |
| }, | |
| { | |
| "epoch": 0.8648078570957748, | |
| "grad_norm": 0.006024620030075312, | |
| "learning_rate": 0.00011357698904868716, | |
| "loss": 0.0286, | |
| "step": 3280 | |
| }, | |
| { | |
| "epoch": 0.8661261617559818, | |
| "grad_norm": 0.033578380942344666, | |
| "learning_rate": 0.00011344504552051723, | |
| "loss": 0.0153, | |
| "step": 3285 | |
| }, | |
| { | |
| "epoch": 0.8674444664161888, | |
| "grad_norm": 0.8537917137145996, | |
| "learning_rate": 0.00011331310199234728, | |
| "loss": 0.0304, | |
| "step": 3290 | |
| }, | |
| { | |
| "epoch": 0.8687627710763958, | |
| "grad_norm": 0.013933337293565273, | |
| "learning_rate": 0.00011318115846417734, | |
| "loss": 0.0112, | |
| "step": 3295 | |
| }, | |
| { | |
| "epoch": 0.8700810757366028, | |
| "grad_norm": 0.35437721014022827, | |
| "learning_rate": 0.0001130492149360074, | |
| "loss": 0.0228, | |
| "step": 3300 | |
| }, | |
| { | |
| "epoch": 0.8713993803968098, | |
| "grad_norm": 1.3024121522903442, | |
| "learning_rate": 0.00011291727140783745, | |
| "loss": 0.0203, | |
| "step": 3305 | |
| }, | |
| { | |
| "epoch": 0.8727176850570166, | |
| "grad_norm": 0.5131255984306335, | |
| "learning_rate": 0.00011278532787966751, | |
| "loss": 0.0181, | |
| "step": 3310 | |
| }, | |
| { | |
| "epoch": 0.8740359897172236, | |
| "grad_norm": 0.039366886019706726, | |
| "learning_rate": 0.00011265338435149758, | |
| "loss": 0.0192, | |
| "step": 3315 | |
| }, | |
| { | |
| "epoch": 0.8753542943774306, | |
| "grad_norm": 0.13679669797420502, | |
| "learning_rate": 0.00011252144082332763, | |
| "loss": 0.004, | |
| "step": 3320 | |
| }, | |
| { | |
| "epoch": 0.8766725990376376, | |
| "grad_norm": 0.003076886525377631, | |
| "learning_rate": 0.00011238949729515769, | |
| "loss": 0.0405, | |
| "step": 3325 | |
| }, | |
| { | |
| "epoch": 0.8779909036978446, | |
| "grad_norm": 0.019953785464167595, | |
| "learning_rate": 0.00011225755376698774, | |
| "loss": 0.0241, | |
| "step": 3330 | |
| }, | |
| { | |
| "epoch": 0.8793092083580516, | |
| "grad_norm": 0.007980377413332462, | |
| "learning_rate": 0.0001121256102388178, | |
| "loss": 0.0064, | |
| "step": 3335 | |
| }, | |
| { | |
| "epoch": 0.8806275130182585, | |
| "grad_norm": 0.018761295825242996, | |
| "learning_rate": 0.00011199366671064784, | |
| "loss": 0.0032, | |
| "step": 3340 | |
| }, | |
| { | |
| "epoch": 0.8819458176784655, | |
| "grad_norm": 0.022511709481477737, | |
| "learning_rate": 0.0001118617231824779, | |
| "loss": 0.0055, | |
| "step": 3345 | |
| }, | |
| { | |
| "epoch": 0.8832641223386725, | |
| "grad_norm": 0.021270718425512314, | |
| "learning_rate": 0.00011172977965430795, | |
| "loss": 0.033, | |
| "step": 3350 | |
| }, | |
| { | |
| "epoch": 0.8845824269988795, | |
| "grad_norm": 0.02710561640560627, | |
| "learning_rate": 0.00011159783612613801, | |
| "loss": 0.0094, | |
| "step": 3355 | |
| }, | |
| { | |
| "epoch": 0.8859007316590864, | |
| "grad_norm": 0.4353378117084503, | |
| "learning_rate": 0.00011146589259796806, | |
| "loss": 0.0089, | |
| "step": 3360 | |
| }, | |
| { | |
| "epoch": 0.8872190363192934, | |
| "grad_norm": 0.0257766991853714, | |
| "learning_rate": 0.00011133394906979813, | |
| "loss": 0.0059, | |
| "step": 3365 | |
| }, | |
| { | |
| "epoch": 0.8885373409795003, | |
| "grad_norm": 0.80838942527771, | |
| "learning_rate": 0.00011120200554162819, | |
| "loss": 0.0263, | |
| "step": 3370 | |
| }, | |
| { | |
| "epoch": 0.8898556456397073, | |
| "grad_norm": 0.007799761835485697, | |
| "learning_rate": 0.00011107006201345824, | |
| "loss": 0.0028, | |
| "step": 3375 | |
| }, | |
| { | |
| "epoch": 0.8911739502999143, | |
| "grad_norm": 0.007315775845199823, | |
| "learning_rate": 0.0001109381184852883, | |
| "loss": 0.0127, | |
| "step": 3380 | |
| }, | |
| { | |
| "epoch": 0.8924922549601213, | |
| "grad_norm": 1.4861233234405518, | |
| "learning_rate": 0.00011080617495711836, | |
| "loss": 0.0562, | |
| "step": 3385 | |
| }, | |
| { | |
| "epoch": 0.8938105596203283, | |
| "grad_norm": 0.010219530202448368, | |
| "learning_rate": 0.00011067423142894841, | |
| "loss": 0.0438, | |
| "step": 3390 | |
| }, | |
| { | |
| "epoch": 0.8951288642805353, | |
| "grad_norm": 1.0191857814788818, | |
| "learning_rate": 0.00011054228790077848, | |
| "loss": 0.0493, | |
| "step": 3395 | |
| }, | |
| { | |
| "epoch": 0.8964471689407422, | |
| "grad_norm": 0.01459536887705326, | |
| "learning_rate": 0.00011041034437260854, | |
| "loss": 0.0117, | |
| "step": 3400 | |
| }, | |
| { | |
| "epoch": 0.8977654736009492, | |
| "grad_norm": 0.008682495914399624, | |
| "learning_rate": 0.00011027840084443859, | |
| "loss": 0.02, | |
| "step": 3405 | |
| }, | |
| { | |
| "epoch": 0.8990837782611562, | |
| "grad_norm": 0.02197263017296791, | |
| "learning_rate": 0.00011014645731626865, | |
| "loss": 0.0454, | |
| "step": 3410 | |
| }, | |
| { | |
| "epoch": 0.9004020829213631, | |
| "grad_norm": 0.01436714269220829, | |
| "learning_rate": 0.0001100145137880987, | |
| "loss": 0.0283, | |
| "step": 3415 | |
| }, | |
| { | |
| "epoch": 0.9017203875815701, | |
| "grad_norm": 0.14327946305274963, | |
| "learning_rate": 0.00010988257025992876, | |
| "loss": 0.0461, | |
| "step": 3420 | |
| }, | |
| { | |
| "epoch": 0.9030386922417771, | |
| "grad_norm": 1.671773910522461, | |
| "learning_rate": 0.00010975062673175883, | |
| "loss": 0.054, | |
| "step": 3425 | |
| }, | |
| { | |
| "epoch": 0.904356996901984, | |
| "grad_norm": 0.009926804341375828, | |
| "learning_rate": 0.00010961868320358888, | |
| "loss": 0.0429, | |
| "step": 3430 | |
| }, | |
| { | |
| "epoch": 0.905675301562191, | |
| "grad_norm": 0.554020881652832, | |
| "learning_rate": 0.00010948673967541894, | |
| "loss": 0.0618, | |
| "step": 3435 | |
| }, | |
| { | |
| "epoch": 0.906993606222398, | |
| "grad_norm": 0.1399248093366623, | |
| "learning_rate": 0.00010935479614724897, | |
| "loss": 0.0229, | |
| "step": 3440 | |
| }, | |
| { | |
| "epoch": 0.908311910882605, | |
| "grad_norm": 0.02739197015762329, | |
| "learning_rate": 0.00010922285261907904, | |
| "loss": 0.0082, | |
| "step": 3445 | |
| }, | |
| { | |
| "epoch": 0.909630215542812, | |
| "grad_norm": 0.33394527435302734, | |
| "learning_rate": 0.00010909090909090909, | |
| "loss": 0.0403, | |
| "step": 3450 | |
| }, | |
| { | |
| "epoch": 0.9109485202030189, | |
| "grad_norm": 0.08083894103765488, | |
| "learning_rate": 0.00010895896556273915, | |
| "loss": 0.0406, | |
| "step": 3455 | |
| }, | |
| { | |
| "epoch": 0.9122668248632259, | |
| "grad_norm": 0.39336663484573364, | |
| "learning_rate": 0.0001088270220345692, | |
| "loss": 0.02, | |
| "step": 3460 | |
| }, | |
| { | |
| "epoch": 0.9135851295234328, | |
| "grad_norm": 0.20481553673744202, | |
| "learning_rate": 0.00010869507850639926, | |
| "loss": 0.0221, | |
| "step": 3465 | |
| }, | |
| { | |
| "epoch": 0.9149034341836398, | |
| "grad_norm": 1.4507408142089844, | |
| "learning_rate": 0.00010856313497822932, | |
| "loss": 0.0357, | |
| "step": 3470 | |
| }, | |
| { | |
| "epoch": 0.9162217388438468, | |
| "grad_norm": 0.2678806483745575, | |
| "learning_rate": 0.00010843119145005937, | |
| "loss": 0.0181, | |
| "step": 3475 | |
| }, | |
| { | |
| "epoch": 0.9175400435040538, | |
| "grad_norm": 0.007361674215644598, | |
| "learning_rate": 0.00010829924792188944, | |
| "loss": 0.0978, | |
| "step": 3480 | |
| }, | |
| { | |
| "epoch": 0.9188583481642607, | |
| "grad_norm": 0.773695707321167, | |
| "learning_rate": 0.0001081673043937195, | |
| "loss": 0.0401, | |
| "step": 3485 | |
| }, | |
| { | |
| "epoch": 0.9201766528244677, | |
| "grad_norm": 0.0010772625682875514, | |
| "learning_rate": 0.00010803536086554955, | |
| "loss": 0.0233, | |
| "step": 3490 | |
| }, | |
| { | |
| "epoch": 0.9214949574846747, | |
| "grad_norm": 0.08971104770898819, | |
| "learning_rate": 0.00010790341733737961, | |
| "loss": 0.0319, | |
| "step": 3495 | |
| }, | |
| { | |
| "epoch": 0.9228132621448817, | |
| "grad_norm": 0.21372731029987335, | |
| "learning_rate": 0.00010777147380920966, | |
| "loss": 0.0315, | |
| "step": 3500 | |
| }, | |
| { | |
| "epoch": 0.9228132621448817, | |
| "eval_loss": 0.02952708676457405, | |
| "eval_runtime": 451.5837, | |
| "eval_samples_per_second": 7.467, | |
| "eval_steps_per_second": 3.734, | |
| "step": 3500 | |
| }, | |
| { | |
| "epoch": 0.9241315668050887, | |
| "grad_norm": 0.016639264300465584, | |
| "learning_rate": 0.00010763953028103972, | |
| "loss": 0.0125, | |
| "step": 3505 | |
| }, | |
| { | |
| "epoch": 0.9254498714652957, | |
| "grad_norm": 0.46340492367744446, | |
| "learning_rate": 0.00010750758675286979, | |
| "loss": 0.0186, | |
| "step": 3510 | |
| }, | |
| { | |
| "epoch": 0.9267681761255026, | |
| "grad_norm": 0.01847526989877224, | |
| "learning_rate": 0.00010737564322469984, | |
| "loss": 0.0026, | |
| "step": 3515 | |
| }, | |
| { | |
| "epoch": 0.9280864807857095, | |
| "grad_norm": 0.5947860479354858, | |
| "learning_rate": 0.0001072436996965299, | |
| "loss": 0.0259, | |
| "step": 3520 | |
| }, | |
| { | |
| "epoch": 0.9294047854459165, | |
| "grad_norm": 0.06145291402935982, | |
| "learning_rate": 0.00010711175616835995, | |
| "loss": 0.0057, | |
| "step": 3525 | |
| }, | |
| { | |
| "epoch": 0.9307230901061235, | |
| "grad_norm": 0.0143959429115057, | |
| "learning_rate": 0.00010697981264019001, | |
| "loss": 0.0145, | |
| "step": 3530 | |
| }, | |
| { | |
| "epoch": 0.9320413947663305, | |
| "grad_norm": 0.21143831312656403, | |
| "learning_rate": 0.00010684786911202007, | |
| "loss": 0.0459, | |
| "step": 3535 | |
| }, | |
| { | |
| "epoch": 0.9333596994265375, | |
| "grad_norm": 0.02548077143728733, | |
| "learning_rate": 0.00010671592558385011, | |
| "loss": 0.0051, | |
| "step": 3540 | |
| }, | |
| { | |
| "epoch": 0.9346780040867444, | |
| "grad_norm": 0.008077048696577549, | |
| "learning_rate": 0.00010658398205568016, | |
| "loss": 0.0306, | |
| "step": 3545 | |
| }, | |
| { | |
| "epoch": 0.9359963087469514, | |
| "grad_norm": 0.0030760422814637423, | |
| "learning_rate": 0.00010645203852751022, | |
| "loss": 0.0575, | |
| "step": 3550 | |
| }, | |
| { | |
| "epoch": 0.9373146134071584, | |
| "grad_norm": 0.18114158511161804, | |
| "learning_rate": 0.00010632009499934027, | |
| "loss": 0.0885, | |
| "step": 3555 | |
| }, | |
| { | |
| "epoch": 0.9386329180673654, | |
| "grad_norm": 0.02450549602508545, | |
| "learning_rate": 0.00010618815147117034, | |
| "loss": 0.0045, | |
| "step": 3560 | |
| }, | |
| { | |
| "epoch": 0.9399512227275724, | |
| "grad_norm": 0.1238626018166542, | |
| "learning_rate": 0.0001060562079430004, | |
| "loss": 0.0166, | |
| "step": 3565 | |
| }, | |
| { | |
| "epoch": 0.9412695273877794, | |
| "grad_norm": 0.1879919469356537, | |
| "learning_rate": 0.00010592426441483046, | |
| "loss": 0.0077, | |
| "step": 3570 | |
| }, | |
| { | |
| "epoch": 0.9425878320479862, | |
| "grad_norm": 0.11323565989732742, | |
| "learning_rate": 0.00010579232088666051, | |
| "loss": 0.0213, | |
| "step": 3575 | |
| }, | |
| { | |
| "epoch": 0.9439061367081932, | |
| "grad_norm": 0.35575854778289795, | |
| "learning_rate": 0.00010566037735849057, | |
| "loss": 0.0336, | |
| "step": 3580 | |
| }, | |
| { | |
| "epoch": 0.9452244413684002, | |
| "grad_norm": 0.14052227139472961, | |
| "learning_rate": 0.00010552843383032062, | |
| "loss": 0.0325, | |
| "step": 3585 | |
| }, | |
| { | |
| "epoch": 0.9465427460286072, | |
| "grad_norm": 0.2643798887729645, | |
| "learning_rate": 0.00010539649030215069, | |
| "loss": 0.0192, | |
| "step": 3590 | |
| }, | |
| { | |
| "epoch": 0.9478610506888142, | |
| "grad_norm": 0.3207031190395355, | |
| "learning_rate": 0.00010526454677398075, | |
| "loss": 0.0221, | |
| "step": 3595 | |
| }, | |
| { | |
| "epoch": 0.9491793553490212, | |
| "grad_norm": 0.022803861647844315, | |
| "learning_rate": 0.0001051326032458108, | |
| "loss": 0.029, | |
| "step": 3600 | |
| }, | |
| { | |
| "epoch": 0.9504976600092281, | |
| "grad_norm": 0.02511664852499962, | |
| "learning_rate": 0.00010500065971764086, | |
| "loss": 0.0422, | |
| "step": 3605 | |
| }, | |
| { | |
| "epoch": 0.9518159646694351, | |
| "grad_norm": 0.06505445390939713, | |
| "learning_rate": 0.00010486871618947091, | |
| "loss": 0.0092, | |
| "step": 3610 | |
| }, | |
| { | |
| "epoch": 0.9531342693296421, | |
| "grad_norm": 0.09998584538698196, | |
| "learning_rate": 0.00010473677266130097, | |
| "loss": 0.0242, | |
| "step": 3615 | |
| }, | |
| { | |
| "epoch": 0.9544525739898491, | |
| "grad_norm": 0.9645698666572571, | |
| "learning_rate": 0.00010460482913313104, | |
| "loss": 0.0124, | |
| "step": 3620 | |
| }, | |
| { | |
| "epoch": 0.955770878650056, | |
| "grad_norm": 0.2389964610338211, | |
| "learning_rate": 0.0001044728856049611, | |
| "loss": 0.0169, | |
| "step": 3625 | |
| }, | |
| { | |
| "epoch": 0.957089183310263, | |
| "grad_norm": 2.030608654022217, | |
| "learning_rate": 0.00010434094207679115, | |
| "loss": 0.0518, | |
| "step": 3630 | |
| }, | |
| { | |
| "epoch": 0.9584074879704699, | |
| "grad_norm": 0.05979987606406212, | |
| "learning_rate": 0.0001042089985486212, | |
| "loss": 0.0081, | |
| "step": 3635 | |
| }, | |
| { | |
| "epoch": 0.9597257926306769, | |
| "grad_norm": 0.15761719644069672, | |
| "learning_rate": 0.00010407705502045125, | |
| "loss": 0.0061, | |
| "step": 3640 | |
| }, | |
| { | |
| "epoch": 0.9610440972908839, | |
| "grad_norm": 0.6534290909767151, | |
| "learning_rate": 0.0001039451114922813, | |
| "loss": 0.0104, | |
| "step": 3645 | |
| }, | |
| { | |
| "epoch": 0.9623624019510909, | |
| "grad_norm": 1.0324147939682007, | |
| "learning_rate": 0.00010381316796411136, | |
| "loss": 0.0381, | |
| "step": 3650 | |
| }, | |
| { | |
| "epoch": 0.9636807066112979, | |
| "grad_norm": 0.002968872431665659, | |
| "learning_rate": 0.00010368122443594142, | |
| "loss": 0.0343, | |
| "step": 3655 | |
| }, | |
| { | |
| "epoch": 0.9649990112715049, | |
| "grad_norm": 0.011243184097111225, | |
| "learning_rate": 0.00010354928090777147, | |
| "loss": 0.019, | |
| "step": 3660 | |
| }, | |
| { | |
| "epoch": 0.9663173159317118, | |
| "grad_norm": 0.17663739621639252, | |
| "learning_rate": 0.00010341733737960153, | |
| "loss": 0.0452, | |
| "step": 3665 | |
| }, | |
| { | |
| "epoch": 0.9676356205919188, | |
| "grad_norm": 1.2647719383239746, | |
| "learning_rate": 0.00010328539385143158, | |
| "loss": 0.0154, | |
| "step": 3670 | |
| }, | |
| { | |
| "epoch": 0.9689539252521258, | |
| "grad_norm": 0.3691752552986145, | |
| "learning_rate": 0.00010315345032326165, | |
| "loss": 0.028, | |
| "step": 3675 | |
| }, | |
| { | |
| "epoch": 0.9702722299123328, | |
| "grad_norm": 0.0015879774000495672, | |
| "learning_rate": 0.00010302150679509171, | |
| "loss": 0.0202, | |
| "step": 3680 | |
| }, | |
| { | |
| "epoch": 0.9715905345725397, | |
| "grad_norm": 0.1441984623670578, | |
| "learning_rate": 0.00010288956326692176, | |
| "loss": 0.0221, | |
| "step": 3685 | |
| }, | |
| { | |
| "epoch": 0.9729088392327467, | |
| "grad_norm": 0.20431455969810486, | |
| "learning_rate": 0.00010275761973875182, | |
| "loss": 0.0072, | |
| "step": 3690 | |
| }, | |
| { | |
| "epoch": 0.9742271438929536, | |
| "grad_norm": 0.861625611782074, | |
| "learning_rate": 0.00010262567621058187, | |
| "loss": 0.0523, | |
| "step": 3695 | |
| }, | |
| { | |
| "epoch": 0.9755454485531606, | |
| "grad_norm": 0.005049478262662888, | |
| "learning_rate": 0.00010249373268241193, | |
| "loss": 0.0051, | |
| "step": 3700 | |
| }, | |
| { | |
| "epoch": 0.9768637532133676, | |
| "grad_norm": 0.49685510993003845, | |
| "learning_rate": 0.000102361789154242, | |
| "loss": 0.023, | |
| "step": 3705 | |
| }, | |
| { | |
| "epoch": 0.9781820578735746, | |
| "grad_norm": 0.08789395540952682, | |
| "learning_rate": 0.00010222984562607205, | |
| "loss": 0.0159, | |
| "step": 3710 | |
| }, | |
| { | |
| "epoch": 0.9795003625337816, | |
| "grad_norm": 0.027168691158294678, | |
| "learning_rate": 0.00010209790209790211, | |
| "loss": 0.0083, | |
| "step": 3715 | |
| }, | |
| { | |
| "epoch": 0.9808186671939886, | |
| "grad_norm": 0.0006773864733986557, | |
| "learning_rate": 0.00010196595856973217, | |
| "loss": 0.0048, | |
| "step": 3720 | |
| }, | |
| { | |
| "epoch": 0.9821369718541955, | |
| "grad_norm": 0.01636457070708275, | |
| "learning_rate": 0.00010183401504156222, | |
| "loss": 0.0159, | |
| "step": 3725 | |
| }, | |
| { | |
| "epoch": 0.9834552765144025, | |
| "grad_norm": 0.10160859674215317, | |
| "learning_rate": 0.00010170207151339228, | |
| "loss": 0.0047, | |
| "step": 3730 | |
| }, | |
| { | |
| "epoch": 0.9847735811746094, | |
| "grad_norm": 0.14173269271850586, | |
| "learning_rate": 0.00010157012798522232, | |
| "loss": 0.006, | |
| "step": 3735 | |
| }, | |
| { | |
| "epoch": 0.9860918858348164, | |
| "grad_norm": 0.003458512481302023, | |
| "learning_rate": 0.00010143818445705238, | |
| "loss": 0.0193, | |
| "step": 3740 | |
| }, | |
| { | |
| "epoch": 0.9874101904950234, | |
| "grad_norm": 0.005163820460438728, | |
| "learning_rate": 0.00010130624092888243, | |
| "loss": 0.0039, | |
| "step": 3745 | |
| }, | |
| { | |
| "epoch": 0.9887284951552304, | |
| "grad_norm": 0.005913791712373495, | |
| "learning_rate": 0.00010117429740071249, | |
| "loss": 0.0119, | |
| "step": 3750 | |
| }, | |
| { | |
| "epoch": 0.9900467998154373, | |
| "grad_norm": 0.00800853967666626, | |
| "learning_rate": 0.00010104235387254256, | |
| "loss": 0.044, | |
| "step": 3755 | |
| }, | |
| { | |
| "epoch": 0.9913651044756443, | |
| "grad_norm": 0.18146778643131256, | |
| "learning_rate": 0.00010091041034437261, | |
| "loss": 0.0048, | |
| "step": 3760 | |
| }, | |
| { | |
| "epoch": 0.9926834091358513, | |
| "grad_norm": 0.01235104724764824, | |
| "learning_rate": 0.00010077846681620267, | |
| "loss": 0.0017, | |
| "step": 3765 | |
| }, | |
| { | |
| "epoch": 0.9940017137960583, | |
| "grad_norm": 0.17677897214889526, | |
| "learning_rate": 0.00010064652328803272, | |
| "loss": 0.0339, | |
| "step": 3770 | |
| }, | |
| { | |
| "epoch": 0.9953200184562653, | |
| "grad_norm": 0.0017472271574661136, | |
| "learning_rate": 0.00010051457975986278, | |
| "loss": 0.0494, | |
| "step": 3775 | |
| }, | |
| { | |
| "epoch": 0.9966383231164723, | |
| "grad_norm": 0.10814860463142395, | |
| "learning_rate": 0.00010038263623169283, | |
| "loss": 0.0741, | |
| "step": 3780 | |
| }, | |
| { | |
| "epoch": 0.9979566277766792, | |
| "grad_norm": 0.11329760402441025, | |
| "learning_rate": 0.0001002506927035229, | |
| "loss": 0.0182, | |
| "step": 3785 | |
| }, | |
| { | |
| "epoch": 0.9992749324368861, | |
| "grad_norm": 0.11573276668787003, | |
| "learning_rate": 0.00010011874917535296, | |
| "loss": 0.0068, | |
| "step": 3790 | |
| }, | |
| { | |
| "epoch": 1.000790982796124, | |
| "grad_norm": 0.08449886739253998, | |
| "learning_rate": 9.998680564718301e-05, | |
| "loss": 0.0141, | |
| "step": 3795 | |
| }, | |
| { | |
| "epoch": 1.002109287456331, | |
| "grad_norm": 0.05035184696316719, | |
| "learning_rate": 9.985486211901307e-05, | |
| "loss": 0.0293, | |
| "step": 3800 | |
| }, | |
| { | |
| "epoch": 1.003427592116538, | |
| "grad_norm": 0.0255444198846817, | |
| "learning_rate": 9.972291859084313e-05, | |
| "loss": 0.0054, | |
| "step": 3805 | |
| }, | |
| { | |
| "epoch": 1.004745896776745, | |
| "grad_norm": 0.0033677336759865284, | |
| "learning_rate": 9.959097506267318e-05, | |
| "loss": 0.0567, | |
| "step": 3810 | |
| }, | |
| { | |
| "epoch": 1.006064201436952, | |
| "grad_norm": 0.09453682601451874, | |
| "learning_rate": 9.945903153450324e-05, | |
| "loss": 0.0589, | |
| "step": 3815 | |
| }, | |
| { | |
| "epoch": 1.007382506097159, | |
| "grad_norm": 0.01592979207634926, | |
| "learning_rate": 9.932708800633329e-05, | |
| "loss": 0.0043, | |
| "step": 3820 | |
| }, | |
| { | |
| "epoch": 1.008700810757366, | |
| "grad_norm": 0.002263693604618311, | |
| "learning_rate": 9.919514447816335e-05, | |
| "loss": 0.0195, | |
| "step": 3825 | |
| }, | |
| { | |
| "epoch": 1.010019115417573, | |
| "grad_norm": 0.013390793465077877, | |
| "learning_rate": 9.90632009499934e-05, | |
| "loss": 0.0152, | |
| "step": 3830 | |
| }, | |
| { | |
| "epoch": 1.01133742007778, | |
| "grad_norm": 0.10473847389221191, | |
| "learning_rate": 9.893125742182346e-05, | |
| "loss": 0.0606, | |
| "step": 3835 | |
| }, | |
| { | |
| "epoch": 1.012655724737987, | |
| "grad_norm": 0.05837221071124077, | |
| "learning_rate": 9.879931389365353e-05, | |
| "loss": 0.0121, | |
| "step": 3840 | |
| }, | |
| { | |
| "epoch": 1.013974029398194, | |
| "grad_norm": 0.3803791105747223, | |
| "learning_rate": 9.866737036548358e-05, | |
| "loss": 0.0386, | |
| "step": 3845 | |
| }, | |
| { | |
| "epoch": 1.0152923340584008, | |
| "grad_norm": 0.4067519009113312, | |
| "learning_rate": 9.853542683731364e-05, | |
| "loss": 0.0115, | |
| "step": 3850 | |
| }, | |
| { | |
| "epoch": 1.0166106387186078, | |
| "grad_norm": 0.02585229091346264, | |
| "learning_rate": 9.84034833091437e-05, | |
| "loss": 0.0214, | |
| "step": 3855 | |
| }, | |
| { | |
| "epoch": 1.0179289433788148, | |
| "grad_norm": 0.03670825809240341, | |
| "learning_rate": 9.827153978097374e-05, | |
| "loss": 0.0059, | |
| "step": 3860 | |
| }, | |
| { | |
| "epoch": 1.0192472480390218, | |
| "grad_norm": 0.014171554706990719, | |
| "learning_rate": 9.81395962528038e-05, | |
| "loss": 0.0145, | |
| "step": 3865 | |
| }, | |
| { | |
| "epoch": 1.0205655526992288, | |
| "grad_norm": 0.027376385405659676, | |
| "learning_rate": 9.800765272463386e-05, | |
| "loss": 0.0089, | |
| "step": 3870 | |
| }, | |
| { | |
| "epoch": 1.0218838573594358, | |
| "grad_norm": 0.03168405964970589, | |
| "learning_rate": 9.787570919646392e-05, | |
| "loss": 0.0132, | |
| "step": 3875 | |
| }, | |
| { | |
| "epoch": 1.0232021620196428, | |
| "grad_norm": 0.03346199914813042, | |
| "learning_rate": 9.774376566829397e-05, | |
| "loss": 0.0246, | |
| "step": 3880 | |
| }, | |
| { | |
| "epoch": 1.0245204666798498, | |
| "grad_norm": 0.00894144270569086, | |
| "learning_rate": 9.761182214012403e-05, | |
| "loss": 0.0105, | |
| "step": 3885 | |
| }, | |
| { | |
| "epoch": 1.0258387713400567, | |
| "grad_norm": 0.3172806203365326, | |
| "learning_rate": 9.747987861195409e-05, | |
| "loss": 0.0103, | |
| "step": 3890 | |
| }, | |
| { | |
| "epoch": 1.0271570760002637, | |
| "grad_norm": 0.009055040776729584, | |
| "learning_rate": 9.734793508378414e-05, | |
| "loss": 0.0103, | |
| "step": 3895 | |
| }, | |
| { | |
| "epoch": 1.0284753806604707, | |
| "grad_norm": 0.014140011742711067, | |
| "learning_rate": 9.721599155561421e-05, | |
| "loss": 0.0037, | |
| "step": 3900 | |
| }, | |
| { | |
| "epoch": 1.0297936853206777, | |
| "grad_norm": 0.008317383006215096, | |
| "learning_rate": 9.708404802744427e-05, | |
| "loss": 0.002, | |
| "step": 3905 | |
| }, | |
| { | |
| "epoch": 1.0311119899808845, | |
| "grad_norm": 0.005038558971136808, | |
| "learning_rate": 9.695210449927431e-05, | |
| "loss": 0.0017, | |
| "step": 3910 | |
| }, | |
| { | |
| "epoch": 1.0324302946410915, | |
| "grad_norm": 0.40058520436286926, | |
| "learning_rate": 9.682016097110436e-05, | |
| "loss": 0.0065, | |
| "step": 3915 | |
| }, | |
| { | |
| "epoch": 1.0337485993012985, | |
| "grad_norm": 0.005197151098400354, | |
| "learning_rate": 9.668821744293442e-05, | |
| "loss": 0.0031, | |
| "step": 3920 | |
| }, | |
| { | |
| "epoch": 1.0350669039615055, | |
| "grad_norm": 0.014353781007230282, | |
| "learning_rate": 9.655627391476449e-05, | |
| "loss": 0.0009, | |
| "step": 3925 | |
| }, | |
| { | |
| "epoch": 1.0363852086217125, | |
| "grad_norm": 0.13260559737682343, | |
| "learning_rate": 9.642433038659454e-05, | |
| "loss": 0.0323, | |
| "step": 3930 | |
| }, | |
| { | |
| "epoch": 1.0377035132819195, | |
| "grad_norm": 0.006795065477490425, | |
| "learning_rate": 9.62923868584246e-05, | |
| "loss": 0.0022, | |
| "step": 3935 | |
| }, | |
| { | |
| "epoch": 1.0390218179421264, | |
| "grad_norm": 0.2276086062192917, | |
| "learning_rate": 9.616044333025466e-05, | |
| "loss": 0.0221, | |
| "step": 3940 | |
| }, | |
| { | |
| "epoch": 1.0403401226023334, | |
| "grad_norm": 0.06121920794248581, | |
| "learning_rate": 9.602849980208471e-05, | |
| "loss": 0.0037, | |
| "step": 3945 | |
| }, | |
| { | |
| "epoch": 1.0416584272625404, | |
| "grad_norm": 0.9180755019187927, | |
| "learning_rate": 9.589655627391477e-05, | |
| "loss": 0.0589, | |
| "step": 3950 | |
| }, | |
| { | |
| "epoch": 1.0429767319227474, | |
| "grad_norm": 0.07515591382980347, | |
| "learning_rate": 9.576461274574484e-05, | |
| "loss": 0.0653, | |
| "step": 3955 | |
| }, | |
| { | |
| "epoch": 1.0442950365829544, | |
| "grad_norm": 0.018060607835650444, | |
| "learning_rate": 9.563266921757488e-05, | |
| "loss": 0.0178, | |
| "step": 3960 | |
| }, | |
| { | |
| "epoch": 1.0456133412431612, | |
| "grad_norm": 0.02751368284225464, | |
| "learning_rate": 9.550072568940493e-05, | |
| "loss": 0.0076, | |
| "step": 3965 | |
| }, | |
| { | |
| "epoch": 1.0469316459033682, | |
| "grad_norm": 0.653998613357544, | |
| "learning_rate": 9.536878216123499e-05, | |
| "loss": 0.0066, | |
| "step": 3970 | |
| }, | |
| { | |
| "epoch": 1.0482499505635752, | |
| "grad_norm": 0.3117768168449402, | |
| "learning_rate": 9.523683863306505e-05, | |
| "loss": 0.0087, | |
| "step": 3975 | |
| }, | |
| { | |
| "epoch": 1.0495682552237822, | |
| "grad_norm": 0.013952831737697124, | |
| "learning_rate": 9.510489510489511e-05, | |
| "loss": 0.0037, | |
| "step": 3980 | |
| }, | |
| { | |
| "epoch": 1.0508865598839892, | |
| "grad_norm": 0.01806250400841236, | |
| "learning_rate": 9.497295157672517e-05, | |
| "loss": 0.0028, | |
| "step": 3985 | |
| }, | |
| { | |
| "epoch": 1.0522048645441962, | |
| "grad_norm": 0.13678006827831268, | |
| "learning_rate": 9.484100804855523e-05, | |
| "loss": 0.0533, | |
| "step": 3990 | |
| }, | |
| { | |
| "epoch": 1.0535231692044031, | |
| "grad_norm": 0.14869382977485657, | |
| "learning_rate": 9.470906452038528e-05, | |
| "loss": 0.009, | |
| "step": 3995 | |
| }, | |
| { | |
| "epoch": 1.0548414738646101, | |
| "grad_norm": 0.33614659309387207, | |
| "learning_rate": 9.457712099221534e-05, | |
| "loss": 0.0555, | |
| "step": 4000 | |
| }, | |
| { | |
| "epoch": 1.0548414738646101, | |
| "eval_loss": 0.026165226474404335, | |
| "eval_runtime": 452.2482, | |
| "eval_samples_per_second": 7.456, | |
| "eval_steps_per_second": 3.728, | |
| "step": 4000 | |
| }, | |
| { | |
| "epoch": 1.0561597785248171, | |
| "grad_norm": 0.007546027656644583, | |
| "learning_rate": 9.444517746404539e-05, | |
| "loss": 0.0029, | |
| "step": 4005 | |
| }, | |
| { | |
| "epoch": 1.0574780831850241, | |
| "grad_norm": 0.3720332384109497, | |
| "learning_rate": 9.431323393587545e-05, | |
| "loss": 0.0353, | |
| "step": 4010 | |
| }, | |
| { | |
| "epoch": 1.0587963878452311, | |
| "grad_norm": 1.1335264444351196, | |
| "learning_rate": 9.41812904077055e-05, | |
| "loss": 0.0142, | |
| "step": 4015 | |
| }, | |
| { | |
| "epoch": 1.060114692505438, | |
| "grad_norm": 0.024723488837480545, | |
| "learning_rate": 9.404934687953556e-05, | |
| "loss": 0.006, | |
| "step": 4020 | |
| }, | |
| { | |
| "epoch": 1.0614329971656449, | |
| "grad_norm": 0.040354058146476746, | |
| "learning_rate": 9.391740335136562e-05, | |
| "loss": 0.0107, | |
| "step": 4025 | |
| }, | |
| { | |
| "epoch": 1.0627513018258519, | |
| "grad_norm": 0.222810298204422, | |
| "learning_rate": 9.378545982319567e-05, | |
| "loss": 0.0273, | |
| "step": 4030 | |
| }, | |
| { | |
| "epoch": 1.0640696064860589, | |
| "grad_norm": 0.025684095919132233, | |
| "learning_rate": 9.365351629502574e-05, | |
| "loss": 0.0033, | |
| "step": 4035 | |
| }, | |
| { | |
| "epoch": 1.0653879111462659, | |
| "grad_norm": 0.05338352546095848, | |
| "learning_rate": 9.35215727668558e-05, | |
| "loss": 0.0052, | |
| "step": 4040 | |
| }, | |
| { | |
| "epoch": 1.0667062158064728, | |
| "grad_norm": 0.06182330474257469, | |
| "learning_rate": 9.338962923868585e-05, | |
| "loss": 0.0038, | |
| "step": 4045 | |
| }, | |
| { | |
| "epoch": 1.0680245204666798, | |
| "grad_norm": 0.012170832604169846, | |
| "learning_rate": 9.325768571051591e-05, | |
| "loss": 0.0018, | |
| "step": 4050 | |
| }, | |
| { | |
| "epoch": 1.0693428251268868, | |
| "grad_norm": 0.5424306392669678, | |
| "learning_rate": 9.312574218234596e-05, | |
| "loss": 0.0445, | |
| "step": 4055 | |
| }, | |
| { | |
| "epoch": 1.0706611297870938, | |
| "grad_norm": 0.017939254641532898, | |
| "learning_rate": 9.299379865417602e-05, | |
| "loss": 0.0389, | |
| "step": 4060 | |
| }, | |
| { | |
| "epoch": 1.0719794344473008, | |
| "grad_norm": 0.0060431682504713535, | |
| "learning_rate": 9.286185512600607e-05, | |
| "loss": 0.0025, | |
| "step": 4065 | |
| }, | |
| { | |
| "epoch": 1.0732977391075078, | |
| "grad_norm": 0.0071444883942604065, | |
| "learning_rate": 9.272991159783613e-05, | |
| "loss": 0.0333, | |
| "step": 4070 | |
| }, | |
| { | |
| "epoch": 1.0746160437677148, | |
| "grad_norm": 0.29632750153541565, | |
| "learning_rate": 9.259796806966619e-05, | |
| "loss": 0.0151, | |
| "step": 4075 | |
| }, | |
| { | |
| "epoch": 1.0759343484279218, | |
| "grad_norm": 0.004526323173195124, | |
| "learning_rate": 9.246602454149624e-05, | |
| "loss": 0.006, | |
| "step": 4080 | |
| }, | |
| { | |
| "epoch": 1.0772526530881286, | |
| "grad_norm": 0.023945212364196777, | |
| "learning_rate": 9.23340810133263e-05, | |
| "loss": 0.004, | |
| "step": 4085 | |
| }, | |
| { | |
| "epoch": 1.0785709577483356, | |
| "grad_norm": 0.13235126435756683, | |
| "learning_rate": 9.220213748515635e-05, | |
| "loss": 0.0059, | |
| "step": 4090 | |
| }, | |
| { | |
| "epoch": 1.0798892624085425, | |
| "grad_norm": 0.17592330276966095, | |
| "learning_rate": 9.207019395698642e-05, | |
| "loss": 0.0302, | |
| "step": 4095 | |
| }, | |
| { | |
| "epoch": 1.0812075670687495, | |
| "grad_norm": 0.004582866560667753, | |
| "learning_rate": 9.193825042881648e-05, | |
| "loss": 0.009, | |
| "step": 4100 | |
| }, | |
| { | |
| "epoch": 1.0825258717289565, | |
| "grad_norm": 0.15214525163173676, | |
| "learning_rate": 9.180630690064653e-05, | |
| "loss": 0.0062, | |
| "step": 4105 | |
| }, | |
| { | |
| "epoch": 1.0838441763891635, | |
| "grad_norm": 0.16535983979701996, | |
| "learning_rate": 9.167436337247658e-05, | |
| "loss": 0.0926, | |
| "step": 4110 | |
| }, | |
| { | |
| "epoch": 1.0851624810493705, | |
| "grad_norm": 0.013285227119922638, | |
| "learning_rate": 9.154241984430663e-05, | |
| "loss": 0.0043, | |
| "step": 4115 | |
| }, | |
| { | |
| "epoch": 1.0864807857095775, | |
| "grad_norm": 0.012116984464228153, | |
| "learning_rate": 9.14104763161367e-05, | |
| "loss": 0.0037, | |
| "step": 4120 | |
| }, | |
| { | |
| "epoch": 1.0877990903697845, | |
| "grad_norm": 0.0373845212161541, | |
| "learning_rate": 9.127853278796676e-05, | |
| "loss": 0.0081, | |
| "step": 4125 | |
| }, | |
| { | |
| "epoch": 1.0891173950299915, | |
| "grad_norm": 0.09324615448713303, | |
| "learning_rate": 9.114658925979681e-05, | |
| "loss": 0.0534, | |
| "step": 4130 | |
| }, | |
| { | |
| "epoch": 1.0904356996901985, | |
| "grad_norm": 0.010992968454957008, | |
| "learning_rate": 9.101464573162687e-05, | |
| "loss": 0.0025, | |
| "step": 4135 | |
| }, | |
| { | |
| "epoch": 1.0917540043504055, | |
| "grad_norm": 0.13710318505764008, | |
| "learning_rate": 9.088270220345692e-05, | |
| "loss": 0.0555, | |
| "step": 4140 | |
| }, | |
| { | |
| "epoch": 1.0930723090106123, | |
| "grad_norm": 0.010403074324131012, | |
| "learning_rate": 9.075075867528698e-05, | |
| "loss": 0.0042, | |
| "step": 4145 | |
| }, | |
| { | |
| "epoch": 1.0943906136708192, | |
| "grad_norm": 0.21544460952281952, | |
| "learning_rate": 9.061881514711705e-05, | |
| "loss": 0.0144, | |
| "step": 4150 | |
| }, | |
| { | |
| "epoch": 1.0957089183310262, | |
| "grad_norm": 0.04194799065589905, | |
| "learning_rate": 9.04868716189471e-05, | |
| "loss": 0.0106, | |
| "step": 4155 | |
| }, | |
| { | |
| "epoch": 1.0970272229912332, | |
| "grad_norm": 0.029204202815890312, | |
| "learning_rate": 9.035492809077715e-05, | |
| "loss": 0.0085, | |
| "step": 4160 | |
| }, | |
| { | |
| "epoch": 1.0983455276514402, | |
| "grad_norm": 0.006751026958227158, | |
| "learning_rate": 9.02229845626072e-05, | |
| "loss": 0.0049, | |
| "step": 4165 | |
| }, | |
| { | |
| "epoch": 1.0996638323116472, | |
| "grad_norm": 0.008232722990214825, | |
| "learning_rate": 9.009104103443726e-05, | |
| "loss": 0.0172, | |
| "step": 4170 | |
| }, | |
| { | |
| "epoch": 1.1009821369718542, | |
| "grad_norm": 0.05630079656839371, | |
| "learning_rate": 8.995909750626733e-05, | |
| "loss": 0.0112, | |
| "step": 4175 | |
| }, | |
| { | |
| "epoch": 1.1023004416320612, | |
| "grad_norm": 0.0011601662263274193, | |
| "learning_rate": 8.982715397809738e-05, | |
| "loss": 0.0317, | |
| "step": 4180 | |
| }, | |
| { | |
| "epoch": 1.1036187462922682, | |
| "grad_norm": 0.006554402410984039, | |
| "learning_rate": 8.969521044992744e-05, | |
| "loss": 0.0035, | |
| "step": 4185 | |
| }, | |
| { | |
| "epoch": 1.1049370509524752, | |
| "grad_norm": 0.34513652324676514, | |
| "learning_rate": 8.956326692175749e-05, | |
| "loss": 0.0036, | |
| "step": 4190 | |
| }, | |
| { | |
| "epoch": 1.1062553556126822, | |
| "grad_norm": 0.283669650554657, | |
| "learning_rate": 8.943132339358755e-05, | |
| "loss": 0.0182, | |
| "step": 4195 | |
| }, | |
| { | |
| "epoch": 1.1075736602728892, | |
| "grad_norm": 0.5376952290534973, | |
| "learning_rate": 8.92993798654176e-05, | |
| "loss": 0.0293, | |
| "step": 4200 | |
| }, | |
| { | |
| "epoch": 1.108891964933096, | |
| "grad_norm": 0.01689724065363407, | |
| "learning_rate": 8.916743633724767e-05, | |
| "loss": 0.0206, | |
| "step": 4205 | |
| }, | |
| { | |
| "epoch": 1.110210269593303, | |
| "grad_norm": 0.026538770645856857, | |
| "learning_rate": 8.903549280907772e-05, | |
| "loss": 0.0181, | |
| "step": 4210 | |
| }, | |
| { | |
| "epoch": 1.11152857425351, | |
| "grad_norm": 0.6372873783111572, | |
| "learning_rate": 8.890354928090777e-05, | |
| "loss": 0.021, | |
| "step": 4215 | |
| }, | |
| { | |
| "epoch": 1.112846878913717, | |
| "grad_norm": 0.06177428737282753, | |
| "learning_rate": 8.877160575273783e-05, | |
| "loss": 0.0033, | |
| "step": 4220 | |
| }, | |
| { | |
| "epoch": 1.114165183573924, | |
| "grad_norm": 0.3712109923362732, | |
| "learning_rate": 8.863966222456788e-05, | |
| "loss": 0.0075, | |
| "step": 4225 | |
| }, | |
| { | |
| "epoch": 1.115483488234131, | |
| "grad_norm": 0.030514653772115707, | |
| "learning_rate": 8.850771869639795e-05, | |
| "loss": 0.0183, | |
| "step": 4230 | |
| }, | |
| { | |
| "epoch": 1.116801792894338, | |
| "grad_norm": 0.012861707247793674, | |
| "learning_rate": 8.837577516822801e-05, | |
| "loss": 0.0032, | |
| "step": 4235 | |
| }, | |
| { | |
| "epoch": 1.118120097554545, | |
| "grad_norm": 0.3278522789478302, | |
| "learning_rate": 8.824383164005806e-05, | |
| "loss": 0.0058, | |
| "step": 4240 | |
| }, | |
| { | |
| "epoch": 1.1194384022147519, | |
| "grad_norm": 0.580259382724762, | |
| "learning_rate": 8.811188811188812e-05, | |
| "loss": 0.0068, | |
| "step": 4245 | |
| }, | |
| { | |
| "epoch": 1.1207567068749589, | |
| "grad_norm": 0.007002575788646936, | |
| "learning_rate": 8.797994458371817e-05, | |
| "loss": 0.0063, | |
| "step": 4250 | |
| }, | |
| { | |
| "epoch": 1.1220750115351659, | |
| "grad_norm": 0.22484643757343292, | |
| "learning_rate": 8.784800105554823e-05, | |
| "loss": 0.0167, | |
| "step": 4255 | |
| }, | |
| { | |
| "epoch": 1.1233933161953726, | |
| "grad_norm": 0.004122686106711626, | |
| "learning_rate": 8.771605752737829e-05, | |
| "loss": 0.002, | |
| "step": 4260 | |
| }, | |
| { | |
| "epoch": 1.1247116208555796, | |
| "grad_norm": 0.009832561016082764, | |
| "learning_rate": 8.758411399920834e-05, | |
| "loss": 0.0029, | |
| "step": 4265 | |
| }, | |
| { | |
| "epoch": 1.1260299255157866, | |
| "grad_norm": 0.04854527860879898, | |
| "learning_rate": 8.74521704710384e-05, | |
| "loss": 0.0068, | |
| "step": 4270 | |
| }, | |
| { | |
| "epoch": 1.1273482301759936, | |
| "grad_norm": 0.12221235036849976, | |
| "learning_rate": 8.732022694286845e-05, | |
| "loss": 0.003, | |
| "step": 4275 | |
| }, | |
| { | |
| "epoch": 1.1286665348362006, | |
| "grad_norm": 0.005857539363205433, | |
| "learning_rate": 8.718828341469851e-05, | |
| "loss": 0.0022, | |
| "step": 4280 | |
| }, | |
| { | |
| "epoch": 1.1299848394964076, | |
| "grad_norm": 0.10582758486270905, | |
| "learning_rate": 8.705633988652856e-05, | |
| "loss": 0.002, | |
| "step": 4285 | |
| }, | |
| { | |
| "epoch": 1.1313031441566146, | |
| "grad_norm": 0.006190940272063017, | |
| "learning_rate": 8.692439635835863e-05, | |
| "loss": 0.0022, | |
| "step": 4290 | |
| }, | |
| { | |
| "epoch": 1.1326214488168216, | |
| "grad_norm": 0.00221514655277133, | |
| "learning_rate": 8.679245283018869e-05, | |
| "loss": 0.0314, | |
| "step": 4295 | |
| }, | |
| { | |
| "epoch": 1.1339397534770286, | |
| "grad_norm": 0.0796755850315094, | |
| "learning_rate": 8.666050930201874e-05, | |
| "loss": 0.0347, | |
| "step": 4300 | |
| }, | |
| { | |
| "epoch": 1.1352580581372356, | |
| "grad_norm": 0.20088806748390198, | |
| "learning_rate": 8.65285657738488e-05, | |
| "loss": 0.0048, | |
| "step": 4305 | |
| }, | |
| { | |
| "epoch": 1.1365763627974426, | |
| "grad_norm": 0.4018377363681793, | |
| "learning_rate": 8.639662224567884e-05, | |
| "loss": 0.0234, | |
| "step": 4310 | |
| }, | |
| { | |
| "epoch": 1.1378946674576496, | |
| "grad_norm": 0.014961684122681618, | |
| "learning_rate": 8.626467871750891e-05, | |
| "loss": 0.0033, | |
| "step": 4315 | |
| }, | |
| { | |
| "epoch": 1.1392129721178565, | |
| "grad_norm": 0.004534922540187836, | |
| "learning_rate": 8.613273518933897e-05, | |
| "loss": 0.0021, | |
| "step": 4320 | |
| }, | |
| { | |
| "epoch": 1.1405312767780633, | |
| "grad_norm": 0.06340984255075455, | |
| "learning_rate": 8.600079166116902e-05, | |
| "loss": 0.0538, | |
| "step": 4325 | |
| }, | |
| { | |
| "epoch": 1.1418495814382703, | |
| "grad_norm": 0.007374623324722052, | |
| "learning_rate": 8.586884813299908e-05, | |
| "loss": 0.0157, | |
| "step": 4330 | |
| }, | |
| { | |
| "epoch": 1.1431678860984773, | |
| "grad_norm": 0.02313193492591381, | |
| "learning_rate": 8.573690460482913e-05, | |
| "loss": 0.0307, | |
| "step": 4335 | |
| }, | |
| { | |
| "epoch": 1.1444861907586843, | |
| "grad_norm": 0.014071634039282799, | |
| "learning_rate": 8.560496107665919e-05, | |
| "loss": 0.0058, | |
| "step": 4340 | |
| }, | |
| { | |
| "epoch": 1.1458044954188913, | |
| "grad_norm": 1.4664901494979858, | |
| "learning_rate": 8.547301754848926e-05, | |
| "loss": 0.0566, | |
| "step": 4345 | |
| }, | |
| { | |
| "epoch": 1.1471228000790983, | |
| "grad_norm": 0.023680074140429497, | |
| "learning_rate": 8.534107402031931e-05, | |
| "loss": 0.0048, | |
| "step": 4350 | |
| }, | |
| { | |
| "epoch": 1.1484411047393053, | |
| "grad_norm": 0.012555698864161968, | |
| "learning_rate": 8.520913049214937e-05, | |
| "loss": 0.0076, | |
| "step": 4355 | |
| }, | |
| { | |
| "epoch": 1.1497594093995123, | |
| "grad_norm": 0.013624129816889763, | |
| "learning_rate": 8.507718696397941e-05, | |
| "loss": 0.0373, | |
| "step": 4360 | |
| }, | |
| { | |
| "epoch": 1.1510777140597193, | |
| "grad_norm": 0.015372387133538723, | |
| "learning_rate": 8.494524343580947e-05, | |
| "loss": 0.0147, | |
| "step": 4365 | |
| }, | |
| { | |
| "epoch": 1.1523960187199263, | |
| "grad_norm": 0.3312993347644806, | |
| "learning_rate": 8.481329990763954e-05, | |
| "loss": 0.0299, | |
| "step": 4370 | |
| }, | |
| { | |
| "epoch": 1.1537143233801332, | |
| "grad_norm": 0.023838184773921967, | |
| "learning_rate": 8.468135637946959e-05, | |
| "loss": 0.0226, | |
| "step": 4375 | |
| }, | |
| { | |
| "epoch": 1.15503262804034, | |
| "grad_norm": 0.42516952753067017, | |
| "learning_rate": 8.454941285129965e-05, | |
| "loss": 0.0088, | |
| "step": 4380 | |
| }, | |
| { | |
| "epoch": 1.156350932700547, | |
| "grad_norm": 0.6900278925895691, | |
| "learning_rate": 8.44174693231297e-05, | |
| "loss": 0.0245, | |
| "step": 4385 | |
| }, | |
| { | |
| "epoch": 1.157669237360754, | |
| "grad_norm": 0.2932703197002411, | |
| "learning_rate": 8.428552579495976e-05, | |
| "loss": 0.0207, | |
| "step": 4390 | |
| }, | |
| { | |
| "epoch": 1.158987542020961, | |
| "grad_norm": 0.12942780554294586, | |
| "learning_rate": 8.415358226678982e-05, | |
| "loss": 0.0037, | |
| "step": 4395 | |
| }, | |
| { | |
| "epoch": 1.160305846681168, | |
| "grad_norm": 0.9499046802520752, | |
| "learning_rate": 8.402163873861989e-05, | |
| "loss": 0.0246, | |
| "step": 4400 | |
| }, | |
| { | |
| "epoch": 1.161624151341375, | |
| "grad_norm": 0.008869118988513947, | |
| "learning_rate": 8.388969521044994e-05, | |
| "loss": 0.0171, | |
| "step": 4405 | |
| }, | |
| { | |
| "epoch": 1.162942456001582, | |
| "grad_norm": 1.7409231662750244, | |
| "learning_rate": 8.375775168227998e-05, | |
| "loss": 0.017, | |
| "step": 4410 | |
| }, | |
| { | |
| "epoch": 1.164260760661789, | |
| "grad_norm": 0.0020101398695260286, | |
| "learning_rate": 8.362580815411004e-05, | |
| "loss": 0.0027, | |
| "step": 4415 | |
| }, | |
| { | |
| "epoch": 1.165579065321996, | |
| "grad_norm": 0.0785067081451416, | |
| "learning_rate": 8.34938646259401e-05, | |
| "loss": 0.0043, | |
| "step": 4420 | |
| }, | |
| { | |
| "epoch": 1.166897369982203, | |
| "grad_norm": 0.0029506285209208727, | |
| "learning_rate": 8.336192109777016e-05, | |
| "loss": 0.0109, | |
| "step": 4425 | |
| }, | |
| { | |
| "epoch": 1.16821567464241, | |
| "grad_norm": 0.02216683328151703, | |
| "learning_rate": 8.322997756960022e-05, | |
| "loss": 0.0026, | |
| "step": 4430 | |
| }, | |
| { | |
| "epoch": 1.1695339793026167, | |
| "grad_norm": 0.02216639369726181, | |
| "learning_rate": 8.309803404143027e-05, | |
| "loss": 0.0045, | |
| "step": 4435 | |
| }, | |
| { | |
| "epoch": 1.170852283962824, | |
| "grad_norm": 0.0, | |
| "learning_rate": 8.296609051326033e-05, | |
| "loss": 0.006, | |
| "step": 4440 | |
| }, | |
| { | |
| "epoch": 1.1721705886230307, | |
| "grad_norm": 0.0019736960530281067, | |
| "learning_rate": 8.283414698509039e-05, | |
| "loss": 0.0078, | |
| "step": 4445 | |
| }, | |
| { | |
| "epoch": 1.1734888932832377, | |
| "grad_norm": 0.012957746163010597, | |
| "learning_rate": 8.270220345692044e-05, | |
| "loss": 0.002, | |
| "step": 4450 | |
| }, | |
| { | |
| "epoch": 1.1748071979434447, | |
| "grad_norm": 0.010877869091928005, | |
| "learning_rate": 8.25702599287505e-05, | |
| "loss": 0.0237, | |
| "step": 4455 | |
| }, | |
| { | |
| "epoch": 1.1761255026036517, | |
| "grad_norm": 0.005947659723460674, | |
| "learning_rate": 8.243831640058055e-05, | |
| "loss": 0.0341, | |
| "step": 4460 | |
| }, | |
| { | |
| "epoch": 1.1774438072638587, | |
| "grad_norm": 0.0005026470171287656, | |
| "learning_rate": 8.230637287241061e-05, | |
| "loss": 0.0033, | |
| "step": 4465 | |
| }, | |
| { | |
| "epoch": 1.1787621119240657, | |
| "grad_norm": 0.022054588422179222, | |
| "learning_rate": 8.217442934424066e-05, | |
| "loss": 0.0042, | |
| "step": 4470 | |
| }, | |
| { | |
| "epoch": 1.1800804165842727, | |
| "grad_norm": 0.7929030656814575, | |
| "learning_rate": 8.204248581607072e-05, | |
| "loss": 0.0076, | |
| "step": 4475 | |
| }, | |
| { | |
| "epoch": 1.1813987212444796, | |
| "grad_norm": 0.39052629470825195, | |
| "learning_rate": 8.191054228790078e-05, | |
| "loss": 0.0228, | |
| "step": 4480 | |
| }, | |
| { | |
| "epoch": 1.1827170259046866, | |
| "grad_norm": 0.007177622988820076, | |
| "learning_rate": 8.177859875973084e-05, | |
| "loss": 0.01, | |
| "step": 4485 | |
| }, | |
| { | |
| "epoch": 1.1840353305648936, | |
| "grad_norm": 0.006175135262310505, | |
| "learning_rate": 8.16466552315609e-05, | |
| "loss": 0.0037, | |
| "step": 4490 | |
| }, | |
| { | |
| "epoch": 1.1853536352251006, | |
| "grad_norm": 0.0356481671333313, | |
| "learning_rate": 8.151471170339096e-05, | |
| "loss": 0.0024, | |
| "step": 4495 | |
| }, | |
| { | |
| "epoch": 1.1866719398853074, | |
| "grad_norm": 0.19069480895996094, | |
| "learning_rate": 8.138276817522101e-05, | |
| "loss": 0.0048, | |
| "step": 4500 | |
| }, | |
| { | |
| "epoch": 1.1866719398853074, | |
| "eval_loss": 0.026386437937617302, | |
| "eval_runtime": 452.2896, | |
| "eval_samples_per_second": 7.455, | |
| "eval_steps_per_second": 3.728, | |
| "step": 4500 | |
| }, | |
| { | |
| "epoch": 1.1879902445455144, | |
| "grad_norm": 0.002254961524158716, | |
| "learning_rate": 8.125082464705107e-05, | |
| "loss": 0.0014, | |
| "step": 4505 | |
| }, | |
| { | |
| "epoch": 1.1893085492057214, | |
| "grad_norm": 0.8026870489120483, | |
| "learning_rate": 8.111888111888112e-05, | |
| "loss": 0.0411, | |
| "step": 4510 | |
| }, | |
| { | |
| "epoch": 1.1906268538659284, | |
| "grad_norm": 0.47328072786331177, | |
| "learning_rate": 8.098693759071118e-05, | |
| "loss": 0.0271, | |
| "step": 4515 | |
| }, | |
| { | |
| "epoch": 1.1919451585261354, | |
| "grad_norm": 0.4888288676738739, | |
| "learning_rate": 8.085499406254123e-05, | |
| "loss": 0.039, | |
| "step": 4520 | |
| }, | |
| { | |
| "epoch": 1.1932634631863424, | |
| "grad_norm": 0.000925812462810427, | |
| "learning_rate": 8.072305053437129e-05, | |
| "loss": 0.0461, | |
| "step": 4525 | |
| }, | |
| { | |
| "epoch": 1.1945817678465493, | |
| "grad_norm": 0.12472371757030487, | |
| "learning_rate": 8.059110700620135e-05, | |
| "loss": 0.0037, | |
| "step": 4530 | |
| }, | |
| { | |
| "epoch": 1.1959000725067563, | |
| "grad_norm": 0.002875336678698659, | |
| "learning_rate": 8.04591634780314e-05, | |
| "loss": 0.0425, | |
| "step": 4535 | |
| }, | |
| { | |
| "epoch": 1.1972183771669633, | |
| "grad_norm": 0.042056187987327576, | |
| "learning_rate": 8.032721994986147e-05, | |
| "loss": 0.0068, | |
| "step": 4540 | |
| }, | |
| { | |
| "epoch": 1.1985366818271703, | |
| "grad_norm": 0.157605841755867, | |
| "learning_rate": 8.019527642169153e-05, | |
| "loss": 0.0179, | |
| "step": 4545 | |
| }, | |
| { | |
| "epoch": 1.1998549864873773, | |
| "grad_norm": 0.005153563339263201, | |
| "learning_rate": 8.006333289352158e-05, | |
| "loss": 0.0045, | |
| "step": 4550 | |
| }, | |
| { | |
| "epoch": 1.201173291147584, | |
| "grad_norm": 0.02541598491370678, | |
| "learning_rate": 7.993138936535164e-05, | |
| "loss": 0.0041, | |
| "step": 4555 | |
| }, | |
| { | |
| "epoch": 1.2024915958077913, | |
| "grad_norm": 0.04266195371747017, | |
| "learning_rate": 7.979944583718168e-05, | |
| "loss": 0.0121, | |
| "step": 4560 | |
| }, | |
| { | |
| "epoch": 1.203809900467998, | |
| "grad_norm": 0.36108532547950745, | |
| "learning_rate": 7.966750230901175e-05, | |
| "loss": 0.0147, | |
| "step": 4565 | |
| }, | |
| { | |
| "epoch": 1.205128205128205, | |
| "grad_norm": 0.40405452251434326, | |
| "learning_rate": 7.95355587808418e-05, | |
| "loss": 0.0056, | |
| "step": 4570 | |
| }, | |
| { | |
| "epoch": 1.206446509788412, | |
| "grad_norm": 0.030422702431678772, | |
| "learning_rate": 7.940361525267186e-05, | |
| "loss": 0.0055, | |
| "step": 4575 | |
| }, | |
| { | |
| "epoch": 1.207764814448619, | |
| "grad_norm": 0.014555396512150764, | |
| "learning_rate": 7.927167172450192e-05, | |
| "loss": 0.0029, | |
| "step": 4580 | |
| }, | |
| { | |
| "epoch": 1.209083119108826, | |
| "grad_norm": 0.33962950110435486, | |
| "learning_rate": 7.913972819633197e-05, | |
| "loss": 0.0191, | |
| "step": 4585 | |
| }, | |
| { | |
| "epoch": 1.210401423769033, | |
| "grad_norm": 0.040150560438632965, | |
| "learning_rate": 7.900778466816203e-05, | |
| "loss": 0.0096, | |
| "step": 4590 | |
| }, | |
| { | |
| "epoch": 1.21171972842924, | |
| "grad_norm": 0.2968510091304779, | |
| "learning_rate": 7.88758411399921e-05, | |
| "loss": 0.0311, | |
| "step": 4595 | |
| }, | |
| { | |
| "epoch": 1.213038033089447, | |
| "grad_norm": 0.04709814116358757, | |
| "learning_rate": 7.874389761182215e-05, | |
| "loss": 0.0175, | |
| "step": 4600 | |
| }, | |
| { | |
| "epoch": 1.214356337749654, | |
| "grad_norm": 0.1379537284374237, | |
| "learning_rate": 7.861195408365221e-05, | |
| "loss": 0.02, | |
| "step": 4605 | |
| }, | |
| { | |
| "epoch": 1.215674642409861, | |
| "grad_norm": 0.018291711807250977, | |
| "learning_rate": 7.848001055548225e-05, | |
| "loss": 0.003, | |
| "step": 4610 | |
| }, | |
| { | |
| "epoch": 1.216992947070068, | |
| "grad_norm": 0.041676126420497894, | |
| "learning_rate": 7.83480670273123e-05, | |
| "loss": 0.0054, | |
| "step": 4615 | |
| }, | |
| { | |
| "epoch": 1.2183112517302748, | |
| "grad_norm": 0.0013747498160228133, | |
| "learning_rate": 7.821612349914237e-05, | |
| "loss": 0.0132, | |
| "step": 4620 | |
| }, | |
| { | |
| "epoch": 1.2196295563904818, | |
| "grad_norm": 0.0050489697605371475, | |
| "learning_rate": 7.808417997097243e-05, | |
| "loss": 0.0272, | |
| "step": 4625 | |
| }, | |
| { | |
| "epoch": 1.2209478610506888, | |
| "grad_norm": 0.017974581569433212, | |
| "learning_rate": 7.795223644280249e-05, | |
| "loss": 0.0037, | |
| "step": 4630 | |
| }, | |
| { | |
| "epoch": 1.2222661657108957, | |
| "grad_norm": 0.001916698063723743, | |
| "learning_rate": 7.782029291463254e-05, | |
| "loss": 0.002, | |
| "step": 4635 | |
| }, | |
| { | |
| "epoch": 1.2235844703711027, | |
| "grad_norm": 0.05344574153423309, | |
| "learning_rate": 7.76883493864626e-05, | |
| "loss": 0.0114, | |
| "step": 4640 | |
| }, | |
| { | |
| "epoch": 1.2249027750313097, | |
| "grad_norm": 0.22823786735534668, | |
| "learning_rate": 7.755640585829265e-05, | |
| "loss": 0.0296, | |
| "step": 4645 | |
| }, | |
| { | |
| "epoch": 1.2262210796915167, | |
| "grad_norm": 0.02051074244081974, | |
| "learning_rate": 7.742446233012272e-05, | |
| "loss": 0.0037, | |
| "step": 4650 | |
| }, | |
| { | |
| "epoch": 1.2275393843517237, | |
| "grad_norm": 0.9797061681747437, | |
| "learning_rate": 7.729251880195276e-05, | |
| "loss": 0.011, | |
| "step": 4655 | |
| }, | |
| { | |
| "epoch": 1.2288576890119307, | |
| "grad_norm": 0.0017285927897319198, | |
| "learning_rate": 7.716057527378282e-05, | |
| "loss": 0.0224, | |
| "step": 4660 | |
| }, | |
| { | |
| "epoch": 1.2301759936721377, | |
| "grad_norm": 0.021783018484711647, | |
| "learning_rate": 7.702863174561288e-05, | |
| "loss": 0.0174, | |
| "step": 4665 | |
| }, | |
| { | |
| "epoch": 1.2314942983323447, | |
| "grad_norm": 0.00763307698071003, | |
| "learning_rate": 7.689668821744293e-05, | |
| "loss": 0.0516, | |
| "step": 4670 | |
| }, | |
| { | |
| "epoch": 1.2328126029925515, | |
| "grad_norm": 0.32605209946632385, | |
| "learning_rate": 7.676474468927299e-05, | |
| "loss": 0.0301, | |
| "step": 4675 | |
| }, | |
| { | |
| "epoch": 1.2341309076527585, | |
| "grad_norm": 1.2027722597122192, | |
| "learning_rate": 7.663280116110306e-05, | |
| "loss": 0.0474, | |
| "step": 4680 | |
| }, | |
| { | |
| "epoch": 1.2354492123129655, | |
| "grad_norm": 0.10201717168092728, | |
| "learning_rate": 7.650085763293311e-05, | |
| "loss": 0.0144, | |
| "step": 4685 | |
| }, | |
| { | |
| "epoch": 1.2367675169731724, | |
| "grad_norm": 0.013835664838552475, | |
| "learning_rate": 7.636891410476317e-05, | |
| "loss": 0.0024, | |
| "step": 4690 | |
| }, | |
| { | |
| "epoch": 1.2380858216333794, | |
| "grad_norm": 0.005699916277080774, | |
| "learning_rate": 7.623697057659322e-05, | |
| "loss": 0.0089, | |
| "step": 4695 | |
| }, | |
| { | |
| "epoch": 1.2394041262935864, | |
| "grad_norm": 0.16583332419395447, | |
| "learning_rate": 7.610502704842328e-05, | |
| "loss": 0.019, | |
| "step": 4700 | |
| }, | |
| { | |
| "epoch": 1.2407224309537934, | |
| "grad_norm": 0.2734023332595825, | |
| "learning_rate": 7.597308352025333e-05, | |
| "loss": 0.0041, | |
| "step": 4705 | |
| }, | |
| { | |
| "epoch": 1.2420407356140004, | |
| "grad_norm": 0.04209504276514053, | |
| "learning_rate": 7.584113999208339e-05, | |
| "loss": 0.0292, | |
| "step": 4710 | |
| }, | |
| { | |
| "epoch": 1.2433590402742074, | |
| "grad_norm": 0.0303195733577013, | |
| "learning_rate": 7.570919646391345e-05, | |
| "loss": 0.0019, | |
| "step": 4715 | |
| }, | |
| { | |
| "epoch": 1.2446773449344144, | |
| "grad_norm": 0.014011899940669537, | |
| "learning_rate": 7.55772529357435e-05, | |
| "loss": 0.0236, | |
| "step": 4720 | |
| }, | |
| { | |
| "epoch": 1.2459956495946214, | |
| "grad_norm": 0.37838876247406006, | |
| "learning_rate": 7.544530940757356e-05, | |
| "loss": 0.0081, | |
| "step": 4725 | |
| }, | |
| { | |
| "epoch": 1.2473139542548284, | |
| "grad_norm": 0.003717717481777072, | |
| "learning_rate": 7.531336587940361e-05, | |
| "loss": 0.0036, | |
| "step": 4730 | |
| }, | |
| { | |
| "epoch": 1.2486322589150354, | |
| "grad_norm": 1.2284752130508423, | |
| "learning_rate": 7.518142235123368e-05, | |
| "loss": 0.0089, | |
| "step": 4735 | |
| }, | |
| { | |
| "epoch": 1.2499505635752421, | |
| "grad_norm": 0.015356095507740974, | |
| "learning_rate": 7.504947882306374e-05, | |
| "loss": 0.0074, | |
| "step": 4740 | |
| }, | |
| { | |
| "epoch": 1.2512688682354491, | |
| "grad_norm": 0.0020383282098919153, | |
| "learning_rate": 7.49175352948938e-05, | |
| "loss": 0.0444, | |
| "step": 4745 | |
| }, | |
| { | |
| "epoch": 1.2525871728956561, | |
| "grad_norm": 0.006680132355540991, | |
| "learning_rate": 7.478559176672385e-05, | |
| "loss": 0.009, | |
| "step": 4750 | |
| }, | |
| { | |
| "epoch": 1.2539054775558631, | |
| "grad_norm": 0.01650019735097885, | |
| "learning_rate": 7.465364823855389e-05, | |
| "loss": 0.0022, | |
| "step": 4755 | |
| }, | |
| { | |
| "epoch": 1.2552237822160701, | |
| "grad_norm": 0.009536102414131165, | |
| "learning_rate": 7.452170471038396e-05, | |
| "loss": 0.0026, | |
| "step": 4760 | |
| }, | |
| { | |
| "epoch": 1.256542086876277, | |
| "grad_norm": 0.04677430912852287, | |
| "learning_rate": 7.438976118221402e-05, | |
| "loss": 0.004, | |
| "step": 4765 | |
| }, | |
| { | |
| "epoch": 1.257860391536484, | |
| "grad_norm": 0.007777783088386059, | |
| "learning_rate": 7.425781765404407e-05, | |
| "loss": 0.0112, | |
| "step": 4770 | |
| }, | |
| { | |
| "epoch": 1.259178696196691, | |
| "grad_norm": 0.03724197298288345, | |
| "learning_rate": 7.412587412587413e-05, | |
| "loss": 0.0065, | |
| "step": 4775 | |
| }, | |
| { | |
| "epoch": 1.260497000856898, | |
| "grad_norm": 0.0023958412930369377, | |
| "learning_rate": 7.399393059770418e-05, | |
| "loss": 0.0238, | |
| "step": 4780 | |
| }, | |
| { | |
| "epoch": 1.261815305517105, | |
| "grad_norm": 0.0036889975890517235, | |
| "learning_rate": 7.386198706953424e-05, | |
| "loss": 0.0012, | |
| "step": 4785 | |
| }, | |
| { | |
| "epoch": 1.263133610177312, | |
| "grad_norm": 0.0009220903157256544, | |
| "learning_rate": 7.373004354136431e-05, | |
| "loss": 0.0017, | |
| "step": 4790 | |
| }, | |
| { | |
| "epoch": 1.2644519148375188, | |
| "grad_norm": 0.0033395602367818356, | |
| "learning_rate": 7.359810001319436e-05, | |
| "loss": 0.0474, | |
| "step": 4795 | |
| }, | |
| { | |
| "epoch": 1.265770219497726, | |
| "grad_norm": 0.004093261435627937, | |
| "learning_rate": 7.346615648502442e-05, | |
| "loss": 0.0025, | |
| "step": 4800 | |
| }, | |
| { | |
| "epoch": 1.2670885241579328, | |
| "grad_norm": 0.004395488649606705, | |
| "learning_rate": 7.333421295685446e-05, | |
| "loss": 0.0011, | |
| "step": 4805 | |
| }, | |
| { | |
| "epoch": 1.2684068288181398, | |
| "grad_norm": 0.024034051224589348, | |
| "learning_rate": 7.320226942868452e-05, | |
| "loss": 0.0027, | |
| "step": 4810 | |
| }, | |
| { | |
| "epoch": 1.2697251334783468, | |
| "grad_norm": 0.9501499533653259, | |
| "learning_rate": 7.307032590051459e-05, | |
| "loss": 0.0279, | |
| "step": 4815 | |
| }, | |
| { | |
| "epoch": 1.2710434381385538, | |
| "grad_norm": 0.008805549703538418, | |
| "learning_rate": 7.293838237234464e-05, | |
| "loss": 0.0403, | |
| "step": 4820 | |
| }, | |
| { | |
| "epoch": 1.2723617427987608, | |
| "grad_norm": 0.01750873774290085, | |
| "learning_rate": 7.28064388441747e-05, | |
| "loss": 0.0571, | |
| "step": 4825 | |
| }, | |
| { | |
| "epoch": 1.2736800474589678, | |
| "grad_norm": 0.004490260500460863, | |
| "learning_rate": 7.267449531600475e-05, | |
| "loss": 0.0269, | |
| "step": 4830 | |
| }, | |
| { | |
| "epoch": 1.2749983521191748, | |
| "grad_norm": 0.07510064542293549, | |
| "learning_rate": 7.254255178783481e-05, | |
| "loss": 0.0123, | |
| "step": 4835 | |
| }, | |
| { | |
| "epoch": 1.2763166567793818, | |
| "grad_norm": 0.039783038198947906, | |
| "learning_rate": 7.241060825966486e-05, | |
| "loss": 0.0137, | |
| "step": 4840 | |
| }, | |
| { | |
| "epoch": 1.2776349614395888, | |
| "grad_norm": 0.019004900008440018, | |
| "learning_rate": 7.227866473149493e-05, | |
| "loss": 0.0047, | |
| "step": 4845 | |
| }, | |
| { | |
| "epoch": 1.2789532660997955, | |
| "grad_norm": 0.04813052713871002, | |
| "learning_rate": 7.214672120332499e-05, | |
| "loss": 0.0021, | |
| "step": 4850 | |
| }, | |
| { | |
| "epoch": 1.2802715707600028, | |
| "grad_norm": 0.00835048221051693, | |
| "learning_rate": 7.201477767515503e-05, | |
| "loss": 0.0014, | |
| "step": 4855 | |
| }, | |
| { | |
| "epoch": 1.2815898754202095, | |
| "grad_norm": 0.008609198965132236, | |
| "learning_rate": 7.188283414698509e-05, | |
| "loss": 0.0219, | |
| "step": 4860 | |
| }, | |
| { | |
| "epoch": 1.2829081800804165, | |
| "grad_norm": 0.007337458431720734, | |
| "learning_rate": 7.175089061881514e-05, | |
| "loss": 0.0014, | |
| "step": 4865 | |
| }, | |
| { | |
| "epoch": 1.2842264847406235, | |
| "grad_norm": 0.0032645913306623697, | |
| "learning_rate": 7.161894709064521e-05, | |
| "loss": 0.0026, | |
| "step": 4870 | |
| }, | |
| { | |
| "epoch": 1.2855447894008305, | |
| "grad_norm": 0.27384671568870544, | |
| "learning_rate": 7.148700356247527e-05, | |
| "loss": 0.0227, | |
| "step": 4875 | |
| }, | |
| { | |
| "epoch": 1.2868630940610375, | |
| "grad_norm": 0.03584875538945198, | |
| "learning_rate": 7.135506003430532e-05, | |
| "loss": 0.0299, | |
| "step": 4880 | |
| }, | |
| { | |
| "epoch": 1.2881813987212445, | |
| "grad_norm": 0.03482440486550331, | |
| "learning_rate": 7.122311650613538e-05, | |
| "loss": 0.0125, | |
| "step": 4885 | |
| }, | |
| { | |
| "epoch": 1.2894997033814515, | |
| "grad_norm": 0.005974395200610161, | |
| "learning_rate": 7.109117297796543e-05, | |
| "loss": 0.0029, | |
| "step": 4890 | |
| }, | |
| { | |
| "epoch": 1.2908180080416585, | |
| "grad_norm": 0.01820153370499611, | |
| "learning_rate": 7.095922944979549e-05, | |
| "loss": 0.0254, | |
| "step": 4895 | |
| }, | |
| { | |
| "epoch": 1.2921363127018655, | |
| "grad_norm": 0.1733965277671814, | |
| "learning_rate": 7.082728592162555e-05, | |
| "loss": 0.028, | |
| "step": 4900 | |
| }, | |
| { | |
| "epoch": 1.2934546173620725, | |
| "grad_norm": 1.3017303943634033, | |
| "learning_rate": 7.06953423934556e-05, | |
| "loss": 0.0213, | |
| "step": 4905 | |
| }, | |
| { | |
| "epoch": 1.2947729220222794, | |
| "grad_norm": 0.01360877975821495, | |
| "learning_rate": 7.056339886528566e-05, | |
| "loss": 0.0039, | |
| "step": 4910 | |
| }, | |
| { | |
| "epoch": 1.2960912266824862, | |
| "grad_norm": 0.01503999624401331, | |
| "learning_rate": 7.043145533711571e-05, | |
| "loss": 0.0102, | |
| "step": 4915 | |
| }, | |
| { | |
| "epoch": 1.2974095313426934, | |
| "grad_norm": 0.2200804352760315, | |
| "learning_rate": 7.029951180894577e-05, | |
| "loss": 0.0461, | |
| "step": 4920 | |
| }, | |
| { | |
| "epoch": 1.2987278360029002, | |
| "grad_norm": 0.08512946963310242, | |
| "learning_rate": 7.016756828077582e-05, | |
| "loss": 0.0066, | |
| "step": 4925 | |
| }, | |
| { | |
| "epoch": 1.3000461406631072, | |
| "grad_norm": 0.08296570926904678, | |
| "learning_rate": 7.00356247526059e-05, | |
| "loss": 0.0223, | |
| "step": 4930 | |
| }, | |
| { | |
| "epoch": 1.3013644453233142, | |
| "grad_norm": 0.008866079151630402, | |
| "learning_rate": 6.990368122443595e-05, | |
| "loss": 0.0032, | |
| "step": 4935 | |
| }, | |
| { | |
| "epoch": 1.3026827499835212, | |
| "grad_norm": 0.024493014439940453, | |
| "learning_rate": 6.9771737696266e-05, | |
| "loss": 0.0128, | |
| "step": 4940 | |
| }, | |
| { | |
| "epoch": 1.3040010546437282, | |
| "grad_norm": 0.08965341746807098, | |
| "learning_rate": 6.963979416809606e-05, | |
| "loss": 0.028, | |
| "step": 4945 | |
| }, | |
| { | |
| "epoch": 1.3053193593039352, | |
| "grad_norm": 0.023156631737947464, | |
| "learning_rate": 6.950785063992612e-05, | |
| "loss": 0.0187, | |
| "step": 4950 | |
| }, | |
| { | |
| "epoch": 1.3066376639641422, | |
| "grad_norm": 0.18552155792713165, | |
| "learning_rate": 6.937590711175617e-05, | |
| "loss": 0.0424, | |
| "step": 4955 | |
| }, | |
| { | |
| "epoch": 1.3079559686243492, | |
| "grad_norm": 0.02200198918581009, | |
| "learning_rate": 6.924396358358623e-05, | |
| "loss": 0.0148, | |
| "step": 4960 | |
| }, | |
| { | |
| "epoch": 1.3092742732845561, | |
| "grad_norm": 0.00568364467471838, | |
| "learning_rate": 6.911202005541628e-05, | |
| "loss": 0.0199, | |
| "step": 4965 | |
| }, | |
| { | |
| "epoch": 1.310592577944763, | |
| "grad_norm": 0.021591177210211754, | |
| "learning_rate": 6.898007652724634e-05, | |
| "loss": 0.0092, | |
| "step": 4970 | |
| }, | |
| { | |
| "epoch": 1.3119108826049701, | |
| "grad_norm": 0.327177494764328, | |
| "learning_rate": 6.88481329990764e-05, | |
| "loss": 0.0047, | |
| "step": 4975 | |
| }, | |
| { | |
| "epoch": 1.313229187265177, | |
| "grad_norm": 0.024512887001037598, | |
| "learning_rate": 6.871618947090645e-05, | |
| "loss": 0.0046, | |
| "step": 4980 | |
| }, | |
| { | |
| "epoch": 1.314547491925384, | |
| "grad_norm": 0.05725006014108658, | |
| "learning_rate": 6.858424594273652e-05, | |
| "loss": 0.0227, | |
| "step": 4985 | |
| }, | |
| { | |
| "epoch": 1.3158657965855909, | |
| "grad_norm": 0.011280277743935585, | |
| "learning_rate": 6.845230241456658e-05, | |
| "loss": 0.0056, | |
| "step": 4990 | |
| }, | |
| { | |
| "epoch": 1.3171841012457979, | |
| "grad_norm": 0.022504402324557304, | |
| "learning_rate": 6.832035888639663e-05, | |
| "loss": 0.0029, | |
| "step": 4995 | |
| }, | |
| { | |
| "epoch": 1.3185024059060049, | |
| "grad_norm": 0.02168826013803482, | |
| "learning_rate": 6.818841535822669e-05, | |
| "loss": 0.0198, | |
| "step": 5000 | |
| }, | |
| { | |
| "epoch": 1.3185024059060049, | |
| "eval_loss": 0.025039294734597206, | |
| "eval_runtime": 452.1097, | |
| "eval_samples_per_second": 7.458, | |
| "eval_steps_per_second": 3.729, | |
| "step": 5000 | |
| }, | |
| { | |
| "epoch": 1.3198207105662119, | |
| "grad_norm": 0.0064329709857702255, | |
| "learning_rate": 6.805647183005673e-05, | |
| "loss": 0.0299, | |
| "step": 5005 | |
| }, | |
| { | |
| "epoch": 1.3211390152264189, | |
| "grad_norm": 0.00267885928042233, | |
| "learning_rate": 6.79245283018868e-05, | |
| "loss": 0.0065, | |
| "step": 5010 | |
| }, | |
| { | |
| "epoch": 1.3224573198866258, | |
| "grad_norm": 0.6842889189720154, | |
| "learning_rate": 6.779258477371685e-05, | |
| "loss": 0.008, | |
| "step": 5015 | |
| }, | |
| { | |
| "epoch": 1.3237756245468328, | |
| "grad_norm": 0.002985635306686163, | |
| "learning_rate": 6.766064124554691e-05, | |
| "loss": 0.0119, | |
| "step": 5020 | |
| }, | |
| { | |
| "epoch": 1.3250939292070396, | |
| "grad_norm": 0.019304940477013588, | |
| "learning_rate": 6.752869771737696e-05, | |
| "loss": 0.0041, | |
| "step": 5025 | |
| }, | |
| { | |
| "epoch": 1.3264122338672468, | |
| "grad_norm": 0.011305035091936588, | |
| "learning_rate": 6.739675418920702e-05, | |
| "loss": 0.0031, | |
| "step": 5030 | |
| }, | |
| { | |
| "epoch": 1.3277305385274536, | |
| "grad_norm": 0.006184784695506096, | |
| "learning_rate": 6.726481066103708e-05, | |
| "loss": 0.0081, | |
| "step": 5035 | |
| }, | |
| { | |
| "epoch": 1.3290488431876606, | |
| "grad_norm": 0.0073184361681342125, | |
| "learning_rate": 6.713286713286715e-05, | |
| "loss": 0.0202, | |
| "step": 5040 | |
| }, | |
| { | |
| "epoch": 1.3303671478478676, | |
| "grad_norm": 0.006566181313246489, | |
| "learning_rate": 6.70009236046972e-05, | |
| "loss": 0.0052, | |
| "step": 5045 | |
| }, | |
| { | |
| "epoch": 1.3316854525080746, | |
| "grad_norm": 0.31427526473999023, | |
| "learning_rate": 6.686898007652726e-05, | |
| "loss": 0.017, | |
| "step": 5050 | |
| }, | |
| { | |
| "epoch": 1.3330037571682816, | |
| "grad_norm": 0.005085447803139687, | |
| "learning_rate": 6.67370365483573e-05, | |
| "loss": 0.009, | |
| "step": 5055 | |
| }, | |
| { | |
| "epoch": 1.3343220618284886, | |
| "grad_norm": 0.2745366096496582, | |
| "learning_rate": 6.660509302018735e-05, | |
| "loss": 0.0119, | |
| "step": 5060 | |
| }, | |
| { | |
| "epoch": 1.3356403664886956, | |
| "grad_norm": 0.2871796786785126, | |
| "learning_rate": 6.647314949201742e-05, | |
| "loss": 0.0158, | |
| "step": 5065 | |
| }, | |
| { | |
| "epoch": 1.3369586711489025, | |
| "grad_norm": 0.2774186134338379, | |
| "learning_rate": 6.634120596384748e-05, | |
| "loss": 0.0084, | |
| "step": 5070 | |
| }, | |
| { | |
| "epoch": 1.3382769758091095, | |
| "grad_norm": 0.013278775848448277, | |
| "learning_rate": 6.620926243567753e-05, | |
| "loss": 0.0111, | |
| "step": 5075 | |
| }, | |
| { | |
| "epoch": 1.3395952804693165, | |
| "grad_norm": 0.01614517532289028, | |
| "learning_rate": 6.607731890750759e-05, | |
| "loss": 0.0066, | |
| "step": 5080 | |
| }, | |
| { | |
| "epoch": 1.3409135851295235, | |
| "grad_norm": 0.0037789656780660152, | |
| "learning_rate": 6.594537537933765e-05, | |
| "loss": 0.0142, | |
| "step": 5085 | |
| }, | |
| { | |
| "epoch": 1.3422318897897303, | |
| "grad_norm": 0.03221861273050308, | |
| "learning_rate": 6.58134318511677e-05, | |
| "loss": 0.0155, | |
| "step": 5090 | |
| }, | |
| { | |
| "epoch": 1.3435501944499375, | |
| "grad_norm": 0.005637989845126867, | |
| "learning_rate": 6.568148832299776e-05, | |
| "loss": 0.0022, | |
| "step": 5095 | |
| }, | |
| { | |
| "epoch": 1.3448684991101443, | |
| "grad_norm": 0.0017844432732090354, | |
| "learning_rate": 6.554954479482783e-05, | |
| "loss": 0.0217, | |
| "step": 5100 | |
| }, | |
| { | |
| "epoch": 1.3461868037703513, | |
| "grad_norm": 0.08099021762609482, | |
| "learning_rate": 6.541760126665787e-05, | |
| "loss": 0.0222, | |
| "step": 5105 | |
| }, | |
| { | |
| "epoch": 1.3475051084305583, | |
| "grad_norm": 0.011909045279026031, | |
| "learning_rate": 6.528565773848792e-05, | |
| "loss": 0.0058, | |
| "step": 5110 | |
| }, | |
| { | |
| "epoch": 1.3488234130907653, | |
| "grad_norm": 0.7332578301429749, | |
| "learning_rate": 6.515371421031798e-05, | |
| "loss": 0.0286, | |
| "step": 5115 | |
| }, | |
| { | |
| "epoch": 1.3501417177509722, | |
| "grad_norm": 0.3415885865688324, | |
| "learning_rate": 6.502177068214804e-05, | |
| "loss": 0.1191, | |
| "step": 5120 | |
| }, | |
| { | |
| "epoch": 1.3514600224111792, | |
| "grad_norm": 0.00904211588203907, | |
| "learning_rate": 6.48898271539781e-05, | |
| "loss": 0.0043, | |
| "step": 5125 | |
| }, | |
| { | |
| "epoch": 1.3527783270713862, | |
| "grad_norm": 0.1978830248117447, | |
| "learning_rate": 6.475788362580816e-05, | |
| "loss": 0.0316, | |
| "step": 5130 | |
| }, | |
| { | |
| "epoch": 1.3540966317315932, | |
| "grad_norm": 0.10229042172431946, | |
| "learning_rate": 6.462594009763822e-05, | |
| "loss": 0.0194, | |
| "step": 5135 | |
| }, | |
| { | |
| "epoch": 1.3554149363918002, | |
| "grad_norm": 0.4457210600376129, | |
| "learning_rate": 6.449399656946827e-05, | |
| "loss": 0.0276, | |
| "step": 5140 | |
| }, | |
| { | |
| "epoch": 1.356733241052007, | |
| "grad_norm": 0.023706572130322456, | |
| "learning_rate": 6.436205304129833e-05, | |
| "loss": 0.0163, | |
| "step": 5145 | |
| }, | |
| { | |
| "epoch": 1.3580515457122142, | |
| "grad_norm": 1.166896939277649, | |
| "learning_rate": 6.423010951312838e-05, | |
| "loss": 0.0189, | |
| "step": 5150 | |
| }, | |
| { | |
| "epoch": 1.359369850372421, | |
| "grad_norm": 0.0016115796752274036, | |
| "learning_rate": 6.409816598495844e-05, | |
| "loss": 0.0191, | |
| "step": 5155 | |
| }, | |
| { | |
| "epoch": 1.360688155032628, | |
| "grad_norm": 0.00786682777106762, | |
| "learning_rate": 6.39662224567885e-05, | |
| "loss": 0.0119, | |
| "step": 5160 | |
| }, | |
| { | |
| "epoch": 1.362006459692835, | |
| "grad_norm": 1.042732834815979, | |
| "learning_rate": 6.383427892861855e-05, | |
| "loss": 0.0497, | |
| "step": 5165 | |
| }, | |
| { | |
| "epoch": 1.363324764353042, | |
| "grad_norm": 0.007983304560184479, | |
| "learning_rate": 6.37023354004486e-05, | |
| "loss": 0.044, | |
| "step": 5170 | |
| }, | |
| { | |
| "epoch": 1.364643069013249, | |
| "grad_norm": 0.009767642244696617, | |
| "learning_rate": 6.357039187227866e-05, | |
| "loss": 0.0405, | |
| "step": 5175 | |
| }, | |
| { | |
| "epoch": 1.365961373673456, | |
| "grad_norm": 0.03164628520607948, | |
| "learning_rate": 6.343844834410873e-05, | |
| "loss": 0.0138, | |
| "step": 5180 | |
| }, | |
| { | |
| "epoch": 1.367279678333663, | |
| "grad_norm": 0.004159921780228615, | |
| "learning_rate": 6.330650481593879e-05, | |
| "loss": 0.0045, | |
| "step": 5185 | |
| }, | |
| { | |
| "epoch": 1.36859798299387, | |
| "grad_norm": 0.004395391326397657, | |
| "learning_rate": 6.317456128776884e-05, | |
| "loss": 0.0046, | |
| "step": 5190 | |
| }, | |
| { | |
| "epoch": 1.369916287654077, | |
| "grad_norm": 0.011886746622622013, | |
| "learning_rate": 6.30426177595989e-05, | |
| "loss": 0.0064, | |
| "step": 5195 | |
| }, | |
| { | |
| "epoch": 1.371234592314284, | |
| "grad_norm": 0.2259266972541809, | |
| "learning_rate": 6.291067423142895e-05, | |
| "loss": 0.0076, | |
| "step": 5200 | |
| }, | |
| { | |
| "epoch": 1.372552896974491, | |
| "grad_norm": 0.01407301053404808, | |
| "learning_rate": 6.277873070325901e-05, | |
| "loss": 0.0201, | |
| "step": 5205 | |
| }, | |
| { | |
| "epoch": 1.3738712016346977, | |
| "grad_norm": 0.00911578256636858, | |
| "learning_rate": 6.264678717508906e-05, | |
| "loss": 0.0164, | |
| "step": 5210 | |
| }, | |
| { | |
| "epoch": 1.3751895062949049, | |
| "grad_norm": 0.20968014001846313, | |
| "learning_rate": 6.251484364691912e-05, | |
| "loss": 0.0075, | |
| "step": 5215 | |
| }, | |
| { | |
| "epoch": 1.3765078109551117, | |
| "grad_norm": 0.008801166899502277, | |
| "learning_rate": 6.238290011874918e-05, | |
| "loss": 0.0068, | |
| "step": 5220 | |
| }, | |
| { | |
| "epoch": 1.3778261156153186, | |
| "grad_norm": 0.007181806955486536, | |
| "learning_rate": 6.225095659057923e-05, | |
| "loss": 0.0136, | |
| "step": 5225 | |
| }, | |
| { | |
| "epoch": 1.3791444202755256, | |
| "grad_norm": 0.7527109980583191, | |
| "learning_rate": 6.211901306240929e-05, | |
| "loss": 0.0287, | |
| "step": 5230 | |
| }, | |
| { | |
| "epoch": 1.3804627249357326, | |
| "grad_norm": 0.039015207439661026, | |
| "learning_rate": 6.198706953423936e-05, | |
| "loss": 0.0326, | |
| "step": 5235 | |
| }, | |
| { | |
| "epoch": 1.3817810295959396, | |
| "grad_norm": 0.021076606586575508, | |
| "learning_rate": 6.185512600606941e-05, | |
| "loss": 0.0191, | |
| "step": 5240 | |
| }, | |
| { | |
| "epoch": 1.3830993342561466, | |
| "grad_norm": 0.016630731523036957, | |
| "learning_rate": 6.172318247789947e-05, | |
| "loss": 0.0131, | |
| "step": 5245 | |
| }, | |
| { | |
| "epoch": 1.3844176389163536, | |
| "grad_norm": 0.011133644729852676, | |
| "learning_rate": 6.159123894972952e-05, | |
| "loss": 0.0029, | |
| "step": 5250 | |
| }, | |
| { | |
| "epoch": 1.3857359435765606, | |
| "grad_norm": 0.6434677243232727, | |
| "learning_rate": 6.145929542155957e-05, | |
| "loss": 0.0091, | |
| "step": 5255 | |
| }, | |
| { | |
| "epoch": 1.3870542482367676, | |
| "grad_norm": 0.051020298153162, | |
| "learning_rate": 6.132735189338964e-05, | |
| "loss": 0.0086, | |
| "step": 5260 | |
| }, | |
| { | |
| "epoch": 1.3883725528969744, | |
| "grad_norm": 0.016413932666182518, | |
| "learning_rate": 6.119540836521969e-05, | |
| "loss": 0.0061, | |
| "step": 5265 | |
| }, | |
| { | |
| "epoch": 1.3896908575571816, | |
| "grad_norm": 0.005769540090113878, | |
| "learning_rate": 6.106346483704975e-05, | |
| "loss": 0.0027, | |
| "step": 5270 | |
| }, | |
| { | |
| "epoch": 1.3910091622173884, | |
| "grad_norm": 0.06687796860933304, | |
| "learning_rate": 6.09315213088798e-05, | |
| "loss": 0.0423, | |
| "step": 5275 | |
| }, | |
| { | |
| "epoch": 1.3923274668775953, | |
| "grad_norm": 0.005641553085297346, | |
| "learning_rate": 6.079957778070986e-05, | |
| "loss": 0.0353, | |
| "step": 5280 | |
| }, | |
| { | |
| "epoch": 1.3936457715378023, | |
| "grad_norm": 0.04460568353533745, | |
| "learning_rate": 6.066763425253992e-05, | |
| "loss": 0.0041, | |
| "step": 5285 | |
| }, | |
| { | |
| "epoch": 1.3949640761980093, | |
| "grad_norm": 0.0387534461915493, | |
| "learning_rate": 6.0535690724369976e-05, | |
| "loss": 0.006, | |
| "step": 5290 | |
| }, | |
| { | |
| "epoch": 1.3962823808582163, | |
| "grad_norm": 0.010292598977684975, | |
| "learning_rate": 6.040374719620003e-05, | |
| "loss": 0.0038, | |
| "step": 5295 | |
| }, | |
| { | |
| "epoch": 1.3976006855184233, | |
| "grad_norm": 0.3646155297756195, | |
| "learning_rate": 6.0271803668030094e-05, | |
| "loss": 0.0111, | |
| "step": 5300 | |
| }, | |
| { | |
| "epoch": 1.3989189901786303, | |
| "grad_norm": 0.022035539150238037, | |
| "learning_rate": 6.0139860139860136e-05, | |
| "loss": 0.0507, | |
| "step": 5305 | |
| }, | |
| { | |
| "epoch": 1.4002372948388373, | |
| "grad_norm": 0.003314939560368657, | |
| "learning_rate": 6.00079166116902e-05, | |
| "loss": 0.0132, | |
| "step": 5310 | |
| }, | |
| { | |
| "epoch": 1.4015555994990443, | |
| "grad_norm": 0.0838267058134079, | |
| "learning_rate": 5.9875973083520254e-05, | |
| "loss": 0.0105, | |
| "step": 5315 | |
| }, | |
| { | |
| "epoch": 1.4028739041592513, | |
| "grad_norm": 0.009368584491312504, | |
| "learning_rate": 5.974402955535031e-05, | |
| "loss": 0.0026, | |
| "step": 5320 | |
| }, | |
| { | |
| "epoch": 1.4041922088194583, | |
| "grad_norm": 0.031248098239302635, | |
| "learning_rate": 5.961208602718037e-05, | |
| "loss": 0.0151, | |
| "step": 5325 | |
| }, | |
| { | |
| "epoch": 1.405510513479665, | |
| "grad_norm": 0.06447605788707733, | |
| "learning_rate": 5.948014249901043e-05, | |
| "loss": 0.0219, | |
| "step": 5330 | |
| }, | |
| { | |
| "epoch": 1.4068288181398723, | |
| "grad_norm": 0.010814374312758446, | |
| "learning_rate": 5.9348198970840484e-05, | |
| "loss": 0.0038, | |
| "step": 5335 | |
| }, | |
| { | |
| "epoch": 1.408147122800079, | |
| "grad_norm": 0.6235967874526978, | |
| "learning_rate": 5.9216255442670546e-05, | |
| "loss": 0.0354, | |
| "step": 5340 | |
| }, | |
| { | |
| "epoch": 1.409465427460286, | |
| "grad_norm": 0.026741521432995796, | |
| "learning_rate": 5.90843119145006e-05, | |
| "loss": 0.0032, | |
| "step": 5345 | |
| }, | |
| { | |
| "epoch": 1.410783732120493, | |
| "grad_norm": 0.019413433969020844, | |
| "learning_rate": 5.895236838633066e-05, | |
| "loss": 0.0216, | |
| "step": 5350 | |
| }, | |
| { | |
| "epoch": 1.4121020367807, | |
| "grad_norm": 0.0735543966293335, | |
| "learning_rate": 5.8820424858160706e-05, | |
| "loss": 0.0033, | |
| "step": 5355 | |
| }, | |
| { | |
| "epoch": 1.413420341440907, | |
| "grad_norm": 0.005189546383917332, | |
| "learning_rate": 5.868848132999076e-05, | |
| "loss": 0.021, | |
| "step": 5360 | |
| }, | |
| { | |
| "epoch": 1.414738646101114, | |
| "grad_norm": 0.21240335702896118, | |
| "learning_rate": 5.8556537801820824e-05, | |
| "loss": 0.0294, | |
| "step": 5365 | |
| }, | |
| { | |
| "epoch": 1.416056950761321, | |
| "grad_norm": 0.010165920481085777, | |
| "learning_rate": 5.842459427365088e-05, | |
| "loss": 0.0021, | |
| "step": 5370 | |
| }, | |
| { | |
| "epoch": 1.417375255421528, | |
| "grad_norm": 0.026774069294333458, | |
| "learning_rate": 5.8292650745480936e-05, | |
| "loss": 0.0299, | |
| "step": 5375 | |
| }, | |
| { | |
| "epoch": 1.418693560081735, | |
| "grad_norm": 0.0019810455851256847, | |
| "learning_rate": 5.816070721731099e-05, | |
| "loss": 0.0029, | |
| "step": 5380 | |
| }, | |
| { | |
| "epoch": 1.4200118647419417, | |
| "grad_norm": 0.038888879120349884, | |
| "learning_rate": 5.8028763689141054e-05, | |
| "loss": 0.0069, | |
| "step": 5385 | |
| }, | |
| { | |
| "epoch": 1.421330169402149, | |
| "grad_norm": 0.016180936247110367, | |
| "learning_rate": 5.789682016097111e-05, | |
| "loss": 0.0032, | |
| "step": 5390 | |
| }, | |
| { | |
| "epoch": 1.4226484740623557, | |
| "grad_norm": 0.01119404286146164, | |
| "learning_rate": 5.7764876632801165e-05, | |
| "loss": 0.0024, | |
| "step": 5395 | |
| }, | |
| { | |
| "epoch": 1.4239667787225627, | |
| "grad_norm": 0.010486694052815437, | |
| "learning_rate": 5.763293310463123e-05, | |
| "loss": 0.0324, | |
| "step": 5400 | |
| }, | |
| { | |
| "epoch": 1.4252850833827697, | |
| "grad_norm": 0.005453066434711218, | |
| "learning_rate": 5.750098957646127e-05, | |
| "loss": 0.0038, | |
| "step": 5405 | |
| }, | |
| { | |
| "epoch": 1.4266033880429767, | |
| "grad_norm": 0.17556461691856384, | |
| "learning_rate": 5.736904604829133e-05, | |
| "loss": 0.0305, | |
| "step": 5410 | |
| }, | |
| { | |
| "epoch": 1.4279216927031837, | |
| "grad_norm": 0.03074715845286846, | |
| "learning_rate": 5.723710252012139e-05, | |
| "loss": 0.003, | |
| "step": 5415 | |
| }, | |
| { | |
| "epoch": 1.4292399973633907, | |
| "grad_norm": 1.7238941192626953, | |
| "learning_rate": 5.710515899195144e-05, | |
| "loss": 0.0254, | |
| "step": 5420 | |
| }, | |
| { | |
| "epoch": 1.4305583020235977, | |
| "grad_norm": 0.012462320737540722, | |
| "learning_rate": 5.6973215463781506e-05, | |
| "loss": 0.0018, | |
| "step": 5425 | |
| }, | |
| { | |
| "epoch": 1.4318766066838047, | |
| "grad_norm": 0.021576853469014168, | |
| "learning_rate": 5.684127193561156e-05, | |
| "loss": 0.0472, | |
| "step": 5430 | |
| }, | |
| { | |
| "epoch": 1.4331949113440117, | |
| "grad_norm": 0.2862134575843811, | |
| "learning_rate": 5.670932840744162e-05, | |
| "loss": 0.0258, | |
| "step": 5435 | |
| }, | |
| { | |
| "epoch": 1.4345132160042184, | |
| "grad_norm": 0.28419312834739685, | |
| "learning_rate": 5.657738487927168e-05, | |
| "loss": 0.0053, | |
| "step": 5440 | |
| }, | |
| { | |
| "epoch": 1.4358315206644257, | |
| "grad_norm": 0.013650139793753624, | |
| "learning_rate": 5.6445441351101735e-05, | |
| "loss": 0.0126, | |
| "step": 5445 | |
| }, | |
| { | |
| "epoch": 1.4371498253246324, | |
| "grad_norm": 0.01203097216784954, | |
| "learning_rate": 5.631349782293179e-05, | |
| "loss": 0.0076, | |
| "step": 5450 | |
| }, | |
| { | |
| "epoch": 1.4384681299848394, | |
| "grad_norm": 0.0881054624915123, | |
| "learning_rate": 5.618155429476184e-05, | |
| "loss": 0.0178, | |
| "step": 5455 | |
| }, | |
| { | |
| "epoch": 1.4397864346450464, | |
| "grad_norm": 0.5258516669273376, | |
| "learning_rate": 5.6049610766591895e-05, | |
| "loss": 0.0112, | |
| "step": 5460 | |
| }, | |
| { | |
| "epoch": 1.4411047393052534, | |
| "grad_norm": 0.001202153041958809, | |
| "learning_rate": 5.591766723842196e-05, | |
| "loss": 0.0089, | |
| "step": 5465 | |
| }, | |
| { | |
| "epoch": 1.4424230439654604, | |
| "grad_norm": 0.4498993456363678, | |
| "learning_rate": 5.5785723710252014e-05, | |
| "loss": 0.0252, | |
| "step": 5470 | |
| }, | |
| { | |
| "epoch": 1.4437413486256674, | |
| "grad_norm": 0.17477644979953766, | |
| "learning_rate": 5.565378018208207e-05, | |
| "loss": 0.0169, | |
| "step": 5475 | |
| }, | |
| { | |
| "epoch": 1.4450596532858744, | |
| "grad_norm": 0.019443338736891747, | |
| "learning_rate": 5.552183665391213e-05, | |
| "loss": 0.0019, | |
| "step": 5480 | |
| }, | |
| { | |
| "epoch": 1.4463779579460814, | |
| "grad_norm": 0.005653039086610079, | |
| "learning_rate": 5.538989312574219e-05, | |
| "loss": 0.0231, | |
| "step": 5485 | |
| }, | |
| { | |
| "epoch": 1.4476962626062884, | |
| "grad_norm": 0.01554112322628498, | |
| "learning_rate": 5.525794959757224e-05, | |
| "loss": 0.0167, | |
| "step": 5490 | |
| }, | |
| { | |
| "epoch": 1.4490145672664954, | |
| "grad_norm": 0.044272180646657944, | |
| "learning_rate": 5.5126006069402305e-05, | |
| "loss": 0.007, | |
| "step": 5495 | |
| }, | |
| { | |
| "epoch": 1.4503328719267023, | |
| "grad_norm": 0.014857172966003418, | |
| "learning_rate": 5.499406254123236e-05, | |
| "loss": 0.0045, | |
| "step": 5500 | |
| }, | |
| { | |
| "epoch": 1.4503328719267023, | |
| "eval_loss": 0.02392147295176983, | |
| "eval_runtime": 452.468, | |
| "eval_samples_per_second": 7.452, | |
| "eval_steps_per_second": 3.726, | |
| "step": 5500 | |
| }, | |
| { | |
| "epoch": 1.4516511765869091, | |
| "grad_norm": 0.007390835788100958, | |
| "learning_rate": 5.486211901306241e-05, | |
| "loss": 0.0171, | |
| "step": 5505 | |
| }, | |
| { | |
| "epoch": 1.4529694812471163, | |
| "grad_norm": 0.0050474610179662704, | |
| "learning_rate": 5.4730175484892466e-05, | |
| "loss": 0.004, | |
| "step": 5510 | |
| }, | |
| { | |
| "epoch": 1.454287785907323, | |
| "grad_norm": 0.08066163957118988, | |
| "learning_rate": 5.459823195672252e-05, | |
| "loss": 0.0103, | |
| "step": 5515 | |
| }, | |
| { | |
| "epoch": 1.45560609056753, | |
| "grad_norm": 0.0062376330606639385, | |
| "learning_rate": 5.4466288428552584e-05, | |
| "loss": 0.0066, | |
| "step": 5520 | |
| }, | |
| { | |
| "epoch": 1.456924395227737, | |
| "grad_norm": 0.00711809890344739, | |
| "learning_rate": 5.433434490038264e-05, | |
| "loss": 0.003, | |
| "step": 5525 | |
| }, | |
| { | |
| "epoch": 1.458242699887944, | |
| "grad_norm": 0.004010149277746677, | |
| "learning_rate": 5.4202401372212695e-05, | |
| "loss": 0.0231, | |
| "step": 5530 | |
| }, | |
| { | |
| "epoch": 1.459561004548151, | |
| "grad_norm": 0.4791967272758484, | |
| "learning_rate": 5.407045784404276e-05, | |
| "loss": 0.0277, | |
| "step": 5535 | |
| }, | |
| { | |
| "epoch": 1.460879309208358, | |
| "grad_norm": 0.03979189693927765, | |
| "learning_rate": 5.393851431587281e-05, | |
| "loss": 0.0033, | |
| "step": 5540 | |
| }, | |
| { | |
| "epoch": 1.462197613868565, | |
| "grad_norm": 0.03331119939684868, | |
| "learning_rate": 5.380657078770287e-05, | |
| "loss": 0.0187, | |
| "step": 5545 | |
| }, | |
| { | |
| "epoch": 1.463515918528772, | |
| "grad_norm": 0.0042802803218364716, | |
| "learning_rate": 5.367462725953293e-05, | |
| "loss": 0.0032, | |
| "step": 5550 | |
| }, | |
| { | |
| "epoch": 1.464834223188979, | |
| "grad_norm": 0.05439918115735054, | |
| "learning_rate": 5.354268373136297e-05, | |
| "loss": 0.0043, | |
| "step": 5555 | |
| }, | |
| { | |
| "epoch": 1.4661525278491858, | |
| "grad_norm": 0.042643506079912186, | |
| "learning_rate": 5.3410740203193036e-05, | |
| "loss": 0.0059, | |
| "step": 5560 | |
| }, | |
| { | |
| "epoch": 1.467470832509393, | |
| "grad_norm": 0.023453116416931152, | |
| "learning_rate": 5.327879667502309e-05, | |
| "loss": 0.0043, | |
| "step": 5565 | |
| }, | |
| { | |
| "epoch": 1.4687891371695998, | |
| "grad_norm": 0.037712760269641876, | |
| "learning_rate": 5.314685314685315e-05, | |
| "loss": 0.0033, | |
| "step": 5570 | |
| }, | |
| { | |
| "epoch": 1.4701074418298068, | |
| "grad_norm": 1.0485608577728271, | |
| "learning_rate": 5.301490961868321e-05, | |
| "loss": 0.0489, | |
| "step": 5575 | |
| }, | |
| { | |
| "epoch": 1.4714257464900138, | |
| "grad_norm": 0.004728829488158226, | |
| "learning_rate": 5.2882966090513265e-05, | |
| "loss": 0.0067, | |
| "step": 5580 | |
| }, | |
| { | |
| "epoch": 1.4727440511502208, | |
| "grad_norm": 0.027893677353858948, | |
| "learning_rate": 5.275102256234332e-05, | |
| "loss": 0.0208, | |
| "step": 5585 | |
| }, | |
| { | |
| "epoch": 1.4740623558104278, | |
| "grad_norm": 0.02256879396736622, | |
| "learning_rate": 5.2619079034173377e-05, | |
| "loss": 0.0036, | |
| "step": 5590 | |
| }, | |
| { | |
| "epoch": 1.4753806604706348, | |
| "grad_norm": 0.12636558711528778, | |
| "learning_rate": 5.248713550600344e-05, | |
| "loss": 0.0046, | |
| "step": 5595 | |
| }, | |
| { | |
| "epoch": 1.4766989651308418, | |
| "grad_norm": 0.000997041119262576, | |
| "learning_rate": 5.235519197783348e-05, | |
| "loss": 0.0101, | |
| "step": 5600 | |
| }, | |
| { | |
| "epoch": 1.4780172697910487, | |
| "grad_norm": 0.023494020104408264, | |
| "learning_rate": 5.2223248449663543e-05, | |
| "loss": 0.0039, | |
| "step": 5605 | |
| }, | |
| { | |
| "epoch": 1.4793355744512557, | |
| "grad_norm": 0.01525307446718216, | |
| "learning_rate": 5.20913049214936e-05, | |
| "loss": 0.021, | |
| "step": 5610 | |
| }, | |
| { | |
| "epoch": 1.4806538791114627, | |
| "grad_norm": 0.0024215306621044874, | |
| "learning_rate": 5.1959361393323655e-05, | |
| "loss": 0.0017, | |
| "step": 5615 | |
| }, | |
| { | |
| "epoch": 1.4819721837716697, | |
| "grad_norm": 1.4708061218261719, | |
| "learning_rate": 5.182741786515372e-05, | |
| "loss": 0.04, | |
| "step": 5620 | |
| }, | |
| { | |
| "epoch": 1.4832904884318765, | |
| "grad_norm": 0.015033531002700329, | |
| "learning_rate": 5.169547433698377e-05, | |
| "loss": 0.0042, | |
| "step": 5625 | |
| }, | |
| { | |
| "epoch": 1.4846087930920837, | |
| "grad_norm": 0.0035444959066808224, | |
| "learning_rate": 5.156353080881383e-05, | |
| "loss": 0.0087, | |
| "step": 5630 | |
| }, | |
| { | |
| "epoch": 1.4859270977522905, | |
| "grad_norm": 0.010087919421494007, | |
| "learning_rate": 5.143158728064389e-05, | |
| "loss": 0.0158, | |
| "step": 5635 | |
| }, | |
| { | |
| "epoch": 1.4872454024124975, | |
| "grad_norm": 0.05779251083731651, | |
| "learning_rate": 5.129964375247395e-05, | |
| "loss": 0.0157, | |
| "step": 5640 | |
| }, | |
| { | |
| "epoch": 1.4885637070727045, | |
| "grad_norm": 0.14927980303764343, | |
| "learning_rate": 5.1167700224304e-05, | |
| "loss": 0.0257, | |
| "step": 5645 | |
| }, | |
| { | |
| "epoch": 1.4898820117329115, | |
| "grad_norm": 0.004252352751791477, | |
| "learning_rate": 5.103575669613405e-05, | |
| "loss": 0.0198, | |
| "step": 5650 | |
| }, | |
| { | |
| "epoch": 1.4912003163931185, | |
| "grad_norm": 0.0029206848703324795, | |
| "learning_rate": 5.090381316796411e-05, | |
| "loss": 0.0016, | |
| "step": 5655 | |
| }, | |
| { | |
| "epoch": 1.4925186210533254, | |
| "grad_norm": 0.005047530401498079, | |
| "learning_rate": 5.077186963979417e-05, | |
| "loss": 0.0023, | |
| "step": 5660 | |
| }, | |
| { | |
| "epoch": 1.4938369257135324, | |
| "grad_norm": 0.003732564626261592, | |
| "learning_rate": 5.0639926111624225e-05, | |
| "loss": 0.0336, | |
| "step": 5665 | |
| }, | |
| { | |
| "epoch": 1.4951552303737394, | |
| "grad_norm": 0.3832889497280121, | |
| "learning_rate": 5.050798258345428e-05, | |
| "loss": 0.0476, | |
| "step": 5670 | |
| }, | |
| { | |
| "epoch": 1.4964735350339464, | |
| "grad_norm": 0.06733009219169617, | |
| "learning_rate": 5.037603905528434e-05, | |
| "loss": 0.0044, | |
| "step": 5675 | |
| }, | |
| { | |
| "epoch": 1.4977918396941532, | |
| "grad_norm": 0.008067069575190544, | |
| "learning_rate": 5.02440955271144e-05, | |
| "loss": 0.0035, | |
| "step": 5680 | |
| }, | |
| { | |
| "epoch": 1.4991101443543604, | |
| "grad_norm": 0.01706300489604473, | |
| "learning_rate": 5.0112151998944454e-05, | |
| "loss": 0.0031, | |
| "step": 5685 | |
| }, | |
| { | |
| "epoch": 1.5004284490145672, | |
| "grad_norm": 0.009932024404406548, | |
| "learning_rate": 4.998020847077451e-05, | |
| "loss": 0.0587, | |
| "step": 5690 | |
| }, | |
| { | |
| "epoch": 1.5017467536747744, | |
| "grad_norm": 0.006488936021924019, | |
| "learning_rate": 4.9848264942604566e-05, | |
| "loss": 0.002, | |
| "step": 5695 | |
| }, | |
| { | |
| "epoch": 1.5030650583349812, | |
| "grad_norm": 0.17488756775856018, | |
| "learning_rate": 4.971632141443462e-05, | |
| "loss": 0.0245, | |
| "step": 5700 | |
| }, | |
| { | |
| "epoch": 1.5043833629951882, | |
| "grad_norm": 0.3327178359031677, | |
| "learning_rate": 4.9584377886264684e-05, | |
| "loss": 0.0404, | |
| "step": 5705 | |
| }, | |
| { | |
| "epoch": 1.5057016676553951, | |
| "grad_norm": 0.18467263877391815, | |
| "learning_rate": 4.945243435809474e-05, | |
| "loss": 0.0248, | |
| "step": 5710 | |
| }, | |
| { | |
| "epoch": 1.5070199723156021, | |
| "grad_norm": 0.020061776041984558, | |
| "learning_rate": 4.9320490829924795e-05, | |
| "loss": 0.0034, | |
| "step": 5715 | |
| }, | |
| { | |
| "epoch": 1.5083382769758091, | |
| "grad_norm": 0.0005288647953420877, | |
| "learning_rate": 4.918854730175485e-05, | |
| "loss": 0.0076, | |
| "step": 5720 | |
| }, | |
| { | |
| "epoch": 1.5096565816360161, | |
| "grad_norm": 0.007515576668083668, | |
| "learning_rate": 4.9056603773584906e-05, | |
| "loss": 0.004, | |
| "step": 5725 | |
| }, | |
| { | |
| "epoch": 1.5109748862962231, | |
| "grad_norm": 0.05365758761763573, | |
| "learning_rate": 4.892466024541497e-05, | |
| "loss": 0.0222, | |
| "step": 5730 | |
| }, | |
| { | |
| "epoch": 1.51229319095643, | |
| "grad_norm": 0.00572391040623188, | |
| "learning_rate": 4.8792716717245025e-05, | |
| "loss": 0.0132, | |
| "step": 5735 | |
| }, | |
| { | |
| "epoch": 1.513611495616637, | |
| "grad_norm": 0.21178627014160156, | |
| "learning_rate": 4.8660773189075073e-05, | |
| "loss": 0.0417, | |
| "step": 5740 | |
| }, | |
| { | |
| "epoch": 1.5149298002768439, | |
| "grad_norm": 0.0641486868262291, | |
| "learning_rate": 4.8528829660905136e-05, | |
| "loss": 0.011, | |
| "step": 5745 | |
| }, | |
| { | |
| "epoch": 1.516248104937051, | |
| "grad_norm": 0.04451924189925194, | |
| "learning_rate": 4.839688613273519e-05, | |
| "loss": 0.012, | |
| "step": 5750 | |
| }, | |
| { | |
| "epoch": 1.5175664095972579, | |
| "grad_norm": 0.019951259717345238, | |
| "learning_rate": 4.826494260456525e-05, | |
| "loss": 0.009, | |
| "step": 5755 | |
| }, | |
| { | |
| "epoch": 1.5188847142574649, | |
| "grad_norm": 0.021919893100857735, | |
| "learning_rate": 4.813299907639531e-05, | |
| "loss": 0.0081, | |
| "step": 5760 | |
| }, | |
| { | |
| "epoch": 1.5202030189176718, | |
| "grad_norm": 0.5730367302894592, | |
| "learning_rate": 4.800105554822536e-05, | |
| "loss": 0.0254, | |
| "step": 5765 | |
| }, | |
| { | |
| "epoch": 1.5215213235778788, | |
| "grad_norm": 0.02501523122191429, | |
| "learning_rate": 4.786911202005542e-05, | |
| "loss": 0.0045, | |
| "step": 5770 | |
| }, | |
| { | |
| "epoch": 1.5228396282380858, | |
| "grad_norm": 0.01574208028614521, | |
| "learning_rate": 4.773716849188548e-05, | |
| "loss": 0.0081, | |
| "step": 5775 | |
| }, | |
| { | |
| "epoch": 1.5241579328982928, | |
| "grad_norm": 0.009626791812479496, | |
| "learning_rate": 4.760522496371553e-05, | |
| "loss": 0.0037, | |
| "step": 5780 | |
| }, | |
| { | |
| "epoch": 1.5254762375584998, | |
| "grad_norm": 0.535539448261261, | |
| "learning_rate": 4.747328143554559e-05, | |
| "loss": 0.0149, | |
| "step": 5785 | |
| }, | |
| { | |
| "epoch": 1.5267945422187066, | |
| "grad_norm": 0.004934845492243767, | |
| "learning_rate": 4.7341337907375644e-05, | |
| "loss": 0.0048, | |
| "step": 5790 | |
| }, | |
| { | |
| "epoch": 1.5281128468789138, | |
| "grad_norm": 0.009070080704987049, | |
| "learning_rate": 4.72093943792057e-05, | |
| "loss": 0.0028, | |
| "step": 5795 | |
| }, | |
| { | |
| "epoch": 1.5294311515391206, | |
| "grad_norm": 0.0040720063261687756, | |
| "learning_rate": 4.707745085103576e-05, | |
| "loss": 0.0016, | |
| "step": 5800 | |
| }, | |
| { | |
| "epoch": 1.5307494561993278, | |
| "grad_norm": 0.45212000608444214, | |
| "learning_rate": 4.694550732286582e-05, | |
| "loss": 0.0111, | |
| "step": 5805 | |
| }, | |
| { | |
| "epoch": 1.5320677608595346, | |
| "grad_norm": 0.024048497900366783, | |
| "learning_rate": 4.681356379469587e-05, | |
| "loss": 0.0149, | |
| "step": 5810 | |
| }, | |
| { | |
| "epoch": 1.5333860655197418, | |
| "grad_norm": 0.11899136006832123, | |
| "learning_rate": 4.668162026652593e-05, | |
| "loss": 0.0034, | |
| "step": 5815 | |
| }, | |
| { | |
| "epoch": 1.5347043701799485, | |
| "grad_norm": 0.011249657720327377, | |
| "learning_rate": 4.6549676738355984e-05, | |
| "loss": 0.0052, | |
| "step": 5820 | |
| }, | |
| { | |
| "epoch": 1.5360226748401555, | |
| "grad_norm": 0.051634710282087326, | |
| "learning_rate": 4.641773321018604e-05, | |
| "loss": 0.0031, | |
| "step": 5825 | |
| }, | |
| { | |
| "epoch": 1.5373409795003625, | |
| "grad_norm": 0.3726826012134552, | |
| "learning_rate": 4.62857896820161e-05, | |
| "loss": 0.0582, | |
| "step": 5830 | |
| }, | |
| { | |
| "epoch": 1.5386592841605695, | |
| "grad_norm": 0.5827310681343079, | |
| "learning_rate": 4.615384615384616e-05, | |
| "loss": 0.0652, | |
| "step": 5835 | |
| }, | |
| { | |
| "epoch": 1.5399775888207765, | |
| "grad_norm": 0.006390869617462158, | |
| "learning_rate": 4.6021902625676214e-05, | |
| "loss": 0.0022, | |
| "step": 5840 | |
| }, | |
| { | |
| "epoch": 1.5412958934809835, | |
| "grad_norm": 0.022760871797800064, | |
| "learning_rate": 4.588995909750627e-05, | |
| "loss": 0.0311, | |
| "step": 5845 | |
| }, | |
| { | |
| "epoch": 1.5426141981411905, | |
| "grad_norm": 0.22773241996765137, | |
| "learning_rate": 4.5758015569336325e-05, | |
| "loss": 0.0051, | |
| "step": 5850 | |
| }, | |
| { | |
| "epoch": 1.5439325028013973, | |
| "grad_norm": 0.015375247225165367, | |
| "learning_rate": 4.562607204116639e-05, | |
| "loss": 0.0023, | |
| "step": 5855 | |
| }, | |
| { | |
| "epoch": 1.5452508074616045, | |
| "grad_norm": 0.007347101345658302, | |
| "learning_rate": 4.549412851299644e-05, | |
| "loss": 0.0437, | |
| "step": 5860 | |
| }, | |
| { | |
| "epoch": 1.5465691121218113, | |
| "grad_norm": 0.012344900518655777, | |
| "learning_rate": 4.536218498482649e-05, | |
| "loss": 0.004, | |
| "step": 5865 | |
| }, | |
| { | |
| "epoch": 1.5478874167820185, | |
| "grad_norm": 0.27038896083831787, | |
| "learning_rate": 4.5230241456656555e-05, | |
| "loss": 0.0047, | |
| "step": 5870 | |
| }, | |
| { | |
| "epoch": 1.5492057214422252, | |
| "grad_norm": 0.016395213082432747, | |
| "learning_rate": 4.509829792848661e-05, | |
| "loss": 0.0026, | |
| "step": 5875 | |
| }, | |
| { | |
| "epoch": 1.5505240261024322, | |
| "grad_norm": 0.4217267632484436, | |
| "learning_rate": 4.4966354400316666e-05, | |
| "loss": 0.0364, | |
| "step": 5880 | |
| }, | |
| { | |
| "epoch": 1.5518423307626392, | |
| "grad_norm": 0.20046105980873108, | |
| "learning_rate": 4.483441087214673e-05, | |
| "loss": 0.0243, | |
| "step": 5885 | |
| }, | |
| { | |
| "epoch": 1.5531606354228462, | |
| "grad_norm": 0.004307698458433151, | |
| "learning_rate": 4.470246734397678e-05, | |
| "loss": 0.0064, | |
| "step": 5890 | |
| }, | |
| { | |
| "epoch": 1.5544789400830532, | |
| "grad_norm": 0.46102187037467957, | |
| "learning_rate": 4.457052381580683e-05, | |
| "loss": 0.0115, | |
| "step": 5895 | |
| }, | |
| { | |
| "epoch": 1.5557972447432602, | |
| "grad_norm": 0.0689118504524231, | |
| "learning_rate": 4.4438580287636895e-05, | |
| "loss": 0.0334, | |
| "step": 5900 | |
| }, | |
| { | |
| "epoch": 1.5571155494034672, | |
| "grad_norm": 0.003091114340350032, | |
| "learning_rate": 4.430663675946695e-05, | |
| "loss": 0.0246, | |
| "step": 5905 | |
| }, | |
| { | |
| "epoch": 1.558433854063674, | |
| "grad_norm": 0.003877349430695176, | |
| "learning_rate": 4.417469323129701e-05, | |
| "loss": 0.0032, | |
| "step": 5910 | |
| }, | |
| { | |
| "epoch": 1.5597521587238812, | |
| "grad_norm": 0.30713143944740295, | |
| "learning_rate": 4.404274970312706e-05, | |
| "loss": 0.0229, | |
| "step": 5915 | |
| }, | |
| { | |
| "epoch": 1.561070463384088, | |
| "grad_norm": 0.07344445586204529, | |
| "learning_rate": 4.391080617495712e-05, | |
| "loss": 0.0078, | |
| "step": 5920 | |
| }, | |
| { | |
| "epoch": 1.5623887680442952, | |
| "grad_norm": 0.01774723082780838, | |
| "learning_rate": 4.377886264678718e-05, | |
| "loss": 0.0034, | |
| "step": 5925 | |
| }, | |
| { | |
| "epoch": 1.563707072704502, | |
| "grad_norm": 0.476324200630188, | |
| "learning_rate": 4.3646919118617236e-05, | |
| "loss": 0.0071, | |
| "step": 5930 | |
| }, | |
| { | |
| "epoch": 1.5650253773647091, | |
| "grad_norm": 0.11624465882778168, | |
| "learning_rate": 4.351497559044729e-05, | |
| "loss": 0.0236, | |
| "step": 5935 | |
| }, | |
| { | |
| "epoch": 1.566343682024916, | |
| "grad_norm": 0.190691277384758, | |
| "learning_rate": 4.338303206227735e-05, | |
| "loss": 0.006, | |
| "step": 5940 | |
| }, | |
| { | |
| "epoch": 1.567661986685123, | |
| "grad_norm": 0.20517045259475708, | |
| "learning_rate": 4.32510885341074e-05, | |
| "loss": 0.009, | |
| "step": 5945 | |
| }, | |
| { | |
| "epoch": 1.56898029134533, | |
| "grad_norm": 0.008122317492961884, | |
| "learning_rate": 4.311914500593746e-05, | |
| "loss": 0.0041, | |
| "step": 5950 | |
| }, | |
| { | |
| "epoch": 1.570298596005537, | |
| "grad_norm": 0.01982291042804718, | |
| "learning_rate": 4.298720147776752e-05, | |
| "loss": 0.0258, | |
| "step": 5955 | |
| }, | |
| { | |
| "epoch": 1.5716169006657439, | |
| "grad_norm": 0.000996922142803669, | |
| "learning_rate": 4.285525794959758e-05, | |
| "loss": 0.0233, | |
| "step": 5960 | |
| }, | |
| { | |
| "epoch": 1.5729352053259509, | |
| "grad_norm": 0.09725592285394669, | |
| "learning_rate": 4.272331442142763e-05, | |
| "loss": 0.0218, | |
| "step": 5965 | |
| }, | |
| { | |
| "epoch": 1.5742535099861579, | |
| "grad_norm": 0.0672350749373436, | |
| "learning_rate": 4.259137089325769e-05, | |
| "loss": 0.0194, | |
| "step": 5970 | |
| }, | |
| { | |
| "epoch": 1.5755718146463646, | |
| "grad_norm": 0.014844833873212337, | |
| "learning_rate": 4.2459427365087744e-05, | |
| "loss": 0.0298, | |
| "step": 5975 | |
| }, | |
| { | |
| "epoch": 1.5768901193065719, | |
| "grad_norm": 0.030519040301442146, | |
| "learning_rate": 4.2327483836917806e-05, | |
| "loss": 0.0178, | |
| "step": 5980 | |
| }, | |
| { | |
| "epoch": 1.5782084239667786, | |
| "grad_norm": 0.018561460077762604, | |
| "learning_rate": 4.219554030874786e-05, | |
| "loss": 0.0154, | |
| "step": 5985 | |
| }, | |
| { | |
| "epoch": 1.5795267286269858, | |
| "grad_norm": 0.02470085583627224, | |
| "learning_rate": 4.206359678057791e-05, | |
| "loss": 0.0361, | |
| "step": 5990 | |
| }, | |
| { | |
| "epoch": 1.5808450332871926, | |
| "grad_norm": 0.055412422865629196, | |
| "learning_rate": 4.193165325240797e-05, | |
| "loss": 0.0162, | |
| "step": 5995 | |
| }, | |
| { | |
| "epoch": 1.5821633379473996, | |
| "grad_norm": 0.0034158769994974136, | |
| "learning_rate": 4.179970972423803e-05, | |
| "loss": 0.0068, | |
| "step": 6000 | |
| }, | |
| { | |
| "epoch": 1.5821633379473996, | |
| "eval_loss": 0.024797894060611725, | |
| "eval_runtime": 452.1611, | |
| "eval_samples_per_second": 7.458, | |
| "eval_steps_per_second": 3.729, | |
| "step": 6000 | |
| }, | |
| { | |
| "epoch": 1.5834816426076066, | |
| "grad_norm": 0.01284120511263609, | |
| "learning_rate": 4.1667766196068085e-05, | |
| "loss": 0.0036, | |
| "step": 6005 | |
| }, | |
| { | |
| "epoch": 1.5847999472678136, | |
| "grad_norm": 0.01274865586310625, | |
| "learning_rate": 4.153582266789815e-05, | |
| "loss": 0.0447, | |
| "step": 6010 | |
| }, | |
| { | |
| "epoch": 1.5861182519280206, | |
| "grad_norm": 0.03555435314774513, | |
| "learning_rate": 4.1403879139728196e-05, | |
| "loss": 0.0078, | |
| "step": 6015 | |
| }, | |
| { | |
| "epoch": 1.5874365565882276, | |
| "grad_norm": 0.0011938117677345872, | |
| "learning_rate": 4.127193561155825e-05, | |
| "loss": 0.0136, | |
| "step": 6020 | |
| }, | |
| { | |
| "epoch": 1.5887548612484346, | |
| "grad_norm": 0.9741255640983582, | |
| "learning_rate": 4.1139992083388314e-05, | |
| "loss": 0.0153, | |
| "step": 6025 | |
| }, | |
| { | |
| "epoch": 1.5900731659086413, | |
| "grad_norm": 0.011220674030482769, | |
| "learning_rate": 4.100804855521837e-05, | |
| "loss": 0.0262, | |
| "step": 6030 | |
| }, | |
| { | |
| "epoch": 1.5913914705688486, | |
| "grad_norm": 0.021556466817855835, | |
| "learning_rate": 4.0876105027048425e-05, | |
| "loss": 0.0044, | |
| "step": 6035 | |
| }, | |
| { | |
| "epoch": 1.5927097752290553, | |
| "grad_norm": 0.2725502848625183, | |
| "learning_rate": 4.074416149887848e-05, | |
| "loss": 0.0558, | |
| "step": 6040 | |
| }, | |
| { | |
| "epoch": 1.5940280798892625, | |
| "grad_norm": 0.6407182216644287, | |
| "learning_rate": 4.0612217970708537e-05, | |
| "loss": 0.0261, | |
| "step": 6045 | |
| }, | |
| { | |
| "epoch": 1.5953463845494693, | |
| "grad_norm": 0.0024960115551948547, | |
| "learning_rate": 4.04802744425386e-05, | |
| "loss": 0.0128, | |
| "step": 6050 | |
| }, | |
| { | |
| "epoch": 1.5966646892096763, | |
| "grad_norm": 0.11380109190940857, | |
| "learning_rate": 4.0348330914368655e-05, | |
| "loss": 0.0199, | |
| "step": 6055 | |
| }, | |
| { | |
| "epoch": 1.5979829938698833, | |
| "grad_norm": 0.18358005583286285, | |
| "learning_rate": 4.0216387386198704e-05, | |
| "loss": 0.0083, | |
| "step": 6060 | |
| }, | |
| { | |
| "epoch": 1.5993012985300903, | |
| "grad_norm": 0.06412303447723389, | |
| "learning_rate": 4.0084443858028766e-05, | |
| "loss": 0.0548, | |
| "step": 6065 | |
| }, | |
| { | |
| "epoch": 1.6006196031902973, | |
| "grad_norm": 0.6999421119689941, | |
| "learning_rate": 3.995250032985882e-05, | |
| "loss": 0.0074, | |
| "step": 6070 | |
| }, | |
| { | |
| "epoch": 1.6019379078505043, | |
| "grad_norm": 0.18698133528232574, | |
| "learning_rate": 3.982055680168888e-05, | |
| "loss": 0.0542, | |
| "step": 6075 | |
| }, | |
| { | |
| "epoch": 1.6032562125107113, | |
| "grad_norm": 0.014717207290232182, | |
| "learning_rate": 3.968861327351894e-05, | |
| "loss": 0.0071, | |
| "step": 6080 | |
| }, | |
| { | |
| "epoch": 1.604574517170918, | |
| "grad_norm": 0.0765385851264, | |
| "learning_rate": 3.955666974534899e-05, | |
| "loss": 0.0063, | |
| "step": 6085 | |
| }, | |
| { | |
| "epoch": 1.6058928218311253, | |
| "grad_norm": 0.4332450330257416, | |
| "learning_rate": 3.9424726217179044e-05, | |
| "loss": 0.0071, | |
| "step": 6090 | |
| }, | |
| { | |
| "epoch": 1.607211126491332, | |
| "grad_norm": 0.003700035158544779, | |
| "learning_rate": 3.929278268900911e-05, | |
| "loss": 0.0052, | |
| "step": 6095 | |
| }, | |
| { | |
| "epoch": 1.6085294311515392, | |
| "grad_norm": 0.02500278130173683, | |
| "learning_rate": 3.916083916083916e-05, | |
| "loss": 0.0387, | |
| "step": 6100 | |
| }, | |
| { | |
| "epoch": 1.609847735811746, | |
| "grad_norm": 0.023568281903862953, | |
| "learning_rate": 3.902889563266922e-05, | |
| "loss": 0.0594, | |
| "step": 6105 | |
| }, | |
| { | |
| "epoch": 1.6111660404719532, | |
| "grad_norm": 0.02687825821340084, | |
| "learning_rate": 3.8896952104499274e-05, | |
| "loss": 0.0229, | |
| "step": 6110 | |
| }, | |
| { | |
| "epoch": 1.61248434513216, | |
| "grad_norm": 0.005178579594939947, | |
| "learning_rate": 3.876500857632933e-05, | |
| "loss": 0.0293, | |
| "step": 6115 | |
| }, | |
| { | |
| "epoch": 1.613802649792367, | |
| "grad_norm": 0.3987988531589508, | |
| "learning_rate": 3.863306504815939e-05, | |
| "loss": 0.015, | |
| "step": 6120 | |
| }, | |
| { | |
| "epoch": 1.615120954452574, | |
| "grad_norm": 0.18915466964244843, | |
| "learning_rate": 3.850112151998945e-05, | |
| "loss": 0.023, | |
| "step": 6125 | |
| }, | |
| { | |
| "epoch": 1.616439259112781, | |
| "grad_norm": 0.015252528712153435, | |
| "learning_rate": 3.83691779918195e-05, | |
| "loss": 0.0185, | |
| "step": 6130 | |
| }, | |
| { | |
| "epoch": 1.617757563772988, | |
| "grad_norm": 0.04947187379002571, | |
| "learning_rate": 3.823723446364956e-05, | |
| "loss": 0.0131, | |
| "step": 6135 | |
| }, | |
| { | |
| "epoch": 1.619075868433195, | |
| "grad_norm": 0.017095958814024925, | |
| "learning_rate": 3.8105290935479615e-05, | |
| "loss": 0.0071, | |
| "step": 6140 | |
| }, | |
| { | |
| "epoch": 1.620394173093402, | |
| "grad_norm": 0.013050337322056293, | |
| "learning_rate": 3.797334740730967e-05, | |
| "loss": 0.0038, | |
| "step": 6145 | |
| }, | |
| { | |
| "epoch": 1.6217124777536087, | |
| "grad_norm": 0.08132806420326233, | |
| "learning_rate": 3.784140387913973e-05, | |
| "loss": 0.0043, | |
| "step": 6150 | |
| }, | |
| { | |
| "epoch": 1.623030782413816, | |
| "grad_norm": 0.020741304382681847, | |
| "learning_rate": 3.770946035096979e-05, | |
| "loss": 0.006, | |
| "step": 6155 | |
| }, | |
| { | |
| "epoch": 1.6243490870740227, | |
| "grad_norm": 0.0576217919588089, | |
| "learning_rate": 3.7577516822799844e-05, | |
| "loss": 0.0033, | |
| "step": 6160 | |
| }, | |
| { | |
| "epoch": 1.62566739173423, | |
| "grad_norm": 0.03032900020480156, | |
| "learning_rate": 3.74455732946299e-05, | |
| "loss": 0.0318, | |
| "step": 6165 | |
| }, | |
| { | |
| "epoch": 1.6269856963944367, | |
| "grad_norm": 0.8868799209594727, | |
| "learning_rate": 3.7313629766459955e-05, | |
| "loss": 0.0304, | |
| "step": 6170 | |
| }, | |
| { | |
| "epoch": 1.6283040010546437, | |
| "grad_norm": 0.003816834883764386, | |
| "learning_rate": 3.718168623829002e-05, | |
| "loss": 0.003, | |
| "step": 6175 | |
| }, | |
| { | |
| "epoch": 1.6296223057148507, | |
| "grad_norm": 0.05368296429514885, | |
| "learning_rate": 3.704974271012007e-05, | |
| "loss": 0.0064, | |
| "step": 6180 | |
| }, | |
| { | |
| "epoch": 1.6309406103750577, | |
| "grad_norm": 0.09963366389274597, | |
| "learning_rate": 3.691779918195012e-05, | |
| "loss": 0.0097, | |
| "step": 6185 | |
| }, | |
| { | |
| "epoch": 1.6322589150352647, | |
| "grad_norm": 0.006273225415498018, | |
| "learning_rate": 3.6785855653780185e-05, | |
| "loss": 0.0071, | |
| "step": 6190 | |
| }, | |
| { | |
| "epoch": 1.6335772196954716, | |
| "grad_norm": 0.15079188346862793, | |
| "learning_rate": 3.665391212561024e-05, | |
| "loss": 0.0058, | |
| "step": 6195 | |
| }, | |
| { | |
| "epoch": 1.6348955243556786, | |
| "grad_norm": 0.004980973433703184, | |
| "learning_rate": 3.6521968597440296e-05, | |
| "loss": 0.0051, | |
| "step": 6200 | |
| }, | |
| { | |
| "epoch": 1.6362138290158854, | |
| "grad_norm": 0.004235363099724054, | |
| "learning_rate": 3.639002506927036e-05, | |
| "loss": 0.0028, | |
| "step": 6205 | |
| }, | |
| { | |
| "epoch": 1.6375321336760926, | |
| "grad_norm": 0.003829963505268097, | |
| "learning_rate": 3.625808154110041e-05, | |
| "loss": 0.0347, | |
| "step": 6210 | |
| }, | |
| { | |
| "epoch": 1.6388504383362994, | |
| "grad_norm": 0.021650686860084534, | |
| "learning_rate": 3.612613801293046e-05, | |
| "loss": 0.0036, | |
| "step": 6215 | |
| }, | |
| { | |
| "epoch": 1.6401687429965066, | |
| "grad_norm": 0.06326934695243835, | |
| "learning_rate": 3.5994194484760525e-05, | |
| "loss": 0.0228, | |
| "step": 6220 | |
| }, | |
| { | |
| "epoch": 1.6414870476567134, | |
| "grad_norm": 0.017276322469115257, | |
| "learning_rate": 3.586225095659058e-05, | |
| "loss": 0.0025, | |
| "step": 6225 | |
| }, | |
| { | |
| "epoch": 1.6428053523169206, | |
| "grad_norm": 0.005066063720732927, | |
| "learning_rate": 3.573030742842064e-05, | |
| "loss": 0.0047, | |
| "step": 6230 | |
| }, | |
| { | |
| "epoch": 1.6441236569771274, | |
| "grad_norm": 0.003512267954647541, | |
| "learning_rate": 3.559836390025069e-05, | |
| "loss": 0.0018, | |
| "step": 6235 | |
| }, | |
| { | |
| "epoch": 1.6454419616373344, | |
| "grad_norm": 0.004347699694335461, | |
| "learning_rate": 3.546642037208075e-05, | |
| "loss": 0.0045, | |
| "step": 6240 | |
| }, | |
| { | |
| "epoch": 1.6467602662975414, | |
| "grad_norm": 0.008277533575892448, | |
| "learning_rate": 3.533447684391081e-05, | |
| "loss": 0.0456, | |
| "step": 6245 | |
| }, | |
| { | |
| "epoch": 1.6480785709577483, | |
| "grad_norm": 0.00973033718764782, | |
| "learning_rate": 3.5202533315740866e-05, | |
| "loss": 0.0215, | |
| "step": 6250 | |
| }, | |
| { | |
| "epoch": 1.6493968756179553, | |
| "grad_norm": 1.9432978630065918, | |
| "learning_rate": 3.507058978757092e-05, | |
| "loss": 0.0132, | |
| "step": 6255 | |
| }, | |
| { | |
| "epoch": 1.6507151802781623, | |
| "grad_norm": 0.2693535387516022, | |
| "learning_rate": 3.493864625940098e-05, | |
| "loss": 0.0037, | |
| "step": 6260 | |
| }, | |
| { | |
| "epoch": 1.6520334849383693, | |
| "grad_norm": 0.02107766456902027, | |
| "learning_rate": 3.480670273123103e-05, | |
| "loss": 0.0031, | |
| "step": 6265 | |
| }, | |
| { | |
| "epoch": 1.653351789598576, | |
| "grad_norm": 0.07168436795473099, | |
| "learning_rate": 3.467475920306109e-05, | |
| "loss": 0.0101, | |
| "step": 6270 | |
| }, | |
| { | |
| "epoch": 1.6546700942587833, | |
| "grad_norm": 0.06479799002408981, | |
| "learning_rate": 3.454281567489115e-05, | |
| "loss": 0.0032, | |
| "step": 6275 | |
| }, | |
| { | |
| "epoch": 1.65598839891899, | |
| "grad_norm": 0.0013557536294683814, | |
| "learning_rate": 3.441087214672121e-05, | |
| "loss": 0.0037, | |
| "step": 6280 | |
| }, | |
| { | |
| "epoch": 1.6573067035791973, | |
| "grad_norm": 0.07330150157213211, | |
| "learning_rate": 3.427892861855126e-05, | |
| "loss": 0.0031, | |
| "step": 6285 | |
| }, | |
| { | |
| "epoch": 1.658625008239404, | |
| "grad_norm": 0.08246012777090073, | |
| "learning_rate": 3.414698509038132e-05, | |
| "loss": 0.0028, | |
| "step": 6290 | |
| }, | |
| { | |
| "epoch": 1.659943312899611, | |
| "grad_norm": 0.6232367157936096, | |
| "learning_rate": 3.4015041562211374e-05, | |
| "loss": 0.0042, | |
| "step": 6295 | |
| }, | |
| { | |
| "epoch": 1.661261617559818, | |
| "grad_norm": 0.007676729932427406, | |
| "learning_rate": 3.388309803404143e-05, | |
| "loss": 0.0501, | |
| "step": 6300 | |
| }, | |
| { | |
| "epoch": 1.662579922220025, | |
| "grad_norm": 0.02081216312944889, | |
| "learning_rate": 3.375115450587149e-05, | |
| "loss": 0.0047, | |
| "step": 6305 | |
| }, | |
| { | |
| "epoch": 1.663898226880232, | |
| "grad_norm": 0.008829087018966675, | |
| "learning_rate": 3.361921097770154e-05, | |
| "loss": 0.0298, | |
| "step": 6310 | |
| }, | |
| { | |
| "epoch": 1.665216531540439, | |
| "grad_norm": 0.4426127076148987, | |
| "learning_rate": 3.34872674495316e-05, | |
| "loss": 0.0045, | |
| "step": 6315 | |
| }, | |
| { | |
| "epoch": 1.666534836200646, | |
| "grad_norm": 0.025818035006523132, | |
| "learning_rate": 3.335532392136166e-05, | |
| "loss": 0.0028, | |
| "step": 6320 | |
| }, | |
| { | |
| "epoch": 1.6678531408608528, | |
| "grad_norm": 0.6068133115768433, | |
| "learning_rate": 3.3223380393191715e-05, | |
| "loss": 0.0202, | |
| "step": 6325 | |
| }, | |
| { | |
| "epoch": 1.66917144552106, | |
| "grad_norm": 0.02740122564136982, | |
| "learning_rate": 3.309143686502178e-05, | |
| "loss": 0.0025, | |
| "step": 6330 | |
| }, | |
| { | |
| "epoch": 1.6704897501812668, | |
| "grad_norm": 0.15878735482692719, | |
| "learning_rate": 3.2959493336851826e-05, | |
| "loss": 0.004, | |
| "step": 6335 | |
| }, | |
| { | |
| "epoch": 1.671808054841474, | |
| "grad_norm": 0.006827466655522585, | |
| "learning_rate": 3.282754980868188e-05, | |
| "loss": 0.0048, | |
| "step": 6340 | |
| }, | |
| { | |
| "epoch": 1.6731263595016808, | |
| "grad_norm": 0.19508551061153412, | |
| "learning_rate": 3.2695606280511944e-05, | |
| "loss": 0.0025, | |
| "step": 6345 | |
| }, | |
| { | |
| "epoch": 1.674444664161888, | |
| "grad_norm": 0.8176754713058472, | |
| "learning_rate": 3.2563662752342e-05, | |
| "loss": 0.0151, | |
| "step": 6350 | |
| }, | |
| { | |
| "epoch": 1.6757629688220947, | |
| "grad_norm": 0.011672024615108967, | |
| "learning_rate": 3.2431719224172055e-05, | |
| "loss": 0.0452, | |
| "step": 6355 | |
| }, | |
| { | |
| "epoch": 1.6770812734823017, | |
| "grad_norm": 0.015824951231479645, | |
| "learning_rate": 3.229977569600211e-05, | |
| "loss": 0.0236, | |
| "step": 6360 | |
| }, | |
| { | |
| "epoch": 1.6783995781425087, | |
| "grad_norm": 0.1358737051486969, | |
| "learning_rate": 3.216783216783217e-05, | |
| "loss": 0.0078, | |
| "step": 6365 | |
| }, | |
| { | |
| "epoch": 1.6797178828027157, | |
| "grad_norm": 0.004896901547908783, | |
| "learning_rate": 3.203588863966223e-05, | |
| "loss": 0.0042, | |
| "step": 6370 | |
| }, | |
| { | |
| "epoch": 1.6810361874629227, | |
| "grad_norm": 0.22593103349208832, | |
| "learning_rate": 3.1903945111492285e-05, | |
| "loss": 0.0053, | |
| "step": 6375 | |
| }, | |
| { | |
| "epoch": 1.6823544921231297, | |
| "grad_norm": 0.0073196059092879295, | |
| "learning_rate": 3.177200158332234e-05, | |
| "loss": 0.0287, | |
| "step": 6380 | |
| }, | |
| { | |
| "epoch": 1.6836727967833367, | |
| "grad_norm": 0.018524926155805588, | |
| "learning_rate": 3.1640058055152396e-05, | |
| "loss": 0.0122, | |
| "step": 6385 | |
| }, | |
| { | |
| "epoch": 1.6849911014435435, | |
| "grad_norm": 0.7453815937042236, | |
| "learning_rate": 3.150811452698245e-05, | |
| "loss": 0.0378, | |
| "step": 6390 | |
| }, | |
| { | |
| "epoch": 1.6863094061037507, | |
| "grad_norm": 0.22409795224666595, | |
| "learning_rate": 3.137617099881251e-05, | |
| "loss": 0.0282, | |
| "step": 6395 | |
| }, | |
| { | |
| "epoch": 1.6876277107639575, | |
| "grad_norm": 0.005432693753391504, | |
| "learning_rate": 3.124422747064257e-05, | |
| "loss": 0.0162, | |
| "step": 6400 | |
| }, | |
| { | |
| "epoch": 1.6889460154241647, | |
| "grad_norm": 0.1493055820465088, | |
| "learning_rate": 3.1112283942472626e-05, | |
| "loss": 0.0123, | |
| "step": 6405 | |
| }, | |
| { | |
| "epoch": 1.6902643200843714, | |
| "grad_norm": 0.1638440042734146, | |
| "learning_rate": 3.0980340414302674e-05, | |
| "loss": 0.0058, | |
| "step": 6410 | |
| }, | |
| { | |
| "epoch": 1.6915826247445784, | |
| "grad_norm": 0.015779908746480942, | |
| "learning_rate": 3.084839688613274e-05, | |
| "loss": 0.0157, | |
| "step": 6415 | |
| }, | |
| { | |
| "epoch": 1.6929009294047854, | |
| "grad_norm": 0.0012348912423476577, | |
| "learning_rate": 3.071645335796279e-05, | |
| "loss": 0.0016, | |
| "step": 6420 | |
| }, | |
| { | |
| "epoch": 1.6942192340649924, | |
| "grad_norm": 0.05294624716043472, | |
| "learning_rate": 3.058450982979285e-05, | |
| "loss": 0.0037, | |
| "step": 6425 | |
| }, | |
| { | |
| "epoch": 1.6955375387251994, | |
| "grad_norm": 0.01926981844007969, | |
| "learning_rate": 3.045256630162291e-05, | |
| "loss": 0.0053, | |
| "step": 6430 | |
| }, | |
| { | |
| "epoch": 1.6968558433854064, | |
| "grad_norm": 0.005958891473710537, | |
| "learning_rate": 3.0320622773452963e-05, | |
| "loss": 0.0025, | |
| "step": 6435 | |
| }, | |
| { | |
| "epoch": 1.6981741480456134, | |
| "grad_norm": 0.001902201445773244, | |
| "learning_rate": 3.018867924528302e-05, | |
| "loss": 0.0027, | |
| "step": 6440 | |
| }, | |
| { | |
| "epoch": 1.6994924527058202, | |
| "grad_norm": 0.036614127457141876, | |
| "learning_rate": 3.0056735717113078e-05, | |
| "loss": 0.0026, | |
| "step": 6445 | |
| }, | |
| { | |
| "epoch": 1.7008107573660274, | |
| "grad_norm": 0.07294526696205139, | |
| "learning_rate": 2.9924792188943133e-05, | |
| "loss": 0.0042, | |
| "step": 6450 | |
| }, | |
| { | |
| "epoch": 1.7021290620262342, | |
| "grad_norm": 0.42822372913360596, | |
| "learning_rate": 2.9792848660773192e-05, | |
| "loss": 0.013, | |
| "step": 6455 | |
| }, | |
| { | |
| "epoch": 1.7034473666864414, | |
| "grad_norm": 0.036622967571020126, | |
| "learning_rate": 2.9660905132603245e-05, | |
| "loss": 0.0029, | |
| "step": 6460 | |
| }, | |
| { | |
| "epoch": 1.7047656713466481, | |
| "grad_norm": 0.08314034342765808, | |
| "learning_rate": 2.9528961604433304e-05, | |
| "loss": 0.0043, | |
| "step": 6465 | |
| }, | |
| { | |
| "epoch": 1.7060839760068551, | |
| "grad_norm": 0.0005654952838085592, | |
| "learning_rate": 2.939701807626336e-05, | |
| "loss": 0.0595, | |
| "step": 6470 | |
| }, | |
| { | |
| "epoch": 1.7074022806670621, | |
| "grad_norm": 0.004545385017991066, | |
| "learning_rate": 2.926507454809342e-05, | |
| "loss": 0.0044, | |
| "step": 6475 | |
| }, | |
| { | |
| "epoch": 1.7087205853272691, | |
| "grad_norm": 0.00033831383916549385, | |
| "learning_rate": 2.9133131019923477e-05, | |
| "loss": 0.0046, | |
| "step": 6480 | |
| }, | |
| { | |
| "epoch": 1.710038889987476, | |
| "grad_norm": 0.0019903562497347593, | |
| "learning_rate": 2.900118749175353e-05, | |
| "loss": 0.0026, | |
| "step": 6485 | |
| }, | |
| { | |
| "epoch": 1.711357194647683, | |
| "grad_norm": 0.10188104957342148, | |
| "learning_rate": 2.8869243963583585e-05, | |
| "loss": 0.0069, | |
| "step": 6490 | |
| }, | |
| { | |
| "epoch": 1.71267549930789, | |
| "grad_norm": 0.2123432606458664, | |
| "learning_rate": 2.8737300435413644e-05, | |
| "loss": 0.0199, | |
| "step": 6495 | |
| }, | |
| { | |
| "epoch": 1.7139938039680969, | |
| "grad_norm": 0.43209517002105713, | |
| "learning_rate": 2.8605356907243703e-05, | |
| "loss": 0.0099, | |
| "step": 6500 | |
| }, | |
| { | |
| "epoch": 1.7139938039680969, | |
| "eval_loss": 0.024327505379915237, | |
| "eval_runtime": 452.0052, | |
| "eval_samples_per_second": 7.46, | |
| "eval_steps_per_second": 3.73, | |
| "step": 6500 | |
| }, | |
| { | |
| "epoch": 1.715312108628304, | |
| "grad_norm": 0.009868285618722439, | |
| "learning_rate": 2.847341337907376e-05, | |
| "loss": 0.0025, | |
| "step": 6505 | |
| }, | |
| { | |
| "epoch": 1.7166304132885108, | |
| "grad_norm": 0.00778606254607439, | |
| "learning_rate": 2.834146985090381e-05, | |
| "loss": 0.0028, | |
| "step": 6510 | |
| }, | |
| { | |
| "epoch": 1.717948717948718, | |
| "grad_norm": 0.02987460047006607, | |
| "learning_rate": 2.820952632273387e-05, | |
| "loss": 0.0068, | |
| "step": 6515 | |
| }, | |
| { | |
| "epoch": 1.7192670226089248, | |
| "grad_norm": 0.04475142061710358, | |
| "learning_rate": 2.807758279456393e-05, | |
| "loss": 0.0022, | |
| "step": 6520 | |
| }, | |
| { | |
| "epoch": 1.720585327269132, | |
| "grad_norm": 0.12720516324043274, | |
| "learning_rate": 2.7945639266393985e-05, | |
| "loss": 0.0488, | |
| "step": 6525 | |
| }, | |
| { | |
| "epoch": 1.7219036319293388, | |
| "grad_norm": 0.0011463731061667204, | |
| "learning_rate": 2.7813695738224044e-05, | |
| "loss": 0.0023, | |
| "step": 6530 | |
| }, | |
| { | |
| "epoch": 1.7232219365895458, | |
| "grad_norm": 0.008907752111554146, | |
| "learning_rate": 2.7681752210054096e-05, | |
| "loss": 0.0039, | |
| "step": 6535 | |
| }, | |
| { | |
| "epoch": 1.7245402412497528, | |
| "grad_norm": 0.008416680619120598, | |
| "learning_rate": 2.7549808681884156e-05, | |
| "loss": 0.0055, | |
| "step": 6540 | |
| }, | |
| { | |
| "epoch": 1.7258585459099598, | |
| "grad_norm": 0.26278871297836304, | |
| "learning_rate": 2.741786515371421e-05, | |
| "loss": 0.0386, | |
| "step": 6545 | |
| }, | |
| { | |
| "epoch": 1.7271768505701668, | |
| "grad_norm": 0.01750275492668152, | |
| "learning_rate": 2.728592162554427e-05, | |
| "loss": 0.0048, | |
| "step": 6550 | |
| }, | |
| { | |
| "epoch": 1.7284951552303738, | |
| "grad_norm": 0.009483959525823593, | |
| "learning_rate": 2.7153978097374326e-05, | |
| "loss": 0.0061, | |
| "step": 6555 | |
| }, | |
| { | |
| "epoch": 1.7298134598905808, | |
| "grad_norm": 0.016591722145676613, | |
| "learning_rate": 2.7022034569204378e-05, | |
| "loss": 0.0058, | |
| "step": 6560 | |
| }, | |
| { | |
| "epoch": 1.7311317645507875, | |
| "grad_norm": 0.5120682716369629, | |
| "learning_rate": 2.6890091041034437e-05, | |
| "loss": 0.0229, | |
| "step": 6565 | |
| }, | |
| { | |
| "epoch": 1.7324500692109948, | |
| "grad_norm": 0.03748248517513275, | |
| "learning_rate": 2.6758147512864496e-05, | |
| "loss": 0.0026, | |
| "step": 6570 | |
| }, | |
| { | |
| "epoch": 1.7337683738712015, | |
| "grad_norm": 0.08328749984502792, | |
| "learning_rate": 2.6626203984694552e-05, | |
| "loss": 0.0052, | |
| "step": 6575 | |
| }, | |
| { | |
| "epoch": 1.7350866785314087, | |
| "grad_norm": 0.012284482829272747, | |
| "learning_rate": 2.649426045652461e-05, | |
| "loss": 0.0353, | |
| "step": 6580 | |
| }, | |
| { | |
| "epoch": 1.7364049831916155, | |
| "grad_norm": 0.06362583488225937, | |
| "learning_rate": 2.6362316928354663e-05, | |
| "loss": 0.0309, | |
| "step": 6585 | |
| }, | |
| { | |
| "epoch": 1.7377232878518225, | |
| "grad_norm": 0.01475360058248043, | |
| "learning_rate": 2.6230373400184722e-05, | |
| "loss": 0.0034, | |
| "step": 6590 | |
| }, | |
| { | |
| "epoch": 1.7390415925120295, | |
| "grad_norm": 0.002241638721898198, | |
| "learning_rate": 2.6098429872014778e-05, | |
| "loss": 0.0365, | |
| "step": 6595 | |
| }, | |
| { | |
| "epoch": 1.7403598971722365, | |
| "grad_norm": 0.11375941336154938, | |
| "learning_rate": 2.5966486343844837e-05, | |
| "loss": 0.0241, | |
| "step": 6600 | |
| }, | |
| { | |
| "epoch": 1.7416782018324435, | |
| "grad_norm": 0.009631779976189137, | |
| "learning_rate": 2.5834542815674896e-05, | |
| "loss": 0.0026, | |
| "step": 6605 | |
| }, | |
| { | |
| "epoch": 1.7429965064926505, | |
| "grad_norm": 0.12113262712955475, | |
| "learning_rate": 2.570259928750495e-05, | |
| "loss": 0.0207, | |
| "step": 6610 | |
| }, | |
| { | |
| "epoch": 1.7443148111528575, | |
| "grad_norm": 0.006536155007779598, | |
| "learning_rate": 2.5570655759335004e-05, | |
| "loss": 0.0022, | |
| "step": 6615 | |
| }, | |
| { | |
| "epoch": 1.7456331158130642, | |
| "grad_norm": 0.043030887842178345, | |
| "learning_rate": 2.5438712231165063e-05, | |
| "loss": 0.003, | |
| "step": 6620 | |
| }, | |
| { | |
| "epoch": 1.7469514204732715, | |
| "grad_norm": 0.00860620103776455, | |
| "learning_rate": 2.5306768702995122e-05, | |
| "loss": 0.027, | |
| "step": 6625 | |
| }, | |
| { | |
| "epoch": 1.7482697251334782, | |
| "grad_norm": 0.014589210972189903, | |
| "learning_rate": 2.5174825174825178e-05, | |
| "loss": 0.0224, | |
| "step": 6630 | |
| }, | |
| { | |
| "epoch": 1.7495880297936854, | |
| "grad_norm": 0.01215316355228424, | |
| "learning_rate": 2.504288164665523e-05, | |
| "loss": 0.011, | |
| "step": 6635 | |
| }, | |
| { | |
| "epoch": 1.7509063344538922, | |
| "grad_norm": 0.10951556265354156, | |
| "learning_rate": 2.491093811848529e-05, | |
| "loss": 0.0384, | |
| "step": 6640 | |
| }, | |
| { | |
| "epoch": 1.7522246391140994, | |
| "grad_norm": 0.30859875679016113, | |
| "learning_rate": 2.4778994590315345e-05, | |
| "loss": 0.0031, | |
| "step": 6645 | |
| }, | |
| { | |
| "epoch": 1.7535429437743062, | |
| "grad_norm": 0.025427229702472687, | |
| "learning_rate": 2.4647051062145404e-05, | |
| "loss": 0.0171, | |
| "step": 6650 | |
| }, | |
| { | |
| "epoch": 1.7548612484345132, | |
| "grad_norm": 0.03334197774529457, | |
| "learning_rate": 2.451510753397546e-05, | |
| "loss": 0.0473, | |
| "step": 6655 | |
| }, | |
| { | |
| "epoch": 1.7561795530947202, | |
| "grad_norm": 0.013445639982819557, | |
| "learning_rate": 2.438316400580552e-05, | |
| "loss": 0.0056, | |
| "step": 6660 | |
| }, | |
| { | |
| "epoch": 1.7574978577549272, | |
| "grad_norm": 0.008306960575282574, | |
| "learning_rate": 2.425122047763557e-05, | |
| "loss": 0.0104, | |
| "step": 6665 | |
| }, | |
| { | |
| "epoch": 1.7588161624151342, | |
| "grad_norm": 0.012615012936294079, | |
| "learning_rate": 2.411927694946563e-05, | |
| "loss": 0.0097, | |
| "step": 6670 | |
| }, | |
| { | |
| "epoch": 1.7601344670753412, | |
| "grad_norm": 0.006827410310506821, | |
| "learning_rate": 2.398733342129569e-05, | |
| "loss": 0.0057, | |
| "step": 6675 | |
| }, | |
| { | |
| "epoch": 1.7614527717355482, | |
| "grad_norm": 0.017035294324159622, | |
| "learning_rate": 2.3855389893125745e-05, | |
| "loss": 0.0035, | |
| "step": 6680 | |
| }, | |
| { | |
| "epoch": 1.762771076395755, | |
| "grad_norm": 0.036102693527936935, | |
| "learning_rate": 2.37234463649558e-05, | |
| "loss": 0.0031, | |
| "step": 6685 | |
| }, | |
| { | |
| "epoch": 1.7640893810559621, | |
| "grad_norm": 0.5004498958587646, | |
| "learning_rate": 2.3591502836785856e-05, | |
| "loss": 0.0217, | |
| "step": 6690 | |
| }, | |
| { | |
| "epoch": 1.765407685716169, | |
| "grad_norm": 0.017726672813296318, | |
| "learning_rate": 2.3459559308615915e-05, | |
| "loss": 0.0112, | |
| "step": 6695 | |
| }, | |
| { | |
| "epoch": 1.7667259903763761, | |
| "grad_norm": 0.00940331444144249, | |
| "learning_rate": 2.332761578044597e-05, | |
| "loss": 0.0107, | |
| "step": 6700 | |
| }, | |
| { | |
| "epoch": 1.768044295036583, | |
| "grad_norm": 0.007495497819036245, | |
| "learning_rate": 2.3195672252276026e-05, | |
| "loss": 0.0032, | |
| "step": 6705 | |
| }, | |
| { | |
| "epoch": 1.7693625996967899, | |
| "grad_norm": 0.6863199472427368, | |
| "learning_rate": 2.3063728724106085e-05, | |
| "loss": 0.034, | |
| "step": 6710 | |
| }, | |
| { | |
| "epoch": 1.7706809043569969, | |
| "grad_norm": 0.004587489180266857, | |
| "learning_rate": 2.293178519593614e-05, | |
| "loss": 0.0032, | |
| "step": 6715 | |
| }, | |
| { | |
| "epoch": 1.7719992090172039, | |
| "grad_norm": 0.017706016078591347, | |
| "learning_rate": 2.2799841667766197e-05, | |
| "loss": 0.0036, | |
| "step": 6720 | |
| }, | |
| { | |
| "epoch": 1.7733175136774109, | |
| "grad_norm": 0.012740216217935085, | |
| "learning_rate": 2.2667898139596252e-05, | |
| "loss": 0.0147, | |
| "step": 6725 | |
| }, | |
| { | |
| "epoch": 1.7746358183376179, | |
| "grad_norm": 0.010391579940915108, | |
| "learning_rate": 2.253595461142631e-05, | |
| "loss": 0.0041, | |
| "step": 6730 | |
| }, | |
| { | |
| "epoch": 1.7759541229978248, | |
| "grad_norm": 0.021570540964603424, | |
| "learning_rate": 2.2404011083256367e-05, | |
| "loss": 0.0363, | |
| "step": 6735 | |
| }, | |
| { | |
| "epoch": 1.7772724276580316, | |
| "grad_norm": 0.005778402555733919, | |
| "learning_rate": 2.2272067555086423e-05, | |
| "loss": 0.002, | |
| "step": 6740 | |
| }, | |
| { | |
| "epoch": 1.7785907323182388, | |
| "grad_norm": 0.0, | |
| "learning_rate": 2.2140124026916482e-05, | |
| "loss": 0.0058, | |
| "step": 6745 | |
| }, | |
| { | |
| "epoch": 1.7799090369784456, | |
| "grad_norm": 0.010869967751204967, | |
| "learning_rate": 2.2008180498746537e-05, | |
| "loss": 0.0036, | |
| "step": 6750 | |
| }, | |
| { | |
| "epoch": 1.7812273416386528, | |
| "grad_norm": 0.04336518794298172, | |
| "learning_rate": 2.1876236970576593e-05, | |
| "loss": 0.0074, | |
| "step": 6755 | |
| }, | |
| { | |
| "epoch": 1.7825456462988596, | |
| "grad_norm": 0.008664094842970371, | |
| "learning_rate": 2.1744293442406652e-05, | |
| "loss": 0.0027, | |
| "step": 6760 | |
| }, | |
| { | |
| "epoch": 1.7838639509590668, | |
| "grad_norm": 0.9408183097839355, | |
| "learning_rate": 2.1612349914236708e-05, | |
| "loss": 0.0371, | |
| "step": 6765 | |
| }, | |
| { | |
| "epoch": 1.7851822556192736, | |
| "grad_norm": 0.016822539269924164, | |
| "learning_rate": 2.1480406386066763e-05, | |
| "loss": 0.0137, | |
| "step": 6770 | |
| }, | |
| { | |
| "epoch": 1.7865005602794806, | |
| "grad_norm": 0.00829544197767973, | |
| "learning_rate": 2.134846285789682e-05, | |
| "loss": 0.0134, | |
| "step": 6775 | |
| }, | |
| { | |
| "epoch": 1.7878188649396876, | |
| "grad_norm": 0.0035508016590029, | |
| "learning_rate": 2.1216519329726878e-05, | |
| "loss": 0.0231, | |
| "step": 6780 | |
| }, | |
| { | |
| "epoch": 1.7891371695998946, | |
| "grad_norm": 0.13871321082115173, | |
| "learning_rate": 2.1084575801556937e-05, | |
| "loss": 0.0296, | |
| "step": 6785 | |
| }, | |
| { | |
| "epoch": 1.7904554742601015, | |
| "grad_norm": 0.002578354673460126, | |
| "learning_rate": 2.095263227338699e-05, | |
| "loss": 0.0178, | |
| "step": 6790 | |
| }, | |
| { | |
| "epoch": 1.7917737789203085, | |
| "grad_norm": 0.5279458165168762, | |
| "learning_rate": 2.082068874521705e-05, | |
| "loss": 0.0336, | |
| "step": 6795 | |
| }, | |
| { | |
| "epoch": 1.7930920835805155, | |
| "grad_norm": 0.0017439400544390082, | |
| "learning_rate": 2.0688745217047104e-05, | |
| "loss": 0.0031, | |
| "step": 6800 | |
| }, | |
| { | |
| "epoch": 1.7944103882407223, | |
| "grad_norm": 0.007989778183400631, | |
| "learning_rate": 2.055680168887716e-05, | |
| "loss": 0.0081, | |
| "step": 6805 | |
| }, | |
| { | |
| "epoch": 1.7957286929009295, | |
| "grad_norm": 0.015163813717663288, | |
| "learning_rate": 2.042485816070722e-05, | |
| "loss": 0.0234, | |
| "step": 6810 | |
| }, | |
| { | |
| "epoch": 1.7970469975611363, | |
| "grad_norm": 0.10615389794111252, | |
| "learning_rate": 2.0292914632537275e-05, | |
| "loss": 0.0144, | |
| "step": 6815 | |
| }, | |
| { | |
| "epoch": 1.7983653022213435, | |
| "grad_norm": 0.03466172143816948, | |
| "learning_rate": 2.0160971104367334e-05, | |
| "loss": 0.0036, | |
| "step": 6820 | |
| }, | |
| { | |
| "epoch": 1.7996836068815503, | |
| "grad_norm": 0.047511328011751175, | |
| "learning_rate": 2.0029027576197386e-05, | |
| "loss": 0.002, | |
| "step": 6825 | |
| }, | |
| { | |
| "epoch": 1.8010019115417573, | |
| "grad_norm": 0.019772246479988098, | |
| "learning_rate": 1.9897084048027445e-05, | |
| "loss": 0.0049, | |
| "step": 6830 | |
| }, | |
| { | |
| "epoch": 1.8023202162019643, | |
| "grad_norm": 0.1156701073050499, | |
| "learning_rate": 1.9765140519857504e-05, | |
| "loss": 0.0033, | |
| "step": 6835 | |
| }, | |
| { | |
| "epoch": 1.8036385208621712, | |
| "grad_norm": 0.010991690680384636, | |
| "learning_rate": 1.963319699168756e-05, | |
| "loss": 0.0036, | |
| "step": 6840 | |
| }, | |
| { | |
| "epoch": 1.8049568255223782, | |
| "grad_norm": 0.29658815264701843, | |
| "learning_rate": 1.9501253463517615e-05, | |
| "loss": 0.0042, | |
| "step": 6845 | |
| }, | |
| { | |
| "epoch": 1.8062751301825852, | |
| "grad_norm": 0.056147243827581406, | |
| "learning_rate": 1.936930993534767e-05, | |
| "loss": 0.0052, | |
| "step": 6850 | |
| }, | |
| { | |
| "epoch": 1.8075934348427922, | |
| "grad_norm": 0.010382590815424919, | |
| "learning_rate": 1.923736640717773e-05, | |
| "loss": 0.0033, | |
| "step": 6855 | |
| }, | |
| { | |
| "epoch": 1.808911739502999, | |
| "grad_norm": 1.1247020959854126, | |
| "learning_rate": 1.9105422879007786e-05, | |
| "loss": 0.0112, | |
| "step": 6860 | |
| }, | |
| { | |
| "epoch": 1.8102300441632062, | |
| "grad_norm": 1.4515737295150757, | |
| "learning_rate": 1.897347935083784e-05, | |
| "loss": 0.0202, | |
| "step": 6865 | |
| }, | |
| { | |
| "epoch": 1.811548348823413, | |
| "grad_norm": 0.016307830810546875, | |
| "learning_rate": 1.88415358226679e-05, | |
| "loss": 0.0148, | |
| "step": 6870 | |
| }, | |
| { | |
| "epoch": 1.8128666534836202, | |
| "grad_norm": 0.0745878592133522, | |
| "learning_rate": 1.8709592294497956e-05, | |
| "loss": 0.0062, | |
| "step": 6875 | |
| }, | |
| { | |
| "epoch": 1.814184958143827, | |
| "grad_norm": 0.02554013952612877, | |
| "learning_rate": 1.8577648766328012e-05, | |
| "loss": 0.003, | |
| "step": 6880 | |
| }, | |
| { | |
| "epoch": 1.815503262804034, | |
| "grad_norm": 0.45748665928840637, | |
| "learning_rate": 1.844570523815807e-05, | |
| "loss": 0.0386, | |
| "step": 6885 | |
| }, | |
| { | |
| "epoch": 1.816821567464241, | |
| "grad_norm": 0.013801589608192444, | |
| "learning_rate": 1.8313761709988126e-05, | |
| "loss": 0.0342, | |
| "step": 6890 | |
| }, | |
| { | |
| "epoch": 1.818139872124448, | |
| "grad_norm": 0.6251696944236755, | |
| "learning_rate": 1.8181818181818182e-05, | |
| "loss": 0.0101, | |
| "step": 6895 | |
| }, | |
| { | |
| "epoch": 1.819458176784655, | |
| "grad_norm": 0.28203102946281433, | |
| "learning_rate": 1.8049874653648238e-05, | |
| "loss": 0.0032, | |
| "step": 6900 | |
| }, | |
| { | |
| "epoch": 1.820776481444862, | |
| "grad_norm": 0.28511062264442444, | |
| "learning_rate": 1.7917931125478297e-05, | |
| "loss": 0.0343, | |
| "step": 6905 | |
| }, | |
| { | |
| "epoch": 1.822094786105069, | |
| "grad_norm": 0.004940215498209, | |
| "learning_rate": 1.7785987597308352e-05, | |
| "loss": 0.0265, | |
| "step": 6910 | |
| }, | |
| { | |
| "epoch": 1.8234130907652757, | |
| "grad_norm": 0.002903093583881855, | |
| "learning_rate": 1.7654044069138408e-05, | |
| "loss": 0.0025, | |
| "step": 6915 | |
| }, | |
| { | |
| "epoch": 1.824731395425483, | |
| "grad_norm": 0.008801674470305443, | |
| "learning_rate": 1.7522100540968467e-05, | |
| "loss": 0.0246, | |
| "step": 6920 | |
| }, | |
| { | |
| "epoch": 1.8260497000856897, | |
| "grad_norm": 0.13823826611042023, | |
| "learning_rate": 1.7390157012798523e-05, | |
| "loss": 0.0058, | |
| "step": 6925 | |
| }, | |
| { | |
| "epoch": 1.827368004745897, | |
| "grad_norm": 0.020868878811597824, | |
| "learning_rate": 1.725821348462858e-05, | |
| "loss": 0.0014, | |
| "step": 6930 | |
| }, | |
| { | |
| "epoch": 1.8286863094061037, | |
| "grad_norm": 0.0027356524951756, | |
| "learning_rate": 1.7126269956458638e-05, | |
| "loss": 0.0035, | |
| "step": 6935 | |
| }, | |
| { | |
| "epoch": 1.8300046140663109, | |
| "grad_norm": 0.06023023650050163, | |
| "learning_rate": 1.6994326428288693e-05, | |
| "loss": 0.0212, | |
| "step": 6940 | |
| }, | |
| { | |
| "epoch": 1.8313229187265176, | |
| "grad_norm": 0.0009826788445934653, | |
| "learning_rate": 1.686238290011875e-05, | |
| "loss": 0.0034, | |
| "step": 6945 | |
| }, | |
| { | |
| "epoch": 1.8326412233867246, | |
| "grad_norm": 0.2867647707462311, | |
| "learning_rate": 1.6730439371948805e-05, | |
| "loss": 0.0146, | |
| "step": 6950 | |
| }, | |
| { | |
| "epoch": 1.8339595280469316, | |
| "grad_norm": 0.004501632414758205, | |
| "learning_rate": 1.6598495843778864e-05, | |
| "loss": 0.0026, | |
| "step": 6955 | |
| }, | |
| { | |
| "epoch": 1.8352778327071386, | |
| "grad_norm": 0.01251616608351469, | |
| "learning_rate": 1.6466552315608923e-05, | |
| "loss": 0.0107, | |
| "step": 6960 | |
| }, | |
| { | |
| "epoch": 1.8365961373673456, | |
| "grad_norm": 0.054781850427389145, | |
| "learning_rate": 1.6334608787438975e-05, | |
| "loss": 0.0044, | |
| "step": 6965 | |
| }, | |
| { | |
| "epoch": 1.8379144420275526, | |
| "grad_norm": 0.1120501235127449, | |
| "learning_rate": 1.6202665259269034e-05, | |
| "loss": 0.0284, | |
| "step": 6970 | |
| }, | |
| { | |
| "epoch": 1.8392327466877596, | |
| "grad_norm": 0.001668553682975471, | |
| "learning_rate": 1.607072173109909e-05, | |
| "loss": 0.0169, | |
| "step": 6975 | |
| }, | |
| { | |
| "epoch": 1.8405510513479664, | |
| "grad_norm": 1.6374458074569702, | |
| "learning_rate": 1.593877820292915e-05, | |
| "loss": 0.031, | |
| "step": 6980 | |
| }, | |
| { | |
| "epoch": 1.8418693560081736, | |
| "grad_norm": 0.012474550865590572, | |
| "learning_rate": 1.5806834674759204e-05, | |
| "loss": 0.0037, | |
| "step": 6985 | |
| }, | |
| { | |
| "epoch": 1.8431876606683804, | |
| "grad_norm": 0.014898869208991528, | |
| "learning_rate": 1.567489114658926e-05, | |
| "loss": 0.003, | |
| "step": 6990 | |
| }, | |
| { | |
| "epoch": 1.8445059653285876, | |
| "grad_norm": 0.035570453852415085, | |
| "learning_rate": 1.554294761841932e-05, | |
| "loss": 0.0038, | |
| "step": 6995 | |
| }, | |
| { | |
| "epoch": 1.8458242699887943, | |
| "grad_norm": 0.9279152750968933, | |
| "learning_rate": 1.541100409024937e-05, | |
| "loss": 0.0235, | |
| "step": 7000 | |
| }, | |
| { | |
| "epoch": 1.8458242699887943, | |
| "eval_loss": 0.022339830175042152, | |
| "eval_runtime": 451.9068, | |
| "eval_samples_per_second": 7.462, | |
| "eval_steps_per_second": 3.731, | |
| "step": 7000 | |
| } | |
| ], | |
| "logging_steps": 5, | |
| "max_steps": 7584, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 2, | |
| "save_steps": 500, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": false | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 5.6496806486741606e+17, | |
| "train_batch_size": 2, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |