| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 3.0, | |
| "eval_steps": 500, | |
| "global_step": 20205, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.014847809948032665, | |
| "grad_norm": 0.14453125, | |
| "learning_rate": 9.950507300173225e-05, | |
| "loss": 0.3437, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.02969561989606533, | |
| "grad_norm": 0.10986328125, | |
| "learning_rate": 9.90101460034645e-05, | |
| "loss": 0.305, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.044543429844097995, | |
| "grad_norm": 0.1630859375, | |
| "learning_rate": 9.851521900519674e-05, | |
| "loss": 0.2885, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.05939123979213066, | |
| "grad_norm": 0.162109375, | |
| "learning_rate": 9.802029200692899e-05, | |
| "loss": 0.2781, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.07423904974016332, | |
| "grad_norm": 0.193359375, | |
| "learning_rate": 9.752536500866123e-05, | |
| "loss": 0.2746, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.08908685968819599, | |
| "grad_norm": 0.1357421875, | |
| "learning_rate": 9.703043801039347e-05, | |
| "loss": 0.2658, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 0.10393466963622866, | |
| "grad_norm": 0.1689453125, | |
| "learning_rate": 9.653551101212572e-05, | |
| "loss": 0.2709, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 0.11878247958426132, | |
| "grad_norm": 0.17578125, | |
| "learning_rate": 9.604058401385795e-05, | |
| "loss": 0.2634, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 0.133630289532294, | |
| "grad_norm": 0.15234375, | |
| "learning_rate": 9.55456570155902e-05, | |
| "loss": 0.2688, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 0.14847809948032664, | |
| "grad_norm": 0.146484375, | |
| "learning_rate": 9.505073001732245e-05, | |
| "loss": 0.261, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.1633259094283593, | |
| "grad_norm": 0.12158203125, | |
| "learning_rate": 9.455580301905469e-05, | |
| "loss": 0.2537, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 0.17817371937639198, | |
| "grad_norm": 0.1845703125, | |
| "learning_rate": 9.406087602078693e-05, | |
| "loss": 0.2592, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 0.19302152932442465, | |
| "grad_norm": 0.234375, | |
| "learning_rate": 9.356594902251918e-05, | |
| "loss": 0.2529, | |
| "step": 1300 | |
| }, | |
| { | |
| "epoch": 0.20786933927245732, | |
| "grad_norm": 0.1064453125, | |
| "learning_rate": 9.307102202425143e-05, | |
| "loss": 0.2489, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 0.22271714922049, | |
| "grad_norm": 0.166015625, | |
| "learning_rate": 9.257609502598367e-05, | |
| "loss": 0.2539, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 0.23756495916852263, | |
| "grad_norm": 0.1669921875, | |
| "learning_rate": 9.208116802771592e-05, | |
| "loss": 0.2449, | |
| "step": 1600 | |
| }, | |
| { | |
| "epoch": 0.2524127691165553, | |
| "grad_norm": 0.2109375, | |
| "learning_rate": 9.158624102944816e-05, | |
| "loss": 0.2457, | |
| "step": 1700 | |
| }, | |
| { | |
| "epoch": 0.267260579064588, | |
| "grad_norm": 0.1904296875, | |
| "learning_rate": 9.109131403118041e-05, | |
| "loss": 0.2424, | |
| "step": 1800 | |
| }, | |
| { | |
| "epoch": 0.28210838901262064, | |
| "grad_norm": 0.23828125, | |
| "learning_rate": 9.059638703291266e-05, | |
| "loss": 0.2485, | |
| "step": 1900 | |
| }, | |
| { | |
| "epoch": 0.2969561989606533, | |
| "grad_norm": 0.203125, | |
| "learning_rate": 9.01014600346449e-05, | |
| "loss": 0.2534, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 0.311804008908686, | |
| "grad_norm": 0.17578125, | |
| "learning_rate": 8.960653303637714e-05, | |
| "loss": 0.238, | |
| "step": 2100 | |
| }, | |
| { | |
| "epoch": 0.3266518188567186, | |
| "grad_norm": 0.158203125, | |
| "learning_rate": 8.911160603810938e-05, | |
| "loss": 0.2409, | |
| "step": 2200 | |
| }, | |
| { | |
| "epoch": 0.3414996288047513, | |
| "grad_norm": 0.177734375, | |
| "learning_rate": 8.861667903984163e-05, | |
| "loss": 0.2484, | |
| "step": 2300 | |
| }, | |
| { | |
| "epoch": 0.35634743875278396, | |
| "grad_norm": 0.1689453125, | |
| "learning_rate": 8.812175204157386e-05, | |
| "loss": 0.2401, | |
| "step": 2400 | |
| }, | |
| { | |
| "epoch": 0.3711952487008166, | |
| "grad_norm": 0.15625, | |
| "learning_rate": 8.762682504330611e-05, | |
| "loss": 0.2372, | |
| "step": 2500 | |
| }, | |
| { | |
| "epoch": 0.3860430586488493, | |
| "grad_norm": 0.193359375, | |
| "learning_rate": 8.713189804503836e-05, | |
| "loss": 0.2346, | |
| "step": 2600 | |
| }, | |
| { | |
| "epoch": 0.40089086859688194, | |
| "grad_norm": 0.19140625, | |
| "learning_rate": 8.663697104677061e-05, | |
| "loss": 0.2307, | |
| "step": 2700 | |
| }, | |
| { | |
| "epoch": 0.41573867854491464, | |
| "grad_norm": 0.15234375, | |
| "learning_rate": 8.614204404850284e-05, | |
| "loss": 0.2276, | |
| "step": 2800 | |
| }, | |
| { | |
| "epoch": 0.4305864884929473, | |
| "grad_norm": 0.1513671875, | |
| "learning_rate": 8.56471170502351e-05, | |
| "loss": 0.2386, | |
| "step": 2900 | |
| }, | |
| { | |
| "epoch": 0.44543429844098, | |
| "grad_norm": 0.189453125, | |
| "learning_rate": 8.515219005196734e-05, | |
| "loss": 0.2313, | |
| "step": 3000 | |
| }, | |
| { | |
| "epoch": 0.4602821083890126, | |
| "grad_norm": 0.208984375, | |
| "learning_rate": 8.465726305369959e-05, | |
| "loss": 0.245, | |
| "step": 3100 | |
| }, | |
| { | |
| "epoch": 0.47512991833704527, | |
| "grad_norm": 0.1943359375, | |
| "learning_rate": 8.416233605543183e-05, | |
| "loss": 0.2278, | |
| "step": 3200 | |
| }, | |
| { | |
| "epoch": 0.48997772828507796, | |
| "grad_norm": 0.21484375, | |
| "learning_rate": 8.366740905716407e-05, | |
| "loss": 0.2346, | |
| "step": 3300 | |
| }, | |
| { | |
| "epoch": 0.5048255382331106, | |
| "grad_norm": 0.173828125, | |
| "learning_rate": 8.317248205889632e-05, | |
| "loss": 0.2364, | |
| "step": 3400 | |
| }, | |
| { | |
| "epoch": 0.5196733481811433, | |
| "grad_norm": 0.1611328125, | |
| "learning_rate": 8.267755506062856e-05, | |
| "loss": 0.2238, | |
| "step": 3500 | |
| }, | |
| { | |
| "epoch": 0.534521158129176, | |
| "grad_norm": 0.154296875, | |
| "learning_rate": 8.21826280623608e-05, | |
| "loss": 0.2396, | |
| "step": 3600 | |
| }, | |
| { | |
| "epoch": 0.5493689680772086, | |
| "grad_norm": 0.2255859375, | |
| "learning_rate": 8.168770106409304e-05, | |
| "loss": 0.2254, | |
| "step": 3700 | |
| }, | |
| { | |
| "epoch": 0.5642167780252413, | |
| "grad_norm": 0.18359375, | |
| "learning_rate": 8.119277406582529e-05, | |
| "loss": 0.2259, | |
| "step": 3800 | |
| }, | |
| { | |
| "epoch": 0.579064587973274, | |
| "grad_norm": 0.158203125, | |
| "learning_rate": 8.069784706755754e-05, | |
| "loss": 0.2273, | |
| "step": 3900 | |
| }, | |
| { | |
| "epoch": 0.5939123979213066, | |
| "grad_norm": 0.1669921875, | |
| "learning_rate": 8.020292006928979e-05, | |
| "loss": 0.2274, | |
| "step": 4000 | |
| }, | |
| { | |
| "epoch": 0.6087602078693393, | |
| "grad_norm": 0.166015625, | |
| "learning_rate": 7.970799307102202e-05, | |
| "loss": 0.2322, | |
| "step": 4100 | |
| }, | |
| { | |
| "epoch": 0.623608017817372, | |
| "grad_norm": 0.2138671875, | |
| "learning_rate": 7.921306607275427e-05, | |
| "loss": 0.2274, | |
| "step": 4200 | |
| }, | |
| { | |
| "epoch": 0.6384558277654045, | |
| "grad_norm": 0.1416015625, | |
| "learning_rate": 7.871813907448652e-05, | |
| "loss": 0.2274, | |
| "step": 4300 | |
| }, | |
| { | |
| "epoch": 0.6533036377134372, | |
| "grad_norm": 0.2060546875, | |
| "learning_rate": 7.822321207621877e-05, | |
| "loss": 0.2362, | |
| "step": 4400 | |
| }, | |
| { | |
| "epoch": 0.6681514476614699, | |
| "grad_norm": 0.1572265625, | |
| "learning_rate": 7.7728285077951e-05, | |
| "loss": 0.2367, | |
| "step": 4500 | |
| }, | |
| { | |
| "epoch": 0.6829992576095026, | |
| "grad_norm": 0.1650390625, | |
| "learning_rate": 7.723335807968325e-05, | |
| "loss": 0.2287, | |
| "step": 4600 | |
| }, | |
| { | |
| "epoch": 0.6978470675575352, | |
| "grad_norm": 0.1259765625, | |
| "learning_rate": 7.67384310814155e-05, | |
| "loss": 0.233, | |
| "step": 4700 | |
| }, | |
| { | |
| "epoch": 0.7126948775055679, | |
| "grad_norm": 0.1455078125, | |
| "learning_rate": 7.624350408314775e-05, | |
| "loss": 0.2243, | |
| "step": 4800 | |
| }, | |
| { | |
| "epoch": 0.7275426874536006, | |
| "grad_norm": 0.14453125, | |
| "learning_rate": 7.574857708487998e-05, | |
| "loss": 0.2311, | |
| "step": 4900 | |
| }, | |
| { | |
| "epoch": 0.7423904974016332, | |
| "grad_norm": 0.251953125, | |
| "learning_rate": 7.525365008661222e-05, | |
| "loss": 0.229, | |
| "step": 5000 | |
| }, | |
| { | |
| "epoch": 0.7572383073496659, | |
| "grad_norm": 0.234375, | |
| "learning_rate": 7.475872308834447e-05, | |
| "loss": 0.2275, | |
| "step": 5100 | |
| }, | |
| { | |
| "epoch": 0.7720861172976986, | |
| "grad_norm": 0.28125, | |
| "learning_rate": 7.426379609007672e-05, | |
| "loss": 0.2235, | |
| "step": 5200 | |
| }, | |
| { | |
| "epoch": 0.7869339272457313, | |
| "grad_norm": 0.2431640625, | |
| "learning_rate": 7.376886909180895e-05, | |
| "loss": 0.2234, | |
| "step": 5300 | |
| }, | |
| { | |
| "epoch": 0.8017817371937639, | |
| "grad_norm": 0.2119140625, | |
| "learning_rate": 7.32739420935412e-05, | |
| "loss": 0.2236, | |
| "step": 5400 | |
| }, | |
| { | |
| "epoch": 0.8166295471417966, | |
| "grad_norm": 0.212890625, | |
| "learning_rate": 7.277901509527345e-05, | |
| "loss": 0.2265, | |
| "step": 5500 | |
| }, | |
| { | |
| "epoch": 0.8314773570898293, | |
| "grad_norm": 0.236328125, | |
| "learning_rate": 7.22840880970057e-05, | |
| "loss": 0.2121, | |
| "step": 5600 | |
| }, | |
| { | |
| "epoch": 0.8463251670378619, | |
| "grad_norm": 0.1953125, | |
| "learning_rate": 7.178916109873795e-05, | |
| "loss": 0.2225, | |
| "step": 5700 | |
| }, | |
| { | |
| "epoch": 0.8611729769858946, | |
| "grad_norm": 0.2119140625, | |
| "learning_rate": 7.129423410047018e-05, | |
| "loss": 0.2196, | |
| "step": 5800 | |
| }, | |
| { | |
| "epoch": 0.8760207869339273, | |
| "grad_norm": 0.171875, | |
| "learning_rate": 7.079930710220243e-05, | |
| "loss": 0.2235, | |
| "step": 5900 | |
| }, | |
| { | |
| "epoch": 0.89086859688196, | |
| "grad_norm": 0.2333984375, | |
| "learning_rate": 7.030438010393468e-05, | |
| "loss": 0.2256, | |
| "step": 6000 | |
| }, | |
| { | |
| "epoch": 0.9057164068299925, | |
| "grad_norm": 0.279296875, | |
| "learning_rate": 6.980945310566693e-05, | |
| "loss": 0.2217, | |
| "step": 6100 | |
| }, | |
| { | |
| "epoch": 0.9205642167780252, | |
| "grad_norm": 0.16015625, | |
| "learning_rate": 6.931452610739916e-05, | |
| "loss": 0.223, | |
| "step": 6200 | |
| }, | |
| { | |
| "epoch": 0.9354120267260579, | |
| "grad_norm": 0.287109375, | |
| "learning_rate": 6.881959910913141e-05, | |
| "loss": 0.2165, | |
| "step": 6300 | |
| }, | |
| { | |
| "epoch": 0.9502598366740905, | |
| "grad_norm": 0.255859375, | |
| "learning_rate": 6.832467211086365e-05, | |
| "loss": 0.2222, | |
| "step": 6400 | |
| }, | |
| { | |
| "epoch": 0.9651076466221232, | |
| "grad_norm": 0.2080078125, | |
| "learning_rate": 6.78297451125959e-05, | |
| "loss": 0.2243, | |
| "step": 6500 | |
| }, | |
| { | |
| "epoch": 0.9799554565701559, | |
| "grad_norm": 0.291015625, | |
| "learning_rate": 6.733481811432813e-05, | |
| "loss": 0.2261, | |
| "step": 6600 | |
| }, | |
| { | |
| "epoch": 0.9948032665181886, | |
| "grad_norm": 0.26171875, | |
| "learning_rate": 6.683989111606038e-05, | |
| "loss": 0.2195, | |
| "step": 6700 | |
| }, | |
| { | |
| "epoch": 1.0096510764662212, | |
| "grad_norm": 0.2138671875, | |
| "learning_rate": 6.634496411779263e-05, | |
| "loss": 0.2005, | |
| "step": 6800 | |
| }, | |
| { | |
| "epoch": 1.024498886414254, | |
| "grad_norm": 0.21875, | |
| "learning_rate": 6.585003711952488e-05, | |
| "loss": 0.1892, | |
| "step": 6900 | |
| }, | |
| { | |
| "epoch": 1.0393466963622866, | |
| "grad_norm": 0.21484375, | |
| "learning_rate": 6.535511012125711e-05, | |
| "loss": 0.1842, | |
| "step": 7000 | |
| }, | |
| { | |
| "epoch": 1.0541945063103193, | |
| "grad_norm": 0.1982421875, | |
| "learning_rate": 6.486018312298936e-05, | |
| "loss": 0.1897, | |
| "step": 7100 | |
| }, | |
| { | |
| "epoch": 1.069042316258352, | |
| "grad_norm": 0.1884765625, | |
| "learning_rate": 6.436525612472161e-05, | |
| "loss": 0.1831, | |
| "step": 7200 | |
| }, | |
| { | |
| "epoch": 1.0838901262063845, | |
| "grad_norm": 0.2578125, | |
| "learning_rate": 6.387032912645386e-05, | |
| "loss": 0.1891, | |
| "step": 7300 | |
| }, | |
| { | |
| "epoch": 1.0987379361544172, | |
| "grad_norm": 0.17578125, | |
| "learning_rate": 6.33754021281861e-05, | |
| "loss": 0.1884, | |
| "step": 7400 | |
| }, | |
| { | |
| "epoch": 1.1135857461024499, | |
| "grad_norm": 0.2451171875, | |
| "learning_rate": 6.288047512991834e-05, | |
| "loss": 0.1904, | |
| "step": 7500 | |
| }, | |
| { | |
| "epoch": 1.1284335560504826, | |
| "grad_norm": 0.267578125, | |
| "learning_rate": 6.238554813165059e-05, | |
| "loss": 0.1899, | |
| "step": 7600 | |
| }, | |
| { | |
| "epoch": 1.1432813659985153, | |
| "grad_norm": 0.30859375, | |
| "learning_rate": 6.189062113338282e-05, | |
| "loss": 0.1884, | |
| "step": 7700 | |
| }, | |
| { | |
| "epoch": 1.158129175946548, | |
| "grad_norm": 0.255859375, | |
| "learning_rate": 6.139569413511507e-05, | |
| "loss": 0.1905, | |
| "step": 7800 | |
| }, | |
| { | |
| "epoch": 1.1729769858945804, | |
| "grad_norm": 0.2373046875, | |
| "learning_rate": 6.0900767136847315e-05, | |
| "loss": 0.183, | |
| "step": 7900 | |
| }, | |
| { | |
| "epoch": 1.1878247958426131, | |
| "grad_norm": 0.2333984375, | |
| "learning_rate": 6.0405840138579564e-05, | |
| "loss": 0.1841, | |
| "step": 8000 | |
| }, | |
| { | |
| "epoch": 1.2026726057906458, | |
| "grad_norm": 0.2490234375, | |
| "learning_rate": 5.9910913140311805e-05, | |
| "loss": 0.1873, | |
| "step": 8100 | |
| }, | |
| { | |
| "epoch": 1.2175204157386785, | |
| "grad_norm": 0.267578125, | |
| "learning_rate": 5.9415986142044054e-05, | |
| "loss": 0.1817, | |
| "step": 8200 | |
| }, | |
| { | |
| "epoch": 1.2323682256867112, | |
| "grad_norm": 0.28515625, | |
| "learning_rate": 5.892105914377629e-05, | |
| "loss": 0.1824, | |
| "step": 8300 | |
| }, | |
| { | |
| "epoch": 1.247216035634744, | |
| "grad_norm": 0.26953125, | |
| "learning_rate": 5.842613214550854e-05, | |
| "loss": 0.19, | |
| "step": 8400 | |
| }, | |
| { | |
| "epoch": 1.2620638455827766, | |
| "grad_norm": 0.236328125, | |
| "learning_rate": 5.7931205147240786e-05, | |
| "loss": 0.1859, | |
| "step": 8500 | |
| }, | |
| { | |
| "epoch": 1.2769116555308093, | |
| "grad_norm": 0.255859375, | |
| "learning_rate": 5.7436278148973035e-05, | |
| "loss": 0.1877, | |
| "step": 8600 | |
| }, | |
| { | |
| "epoch": 1.2917594654788418, | |
| "grad_norm": 0.1826171875, | |
| "learning_rate": 5.694135115070527e-05, | |
| "loss": 0.1813, | |
| "step": 8700 | |
| }, | |
| { | |
| "epoch": 1.3066072754268745, | |
| "grad_norm": 0.365234375, | |
| "learning_rate": 5.644642415243752e-05, | |
| "loss": 0.186, | |
| "step": 8800 | |
| }, | |
| { | |
| "epoch": 1.3214550853749072, | |
| "grad_norm": 0.2041015625, | |
| "learning_rate": 5.595149715416976e-05, | |
| "loss": 0.1861, | |
| "step": 8900 | |
| }, | |
| { | |
| "epoch": 1.3363028953229399, | |
| "grad_norm": 0.248046875, | |
| "learning_rate": 5.545657015590201e-05, | |
| "loss": 0.1858, | |
| "step": 9000 | |
| }, | |
| { | |
| "epoch": 1.3511507052709726, | |
| "grad_norm": 0.2021484375, | |
| "learning_rate": 5.4961643157634244e-05, | |
| "loss": 0.1827, | |
| "step": 9100 | |
| }, | |
| { | |
| "epoch": 1.365998515219005, | |
| "grad_norm": 0.2080078125, | |
| "learning_rate": 5.446671615936649e-05, | |
| "loss": 0.1778, | |
| "step": 9200 | |
| }, | |
| { | |
| "epoch": 1.3808463251670378, | |
| "grad_norm": 0.26171875, | |
| "learning_rate": 5.397178916109874e-05, | |
| "loss": 0.1891, | |
| "step": 9300 | |
| }, | |
| { | |
| "epoch": 1.3956941351150705, | |
| "grad_norm": 0.294921875, | |
| "learning_rate": 5.347686216283099e-05, | |
| "loss": 0.1842, | |
| "step": 9400 | |
| }, | |
| { | |
| "epoch": 1.4105419450631032, | |
| "grad_norm": 0.2392578125, | |
| "learning_rate": 5.298193516456323e-05, | |
| "loss": 0.187, | |
| "step": 9500 | |
| }, | |
| { | |
| "epoch": 1.4253897550111359, | |
| "grad_norm": 0.2216796875, | |
| "learning_rate": 5.248700816629547e-05, | |
| "loss": 0.1897, | |
| "step": 9600 | |
| }, | |
| { | |
| "epoch": 1.4402375649591685, | |
| "grad_norm": 0.322265625, | |
| "learning_rate": 5.1992081168027716e-05, | |
| "loss": 0.1873, | |
| "step": 9700 | |
| }, | |
| { | |
| "epoch": 1.4550853749072012, | |
| "grad_norm": 0.291015625, | |
| "learning_rate": 5.1497154169759965e-05, | |
| "loss": 0.188, | |
| "step": 9800 | |
| }, | |
| { | |
| "epoch": 1.469933184855234, | |
| "grad_norm": 0.2119140625, | |
| "learning_rate": 5.100222717149221e-05, | |
| "loss": 0.1824, | |
| "step": 9900 | |
| }, | |
| { | |
| "epoch": 1.4847809948032666, | |
| "grad_norm": 0.185546875, | |
| "learning_rate": 5.050730017322445e-05, | |
| "loss": 0.1827, | |
| "step": 10000 | |
| }, | |
| { | |
| "epoch": 1.4996288047512991, | |
| "grad_norm": 0.33203125, | |
| "learning_rate": 5.00123731749567e-05, | |
| "loss": 0.185, | |
| "step": 10100 | |
| }, | |
| { | |
| "epoch": 1.5144766146993318, | |
| "grad_norm": 0.1884765625, | |
| "learning_rate": 4.951744617668894e-05, | |
| "loss": 0.1813, | |
| "step": 10200 | |
| }, | |
| { | |
| "epoch": 1.5293244246473645, | |
| "grad_norm": 0.2294921875, | |
| "learning_rate": 4.902251917842118e-05, | |
| "loss": 0.1796, | |
| "step": 10300 | |
| }, | |
| { | |
| "epoch": 1.5441722345953972, | |
| "grad_norm": 0.296875, | |
| "learning_rate": 4.852759218015343e-05, | |
| "loss": 0.1887, | |
| "step": 10400 | |
| }, | |
| { | |
| "epoch": 1.5590200445434297, | |
| "grad_norm": 0.166015625, | |
| "learning_rate": 4.803266518188567e-05, | |
| "loss": 0.1845, | |
| "step": 10500 | |
| }, | |
| { | |
| "epoch": 1.5738678544914624, | |
| "grad_norm": 0.326171875, | |
| "learning_rate": 4.753773818361792e-05, | |
| "loss": 0.1784, | |
| "step": 10600 | |
| }, | |
| { | |
| "epoch": 1.588715664439495, | |
| "grad_norm": 0.1982421875, | |
| "learning_rate": 4.704281118535016e-05, | |
| "loss": 0.1842, | |
| "step": 10700 | |
| }, | |
| { | |
| "epoch": 1.6035634743875278, | |
| "grad_norm": 0.314453125, | |
| "learning_rate": 4.654788418708241e-05, | |
| "loss": 0.188, | |
| "step": 10800 | |
| }, | |
| { | |
| "epoch": 1.6184112843355605, | |
| "grad_norm": 0.25, | |
| "learning_rate": 4.605295718881465e-05, | |
| "loss": 0.1844, | |
| "step": 10900 | |
| }, | |
| { | |
| "epoch": 1.6332590942835932, | |
| "grad_norm": 0.380859375, | |
| "learning_rate": 4.5558030190546894e-05, | |
| "loss": 0.1802, | |
| "step": 11000 | |
| }, | |
| { | |
| "epoch": 1.6481069042316259, | |
| "grad_norm": 0.2314453125, | |
| "learning_rate": 4.506310319227914e-05, | |
| "loss": 0.1881, | |
| "step": 11100 | |
| }, | |
| { | |
| "epoch": 1.6629547141796586, | |
| "grad_norm": 0.37109375, | |
| "learning_rate": 4.4568176194011384e-05, | |
| "loss": 0.185, | |
| "step": 11200 | |
| }, | |
| { | |
| "epoch": 1.6778025241276913, | |
| "grad_norm": 0.283203125, | |
| "learning_rate": 4.407324919574363e-05, | |
| "loss": 0.1818, | |
| "step": 11300 | |
| }, | |
| { | |
| "epoch": 1.692650334075724, | |
| "grad_norm": 0.25, | |
| "learning_rate": 4.3578322197475875e-05, | |
| "loss": 0.182, | |
| "step": 11400 | |
| }, | |
| { | |
| "epoch": 1.7074981440237567, | |
| "grad_norm": 0.25390625, | |
| "learning_rate": 4.3083395199208124e-05, | |
| "loss": 0.184, | |
| "step": 11500 | |
| }, | |
| { | |
| "epoch": 1.7223459539717891, | |
| "grad_norm": 0.28515625, | |
| "learning_rate": 4.2588468200940365e-05, | |
| "loss": 0.1923, | |
| "step": 11600 | |
| }, | |
| { | |
| "epoch": 1.7371937639198218, | |
| "grad_norm": 0.2236328125, | |
| "learning_rate": 4.209354120267261e-05, | |
| "loss": 0.1812, | |
| "step": 11700 | |
| }, | |
| { | |
| "epoch": 1.7520415738678545, | |
| "grad_norm": 0.1865234375, | |
| "learning_rate": 4.159861420440485e-05, | |
| "loss": 0.1756, | |
| "step": 11800 | |
| }, | |
| { | |
| "epoch": 1.766889383815887, | |
| "grad_norm": 0.31640625, | |
| "learning_rate": 4.11036872061371e-05, | |
| "loss": 0.1878, | |
| "step": 11900 | |
| }, | |
| { | |
| "epoch": 1.7817371937639197, | |
| "grad_norm": 0.32421875, | |
| "learning_rate": 4.060876020786934e-05, | |
| "loss": 0.18, | |
| "step": 12000 | |
| }, | |
| { | |
| "epoch": 1.7965850037119524, | |
| "grad_norm": 0.265625, | |
| "learning_rate": 4.011383320960159e-05, | |
| "loss": 0.1802, | |
| "step": 12100 | |
| }, | |
| { | |
| "epoch": 1.811432813659985, | |
| "grad_norm": 0.234375, | |
| "learning_rate": 3.961890621133383e-05, | |
| "loss": 0.1802, | |
| "step": 12200 | |
| }, | |
| { | |
| "epoch": 1.8262806236080178, | |
| "grad_norm": 0.31640625, | |
| "learning_rate": 3.912397921306608e-05, | |
| "loss": 0.1842, | |
| "step": 12300 | |
| }, | |
| { | |
| "epoch": 1.8411284335560505, | |
| "grad_norm": 0.2109375, | |
| "learning_rate": 3.8629052214798314e-05, | |
| "loss": 0.1855, | |
| "step": 12400 | |
| }, | |
| { | |
| "epoch": 1.8559762435040832, | |
| "grad_norm": 0.271484375, | |
| "learning_rate": 3.813412521653056e-05, | |
| "loss": 0.1846, | |
| "step": 12500 | |
| }, | |
| { | |
| "epoch": 1.8708240534521159, | |
| "grad_norm": 0.275390625, | |
| "learning_rate": 3.7639198218262804e-05, | |
| "loss": 0.1829, | |
| "step": 12600 | |
| }, | |
| { | |
| "epoch": 1.8856718634001486, | |
| "grad_norm": 0.2734375, | |
| "learning_rate": 3.714427121999505e-05, | |
| "loss": 0.1845, | |
| "step": 12700 | |
| }, | |
| { | |
| "epoch": 1.9005196733481813, | |
| "grad_norm": 0.28125, | |
| "learning_rate": 3.66493442217273e-05, | |
| "loss": 0.1849, | |
| "step": 12800 | |
| }, | |
| { | |
| "epoch": 1.9153674832962138, | |
| "grad_norm": 0.2353515625, | |
| "learning_rate": 3.6154417223459543e-05, | |
| "loss": 0.1849, | |
| "step": 12900 | |
| }, | |
| { | |
| "epoch": 1.9302152932442465, | |
| "grad_norm": 0.33203125, | |
| "learning_rate": 3.5659490225191785e-05, | |
| "loss": 0.1782, | |
| "step": 13000 | |
| }, | |
| { | |
| "epoch": 1.9450631031922792, | |
| "grad_norm": 0.29296875, | |
| "learning_rate": 3.516456322692403e-05, | |
| "loss": 0.1871, | |
| "step": 13100 | |
| }, | |
| { | |
| "epoch": 1.9599109131403119, | |
| "grad_norm": 0.23828125, | |
| "learning_rate": 3.4669636228656276e-05, | |
| "loss": 0.1775, | |
| "step": 13200 | |
| }, | |
| { | |
| "epoch": 1.9747587230883443, | |
| "grad_norm": 0.30078125, | |
| "learning_rate": 3.417470923038852e-05, | |
| "loss": 0.1815, | |
| "step": 13300 | |
| }, | |
| { | |
| "epoch": 1.989606533036377, | |
| "grad_norm": 0.30078125, | |
| "learning_rate": 3.3679782232120766e-05, | |
| "loss": 0.1831, | |
| "step": 13400 | |
| }, | |
| { | |
| "epoch": 2.0044543429844097, | |
| "grad_norm": 0.2373046875, | |
| "learning_rate": 3.318485523385301e-05, | |
| "loss": 0.1752, | |
| "step": 13500 | |
| }, | |
| { | |
| "epoch": 2.0193021529324424, | |
| "grad_norm": 0.298828125, | |
| "learning_rate": 3.268992823558526e-05, | |
| "loss": 0.172, | |
| "step": 13600 | |
| }, | |
| { | |
| "epoch": 2.034149962880475, | |
| "grad_norm": 0.263671875, | |
| "learning_rate": 3.21950012373175e-05, | |
| "loss": 0.1695, | |
| "step": 13700 | |
| }, | |
| { | |
| "epoch": 2.048997772828508, | |
| "grad_norm": 0.255859375, | |
| "learning_rate": 3.170007423904974e-05, | |
| "loss": 0.169, | |
| "step": 13800 | |
| }, | |
| { | |
| "epoch": 2.0638455827765405, | |
| "grad_norm": 0.2333984375, | |
| "learning_rate": 3.120514724078198e-05, | |
| "loss": 0.1688, | |
| "step": 13900 | |
| }, | |
| { | |
| "epoch": 2.078693392724573, | |
| "grad_norm": 0.224609375, | |
| "learning_rate": 3.071022024251423e-05, | |
| "loss": 0.1696, | |
| "step": 14000 | |
| }, | |
| { | |
| "epoch": 2.093541202672606, | |
| "grad_norm": 0.34375, | |
| "learning_rate": 3.0215293244246473e-05, | |
| "loss": 0.1683, | |
| "step": 14100 | |
| }, | |
| { | |
| "epoch": 2.1083890126206386, | |
| "grad_norm": 0.365234375, | |
| "learning_rate": 2.972036624597872e-05, | |
| "loss": 0.1647, | |
| "step": 14200 | |
| }, | |
| { | |
| "epoch": 2.1232368225686713, | |
| "grad_norm": 0.310546875, | |
| "learning_rate": 2.9225439247710963e-05, | |
| "loss": 0.1698, | |
| "step": 14300 | |
| }, | |
| { | |
| "epoch": 2.138084632516704, | |
| "grad_norm": 0.20703125, | |
| "learning_rate": 2.873051224944321e-05, | |
| "loss": 0.1648, | |
| "step": 14400 | |
| }, | |
| { | |
| "epoch": 2.1529324424647363, | |
| "grad_norm": 0.224609375, | |
| "learning_rate": 2.823558525117545e-05, | |
| "loss": 0.1676, | |
| "step": 14500 | |
| }, | |
| { | |
| "epoch": 2.167780252412769, | |
| "grad_norm": 0.283203125, | |
| "learning_rate": 2.77406582529077e-05, | |
| "loss": 0.1676, | |
| "step": 14600 | |
| }, | |
| { | |
| "epoch": 2.1826280623608016, | |
| "grad_norm": 0.240234375, | |
| "learning_rate": 2.7245731254639944e-05, | |
| "loss": 0.168, | |
| "step": 14700 | |
| }, | |
| { | |
| "epoch": 2.1974758723088343, | |
| "grad_norm": 0.2734375, | |
| "learning_rate": 2.6750804256372186e-05, | |
| "loss": 0.1662, | |
| "step": 14800 | |
| }, | |
| { | |
| "epoch": 2.212323682256867, | |
| "grad_norm": 0.2119140625, | |
| "learning_rate": 2.6255877258104435e-05, | |
| "loss": 0.163, | |
| "step": 14900 | |
| }, | |
| { | |
| "epoch": 2.2271714922048997, | |
| "grad_norm": 0.32421875, | |
| "learning_rate": 2.5760950259836673e-05, | |
| "loss": 0.1658, | |
| "step": 15000 | |
| }, | |
| { | |
| "epoch": 2.2420193021529324, | |
| "grad_norm": 0.2734375, | |
| "learning_rate": 2.5266023261568922e-05, | |
| "loss": 0.1645, | |
| "step": 15100 | |
| }, | |
| { | |
| "epoch": 2.256867112100965, | |
| "grad_norm": 0.291015625, | |
| "learning_rate": 2.4771096263301164e-05, | |
| "loss": 0.1563, | |
| "step": 15200 | |
| }, | |
| { | |
| "epoch": 2.271714922048998, | |
| "grad_norm": 0.34375, | |
| "learning_rate": 2.427616926503341e-05, | |
| "loss": 0.1677, | |
| "step": 15300 | |
| }, | |
| { | |
| "epoch": 2.2865627319970305, | |
| "grad_norm": 0.265625, | |
| "learning_rate": 2.378124226676565e-05, | |
| "loss": 0.1635, | |
| "step": 15400 | |
| }, | |
| { | |
| "epoch": 2.3014105419450632, | |
| "grad_norm": 0.32421875, | |
| "learning_rate": 2.3286315268497896e-05, | |
| "loss": 0.1693, | |
| "step": 15500 | |
| }, | |
| { | |
| "epoch": 2.316258351893096, | |
| "grad_norm": 0.2470703125, | |
| "learning_rate": 2.2791388270230145e-05, | |
| "loss": 0.1682, | |
| "step": 15600 | |
| }, | |
| { | |
| "epoch": 2.3311061618411286, | |
| "grad_norm": 0.271484375, | |
| "learning_rate": 2.2296461271962387e-05, | |
| "loss": 0.1637, | |
| "step": 15700 | |
| }, | |
| { | |
| "epoch": 2.345953971789161, | |
| "grad_norm": 0.337890625, | |
| "learning_rate": 2.1801534273694632e-05, | |
| "loss": 0.1702, | |
| "step": 15800 | |
| }, | |
| { | |
| "epoch": 2.3608017817371936, | |
| "grad_norm": 0.3125, | |
| "learning_rate": 2.1306607275426877e-05, | |
| "loss": 0.1648, | |
| "step": 15900 | |
| }, | |
| { | |
| "epoch": 2.3756495916852263, | |
| "grad_norm": 0.275390625, | |
| "learning_rate": 2.0811680277159122e-05, | |
| "loss": 0.1586, | |
| "step": 16000 | |
| }, | |
| { | |
| "epoch": 2.390497401633259, | |
| "grad_norm": 0.263671875, | |
| "learning_rate": 2.0316753278891364e-05, | |
| "loss": 0.1628, | |
| "step": 16100 | |
| }, | |
| { | |
| "epoch": 2.4053452115812917, | |
| "grad_norm": 0.28125, | |
| "learning_rate": 1.982182628062361e-05, | |
| "loss": 0.1621, | |
| "step": 16200 | |
| }, | |
| { | |
| "epoch": 2.4201930215293244, | |
| "grad_norm": 0.296875, | |
| "learning_rate": 1.9326899282355855e-05, | |
| "loss": 0.1628, | |
| "step": 16300 | |
| }, | |
| { | |
| "epoch": 2.435040831477357, | |
| "grad_norm": 0.2412109375, | |
| "learning_rate": 1.8831972284088097e-05, | |
| "loss": 0.1642, | |
| "step": 16400 | |
| }, | |
| { | |
| "epoch": 2.4498886414253898, | |
| "grad_norm": 0.314453125, | |
| "learning_rate": 1.8337045285820342e-05, | |
| "loss": 0.1659, | |
| "step": 16500 | |
| }, | |
| { | |
| "epoch": 2.4647364513734225, | |
| "grad_norm": 0.2734375, | |
| "learning_rate": 1.7842118287552587e-05, | |
| "loss": 0.1683, | |
| "step": 16600 | |
| }, | |
| { | |
| "epoch": 2.479584261321455, | |
| "grad_norm": 0.322265625, | |
| "learning_rate": 1.7347191289284832e-05, | |
| "loss": 0.1655, | |
| "step": 16700 | |
| }, | |
| { | |
| "epoch": 2.494432071269488, | |
| "grad_norm": 0.2734375, | |
| "learning_rate": 1.6852264291017074e-05, | |
| "loss": 0.1655, | |
| "step": 16800 | |
| }, | |
| { | |
| "epoch": 2.5092798812175205, | |
| "grad_norm": 0.3046875, | |
| "learning_rate": 1.635733729274932e-05, | |
| "loss": 0.164, | |
| "step": 16900 | |
| }, | |
| { | |
| "epoch": 2.5241276911655532, | |
| "grad_norm": 0.28125, | |
| "learning_rate": 1.5862410294481565e-05, | |
| "loss": 0.1745, | |
| "step": 17000 | |
| }, | |
| { | |
| "epoch": 2.538975501113586, | |
| "grad_norm": 0.2490234375, | |
| "learning_rate": 1.5367483296213807e-05, | |
| "loss": 0.169, | |
| "step": 17100 | |
| }, | |
| { | |
| "epoch": 2.5538233110616186, | |
| "grad_norm": 0.314453125, | |
| "learning_rate": 1.4872556297946052e-05, | |
| "loss": 0.1695, | |
| "step": 17200 | |
| }, | |
| { | |
| "epoch": 2.5686711210096513, | |
| "grad_norm": 0.306640625, | |
| "learning_rate": 1.4377629299678297e-05, | |
| "loss": 0.1657, | |
| "step": 17300 | |
| }, | |
| { | |
| "epoch": 2.5835189309576836, | |
| "grad_norm": 0.302734375, | |
| "learning_rate": 1.3882702301410544e-05, | |
| "loss": 0.1615, | |
| "step": 17400 | |
| }, | |
| { | |
| "epoch": 2.5983667409057163, | |
| "grad_norm": 0.263671875, | |
| "learning_rate": 1.3387775303142788e-05, | |
| "loss": 0.1675, | |
| "step": 17500 | |
| }, | |
| { | |
| "epoch": 2.613214550853749, | |
| "grad_norm": 0.29296875, | |
| "learning_rate": 1.2892848304875033e-05, | |
| "loss": 0.1646, | |
| "step": 17600 | |
| }, | |
| { | |
| "epoch": 2.6280623608017817, | |
| "grad_norm": 0.197265625, | |
| "learning_rate": 1.2397921306607275e-05, | |
| "loss": 0.1631, | |
| "step": 17700 | |
| }, | |
| { | |
| "epoch": 2.6429101707498144, | |
| "grad_norm": 0.2890625, | |
| "learning_rate": 1.190299430833952e-05, | |
| "loss": 0.1621, | |
| "step": 17800 | |
| }, | |
| { | |
| "epoch": 2.657757980697847, | |
| "grad_norm": 0.29296875, | |
| "learning_rate": 1.1408067310071765e-05, | |
| "loss": 0.1682, | |
| "step": 17900 | |
| }, | |
| { | |
| "epoch": 2.6726057906458798, | |
| "grad_norm": 0.29296875, | |
| "learning_rate": 1.091314031180401e-05, | |
| "loss": 0.1664, | |
| "step": 18000 | |
| }, | |
| { | |
| "epoch": 2.6874536005939125, | |
| "grad_norm": 0.2431640625, | |
| "learning_rate": 1.0418213313536254e-05, | |
| "loss": 0.1686, | |
| "step": 18100 | |
| }, | |
| { | |
| "epoch": 2.702301410541945, | |
| "grad_norm": 0.427734375, | |
| "learning_rate": 9.9232863152685e-06, | |
| "loss": 0.1653, | |
| "step": 18200 | |
| }, | |
| { | |
| "epoch": 2.717149220489978, | |
| "grad_norm": 0.283203125, | |
| "learning_rate": 9.428359317000743e-06, | |
| "loss": 0.1704, | |
| "step": 18300 | |
| }, | |
| { | |
| "epoch": 2.73199703043801, | |
| "grad_norm": 0.2314453125, | |
| "learning_rate": 8.933432318732986e-06, | |
| "loss": 0.1645, | |
| "step": 18400 | |
| }, | |
| { | |
| "epoch": 2.746844840386043, | |
| "grad_norm": 0.2041015625, | |
| "learning_rate": 8.438505320465232e-06, | |
| "loss": 0.1647, | |
| "step": 18500 | |
| }, | |
| { | |
| "epoch": 2.7616926503340755, | |
| "grad_norm": 0.333984375, | |
| "learning_rate": 7.943578322197475e-06, | |
| "loss": 0.1721, | |
| "step": 18600 | |
| }, | |
| { | |
| "epoch": 2.776540460282108, | |
| "grad_norm": 0.330078125, | |
| "learning_rate": 7.44865132392972e-06, | |
| "loss": 0.1652, | |
| "step": 18700 | |
| }, | |
| { | |
| "epoch": 2.791388270230141, | |
| "grad_norm": 0.333984375, | |
| "learning_rate": 6.953724325661966e-06, | |
| "loss": 0.1634, | |
| "step": 18800 | |
| }, | |
| { | |
| "epoch": 2.8062360801781736, | |
| "grad_norm": 0.283203125, | |
| "learning_rate": 6.45879732739421e-06, | |
| "loss": 0.1668, | |
| "step": 18900 | |
| }, | |
| { | |
| "epoch": 2.8210838901262063, | |
| "grad_norm": 0.248046875, | |
| "learning_rate": 5.9638703291264544e-06, | |
| "loss": 0.1642, | |
| "step": 19000 | |
| }, | |
| { | |
| "epoch": 2.835931700074239, | |
| "grad_norm": 0.2373046875, | |
| "learning_rate": 5.468943330858699e-06, | |
| "loss": 0.1594, | |
| "step": 19100 | |
| }, | |
| { | |
| "epoch": 2.8507795100222717, | |
| "grad_norm": 0.224609375, | |
| "learning_rate": 4.974016332590943e-06, | |
| "loss": 0.1648, | |
| "step": 19200 | |
| }, | |
| { | |
| "epoch": 2.8656273199703044, | |
| "grad_norm": 0.2353515625, | |
| "learning_rate": 4.479089334323188e-06, | |
| "loss": 0.1637, | |
| "step": 19300 | |
| }, | |
| { | |
| "epoch": 2.880475129918337, | |
| "grad_norm": 0.2294921875, | |
| "learning_rate": 3.984162336055432e-06, | |
| "loss": 0.1728, | |
| "step": 19400 | |
| }, | |
| { | |
| "epoch": 2.89532293986637, | |
| "grad_norm": 0.25390625, | |
| "learning_rate": 3.489235337787677e-06, | |
| "loss": 0.1646, | |
| "step": 19500 | |
| }, | |
| { | |
| "epoch": 2.9101707498144025, | |
| "grad_norm": 0.25, | |
| "learning_rate": 2.9943083395199213e-06, | |
| "loss": 0.1692, | |
| "step": 19600 | |
| }, | |
| { | |
| "epoch": 2.925018559762435, | |
| "grad_norm": 0.328125, | |
| "learning_rate": 2.4993813412521652e-06, | |
| "loss": 0.1708, | |
| "step": 19700 | |
| }, | |
| { | |
| "epoch": 2.939866369710468, | |
| "grad_norm": 0.265625, | |
| "learning_rate": 2.00445434298441e-06, | |
| "loss": 0.1689, | |
| "step": 19800 | |
| }, | |
| { | |
| "epoch": 2.9547141796585006, | |
| "grad_norm": 0.25, | |
| "learning_rate": 1.5095273447166542e-06, | |
| "loss": 0.1622, | |
| "step": 19900 | |
| }, | |
| { | |
| "epoch": 2.9695619896065333, | |
| "grad_norm": 0.298828125, | |
| "learning_rate": 1.0146003464488989e-06, | |
| "loss": 0.1649, | |
| "step": 20000 | |
| }, | |
| { | |
| "epoch": 2.984409799554566, | |
| "grad_norm": 0.291015625, | |
| "learning_rate": 5.196733481811434e-07, | |
| "loss": 0.163, | |
| "step": 20100 | |
| }, | |
| { | |
| "epoch": 2.9992576095025982, | |
| "grad_norm": 0.359375, | |
| "learning_rate": 2.4746349913387775e-08, | |
| "loss": 0.1617, | |
| "step": 20200 | |
| } | |
| ], | |
| "logging_steps": 100, | |
| "max_steps": 20205, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 3, | |
| "save_steps": 1000, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 1.1772177365591194e+18, | |
| "train_batch_size": 1, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |