{ "best_global_step": 121200, "best_metric": 0.3653090298175812, "best_model_checkpoint": "/data/alamparan/mattext_ckpt_2/results_2m_no_earlystop/2026-02-15/04-45-09/pretrain/checkpoints/robocrys_rep_test-pretrain/checkpoint-69000", "epoch": 50.0, "eval_steps": 50, "global_step": 129750, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.019267822736030827, "grad_norm": 8.877785682678223, "learning_rate": 0.00019992447013487478, "loss": 35.2349951171875, "step": 50 }, { "epoch": 0.019267822736030827, "eval_loss": 23.86328887939453, "eval_runtime": 11.7233, "eval_samples_per_second": 1621.043, "eval_steps_per_second": 33.779, "step": 50 }, { "epoch": 0.038535645472061654, "grad_norm": 2.8108115196228027, "learning_rate": 0.00019984739884393063, "loss": 22.29526123046875, "step": 100 }, { "epoch": 0.038535645472061654, "eval_loss": 20.857704162597656, "eval_runtime": 11.6942, "eval_samples_per_second": 1625.073, "eval_steps_per_second": 33.863, "step": 100 }, { "epoch": 0.057803468208092484, "grad_norm": 6.313286304473877, "learning_rate": 0.00019977032755298652, "loss": 20.316953125, "step": 150 }, { "epoch": 0.057803468208092484, "eval_loss": 19.354394912719727, "eval_runtime": 11.6943, "eval_samples_per_second": 1625.063, "eval_steps_per_second": 33.863, "step": 150 }, { "epoch": 0.07707129094412331, "grad_norm": 4.738299369812012, "learning_rate": 0.0001996932562620424, "loss": 19.059852294921875, "step": 200 }, { "epoch": 0.07707129094412331, "eval_loss": 18.21726417541504, "eval_runtime": 11.7041, "eval_samples_per_second": 1623.701, "eval_steps_per_second": 33.834, "step": 200 }, { "epoch": 0.09633911368015415, "grad_norm": 8.969942092895508, "learning_rate": 0.0001996161849710983, "loss": 18.012269287109376, "step": 250 }, { "epoch": 0.09633911368015415, "eval_loss": 16.938831329345703, "eval_runtime": 11.6965, "eval_samples_per_second": 1624.76, "eval_steps_per_second": 33.856, "step": 250 }, { "epoch": 0.11560693641618497, "grad_norm": 7.669299602508545, "learning_rate": 0.00019953911368015415, "loss": 16.699818115234375, "step": 300 }, { "epoch": 0.11560693641618497, "eval_loss": 15.053141593933105, "eval_runtime": 11.7059, "eval_samples_per_second": 1623.453, "eval_steps_per_second": 33.829, "step": 300 }, { "epoch": 0.1348747591522158, "grad_norm": 8.959158897399902, "learning_rate": 0.00019946204238921003, "loss": 13.77020263671875, "step": 350 }, { "epoch": 0.1348747591522158, "eval_loss": 9.377955436706543, "eval_runtime": 11.7151, "eval_samples_per_second": 1622.178, "eval_steps_per_second": 33.802, "step": 350 }, { "epoch": 0.15414258188824662, "grad_norm": 3.3629279136657715, "learning_rate": 0.00019938497109826592, "loss": 7.74793701171875, "step": 400 }, { "epoch": 0.15414258188824662, "eval_loss": 4.854114532470703, "eval_runtime": 11.7147, "eval_samples_per_second": 1622.231, "eval_steps_per_second": 33.804, "step": 400 }, { "epoch": 0.17341040462427745, "grad_norm": 3.55245041847229, "learning_rate": 0.00019930789980732177, "loss": 4.946226196289063, "step": 450 }, { "epoch": 0.17341040462427745, "eval_loss": 3.9195499420166016, "eval_runtime": 11.7159, "eval_samples_per_second": 1622.075, "eval_steps_per_second": 33.8, "step": 450 }, { "epoch": 0.1926782273603083, "grad_norm": 2.8697292804718018, "learning_rate": 0.00019923082851637766, "loss": 4.238619384765625, "step": 500 }, { "epoch": 0.1926782273603083, "eval_loss": 3.3998587131500244, "eval_runtime": 11.6977, "eval_samples_per_second": 1624.594, "eval_steps_per_second": 33.853, "step": 500 }, { "epoch": 0.2119460500963391, "grad_norm": 2.6605217456817627, "learning_rate": 0.00019915375722543354, "loss": 3.7408367919921877, "step": 550 }, { "epoch": 0.2119460500963391, "eval_loss": 2.9202401638031006, "eval_runtime": 11.7292, "eval_samples_per_second": 1620.224, "eval_steps_per_second": 33.762, "step": 550 }, { "epoch": 0.23121387283236994, "grad_norm": 3.1394155025482178, "learning_rate": 0.0001990766859344894, "loss": 3.1157400512695315, "step": 600 }, { "epoch": 0.23121387283236994, "eval_loss": 2.423171043395996, "eval_runtime": 11.7332, "eval_samples_per_second": 1619.676, "eval_steps_per_second": 33.75, "step": 600 }, { "epoch": 0.2504816955684008, "grad_norm": 2.3351471424102783, "learning_rate": 0.00019899961464354529, "loss": 2.5061297607421875, "step": 650 }, { "epoch": 0.2504816955684008, "eval_loss": 1.9836865663528442, "eval_runtime": 11.7416, "eval_samples_per_second": 1618.524, "eval_steps_per_second": 33.726, "step": 650 }, { "epoch": 0.2697495183044316, "grad_norm": 1.8004690408706665, "learning_rate": 0.00019892254335260117, "loss": 2.0416378784179687, "step": 700 }, { "epoch": 0.2697495183044316, "eval_loss": 1.7745883464813232, "eval_runtime": 11.7338, "eval_samples_per_second": 1619.599, "eval_steps_per_second": 33.749, "step": 700 }, { "epoch": 0.28901734104046245, "grad_norm": 1.9287327527999878, "learning_rate": 0.00019884547206165706, "loss": 1.8328019714355468, "step": 750 }, { "epoch": 0.28901734104046245, "eval_loss": 1.65010666847229, "eval_runtime": 11.7115, "eval_samples_per_second": 1622.678, "eval_steps_per_second": 33.813, "step": 750 }, { "epoch": 0.30828516377649323, "grad_norm": 1.6123042106628418, "learning_rate": 0.0001987684007707129, "loss": 1.7091549682617186, "step": 800 }, { "epoch": 0.30828516377649323, "eval_loss": 1.5188778638839722, "eval_runtime": 11.7434, "eval_samples_per_second": 1618.267, "eval_steps_per_second": 33.721, "step": 800 }, { "epoch": 0.32755298651252407, "grad_norm": 1.5286041498184204, "learning_rate": 0.0001986913294797688, "loss": 1.5871896362304687, "step": 850 }, { "epoch": 0.32755298651252407, "eval_loss": 1.4317147731781006, "eval_runtime": 11.7301, "eval_samples_per_second": 1620.112, "eval_steps_per_second": 33.759, "step": 850 }, { "epoch": 0.3468208092485549, "grad_norm": 1.6148699522018433, "learning_rate": 0.00019861425818882466, "loss": 1.4919316101074218, "step": 900 }, { "epoch": 0.3468208092485549, "eval_loss": 1.3607696294784546, "eval_runtime": 11.7272, "eval_samples_per_second": 1620.505, "eval_steps_per_second": 33.768, "step": 900 }, { "epoch": 0.36608863198458574, "grad_norm": 1.5548264980316162, "learning_rate": 0.00019853718689788054, "loss": 1.4115541076660156, "step": 950 }, { "epoch": 0.36608863198458574, "eval_loss": 1.285597324371338, "eval_runtime": 11.7227, "eval_samples_per_second": 1621.129, "eval_steps_per_second": 33.781, "step": 950 }, { "epoch": 0.3853564547206166, "grad_norm": 1.4407851696014404, "learning_rate": 0.00019846011560693643, "loss": 1.3329698181152343, "step": 1000 }, { "epoch": 0.3853564547206166, "eval_loss": 1.2415759563446045, "eval_runtime": 11.7347, "eval_samples_per_second": 1619.464, "eval_steps_per_second": 33.746, "step": 1000 }, { "epoch": 0.4046242774566474, "grad_norm": 1.551575779914856, "learning_rate": 0.0001983830443159923, "loss": 1.2870872497558594, "step": 1050 }, { "epoch": 0.4046242774566474, "eval_loss": 1.2125412225723267, "eval_runtime": 11.7383, "eval_samples_per_second": 1618.971, "eval_steps_per_second": 33.736, "step": 1050 }, { "epoch": 0.4238921001926782, "grad_norm": 1.3146734237670898, "learning_rate": 0.0001983059730250482, "loss": 1.2200655364990234, "step": 1100 }, { "epoch": 0.4238921001926782, "eval_loss": 1.1706342697143555, "eval_runtime": 11.7381, "eval_samples_per_second": 1619.002, "eval_steps_per_second": 33.736, "step": 1100 }, { "epoch": 0.44315992292870904, "grad_norm": 1.2496591806411743, "learning_rate": 0.00019822890173410408, "loss": 1.1880706787109374, "step": 1150 }, { "epoch": 0.44315992292870904, "eval_loss": 1.160879373550415, "eval_runtime": 11.7298, "eval_samples_per_second": 1620.149, "eval_steps_per_second": 33.76, "step": 1150 }, { "epoch": 0.4624277456647399, "grad_norm": 1.2143440246582031, "learning_rate": 0.00019815183044315994, "loss": 1.1632847595214844, "step": 1200 }, { "epoch": 0.4624277456647399, "eval_loss": 1.1183408498764038, "eval_runtime": 11.7254, "eval_samples_per_second": 1620.755, "eval_steps_per_second": 33.773, "step": 1200 }, { "epoch": 0.4816955684007707, "grad_norm": 1.2999197244644165, "learning_rate": 0.0001980747591522158, "loss": 1.111785888671875, "step": 1250 }, { "epoch": 0.4816955684007707, "eval_loss": 1.090254783630371, "eval_runtime": 11.7531, "eval_samples_per_second": 1616.939, "eval_steps_per_second": 33.693, "step": 1250 }, { "epoch": 0.5009633911368016, "grad_norm": 1.129799723625183, "learning_rate": 0.00019799768786127168, "loss": 1.0785940551757813, "step": 1300 }, { "epoch": 0.5009633911368016, "eval_loss": 1.0533366203308105, "eval_runtime": 11.7245, "eval_samples_per_second": 1620.875, "eval_steps_per_second": 33.775, "step": 1300 }, { "epoch": 0.5202312138728323, "grad_norm": 1.0714735984802246, "learning_rate": 0.00019792061657032757, "loss": 1.0660378265380859, "step": 1350 }, { "epoch": 0.5202312138728323, "eval_loss": 1.0415290594100952, "eval_runtime": 11.7336, "eval_samples_per_second": 1619.629, "eval_steps_per_second": 33.749, "step": 1350 }, { "epoch": 0.5394990366088632, "grad_norm": 1.03439462184906, "learning_rate": 0.00019784354527938345, "loss": 1.0371372985839844, "step": 1400 }, { "epoch": 0.5394990366088632, "eval_loss": 1.0224626064300537, "eval_runtime": 11.7425, "eval_samples_per_second": 1618.397, "eval_steps_per_second": 33.724, "step": 1400 }, { "epoch": 0.558766859344894, "grad_norm": 1.0442633628845215, "learning_rate": 0.0001977664739884393, "loss": 1.0124398040771485, "step": 1450 }, { "epoch": 0.558766859344894, "eval_loss": 1.008814811706543, "eval_runtime": 11.7341, "eval_samples_per_second": 1619.552, "eval_steps_per_second": 33.748, "step": 1450 }, { "epoch": 0.5780346820809249, "grad_norm": 1.0305899381637573, "learning_rate": 0.0001976894026974952, "loss": 1.0005599975585937, "step": 1500 }, { "epoch": 0.5780346820809249, "eval_loss": 0.9917020797729492, "eval_runtime": 11.7556, "eval_samples_per_second": 1616.587, "eval_steps_per_second": 33.686, "step": 1500 }, { "epoch": 0.5973025048169557, "grad_norm": 0.9734905362129211, "learning_rate": 0.00019761233140655108, "loss": 0.9688368225097657, "step": 1550 }, { "epoch": 0.5973025048169557, "eval_loss": 0.9741984009742737, "eval_runtime": 11.7336, "eval_samples_per_second": 1619.625, "eval_steps_per_second": 33.749, "step": 1550 }, { "epoch": 0.6165703275529865, "grad_norm": 0.9896020889282227, "learning_rate": 0.00019753526011560694, "loss": 0.955411376953125, "step": 1600 }, { "epoch": 0.6165703275529865, "eval_loss": 0.9494890570640564, "eval_runtime": 11.7555, "eval_samples_per_second": 1616.6, "eval_steps_per_second": 33.686, "step": 1600 }, { "epoch": 0.6358381502890174, "grad_norm": 1.0854922533035278, "learning_rate": 0.00019745818882466282, "loss": 0.9497003173828125, "step": 1650 }, { "epoch": 0.6358381502890174, "eval_loss": 0.9642688035964966, "eval_runtime": 11.7394, "eval_samples_per_second": 1618.826, "eval_steps_per_second": 33.733, "step": 1650 }, { "epoch": 0.6551059730250481, "grad_norm": 1.1026602983474731, "learning_rate": 0.0001973811175337187, "loss": 0.927451400756836, "step": 1700 }, { "epoch": 0.6551059730250481, "eval_loss": 0.9418200254440308, "eval_runtime": 11.7376, "eval_samples_per_second": 1619.069, "eval_steps_per_second": 33.738, "step": 1700 }, { "epoch": 0.674373795761079, "grad_norm": 1.1530070304870605, "learning_rate": 0.00019730404624277456, "loss": 0.910860595703125, "step": 1750 }, { "epoch": 0.674373795761079, "eval_loss": 0.9280962347984314, "eval_runtime": 11.7349, "eval_samples_per_second": 1619.439, "eval_steps_per_second": 33.745, "step": 1750 }, { "epoch": 0.6936416184971098, "grad_norm": 1.0063809156417847, "learning_rate": 0.00019722697495183045, "loss": 0.9104718017578125, "step": 1800 }, { "epoch": 0.6936416184971098, "eval_loss": 0.9196037650108337, "eval_runtime": 11.7957, "eval_samples_per_second": 1611.102, "eval_steps_per_second": 33.572, "step": 1800 }, { "epoch": 0.7129094412331407, "grad_norm": 0.9946666359901428, "learning_rate": 0.00019714990366088633, "loss": 0.8886672210693359, "step": 1850 }, { "epoch": 0.7129094412331407, "eval_loss": 0.9222117066383362, "eval_runtime": 11.7442, "eval_samples_per_second": 1618.163, "eval_steps_per_second": 33.719, "step": 1850 }, { "epoch": 0.7321772639691715, "grad_norm": 0.9966956377029419, "learning_rate": 0.00019707283236994222, "loss": 0.8832919311523437, "step": 1900 }, { "epoch": 0.7321772639691715, "eval_loss": 0.8887454867362976, "eval_runtime": 11.7259, "eval_samples_per_second": 1620.686, "eval_steps_per_second": 33.771, "step": 1900 }, { "epoch": 0.7514450867052023, "grad_norm": 0.9478052258491516, "learning_rate": 0.00019699576107899808, "loss": 0.8676170349121094, "step": 1950 }, { "epoch": 0.7514450867052023, "eval_loss": 0.8681301474571228, "eval_runtime": 11.7533, "eval_samples_per_second": 1616.911, "eval_steps_per_second": 33.693, "step": 1950 }, { "epoch": 0.7707129094412332, "grad_norm": 1.0266932249069214, "learning_rate": 0.00019691868978805396, "loss": 0.8615008544921875, "step": 2000 }, { "epoch": 0.7707129094412332, "eval_loss": 0.880142092704773, "eval_runtime": 11.736, "eval_samples_per_second": 1619.295, "eval_steps_per_second": 33.742, "step": 2000 }, { "epoch": 0.789980732177264, "grad_norm": 0.9472013711929321, "learning_rate": 0.00019684161849710982, "loss": 0.8559214782714843, "step": 2050 }, { "epoch": 0.789980732177264, "eval_loss": 0.8617808222770691, "eval_runtime": 11.7452, "eval_samples_per_second": 1618.019, "eval_steps_per_second": 33.716, "step": 2050 }, { "epoch": 0.8092485549132948, "grad_norm": 0.91195148229599, "learning_rate": 0.0001967645472061657, "loss": 0.85264892578125, "step": 2100 }, { "epoch": 0.8092485549132948, "eval_loss": 0.8737282752990723, "eval_runtime": 11.7339, "eval_samples_per_second": 1619.578, "eval_steps_per_second": 33.748, "step": 2100 }, { "epoch": 0.8285163776493256, "grad_norm": 0.8524146676063538, "learning_rate": 0.0001966874759152216, "loss": 0.8335631561279296, "step": 2150 }, { "epoch": 0.8285163776493256, "eval_loss": 0.8772912621498108, "eval_runtime": 11.7646, "eval_samples_per_second": 1615.348, "eval_steps_per_second": 33.66, "step": 2150 }, { "epoch": 0.8477842003853564, "grad_norm": 0.8374658226966858, "learning_rate": 0.00019661040462427747, "loss": 0.8254063415527344, "step": 2200 }, { "epoch": 0.8477842003853564, "eval_loss": 0.8591238260269165, "eval_runtime": 11.7397, "eval_samples_per_second": 1618.781, "eval_steps_per_second": 33.732, "step": 2200 }, { "epoch": 0.8670520231213873, "grad_norm": 0.9672785401344299, "learning_rate": 0.00019653333333333336, "loss": 0.8236775207519531, "step": 2250 }, { "epoch": 0.8670520231213873, "eval_loss": 0.8585294485092163, "eval_runtime": 11.749, "eval_samples_per_second": 1617.495, "eval_steps_per_second": 33.705, "step": 2250 }, { "epoch": 0.8863198458574181, "grad_norm": 0.9784515500068665, "learning_rate": 0.00019645626204238922, "loss": 0.8199764251708984, "step": 2300 }, { "epoch": 0.8863198458574181, "eval_loss": 0.8411864042282104, "eval_runtime": 11.7403, "eval_samples_per_second": 1618.697, "eval_steps_per_second": 33.73, "step": 2300 }, { "epoch": 0.905587668593449, "grad_norm": 0.7446373701095581, "learning_rate": 0.00019637919075144507, "loss": 0.8040913391113281, "step": 2350 }, { "epoch": 0.905587668593449, "eval_loss": 0.8372390270233154, "eval_runtime": 11.7434, "eval_samples_per_second": 1618.267, "eval_steps_per_second": 33.721, "step": 2350 }, { "epoch": 0.9248554913294798, "grad_norm": 0.8660910129547119, "learning_rate": 0.00019630211946050096, "loss": 0.7966217041015625, "step": 2400 }, { "epoch": 0.9248554913294798, "eval_loss": 0.841880738735199, "eval_runtime": 11.7309, "eval_samples_per_second": 1620.001, "eval_steps_per_second": 33.757, "step": 2400 }, { "epoch": 0.9441233140655106, "grad_norm": 0.8319749236106873, "learning_rate": 0.00019622504816955684, "loss": 0.7877145385742188, "step": 2450 }, { "epoch": 0.9441233140655106, "eval_loss": 0.8249449133872986, "eval_runtime": 11.7408, "eval_samples_per_second": 1618.631, "eval_steps_per_second": 33.729, "step": 2450 }, { "epoch": 0.9633911368015414, "grad_norm": 0.8070456981658936, "learning_rate": 0.00019614797687861273, "loss": 0.7957000732421875, "step": 2500 }, { "epoch": 0.9633911368015414, "eval_loss": 0.8248544335365295, "eval_runtime": 11.7674, "eval_samples_per_second": 1614.976, "eval_steps_per_second": 33.652, "step": 2500 }, { "epoch": 0.9826589595375722, "grad_norm": 0.8011983633041382, "learning_rate": 0.0001960709055876686, "loss": 0.777335433959961, "step": 2550 }, { "epoch": 0.9826589595375722, "eval_loss": 0.8185372352600098, "eval_runtime": 11.7383, "eval_samples_per_second": 1618.972, "eval_steps_per_second": 33.736, "step": 2550 }, { "epoch": 1.001926782273603, "grad_norm": 0.8539603352546692, "learning_rate": 0.0001959938342967245, "loss": 0.7638008117675781, "step": 2600 }, { "epoch": 1.001926782273603, "eval_loss": 0.8289339542388916, "eval_runtime": 11.7398, "eval_samples_per_second": 1618.766, "eval_steps_per_second": 33.731, "step": 2600 }, { "epoch": 1.0211946050096339, "grad_norm": 0.844338059425354, "learning_rate": 0.00019591676300578036, "loss": 0.760958251953125, "step": 2650 }, { "epoch": 1.0211946050096339, "eval_loss": 0.8120879530906677, "eval_runtime": 11.7489, "eval_samples_per_second": 1617.514, "eval_steps_per_second": 33.705, "step": 2650 }, { "epoch": 1.0404624277456647, "grad_norm": 0.7795179486274719, "learning_rate": 0.00019583969171483621, "loss": 0.7511671447753906, "step": 2700 }, { "epoch": 1.0404624277456647, "eval_loss": 0.8116646409034729, "eval_runtime": 11.7491, "eval_samples_per_second": 1617.479, "eval_steps_per_second": 33.705, "step": 2700 }, { "epoch": 1.0597302504816957, "grad_norm": 0.756145179271698, "learning_rate": 0.0001957626204238921, "loss": 0.7602538299560547, "step": 2750 }, { "epoch": 1.0597302504816957, "eval_loss": 0.8128464818000793, "eval_runtime": 11.7593, "eval_samples_per_second": 1616.086, "eval_steps_per_second": 33.676, "step": 2750 }, { "epoch": 1.0789980732177264, "grad_norm": 0.8052137494087219, "learning_rate": 0.00019568554913294798, "loss": 0.7421517944335938, "step": 2800 }, { "epoch": 1.0789980732177264, "eval_loss": 0.8002434968948364, "eval_runtime": 11.7568, "eval_samples_per_second": 1616.423, "eval_steps_per_second": 33.683, "step": 2800 }, { "epoch": 1.0982658959537572, "grad_norm": 0.7332647442817688, "learning_rate": 0.00019560847784200387, "loss": 0.7451560974121094, "step": 2850 }, { "epoch": 1.0982658959537572, "eval_loss": 0.8004708290100098, "eval_runtime": 11.7493, "eval_samples_per_second": 1617.453, "eval_steps_per_second": 33.704, "step": 2850 }, { "epoch": 1.117533718689788, "grad_norm": 0.7714581489562988, "learning_rate": 0.00019553140655105975, "loss": 0.7454108428955079, "step": 2900 }, { "epoch": 1.117533718689788, "eval_loss": 0.7875174880027771, "eval_runtime": 11.7368, "eval_samples_per_second": 1619.175, "eval_steps_per_second": 33.74, "step": 2900 }, { "epoch": 1.1368015414258188, "grad_norm": 0.7543232440948486, "learning_rate": 0.0001954543352601156, "loss": 0.7338851165771484, "step": 2950 }, { "epoch": 1.1368015414258188, "eval_loss": 0.79219651222229, "eval_runtime": 11.7441, "eval_samples_per_second": 1618.18, "eval_steps_per_second": 33.719, "step": 2950 }, { "epoch": 1.1560693641618498, "grad_norm": 0.8045482039451599, "learning_rate": 0.0001953772639691715, "loss": 0.7145806121826171, "step": 3000 }, { "epoch": 1.1560693641618498, "eval_loss": 0.7953436374664307, "eval_runtime": 11.7541, "eval_samples_per_second": 1616.793, "eval_steps_per_second": 33.69, "step": 3000 }, { "epoch": 1.1753371868978806, "grad_norm": 0.7869629263877869, "learning_rate": 0.00019530019267822735, "loss": 0.7264592742919922, "step": 3050 }, { "epoch": 1.1753371868978806, "eval_loss": 0.7890883088111877, "eval_runtime": 11.753, "eval_samples_per_second": 1616.946, "eval_steps_per_second": 33.693, "step": 3050 }, { "epoch": 1.1946050096339114, "grad_norm": 0.7804467678070068, "learning_rate": 0.00019522312138728324, "loss": 0.7176795196533203, "step": 3100 }, { "epoch": 1.1946050096339114, "eval_loss": 0.7677463889122009, "eval_runtime": 11.7592, "eval_samples_per_second": 1616.09, "eval_steps_per_second": 33.676, "step": 3100 }, { "epoch": 1.2138728323699421, "grad_norm": 0.8105738759040833, "learning_rate": 0.00019514605009633912, "loss": 0.7134496307373047, "step": 3150 }, { "epoch": 1.2138728323699421, "eval_loss": 0.7625473737716675, "eval_runtime": 11.7687, "eval_samples_per_second": 1614.798, "eval_steps_per_second": 33.649, "step": 3150 }, { "epoch": 1.2331406551059731, "grad_norm": 0.765195906162262, "learning_rate": 0.000195068978805395, "loss": 0.711170654296875, "step": 3200 }, { "epoch": 1.2331406551059731, "eval_loss": 0.762707531452179, "eval_runtime": 11.7549, "eval_samples_per_second": 1616.685, "eval_steps_per_second": 33.688, "step": 3200 }, { "epoch": 1.252408477842004, "grad_norm": 0.7225365042686462, "learning_rate": 0.00019499190751445087, "loss": 0.7032756042480469, "step": 3250 }, { "epoch": 1.252408477842004, "eval_loss": 0.7597822546958923, "eval_runtime": 11.7474, "eval_samples_per_second": 1617.726, "eval_steps_per_second": 33.71, "step": 3250 }, { "epoch": 1.2716763005780347, "grad_norm": 0.6805968284606934, "learning_rate": 0.00019491483622350675, "loss": 0.7095954132080078, "step": 3300 }, { "epoch": 1.2716763005780347, "eval_loss": 0.7700726389884949, "eval_runtime": 11.7657, "eval_samples_per_second": 1615.201, "eval_steps_per_second": 33.657, "step": 3300 }, { "epoch": 1.2909441233140655, "grad_norm": 0.7081642150878906, "learning_rate": 0.00019483776493256264, "loss": 0.6991606903076172, "step": 3350 }, { "epoch": 1.2909441233140655, "eval_loss": 0.7669265866279602, "eval_runtime": 11.7675, "eval_samples_per_second": 1614.961, "eval_steps_per_second": 33.652, "step": 3350 }, { "epoch": 1.3102119460500963, "grad_norm": 0.6798382997512817, "learning_rate": 0.00019476069364161852, "loss": 0.6957772827148437, "step": 3400 }, { "epoch": 1.3102119460500963, "eval_loss": 0.7487348914146423, "eval_runtime": 11.765, "eval_samples_per_second": 1615.304, "eval_steps_per_second": 33.659, "step": 3400 }, { "epoch": 1.3294797687861273, "grad_norm": 0.6709014773368835, "learning_rate": 0.00019468362235067438, "loss": 0.6927545166015625, "step": 3450 }, { "epoch": 1.3294797687861273, "eval_loss": 0.7583490014076233, "eval_runtime": 11.7653, "eval_samples_per_second": 1615.253, "eval_steps_per_second": 33.658, "step": 3450 }, { "epoch": 1.348747591522158, "grad_norm": 0.6835832595825195, "learning_rate": 0.00019460655105973026, "loss": 0.69254150390625, "step": 3500 }, { "epoch": 1.348747591522158, "eval_loss": 0.743072509765625, "eval_runtime": 11.753, "eval_samples_per_second": 1616.95, "eval_steps_per_second": 33.694, "step": 3500 }, { "epoch": 1.3680154142581888, "grad_norm": 0.6507683396339417, "learning_rate": 0.00019452947976878612, "loss": 0.6918922424316406, "step": 3550 }, { "epoch": 1.3680154142581888, "eval_loss": 0.7405670881271362, "eval_runtime": 11.7526, "eval_samples_per_second": 1617.01, "eval_steps_per_second": 33.695, "step": 3550 }, { "epoch": 1.3872832369942196, "grad_norm": 0.7638285756111145, "learning_rate": 0.000194452408477842, "loss": 0.6897285461425782, "step": 3600 }, { "epoch": 1.3872832369942196, "eval_loss": 0.7636016607284546, "eval_runtime": 11.7409, "eval_samples_per_second": 1618.617, "eval_steps_per_second": 33.728, "step": 3600 }, { "epoch": 1.4065510597302504, "grad_norm": 0.7876355051994324, "learning_rate": 0.0001943753371868979, "loss": 0.6803182983398437, "step": 3650 }, { "epoch": 1.4065510597302504, "eval_loss": 0.7356443405151367, "eval_runtime": 11.7388, "eval_samples_per_second": 1618.911, "eval_steps_per_second": 33.734, "step": 3650 }, { "epoch": 1.4258188824662814, "grad_norm": 0.6836740970611572, "learning_rate": 0.00019429826589595378, "loss": 0.6739904022216797, "step": 3700 }, { "epoch": 1.4258188824662814, "eval_loss": 0.7489825487136841, "eval_runtime": 11.7315, "eval_samples_per_second": 1619.909, "eval_steps_per_second": 33.755, "step": 3700 }, { "epoch": 1.4450867052023122, "grad_norm": 0.6540709137916565, "learning_rate": 0.00019422119460500966, "loss": 0.675911865234375, "step": 3750 }, { "epoch": 1.4450867052023122, "eval_loss": 0.7367815375328064, "eval_runtime": 11.7595, "eval_samples_per_second": 1616.062, "eval_steps_per_second": 33.675, "step": 3750 }, { "epoch": 1.464354527938343, "grad_norm": 0.6768763661384583, "learning_rate": 0.00019414412331406552, "loss": 0.6742576599121094, "step": 3800 }, { "epoch": 1.464354527938343, "eval_loss": 0.7461339235305786, "eval_runtime": 11.7455, "eval_samples_per_second": 1617.985, "eval_steps_per_second": 33.715, "step": 3800 }, { "epoch": 1.4836223506743738, "grad_norm": 0.740139365196228, "learning_rate": 0.00019406705202312138, "loss": 0.6692763519287109, "step": 3850 }, { "epoch": 1.4836223506743738, "eval_loss": 0.7398139238357544, "eval_runtime": 11.7488, "eval_samples_per_second": 1617.522, "eval_steps_per_second": 33.705, "step": 3850 }, { "epoch": 1.5028901734104045, "grad_norm": 0.7188435196876526, "learning_rate": 0.00019398998073217726, "loss": 0.6595063781738282, "step": 3900 }, { "epoch": 1.5028901734104045, "eval_loss": 0.7308260798454285, "eval_runtime": 11.7603, "eval_samples_per_second": 1615.949, "eval_steps_per_second": 33.673, "step": 3900 }, { "epoch": 1.5221579961464355, "grad_norm": 0.611328661441803, "learning_rate": 0.00019391290944123315, "loss": 0.660636215209961, "step": 3950 }, { "epoch": 1.5221579961464355, "eval_loss": 0.7358222603797913, "eval_runtime": 11.7664, "eval_samples_per_second": 1615.105, "eval_steps_per_second": 33.655, "step": 3950 }, { "epoch": 1.5414258188824663, "grad_norm": 0.6760669350624084, "learning_rate": 0.00019383583815028903, "loss": 0.6599201202392578, "step": 4000 }, { "epoch": 1.5414258188824663, "eval_loss": 0.7250499725341797, "eval_runtime": 11.7415, "eval_samples_per_second": 1618.53, "eval_steps_per_second": 33.726, "step": 4000 }, { "epoch": 1.560693641618497, "grad_norm": 0.7719911336898804, "learning_rate": 0.00019375876685934492, "loss": 0.6521245574951172, "step": 4050 }, { "epoch": 1.560693641618497, "eval_loss": 0.7325279116630554, "eval_runtime": 11.7498, "eval_samples_per_second": 1617.386, "eval_steps_per_second": 33.703, "step": 4050 }, { "epoch": 1.579961464354528, "grad_norm": 0.7071899175643921, "learning_rate": 0.0001936816955684008, "loss": 0.658658218383789, "step": 4100 }, { "epoch": 1.579961464354528, "eval_loss": 0.7083977460861206, "eval_runtime": 11.7651, "eval_samples_per_second": 1615.281, "eval_steps_per_second": 33.659, "step": 4100 }, { "epoch": 1.5992292870905587, "grad_norm": 0.6942433714866638, "learning_rate": 0.00019360462427745666, "loss": 0.6526612854003906, "step": 4150 }, { "epoch": 1.5992292870905587, "eval_loss": 0.707388699054718, "eval_runtime": 11.7453, "eval_samples_per_second": 1618.008, "eval_steps_per_second": 33.716, "step": 4150 }, { "epoch": 1.6184971098265897, "grad_norm": 0.6720093488693237, "learning_rate": 0.00019352755298651252, "loss": 0.6444657897949219, "step": 4200 }, { "epoch": 1.6184971098265897, "eval_loss": 0.708336353302002, "eval_runtime": 11.749, "eval_samples_per_second": 1617.496, "eval_steps_per_second": 33.705, "step": 4200 }, { "epoch": 1.6377649325626205, "grad_norm": 0.6178760528564453, "learning_rate": 0.0001934504816955684, "loss": 0.6516685485839844, "step": 4250 }, { "epoch": 1.6377649325626205, "eval_loss": 0.7173527479171753, "eval_runtime": 11.75, "eval_samples_per_second": 1617.365, "eval_steps_per_second": 33.702, "step": 4250 }, { "epoch": 1.6570327552986512, "grad_norm": 0.6757171154022217, "learning_rate": 0.00019337341040462429, "loss": 0.6455426025390625, "step": 4300 }, { "epoch": 1.6570327552986512, "eval_loss": 0.7112250328063965, "eval_runtime": 11.7495, "eval_samples_per_second": 1617.424, "eval_steps_per_second": 33.703, "step": 4300 }, { "epoch": 1.6763005780346822, "grad_norm": 0.6455113887786865, "learning_rate": 0.00019329633911368017, "loss": 0.6472171020507812, "step": 4350 }, { "epoch": 1.6763005780346822, "eval_loss": 0.7111583948135376, "eval_runtime": 11.7563, "eval_samples_per_second": 1616.493, "eval_steps_per_second": 33.684, "step": 4350 }, { "epoch": 1.6955684007707128, "grad_norm": 0.603722870349884, "learning_rate": 0.00019321926782273606, "loss": 0.6337673568725586, "step": 4400 }, { "epoch": 1.6955684007707128, "eval_loss": 0.7154005169868469, "eval_runtime": 11.7503, "eval_samples_per_second": 1617.32, "eval_steps_per_second": 33.701, "step": 4400 }, { "epoch": 1.7148362235067438, "grad_norm": 0.7349875569343567, "learning_rate": 0.00019314219653179191, "loss": 0.6415390014648438, "step": 4450 }, { "epoch": 1.7148362235067438, "eval_loss": 0.6985453963279724, "eval_runtime": 11.7835, "eval_samples_per_second": 1612.762, "eval_steps_per_second": 33.606, "step": 4450 }, { "epoch": 1.7341040462427746, "grad_norm": 0.622351348400116, "learning_rate": 0.0001930651252408478, "loss": 0.6380974578857422, "step": 4500 }, { "epoch": 1.7341040462427746, "eval_loss": 0.7044574022293091, "eval_runtime": 11.7337, "eval_samples_per_second": 1619.613, "eval_steps_per_second": 33.749, "step": 4500 }, { "epoch": 1.7533718689788054, "grad_norm": 0.6551128029823303, "learning_rate": 0.00019298805394990366, "loss": 0.6292387390136719, "step": 4550 }, { "epoch": 1.7533718689788054, "eval_loss": 0.7003881931304932, "eval_runtime": 11.7633, "eval_samples_per_second": 1615.537, "eval_steps_per_second": 33.664, "step": 4550 }, { "epoch": 1.7726396917148364, "grad_norm": 0.654995858669281, "learning_rate": 0.00019291098265895954, "loss": 0.6249727630615234, "step": 4600 }, { "epoch": 1.7726396917148364, "eval_loss": 0.6954051852226257, "eval_runtime": 11.7601, "eval_samples_per_second": 1615.977, "eval_steps_per_second": 33.673, "step": 4600 }, { "epoch": 1.791907514450867, "grad_norm": 0.6905754804611206, "learning_rate": 0.00019283391136801543, "loss": 0.6322254562377929, "step": 4650 }, { "epoch": 1.791907514450867, "eval_loss": 0.6908088326454163, "eval_runtime": 11.7829, "eval_samples_per_second": 1612.845, "eval_steps_per_second": 33.608, "step": 4650 }, { "epoch": 1.811175337186898, "grad_norm": 0.6779332756996155, "learning_rate": 0.0001927568400770713, "loss": 0.6285964202880859, "step": 4700 }, { "epoch": 1.811175337186898, "eval_loss": 0.6871103644371033, "eval_runtime": 11.7419, "eval_samples_per_second": 1618.474, "eval_steps_per_second": 33.725, "step": 4700 }, { "epoch": 1.8304431599229287, "grad_norm": 0.6830115914344788, "learning_rate": 0.00019267976878612717, "loss": 0.6160481262207032, "step": 4750 }, { "epoch": 1.8304431599229287, "eval_loss": 0.6783779263496399, "eval_runtime": 11.7636, "eval_samples_per_second": 1615.496, "eval_steps_per_second": 33.663, "step": 4750 }, { "epoch": 1.8497109826589595, "grad_norm": 0.6607063412666321, "learning_rate": 0.00019260269749518305, "loss": 0.6207050704956054, "step": 4800 }, { "epoch": 1.8497109826589595, "eval_loss": 0.6837959289550781, "eval_runtime": 11.8333, "eval_samples_per_second": 1605.976, "eval_steps_per_second": 33.465, "step": 4800 }, { "epoch": 1.8689788053949905, "grad_norm": 0.640997588634491, "learning_rate": 0.00019252562620423894, "loss": 0.6250300216674805, "step": 4850 }, { "epoch": 1.8689788053949905, "eval_loss": 0.6962457895278931, "eval_runtime": 11.7449, "eval_samples_per_second": 1618.07, "eval_steps_per_second": 33.717, "step": 4850 }, { "epoch": 1.888246628131021, "grad_norm": 0.5522756576538086, "learning_rate": 0.0001924485549132948, "loss": 0.6155550384521484, "step": 4900 }, { "epoch": 1.888246628131021, "eval_loss": 0.686696469783783, "eval_runtime": 11.7643, "eval_samples_per_second": 1615.395, "eval_steps_per_second": 33.661, "step": 4900 }, { "epoch": 1.907514450867052, "grad_norm": 0.6492411494255066, "learning_rate": 0.00019237148362235068, "loss": 0.6112504577636719, "step": 4950 }, { "epoch": 1.907514450867052, "eval_loss": 0.6840726137161255, "eval_runtime": 11.7483, "eval_samples_per_second": 1617.593, "eval_steps_per_second": 33.707, "step": 4950 }, { "epoch": 1.9267822736030829, "grad_norm": 0.6509422659873962, "learning_rate": 0.00019229441233140657, "loss": 0.6117168426513672, "step": 5000 }, { "epoch": 1.9267822736030829, "eval_loss": 0.68156898021698, "eval_runtime": 11.7643, "eval_samples_per_second": 1615.399, "eval_steps_per_second": 33.661, "step": 5000 }, { "epoch": 1.9460500963391136, "grad_norm": 0.6427932381629944, "learning_rate": 0.00019221734104046242, "loss": 0.6160677719116211, "step": 5050 }, { "epoch": 1.9460500963391136, "eval_loss": 0.6653590798377991, "eval_runtime": 11.7609, "eval_samples_per_second": 1615.861, "eval_steps_per_second": 33.671, "step": 5050 }, { "epoch": 1.9653179190751446, "grad_norm": 0.712970495223999, "learning_rate": 0.0001921402697495183, "loss": 0.6110684204101563, "step": 5100 }, { "epoch": 1.9653179190751446, "eval_loss": 0.6723642349243164, "eval_runtime": 11.764, "eval_samples_per_second": 1615.439, "eval_steps_per_second": 33.662, "step": 5100 }, { "epoch": 1.9845857418111752, "grad_norm": 0.6063272953033447, "learning_rate": 0.0001920631984585742, "loss": 0.6059880828857422, "step": 5150 }, { "epoch": 1.9845857418111752, "eval_loss": 0.6600672602653503, "eval_runtime": 11.7591, "eval_samples_per_second": 1616.104, "eval_steps_per_second": 33.676, "step": 5150 }, { "epoch": 2.003853564547206, "grad_norm": 0.6128818392753601, "learning_rate": 0.00019198612716763008, "loss": 0.6065807342529297, "step": 5200 }, { "epoch": 2.003853564547206, "eval_loss": 0.6653997898101807, "eval_runtime": 11.7428, "eval_samples_per_second": 1618.354, "eval_steps_per_second": 33.723, "step": 5200 }, { "epoch": 2.023121387283237, "grad_norm": 0.59360271692276, "learning_rate": 0.00019190905587668596, "loss": 0.6008117294311524, "step": 5250 }, { "epoch": 2.023121387283237, "eval_loss": 0.6623563170433044, "eval_runtime": 11.765, "eval_samples_per_second": 1615.298, "eval_steps_per_second": 33.659, "step": 5250 }, { "epoch": 2.0423892100192678, "grad_norm": 0.5774858593940735, "learning_rate": 0.00019183198458574182, "loss": 0.590107192993164, "step": 5300 }, { "epoch": 2.0423892100192678, "eval_loss": 0.6823951601982117, "eval_runtime": 11.7525, "eval_samples_per_second": 1617.012, "eval_steps_per_second": 33.695, "step": 5300 }, { "epoch": 2.0616570327552988, "grad_norm": 0.6577916741371155, "learning_rate": 0.00019175491329479768, "loss": 0.5975875854492188, "step": 5350 }, { "epoch": 2.0616570327552988, "eval_loss": 0.6666197180747986, "eval_runtime": 11.7779, "eval_samples_per_second": 1613.528, "eval_steps_per_second": 33.622, "step": 5350 }, { "epoch": 2.0809248554913293, "grad_norm": 0.6946585178375244, "learning_rate": 0.00019167784200385356, "loss": 0.6030185699462891, "step": 5400 }, { "epoch": 2.0809248554913293, "eval_loss": 0.6608912348747253, "eval_runtime": 11.7602, "eval_samples_per_second": 1615.96, "eval_steps_per_second": 33.673, "step": 5400 }, { "epoch": 2.1001926782273603, "grad_norm": 0.542421817779541, "learning_rate": 0.00019160077071290945, "loss": 0.5967891693115235, "step": 5450 }, { "epoch": 2.1001926782273603, "eval_loss": 0.6573489904403687, "eval_runtime": 11.7632, "eval_samples_per_second": 1615.543, "eval_steps_per_second": 33.664, "step": 5450 }, { "epoch": 2.1194605009633913, "grad_norm": 0.5841713547706604, "learning_rate": 0.00019152369942196533, "loss": 0.5885901641845703, "step": 5500 }, { "epoch": 2.1194605009633913, "eval_loss": 0.656871497631073, "eval_runtime": 11.7501, "eval_samples_per_second": 1617.351, "eval_steps_per_second": 33.702, "step": 5500 }, { "epoch": 2.138728323699422, "grad_norm": 0.5896437168121338, "learning_rate": 0.00019144662813102122, "loss": 0.589515380859375, "step": 5550 }, { "epoch": 2.138728323699422, "eval_loss": 0.6517202258110046, "eval_runtime": 11.7494, "eval_samples_per_second": 1617.443, "eval_steps_per_second": 33.704, "step": 5550 }, { "epoch": 2.157996146435453, "grad_norm": 0.6488344669342041, "learning_rate": 0.0001913695568400771, "loss": 0.5839699935913086, "step": 5600 }, { "epoch": 2.157996146435453, "eval_loss": 0.6614031195640564, "eval_runtime": 11.7652, "eval_samples_per_second": 1615.273, "eval_steps_per_second": 33.659, "step": 5600 }, { "epoch": 2.1772639691714835, "grad_norm": 0.6557515859603882, "learning_rate": 0.00019129248554913296, "loss": 0.5809717941284179, "step": 5650 }, { "epoch": 2.1772639691714835, "eval_loss": 0.640534520149231, "eval_runtime": 11.7496, "eval_samples_per_second": 1617.423, "eval_steps_per_second": 33.703, "step": 5650 }, { "epoch": 2.1965317919075145, "grad_norm": 0.6391697525978088, "learning_rate": 0.00019121541425818882, "loss": 0.5836068725585938, "step": 5700 }, { "epoch": 2.1965317919075145, "eval_loss": 0.654788613319397, "eval_runtime": 11.7567, "eval_samples_per_second": 1616.437, "eval_steps_per_second": 33.683, "step": 5700 }, { "epoch": 2.2157996146435455, "grad_norm": 0.5705830454826355, "learning_rate": 0.0001911383429672447, "loss": 0.5864692306518555, "step": 5750 }, { "epoch": 2.2157996146435455, "eval_loss": 0.6632187366485596, "eval_runtime": 11.7548, "eval_samples_per_second": 1616.706, "eval_steps_per_second": 33.688, "step": 5750 }, { "epoch": 2.235067437379576, "grad_norm": 0.5806713104248047, "learning_rate": 0.0001910612716763006, "loss": 0.5800092315673828, "step": 5800 }, { "epoch": 2.235067437379576, "eval_loss": 0.6513479948043823, "eval_runtime": 11.7791, "eval_samples_per_second": 1613.368, "eval_steps_per_second": 33.619, "step": 5800 }, { "epoch": 2.254335260115607, "grad_norm": 0.5774704813957214, "learning_rate": 0.00019098420038535647, "loss": 0.58150634765625, "step": 5850 }, { "epoch": 2.254335260115607, "eval_loss": 0.6450129151344299, "eval_runtime": 11.7686, "eval_samples_per_second": 1614.804, "eval_steps_per_second": 33.649, "step": 5850 }, { "epoch": 2.2736030828516376, "grad_norm": 0.6218037009239197, "learning_rate": 0.00019090712909441236, "loss": 0.5769814682006836, "step": 5900 }, { "epoch": 2.2736030828516376, "eval_loss": 0.6484137773513794, "eval_runtime": 11.789, "eval_samples_per_second": 1612.006, "eval_steps_per_second": 33.591, "step": 5900 }, { "epoch": 2.2928709055876686, "grad_norm": 0.615885317325592, "learning_rate": 0.00019083005780346822, "loss": 0.5799093627929688, "step": 5950 }, { "epoch": 2.2928709055876686, "eval_loss": 0.6442128419876099, "eval_runtime": 11.799, "eval_samples_per_second": 1610.647, "eval_steps_per_second": 33.562, "step": 5950 }, { "epoch": 2.3121387283236996, "grad_norm": 0.5743442177772522, "learning_rate": 0.0001907529865125241, "loss": 0.5728417205810546, "step": 6000 }, { "epoch": 2.3121387283236996, "eval_loss": 0.6562144160270691, "eval_runtime": 11.7531, "eval_samples_per_second": 1616.938, "eval_steps_per_second": 33.693, "step": 6000 }, { "epoch": 2.33140655105973, "grad_norm": 0.6700494289398193, "learning_rate": 0.00019067591522157996, "loss": 0.5660633468627929, "step": 6050 }, { "epoch": 2.33140655105973, "eval_loss": 0.6471076607704163, "eval_runtime": 11.7748, "eval_samples_per_second": 1613.952, "eval_steps_per_second": 33.631, "step": 6050 }, { "epoch": 2.350674373795761, "grad_norm": 0.6334522366523743, "learning_rate": 0.00019059884393063584, "loss": 0.5654730224609374, "step": 6100 }, { "epoch": 2.350674373795761, "eval_loss": 0.631842851638794, "eval_runtime": 11.7604, "eval_samples_per_second": 1615.928, "eval_steps_per_second": 33.672, "step": 6100 }, { "epoch": 2.3699421965317917, "grad_norm": 0.5829822421073914, "learning_rate": 0.00019052177263969173, "loss": 0.5722093963623047, "step": 6150 }, { "epoch": 2.3699421965317917, "eval_loss": 0.635600745677948, "eval_runtime": 11.7661, "eval_samples_per_second": 1615.143, "eval_steps_per_second": 33.656, "step": 6150 }, { "epoch": 2.3892100192678227, "grad_norm": 0.5980924963951111, "learning_rate": 0.00019044470134874761, "loss": 0.5730614471435547, "step": 6200 }, { "epoch": 2.3892100192678227, "eval_loss": 0.6208349466323853, "eval_runtime": 11.7599, "eval_samples_per_second": 1616.004, "eval_steps_per_second": 33.674, "step": 6200 }, { "epoch": 2.4084778420038537, "grad_norm": 0.5710204243659973, "learning_rate": 0.00019036763005780347, "loss": 0.563860092163086, "step": 6250 }, { "epoch": 2.4084778420038537, "eval_loss": 0.6325879693031311, "eval_runtime": 11.7545, "eval_samples_per_second": 1616.74, "eval_steps_per_second": 33.689, "step": 6250 }, { "epoch": 2.4277456647398843, "grad_norm": 0.6150076985359192, "learning_rate": 0.00019029055876685936, "loss": 0.5657758331298828, "step": 6300 }, { "epoch": 2.4277456647398843, "eval_loss": 0.6412080526351929, "eval_runtime": 11.7607, "eval_samples_per_second": 1615.884, "eval_steps_per_second": 33.671, "step": 6300 }, { "epoch": 2.4470134874759153, "grad_norm": 0.6363497376441956, "learning_rate": 0.00019021348747591524, "loss": 0.565108642578125, "step": 6350 }, { "epoch": 2.4470134874759153, "eval_loss": 0.6272048354148865, "eval_runtime": 11.7547, "eval_samples_per_second": 1616.715, "eval_steps_per_second": 33.689, "step": 6350 }, { "epoch": 2.4662813102119463, "grad_norm": 0.6153821349143982, "learning_rate": 0.0001901364161849711, "loss": 0.5622220611572266, "step": 6400 }, { "epoch": 2.4662813102119463, "eval_loss": 0.6397174000740051, "eval_runtime": 11.762, "eval_samples_per_second": 1615.707, "eval_steps_per_second": 33.668, "step": 6400 }, { "epoch": 2.485549132947977, "grad_norm": 0.6426052451133728, "learning_rate": 0.00019005934489402698, "loss": 0.5594404220581055, "step": 6450 }, { "epoch": 2.485549132947977, "eval_loss": 0.6287122368812561, "eval_runtime": 11.7479, "eval_samples_per_second": 1617.648, "eval_steps_per_second": 33.708, "step": 6450 }, { "epoch": 2.504816955684008, "grad_norm": 0.600919246673584, "learning_rate": 0.00018998227360308284, "loss": 0.5591336059570312, "step": 6500 }, { "epoch": 2.504816955684008, "eval_loss": 0.6275458931922913, "eval_runtime": 11.7495, "eval_samples_per_second": 1617.428, "eval_steps_per_second": 33.703, "step": 6500 }, { "epoch": 2.5240847784200384, "grad_norm": 0.6406385898590088, "learning_rate": 0.00018990520231213873, "loss": 0.5618526458740234, "step": 6550 }, { "epoch": 2.5240847784200384, "eval_loss": 0.6250490546226501, "eval_runtime": 11.7544, "eval_samples_per_second": 1616.754, "eval_steps_per_second": 33.689, "step": 6550 }, { "epoch": 2.5433526011560694, "grad_norm": 0.5960623621940613, "learning_rate": 0.0001898281310211946, "loss": 0.564210319519043, "step": 6600 }, { "epoch": 2.5433526011560694, "eval_loss": 0.6141932606697083, "eval_runtime": 11.7765, "eval_samples_per_second": 1613.72, "eval_steps_per_second": 33.626, "step": 6600 }, { "epoch": 2.5626204238921, "grad_norm": 0.6122861504554749, "learning_rate": 0.0001897510597302505, "loss": 0.5627736663818359, "step": 6650 }, { "epoch": 2.5626204238921, "eval_loss": 0.6229286789894104, "eval_runtime": 11.7609, "eval_samples_per_second": 1615.868, "eval_steps_per_second": 33.671, "step": 6650 }, { "epoch": 2.581888246628131, "grad_norm": 0.633362352848053, "learning_rate": 0.00018967398843930638, "loss": 0.5545833969116211, "step": 6700 }, { "epoch": 2.581888246628131, "eval_loss": 0.628903865814209, "eval_runtime": 11.7612, "eval_samples_per_second": 1615.82, "eval_steps_per_second": 33.67, "step": 6700 }, { "epoch": 2.601156069364162, "grad_norm": 0.6064343452453613, "learning_rate": 0.00018959691714836224, "loss": 0.5525814056396484, "step": 6750 }, { "epoch": 2.601156069364162, "eval_loss": 0.6245915293693542, "eval_runtime": 11.7647, "eval_samples_per_second": 1615.341, "eval_steps_per_second": 33.66, "step": 6750 }, { "epoch": 2.6204238921001926, "grad_norm": 0.642774760723114, "learning_rate": 0.0001895198458574181, "loss": 0.5488033676147461, "step": 6800 }, { "epoch": 2.6204238921001926, "eval_loss": 0.6279101371765137, "eval_runtime": 11.7706, "eval_samples_per_second": 1614.537, "eval_steps_per_second": 33.643, "step": 6800 }, { "epoch": 2.6396917148362236, "grad_norm": 0.5844165682792664, "learning_rate": 0.00018944277456647398, "loss": 0.559207763671875, "step": 6850 }, { "epoch": 2.6396917148362236, "eval_loss": 0.6072271466255188, "eval_runtime": 11.7825, "eval_samples_per_second": 1612.903, "eval_steps_per_second": 33.609, "step": 6850 }, { "epoch": 2.6589595375722546, "grad_norm": 0.5651281476020813, "learning_rate": 0.00018936570327552987, "loss": 0.5525361633300782, "step": 6900 }, { "epoch": 2.6589595375722546, "eval_loss": 0.6185606718063354, "eval_runtime": 11.7745, "eval_samples_per_second": 1613.99, "eval_steps_per_second": 33.632, "step": 6900 }, { "epoch": 2.678227360308285, "grad_norm": 0.6769030690193176, "learning_rate": 0.00018928863198458575, "loss": 0.5407891464233399, "step": 6950 }, { "epoch": 2.678227360308285, "eval_loss": 0.6081722974777222, "eval_runtime": 11.7919, "eval_samples_per_second": 1611.613, "eval_steps_per_second": 33.582, "step": 6950 }, { "epoch": 2.697495183044316, "grad_norm": 0.5626652240753174, "learning_rate": 0.00018921156069364164, "loss": 0.5540246200561524, "step": 7000 }, { "epoch": 2.697495183044316, "eval_loss": 0.6142093539237976, "eval_runtime": 11.7574, "eval_samples_per_second": 1616.35, "eval_steps_per_second": 33.681, "step": 7000 }, { "epoch": 2.7167630057803467, "grad_norm": 0.5486749410629272, "learning_rate": 0.00018913448940269752, "loss": 0.5495675659179687, "step": 7050 }, { "epoch": 2.7167630057803467, "eval_loss": 0.6160227656364441, "eval_runtime": 11.7687, "eval_samples_per_second": 1614.797, "eval_steps_per_second": 33.649, "step": 7050 }, { "epoch": 2.7360308285163777, "grad_norm": 0.5408969521522522, "learning_rate": 0.00018905741811175338, "loss": 0.5486634826660156, "step": 7100 }, { "epoch": 2.7360308285163777, "eval_loss": 0.6077710390090942, "eval_runtime": 11.7778, "eval_samples_per_second": 1613.543, "eval_steps_per_second": 33.623, "step": 7100 }, { "epoch": 2.7552986512524082, "grad_norm": 0.5238296985626221, "learning_rate": 0.00018898034682080926, "loss": 0.5523670578002929, "step": 7150 }, { "epoch": 2.7552986512524082, "eval_loss": 0.6196697354316711, "eval_runtime": 11.7863, "eval_samples_per_second": 1612.384, "eval_steps_per_second": 33.598, "step": 7150 }, { "epoch": 2.7745664739884393, "grad_norm": 0.5837554931640625, "learning_rate": 0.00018890327552986512, "loss": 0.5453910446166992, "step": 7200 }, { "epoch": 2.7745664739884393, "eval_loss": 0.6161477565765381, "eval_runtime": 11.76, "eval_samples_per_second": 1615.984, "eval_steps_per_second": 33.673, "step": 7200 }, { "epoch": 2.7938342967244703, "grad_norm": 0.5618086457252502, "learning_rate": 0.000188826204238921, "loss": 0.5410779571533203, "step": 7250 }, { "epoch": 2.7938342967244703, "eval_loss": 0.6196335554122925, "eval_runtime": 11.7649, "eval_samples_per_second": 1615.313, "eval_steps_per_second": 33.659, "step": 7250 }, { "epoch": 2.813102119460501, "grad_norm": 0.5747096538543701, "learning_rate": 0.0001887491329479769, "loss": 0.53860595703125, "step": 7300 }, { "epoch": 2.813102119460501, "eval_loss": 0.6052906513214111, "eval_runtime": 11.7653, "eval_samples_per_second": 1615.257, "eval_steps_per_second": 33.658, "step": 7300 }, { "epoch": 2.832369942196532, "grad_norm": 0.5254901051521301, "learning_rate": 0.00018867206165703278, "loss": 0.5395934295654297, "step": 7350 }, { "epoch": 2.832369942196532, "eval_loss": 0.6092073917388916, "eval_runtime": 11.7728, "eval_samples_per_second": 1614.229, "eval_steps_per_second": 33.637, "step": 7350 }, { "epoch": 2.851637764932563, "grad_norm": 0.6120988130569458, "learning_rate": 0.00018859499036608863, "loss": 0.537706298828125, "step": 7400 }, { "epoch": 2.851637764932563, "eval_loss": 0.6123009920120239, "eval_runtime": 11.7554, "eval_samples_per_second": 1616.619, "eval_steps_per_second": 33.687, "step": 7400 }, { "epoch": 2.8709055876685934, "grad_norm": 0.5506011843681335, "learning_rate": 0.00018851791907514452, "loss": 0.5340191650390625, "step": 7450 }, { "epoch": 2.8709055876685934, "eval_loss": 0.6140795350074768, "eval_runtime": 11.764, "eval_samples_per_second": 1615.431, "eval_steps_per_second": 33.662, "step": 7450 }, { "epoch": 2.8901734104046244, "grad_norm": 0.5737815499305725, "learning_rate": 0.0001884408477842004, "loss": 0.5365435028076172, "step": 7500 }, { "epoch": 2.8901734104046244, "eval_loss": 0.6090665459632874, "eval_runtime": 11.7794, "eval_samples_per_second": 1613.322, "eval_steps_per_second": 33.618, "step": 7500 }, { "epoch": 2.909441233140655, "grad_norm": 0.5369409918785095, "learning_rate": 0.00018836377649325626, "loss": 0.5379184722900391, "step": 7550 }, { "epoch": 2.909441233140655, "eval_loss": 0.6087771058082581, "eval_runtime": 11.7635, "eval_samples_per_second": 1615.502, "eval_steps_per_second": 33.663, "step": 7550 }, { "epoch": 2.928709055876686, "grad_norm": 0.5845835208892822, "learning_rate": 0.00018828670520231215, "loss": 0.534803466796875, "step": 7600 }, { "epoch": 2.928709055876686, "eval_loss": 0.6000416278839111, "eval_runtime": 11.7603, "eval_samples_per_second": 1615.949, "eval_steps_per_second": 33.673, "step": 7600 }, { "epoch": 2.9479768786127165, "grad_norm": 0.6331714987754822, "learning_rate": 0.00018820963391136803, "loss": 0.5358788681030273, "step": 7650 }, { "epoch": 2.9479768786127165, "eval_loss": 0.6086013913154602, "eval_runtime": 11.7825, "eval_samples_per_second": 1612.902, "eval_steps_per_second": 33.609, "step": 7650 }, { "epoch": 2.9672447013487475, "grad_norm": 0.5339002013206482, "learning_rate": 0.0001881325626204239, "loss": 0.5269888305664062, "step": 7700 }, { "epoch": 2.9672447013487475, "eval_loss": 0.6162144541740417, "eval_runtime": 11.7733, "eval_samples_per_second": 1614.167, "eval_steps_per_second": 33.636, "step": 7700 }, { "epoch": 2.9865125240847785, "grad_norm": 0.5314202308654785, "learning_rate": 0.00018805549132947977, "loss": 0.5301641082763672, "step": 7750 }, { "epoch": 2.9865125240847785, "eval_loss": 0.6147223711013794, "eval_runtime": 11.7661, "eval_samples_per_second": 1615.151, "eval_steps_per_second": 33.656, "step": 7750 }, { "epoch": 3.005780346820809, "grad_norm": 0.522209107875824, "learning_rate": 0.00018797842003853566, "loss": 0.528267822265625, "step": 7800 }, { "epoch": 3.005780346820809, "eval_loss": 0.6137077808380127, "eval_runtime": 11.763, "eval_samples_per_second": 1615.576, "eval_steps_per_second": 33.665, "step": 7800 }, { "epoch": 3.02504816955684, "grad_norm": 0.532255232334137, "learning_rate": 0.00018790134874759154, "loss": 0.53183837890625, "step": 7850 }, { "epoch": 3.02504816955684, "eval_loss": 0.5966995358467102, "eval_runtime": 11.7702, "eval_samples_per_second": 1614.589, "eval_steps_per_second": 33.644, "step": 7850 }, { "epoch": 3.044315992292871, "grad_norm": 0.5224747061729431, "learning_rate": 0.0001878242774566474, "loss": 0.5219392013549805, "step": 7900 }, { "epoch": 3.044315992292871, "eval_loss": 0.5935191512107849, "eval_runtime": 11.7613, "eval_samples_per_second": 1615.811, "eval_steps_per_second": 33.67, "step": 7900 }, { "epoch": 3.0635838150289016, "grad_norm": 0.5991969704627991, "learning_rate": 0.0001877472061657033, "loss": 0.5260236358642578, "step": 7950 }, { "epoch": 3.0635838150289016, "eval_loss": 0.5973201990127563, "eval_runtime": 11.7723, "eval_samples_per_second": 1614.297, "eval_steps_per_second": 33.638, "step": 7950 }, { "epoch": 3.0828516377649327, "grad_norm": 0.583088219165802, "learning_rate": 0.00018767013487475914, "loss": 0.5255516815185547, "step": 8000 }, { "epoch": 3.0828516377649327, "eval_loss": 0.5987491607666016, "eval_runtime": 11.7714, "eval_samples_per_second": 1614.415, "eval_steps_per_second": 33.641, "step": 8000 }, { "epoch": 3.102119460500963, "grad_norm": 0.5187159180641174, "learning_rate": 0.00018759306358381503, "loss": 0.5250172042846679, "step": 8050 }, { "epoch": 3.102119460500963, "eval_loss": 0.596394419670105, "eval_runtime": 11.7594, "eval_samples_per_second": 1616.063, "eval_steps_per_second": 33.675, "step": 8050 }, { "epoch": 3.121387283236994, "grad_norm": 0.5229313373565674, "learning_rate": 0.00018751599229287091, "loss": 0.5242057418823243, "step": 8100 }, { "epoch": 3.121387283236994, "eval_loss": 0.5977076888084412, "eval_runtime": 11.7666, "eval_samples_per_second": 1615.074, "eval_steps_per_second": 33.654, "step": 8100 }, { "epoch": 3.140655105973025, "grad_norm": 0.565708577632904, "learning_rate": 0.0001874389210019268, "loss": 0.5285503768920898, "step": 8150 }, { "epoch": 3.140655105973025, "eval_loss": 0.5815967917442322, "eval_runtime": 11.7695, "eval_samples_per_second": 1614.68, "eval_steps_per_second": 33.646, "step": 8150 }, { "epoch": 3.159922928709056, "grad_norm": 0.5821714997291565, "learning_rate": 0.00018736184971098268, "loss": 0.518197021484375, "step": 8200 }, { "epoch": 3.159922928709056, "eval_loss": 0.5904550552368164, "eval_runtime": 11.7715, "eval_samples_per_second": 1614.401, "eval_steps_per_second": 33.64, "step": 8200 }, { "epoch": 3.179190751445087, "grad_norm": 0.5481347441673279, "learning_rate": 0.00018728477842003854, "loss": 0.5211473083496094, "step": 8250 }, { "epoch": 3.179190751445087, "eval_loss": 0.592438280582428, "eval_runtime": 11.7718, "eval_samples_per_second": 1614.366, "eval_steps_per_second": 33.64, "step": 8250 }, { "epoch": 3.1984585741811173, "grad_norm": 0.5324996709823608, "learning_rate": 0.0001872077071290944, "loss": 0.5252375411987305, "step": 8300 }, { "epoch": 3.1984585741811173, "eval_loss": 0.5905559062957764, "eval_runtime": 11.766, "eval_samples_per_second": 1615.161, "eval_steps_per_second": 33.656, "step": 8300 }, { "epoch": 3.2177263969171483, "grad_norm": 0.5032946467399597, "learning_rate": 0.00018713063583815028, "loss": 0.5133762359619141, "step": 8350 }, { "epoch": 3.2177263969171483, "eval_loss": 0.5890920758247375, "eval_runtime": 11.7606, "eval_samples_per_second": 1615.905, "eval_steps_per_second": 33.672, "step": 8350 }, { "epoch": 3.2369942196531793, "grad_norm": 0.5387537479400635, "learning_rate": 0.00018705356454720617, "loss": 0.515871696472168, "step": 8400 }, { "epoch": 3.2369942196531793, "eval_loss": 0.5889708995819092, "eval_runtime": 11.764, "eval_samples_per_second": 1615.444, "eval_steps_per_second": 33.662, "step": 8400 }, { "epoch": 3.25626204238921, "grad_norm": 0.5985877513885498, "learning_rate": 0.00018697649325626205, "loss": 0.5177163696289062, "step": 8450 }, { "epoch": 3.25626204238921, "eval_loss": 0.5866624116897583, "eval_runtime": 11.7474, "eval_samples_per_second": 1617.725, "eval_steps_per_second": 33.71, "step": 8450 }, { "epoch": 3.275529865125241, "grad_norm": 0.5255807638168335, "learning_rate": 0.00018689942196531794, "loss": 0.5219387817382812, "step": 8500 }, { "epoch": 3.275529865125241, "eval_loss": 0.58152836561203, "eval_runtime": 11.7812, "eval_samples_per_second": 1613.084, "eval_steps_per_second": 33.613, "step": 8500 }, { "epoch": 3.294797687861272, "grad_norm": 0.5529633164405823, "learning_rate": 0.00018682235067437382, "loss": 0.5175283432006836, "step": 8550 }, { "epoch": 3.294797687861272, "eval_loss": 0.5826069712638855, "eval_runtime": 11.781, "eval_samples_per_second": 1613.102, "eval_steps_per_second": 33.613, "step": 8550 }, { "epoch": 3.3140655105973025, "grad_norm": 0.5599907040596008, "learning_rate": 0.00018674527938342968, "loss": 0.5167975997924805, "step": 8600 }, { "epoch": 3.3140655105973025, "eval_loss": 0.5896000862121582, "eval_runtime": 11.7584, "eval_samples_per_second": 1616.209, "eval_steps_per_second": 33.678, "step": 8600 }, { "epoch": 3.3333333333333335, "grad_norm": 0.5085728764533997, "learning_rate": 0.00018666820809248554, "loss": 0.5114879608154297, "step": 8650 }, { "epoch": 3.3333333333333335, "eval_loss": 0.5988126397132874, "eval_runtime": 11.7576, "eval_samples_per_second": 1616.313, "eval_steps_per_second": 33.68, "step": 8650 }, { "epoch": 3.352601156069364, "grad_norm": 0.604789137840271, "learning_rate": 0.00018659113680154142, "loss": 0.5019680786132813, "step": 8700 }, { "epoch": 3.352601156069364, "eval_loss": 0.5737500190734863, "eval_runtime": 11.7542, "eval_samples_per_second": 1616.783, "eval_steps_per_second": 33.69, "step": 8700 }, { "epoch": 3.371868978805395, "grad_norm": 0.5371202826499939, "learning_rate": 0.0001865140655105973, "loss": 0.5171105194091797, "step": 8750 }, { "epoch": 3.371868978805395, "eval_loss": 0.5800595879554749, "eval_runtime": 11.7692, "eval_samples_per_second": 1614.725, "eval_steps_per_second": 33.647, "step": 8750 }, { "epoch": 3.3911368015414256, "grad_norm": 0.6188752055168152, "learning_rate": 0.0001864369942196532, "loss": 0.5068273544311523, "step": 8800 }, { "epoch": 3.3911368015414256, "eval_loss": 0.59019535779953, "eval_runtime": 11.7609, "eval_samples_per_second": 1615.86, "eval_steps_per_second": 33.671, "step": 8800 }, { "epoch": 3.4104046242774566, "grad_norm": 0.5636101961135864, "learning_rate": 0.00018635992292870908, "loss": 0.5099409103393555, "step": 8850 }, { "epoch": 3.4104046242774566, "eval_loss": 0.5903280973434448, "eval_runtime": 11.7683, "eval_samples_per_second": 1614.845, "eval_steps_per_second": 33.65, "step": 8850 }, { "epoch": 3.4296724470134876, "grad_norm": 0.5697704553604126, "learning_rate": 0.00018628285163776494, "loss": 0.5104102706909179, "step": 8900 }, { "epoch": 3.4296724470134876, "eval_loss": 0.5798438787460327, "eval_runtime": 11.7968, "eval_samples_per_second": 1610.952, "eval_steps_per_second": 33.569, "step": 8900 }, { "epoch": 3.448940269749518, "grad_norm": 0.6542669534683228, "learning_rate": 0.00018620578034682082, "loss": 0.5063149642944336, "step": 8950 }, { "epoch": 3.448940269749518, "eval_loss": 0.5749019384384155, "eval_runtime": 11.7684, "eval_samples_per_second": 1614.827, "eval_steps_per_second": 33.649, "step": 8950 }, { "epoch": 3.468208092485549, "grad_norm": 0.5875283479690552, "learning_rate": 0.00018612870905587668, "loss": 0.5149761962890625, "step": 9000 }, { "epoch": 3.468208092485549, "eval_loss": 0.5722861289978027, "eval_runtime": 11.7774, "eval_samples_per_second": 1613.603, "eval_steps_per_second": 33.624, "step": 9000 }, { "epoch": 3.48747591522158, "grad_norm": 0.5560988783836365, "learning_rate": 0.00018605163776493256, "loss": 0.5096327209472656, "step": 9050 }, { "epoch": 3.48747591522158, "eval_loss": 0.5822066068649292, "eval_runtime": 11.7667, "eval_samples_per_second": 1615.067, "eval_steps_per_second": 33.654, "step": 9050 }, { "epoch": 3.5067437379576107, "grad_norm": 0.604392945766449, "learning_rate": 0.00018597456647398845, "loss": 0.5118858337402343, "step": 9100 }, { "epoch": 3.5067437379576107, "eval_loss": 0.5773624181747437, "eval_runtime": 11.7867, "eval_samples_per_second": 1612.325, "eval_steps_per_second": 33.597, "step": 9100 }, { "epoch": 3.5260115606936417, "grad_norm": 0.4979294538497925, "learning_rate": 0.00018589749518304433, "loss": 0.5043356323242187, "step": 9150 }, { "epoch": 3.5260115606936417, "eval_loss": 0.5731167793273926, "eval_runtime": 11.7679, "eval_samples_per_second": 1614.9, "eval_steps_per_second": 33.651, "step": 9150 }, { "epoch": 3.5452793834296723, "grad_norm": 0.49121415615081787, "learning_rate": 0.0001858204238921002, "loss": 0.5067110443115235, "step": 9200 }, { "epoch": 3.5452793834296723, "eval_loss": 0.5797601938247681, "eval_runtime": 11.7674, "eval_samples_per_second": 1614.974, "eval_steps_per_second": 33.652, "step": 9200 }, { "epoch": 3.5645472061657033, "grad_norm": 0.550812304019928, "learning_rate": 0.00018574335260115608, "loss": 0.5102814865112305, "step": 9250 }, { "epoch": 3.5645472061657033, "eval_loss": 0.5666872262954712, "eval_runtime": 11.7716, "eval_samples_per_second": 1614.394, "eval_steps_per_second": 33.64, "step": 9250 }, { "epoch": 3.583815028901734, "grad_norm": 0.49830424785614014, "learning_rate": 0.00018566628131021196, "loss": 0.508917350769043, "step": 9300 }, { "epoch": 3.583815028901734, "eval_loss": 0.5714557766914368, "eval_runtime": 11.7749, "eval_samples_per_second": 1613.938, "eval_steps_per_second": 33.631, "step": 9300 }, { "epoch": 3.603082851637765, "grad_norm": 0.5482215285301208, "learning_rate": 0.00018558921001926785, "loss": 0.5125805282592774, "step": 9350 }, { "epoch": 3.603082851637765, "eval_loss": 0.5661311149597168, "eval_runtime": 11.7711, "eval_samples_per_second": 1614.463, "eval_steps_per_second": 33.642, "step": 9350 }, { "epoch": 3.622350674373796, "grad_norm": 0.49189719557762146, "learning_rate": 0.0001855121387283237, "loss": 0.4995438766479492, "step": 9400 }, { "epoch": 3.622350674373796, "eval_loss": 0.5750783681869507, "eval_runtime": 11.7576, "eval_samples_per_second": 1616.322, "eval_steps_per_second": 33.68, "step": 9400 }, { "epoch": 3.6416184971098264, "grad_norm": 0.5633854269981384, "learning_rate": 0.0001854350674373796, "loss": 0.505713119506836, "step": 9450 }, { "epoch": 3.6416184971098264, "eval_loss": 0.5851807594299316, "eval_runtime": 11.8051, "eval_samples_per_second": 1609.815, "eval_steps_per_second": 33.545, "step": 9450 }, { "epoch": 3.6608863198458574, "grad_norm": 0.5046204328536987, "learning_rate": 0.00018535799614643545, "loss": 0.49628219604492185, "step": 9500 }, { "epoch": 3.6608863198458574, "eval_loss": 0.5881949067115784, "eval_runtime": 11.7563, "eval_samples_per_second": 1616.488, "eval_steps_per_second": 33.684, "step": 9500 }, { "epoch": 3.6801541425818884, "grad_norm": 0.48170000314712524, "learning_rate": 0.00018528092485549133, "loss": 0.5063363647460938, "step": 9550 }, { "epoch": 3.6801541425818884, "eval_loss": 0.5902872085571289, "eval_runtime": 11.7772, "eval_samples_per_second": 1613.629, "eval_steps_per_second": 33.624, "step": 9550 }, { "epoch": 3.699421965317919, "grad_norm": 0.4667477011680603, "learning_rate": 0.00018520385356454722, "loss": 0.502325553894043, "step": 9600 }, { "epoch": 3.699421965317919, "eval_loss": 0.5809522271156311, "eval_runtime": 11.7766, "eval_samples_per_second": 1613.709, "eval_steps_per_second": 33.626, "step": 9600 }, { "epoch": 3.71868978805395, "grad_norm": 0.5347242951393127, "learning_rate": 0.0001851267822736031, "loss": 0.49383689880371096, "step": 9650 }, { "epoch": 3.71868978805395, "eval_loss": 0.5626729726791382, "eval_runtime": 11.7677, "eval_samples_per_second": 1614.927, "eval_steps_per_second": 33.651, "step": 9650 }, { "epoch": 3.7379576107899806, "grad_norm": 0.4509577453136444, "learning_rate": 0.000185049710982659, "loss": 0.5000727462768555, "step": 9700 }, { "epoch": 3.7379576107899806, "eval_loss": 0.5731959342956543, "eval_runtime": 11.7851, "eval_samples_per_second": 1612.54, "eval_steps_per_second": 33.602, "step": 9700 }, { "epoch": 3.7572254335260116, "grad_norm": 0.5169110894203186, "learning_rate": 0.00018497263969171485, "loss": 0.4996648406982422, "step": 9750 }, { "epoch": 3.7572254335260116, "eval_loss": 0.5586010217666626, "eval_runtime": 11.7715, "eval_samples_per_second": 1614.411, "eval_steps_per_second": 33.641, "step": 9750 }, { "epoch": 3.776493256262042, "grad_norm": 0.4938749670982361, "learning_rate": 0.0001848955684007707, "loss": 0.5040319061279297, "step": 9800 }, { "epoch": 3.776493256262042, "eval_loss": 0.5517870783805847, "eval_runtime": 11.8456, "eval_samples_per_second": 1604.31, "eval_steps_per_second": 33.43, "step": 9800 }, { "epoch": 3.795761078998073, "grad_norm": 0.5559639930725098, "learning_rate": 0.0001848184971098266, "loss": 0.4950738525390625, "step": 9850 }, { "epoch": 3.795761078998073, "eval_loss": 0.5769611597061157, "eval_runtime": 11.7747, "eval_samples_per_second": 1613.968, "eval_steps_per_second": 33.631, "step": 9850 }, { "epoch": 3.815028901734104, "grad_norm": 0.5335586667060852, "learning_rate": 0.00018474142581888247, "loss": 0.49566734313964844, "step": 9900 }, { "epoch": 3.815028901734104, "eval_loss": 0.5707488059997559, "eval_runtime": 11.7782, "eval_samples_per_second": 1613.49, "eval_steps_per_second": 33.621, "step": 9900 }, { "epoch": 3.8342967244701347, "grad_norm": 0.5322848558425903, "learning_rate": 0.00018466435452793836, "loss": 0.4938945770263672, "step": 9950 }, { "epoch": 3.8342967244701347, "eval_loss": 0.5689759254455566, "eval_runtime": 11.7627, "eval_samples_per_second": 1615.618, "eval_steps_per_second": 33.666, "step": 9950 }, { "epoch": 3.8535645472061657, "grad_norm": 0.5416219234466553, "learning_rate": 0.00018458728323699424, "loss": 0.4981888961791992, "step": 10000 }, { "epoch": 3.8535645472061657, "eval_loss": 0.5690964460372925, "eval_runtime": 11.7825, "eval_samples_per_second": 1612.901, "eval_steps_per_second": 33.609, "step": 10000 }, { "epoch": 3.8728323699421967, "grad_norm": 0.5266232490539551, "learning_rate": 0.00018451021194605013, "loss": 0.4997486877441406, "step": 10050 }, { "epoch": 3.8728323699421967, "eval_loss": 0.5566293001174927, "eval_runtime": 11.7677, "eval_samples_per_second": 1614.928, "eval_steps_per_second": 33.651, "step": 10050 }, { "epoch": 3.8921001926782273, "grad_norm": 0.5355671048164368, "learning_rate": 0.00018443314065510599, "loss": 0.49376720428466797, "step": 10100 }, { "epoch": 3.8921001926782273, "eval_loss": 0.5607211589813232, "eval_runtime": 11.7729, "eval_samples_per_second": 1614.213, "eval_steps_per_second": 33.637, "step": 10100 }, { "epoch": 3.9113680154142583, "grad_norm": 0.5219500660896301, "learning_rate": 0.00018435606936416184, "loss": 0.48877037048339844, "step": 10150 }, { "epoch": 3.9113680154142583, "eval_loss": 0.5635939240455627, "eval_runtime": 11.7594, "eval_samples_per_second": 1616.073, "eval_steps_per_second": 33.675, "step": 10150 }, { "epoch": 3.9306358381502893, "grad_norm": 0.5074095726013184, "learning_rate": 0.00018427899807321773, "loss": 0.490816650390625, "step": 10200 }, { "epoch": 3.9306358381502893, "eval_loss": 0.5602849125862122, "eval_runtime": 11.7612, "eval_samples_per_second": 1615.817, "eval_steps_per_second": 33.67, "step": 10200 }, { "epoch": 3.94990366088632, "grad_norm": 0.4775502681732178, "learning_rate": 0.0001842019267822736, "loss": 0.4908727264404297, "step": 10250 }, { "epoch": 3.94990366088632, "eval_loss": 0.5666294097900391, "eval_runtime": 11.7725, "eval_samples_per_second": 1614.264, "eval_steps_per_second": 33.638, "step": 10250 }, { "epoch": 3.969171483622351, "grad_norm": 0.49250328540802, "learning_rate": 0.0001841248554913295, "loss": 0.49150981903076174, "step": 10300 }, { "epoch": 3.969171483622351, "eval_loss": 0.5518072247505188, "eval_runtime": 11.7679, "eval_samples_per_second": 1614.899, "eval_steps_per_second": 33.651, "step": 10300 }, { "epoch": 3.9884393063583814, "grad_norm": 0.4961948096752167, "learning_rate": 0.00018404778420038538, "loss": 0.49219341278076173, "step": 10350 }, { "epoch": 3.9884393063583814, "eval_loss": 0.5544256567955017, "eval_runtime": 11.7656, "eval_samples_per_second": 1615.223, "eval_steps_per_second": 33.658, "step": 10350 }, { "epoch": 4.007707129094412, "grad_norm": 0.49724331498146057, "learning_rate": 0.00018397071290944124, "loss": 0.49793853759765627, "step": 10400 }, { "epoch": 4.007707129094412, "eval_loss": 0.5688167810440063, "eval_runtime": 11.7702, "eval_samples_per_second": 1614.583, "eval_steps_per_second": 33.644, "step": 10400 }, { "epoch": 4.026974951830443, "grad_norm": 0.48397186398506165, "learning_rate": 0.00018389364161849713, "loss": 0.4914028549194336, "step": 10450 }, { "epoch": 4.026974951830443, "eval_loss": 0.5645285248756409, "eval_runtime": 11.8222, "eval_samples_per_second": 1607.48, "eval_steps_per_second": 33.496, "step": 10450 }, { "epoch": 4.046242774566474, "grad_norm": 0.5236652493476868, "learning_rate": 0.00018381657032755298, "loss": 0.48504051208496096, "step": 10500 }, { "epoch": 4.046242774566474, "eval_loss": 0.56333327293396, "eval_runtime": 11.7933, "eval_samples_per_second": 1611.429, "eval_steps_per_second": 33.579, "step": 10500 }, { "epoch": 4.065510597302505, "grad_norm": 0.5099443197250366, "learning_rate": 0.00018373949903660887, "loss": 0.48282081604003907, "step": 10550 }, { "epoch": 4.065510597302505, "eval_loss": 0.5540857911109924, "eval_runtime": 11.7764, "eval_samples_per_second": 1613.733, "eval_steps_per_second": 33.627, "step": 10550 }, { "epoch": 4.0847784200385355, "grad_norm": 0.518556535243988, "learning_rate": 0.00018366242774566475, "loss": 0.4893923187255859, "step": 10600 }, { "epoch": 4.0847784200385355, "eval_loss": 0.5649839639663696, "eval_runtime": 11.7708, "eval_samples_per_second": 1614.508, "eval_steps_per_second": 33.643, "step": 10600 }, { "epoch": 4.104046242774566, "grad_norm": 0.5623594522476196, "learning_rate": 0.00018358535645472064, "loss": 0.48425621032714844, "step": 10650 }, { "epoch": 4.104046242774566, "eval_loss": 0.5522623062133789, "eval_runtime": 11.7753, "eval_samples_per_second": 1613.887, "eval_steps_per_second": 33.63, "step": 10650 }, { "epoch": 4.1233140655105975, "grad_norm": 0.5469601154327393, "learning_rate": 0.0001835082851637765, "loss": 0.4846769332885742, "step": 10700 }, { "epoch": 4.1233140655105975, "eval_loss": 0.5505762100219727, "eval_runtime": 11.7847, "eval_samples_per_second": 1612.603, "eval_steps_per_second": 33.603, "step": 10700 }, { "epoch": 4.142581888246628, "grad_norm": 0.525133490562439, "learning_rate": 0.00018343121387283238, "loss": 0.486254997253418, "step": 10750 }, { "epoch": 4.142581888246628, "eval_loss": 0.5474607944488525, "eval_runtime": 11.7767, "eval_samples_per_second": 1613.69, "eval_steps_per_second": 33.626, "step": 10750 }, { "epoch": 4.161849710982659, "grad_norm": 0.4922912120819092, "learning_rate": 0.00018335414258188827, "loss": 0.4855150604248047, "step": 10800 }, { "epoch": 4.161849710982659, "eval_loss": 0.5565043687820435, "eval_runtime": 11.7742, "eval_samples_per_second": 1614.034, "eval_steps_per_second": 33.633, "step": 10800 }, { "epoch": 4.18111753371869, "grad_norm": 0.5698932409286499, "learning_rate": 0.00018327707129094412, "loss": 0.48812236785888674, "step": 10850 }, { "epoch": 4.18111753371869, "eval_loss": 0.549169659614563, "eval_runtime": 11.7848, "eval_samples_per_second": 1612.587, "eval_steps_per_second": 33.603, "step": 10850 }, { "epoch": 4.200385356454721, "grad_norm": 0.4862995147705078, "learning_rate": 0.0001832, "loss": 0.4878178024291992, "step": 10900 }, { "epoch": 4.200385356454721, "eval_loss": 0.5611778497695923, "eval_runtime": 11.781, "eval_samples_per_second": 1613.11, "eval_steps_per_second": 33.614, "step": 10900 }, { "epoch": 4.219653179190751, "grad_norm": 0.49477845430374146, "learning_rate": 0.0001831229287090559, "loss": 0.4816843414306641, "step": 10950 }, { "epoch": 4.219653179190751, "eval_loss": 0.5555013418197632, "eval_runtime": 11.7793, "eval_samples_per_second": 1613.342, "eval_steps_per_second": 33.618, "step": 10950 }, { "epoch": 4.238921001926783, "grad_norm": 0.4496716856956482, "learning_rate": 0.00018304585741811175, "loss": 0.48199901580810545, "step": 11000 }, { "epoch": 4.238921001926783, "eval_loss": 0.5633974075317383, "eval_runtime": 11.7778, "eval_samples_per_second": 1613.541, "eval_steps_per_second": 33.623, "step": 11000 }, { "epoch": 4.258188824662813, "grad_norm": 0.5063816905021667, "learning_rate": 0.00018296878612716764, "loss": 0.4759461212158203, "step": 11050 }, { "epoch": 4.258188824662813, "eval_loss": 0.5390656590461731, "eval_runtime": 11.7789, "eval_samples_per_second": 1613.39, "eval_steps_per_second": 33.619, "step": 11050 }, { "epoch": 4.277456647398844, "grad_norm": 0.47395262122154236, "learning_rate": 0.00018289171483622352, "loss": 0.4842206954956055, "step": 11100 }, { "epoch": 4.277456647398844, "eval_loss": 0.550040066242218, "eval_runtime": 11.7741, "eval_samples_per_second": 1614.057, "eval_steps_per_second": 33.633, "step": 11100 }, { "epoch": 4.296724470134874, "grad_norm": 0.454486608505249, "learning_rate": 0.0001828146435452794, "loss": 0.4751776123046875, "step": 11150 }, { "epoch": 4.296724470134874, "eval_loss": 0.5576136112213135, "eval_runtime": 11.7733, "eval_samples_per_second": 1614.162, "eval_steps_per_second": 33.635, "step": 11150 }, { "epoch": 4.315992292870906, "grad_norm": 0.5085299611091614, "learning_rate": 0.00018273757225433526, "loss": 0.47597221374511717, "step": 11200 }, { "epoch": 4.315992292870906, "eval_loss": 0.5474160313606262, "eval_runtime": 11.7804, "eval_samples_per_second": 1613.182, "eval_steps_per_second": 33.615, "step": 11200 }, { "epoch": 4.335260115606936, "grad_norm": 0.4349985420703888, "learning_rate": 0.00018266050096339115, "loss": 0.4813078308105469, "step": 11250 }, { "epoch": 4.335260115606936, "eval_loss": 0.5608411431312561, "eval_runtime": 11.7832, "eval_samples_per_second": 1612.81, "eval_steps_per_second": 33.607, "step": 11250 }, { "epoch": 4.354527938342967, "grad_norm": 0.5031872987747192, "learning_rate": 0.000182583429672447, "loss": 0.47858657836914065, "step": 11300 }, { "epoch": 4.354527938342967, "eval_loss": 0.5572826862335205, "eval_runtime": 11.7927, "eval_samples_per_second": 1611.505, "eval_steps_per_second": 33.58, "step": 11300 }, { "epoch": 4.373795761078998, "grad_norm": 0.5024815201759338, "learning_rate": 0.0001825063583815029, "loss": 0.48066383361816406, "step": 11350 }, { "epoch": 4.373795761078998, "eval_loss": 0.5512340664863586, "eval_runtime": 11.7796, "eval_samples_per_second": 1613.292, "eval_steps_per_second": 33.617, "step": 11350 }, { "epoch": 4.393063583815029, "grad_norm": 0.5470072031021118, "learning_rate": 0.00018242928709055878, "loss": 0.47411796569824216, "step": 11400 }, { "epoch": 4.393063583815029, "eval_loss": 0.5593649744987488, "eval_runtime": 11.7845, "eval_samples_per_second": 1612.629, "eval_steps_per_second": 33.604, "step": 11400 }, { "epoch": 4.4123314065510595, "grad_norm": 0.4830038845539093, "learning_rate": 0.00018235221579961466, "loss": 0.48128753662109375, "step": 11450 }, { "epoch": 4.4123314065510595, "eval_loss": 0.5370304584503174, "eval_runtime": 11.7886, "eval_samples_per_second": 1612.06, "eval_steps_per_second": 33.592, "step": 11450 }, { "epoch": 4.431599229287091, "grad_norm": 0.4701225757598877, "learning_rate": 0.00018227514450867055, "loss": 0.47607994079589844, "step": 11500 }, { "epoch": 4.431599229287091, "eval_loss": 0.5497246384620667, "eval_runtime": 11.7785, "eval_samples_per_second": 1613.445, "eval_steps_per_second": 33.621, "step": 11500 }, { "epoch": 4.4508670520231215, "grad_norm": 0.5773839354515076, "learning_rate": 0.0001821980732177264, "loss": 0.47957725524902345, "step": 11550 }, { "epoch": 4.4508670520231215, "eval_loss": 0.5329610705375671, "eval_runtime": 11.785, "eval_samples_per_second": 1612.554, "eval_steps_per_second": 33.602, "step": 11550 }, { "epoch": 4.470134874759152, "grad_norm": 0.46770432591438293, "learning_rate": 0.0001821210019267823, "loss": 0.47700042724609376, "step": 11600 }, { "epoch": 4.470134874759152, "eval_loss": 0.5446674823760986, "eval_runtime": 11.8488, "eval_samples_per_second": 1603.877, "eval_steps_per_second": 33.421, "step": 11600 }, { "epoch": 4.4894026974951835, "grad_norm": 0.47263628244400024, "learning_rate": 0.00018204393063583815, "loss": 0.4798468780517578, "step": 11650 }, { "epoch": 4.4894026974951835, "eval_loss": 0.5371646285057068, "eval_runtime": 11.7989, "eval_samples_per_second": 1610.66, "eval_steps_per_second": 33.562, "step": 11650 }, { "epoch": 4.508670520231214, "grad_norm": 0.5018479228019714, "learning_rate": 0.00018196685934489403, "loss": 0.47104705810546876, "step": 11700 }, { "epoch": 4.508670520231214, "eval_loss": 0.5411204695701599, "eval_runtime": 11.7826, "eval_samples_per_second": 1612.885, "eval_steps_per_second": 33.609, "step": 11700 }, { "epoch": 4.527938342967245, "grad_norm": 0.5016299486160278, "learning_rate": 0.00018188978805394992, "loss": 0.4786482238769531, "step": 11750 }, { "epoch": 4.527938342967245, "eval_loss": 0.5467473268508911, "eval_runtime": 11.7872, "eval_samples_per_second": 1612.259, "eval_steps_per_second": 33.596, "step": 11750 }, { "epoch": 4.547206165703275, "grad_norm": 0.522402286529541, "learning_rate": 0.0001818127167630058, "loss": 0.4739419174194336, "step": 11800 }, { "epoch": 4.547206165703275, "eval_loss": 0.5275012850761414, "eval_runtime": 11.8018, "eval_samples_per_second": 1610.257, "eval_steps_per_second": 33.554, "step": 11800 }, { "epoch": 4.566473988439307, "grad_norm": 0.6756328344345093, "learning_rate": 0.00018173564547206166, "loss": 0.47961132049560545, "step": 11850 }, { "epoch": 4.566473988439307, "eval_loss": 0.5474854111671448, "eval_runtime": 11.7873, "eval_samples_per_second": 1612.241, "eval_steps_per_second": 33.595, "step": 11850 }, { "epoch": 4.585741811175337, "grad_norm": 0.4391210079193115, "learning_rate": 0.00018165857418111754, "loss": 0.4737638854980469, "step": 11900 }, { "epoch": 4.585741811175337, "eval_loss": 0.5443593859672546, "eval_runtime": 11.7792, "eval_samples_per_second": 1613.354, "eval_steps_per_second": 33.619, "step": 11900 }, { "epoch": 4.605009633911368, "grad_norm": 0.45925694704055786, "learning_rate": 0.00018158150289017343, "loss": 0.4764963531494141, "step": 11950 }, { "epoch": 4.605009633911368, "eval_loss": 0.5400537848472595, "eval_runtime": 11.779, "eval_samples_per_second": 1613.377, "eval_steps_per_second": 33.619, "step": 11950 }, { "epoch": 4.624277456647399, "grad_norm": 0.4929813742637634, "learning_rate": 0.00018150443159922929, "loss": 0.46989219665527343, "step": 12000 }, { "epoch": 4.624277456647399, "eval_loss": 0.5521953105926514, "eval_runtime": 11.7714, "eval_samples_per_second": 1614.416, "eval_steps_per_second": 33.641, "step": 12000 }, { "epoch": 4.64354527938343, "grad_norm": 0.5123869180679321, "learning_rate": 0.00018142736030828517, "loss": 0.4736211395263672, "step": 12050 }, { "epoch": 4.64354527938343, "eval_loss": 0.5476213693618774, "eval_runtime": 11.7934, "eval_samples_per_second": 1611.406, "eval_steps_per_second": 33.578, "step": 12050 }, { "epoch": 4.66281310211946, "grad_norm": 0.5603312849998474, "learning_rate": 0.00018135028901734106, "loss": 0.47359893798828123, "step": 12100 }, { "epoch": 4.66281310211946, "eval_loss": 0.541972815990448, "eval_runtime": 11.7751, "eval_samples_per_second": 1613.914, "eval_steps_per_second": 33.63, "step": 12100 }, { "epoch": 4.682080924855491, "grad_norm": 0.5383356213569641, "learning_rate": 0.0001812732177263969, "loss": 0.46968029022216795, "step": 12150 }, { "epoch": 4.682080924855491, "eval_loss": 0.531944990158081, "eval_runtime": 11.7758, "eval_samples_per_second": 1613.825, "eval_steps_per_second": 33.628, "step": 12150 }, { "epoch": 4.701348747591522, "grad_norm": 0.4655359387397766, "learning_rate": 0.0001811961464354528, "loss": 0.47204586029052736, "step": 12200 }, { "epoch": 4.701348747591522, "eval_loss": 0.5414470434188843, "eval_runtime": 11.7745, "eval_samples_per_second": 1613.996, "eval_steps_per_second": 33.632, "step": 12200 }, { "epoch": 4.720616570327553, "grad_norm": 0.43928924202919006, "learning_rate": 0.00018111907514450868, "loss": 0.470347900390625, "step": 12250 }, { "epoch": 4.720616570327553, "eval_loss": 0.5526337027549744, "eval_runtime": 11.8444, "eval_samples_per_second": 1604.468, "eval_steps_per_second": 33.433, "step": 12250 }, { "epoch": 4.7398843930635834, "grad_norm": 0.5074217319488525, "learning_rate": 0.00018104200385356457, "loss": 0.4694701766967773, "step": 12300 }, { "epoch": 4.7398843930635834, "eval_loss": 0.5342041850090027, "eval_runtime": 11.7957, "eval_samples_per_second": 1611.099, "eval_steps_per_second": 33.572, "step": 12300 }, { "epoch": 4.759152215799615, "grad_norm": 0.4563540518283844, "learning_rate": 0.00018096493256262043, "loss": 0.47026519775390624, "step": 12350 }, { "epoch": 4.759152215799615, "eval_loss": 0.5437130331993103, "eval_runtime": 11.7833, "eval_samples_per_second": 1612.789, "eval_steps_per_second": 33.607, "step": 12350 }, { "epoch": 4.7784200385356455, "grad_norm": 0.44174790382385254, "learning_rate": 0.0001808878612716763, "loss": 0.4680271530151367, "step": 12400 }, { "epoch": 4.7784200385356455, "eval_loss": 0.537128746509552, "eval_runtime": 11.7783, "eval_samples_per_second": 1613.479, "eval_steps_per_second": 33.621, "step": 12400 }, { "epoch": 4.797687861271676, "grad_norm": 0.47368425130844116, "learning_rate": 0.00018081078998073217, "loss": 0.46962554931640627, "step": 12450 }, { "epoch": 4.797687861271676, "eval_loss": 0.536321759223938, "eval_runtime": 11.788, "eval_samples_per_second": 1612.152, "eval_steps_per_second": 33.594, "step": 12450 }, { "epoch": 4.8169556840077075, "grad_norm": 0.5200686454772949, "learning_rate": 0.00018073371868978805, "loss": 0.47060340881347656, "step": 12500 }, { "epoch": 4.8169556840077075, "eval_loss": 0.5413572192192078, "eval_runtime": 11.7816, "eval_samples_per_second": 1613.029, "eval_steps_per_second": 33.612, "step": 12500 }, { "epoch": 4.836223506743738, "grad_norm": 0.5366744995117188, "learning_rate": 0.00018065664739884394, "loss": 0.4683336639404297, "step": 12550 }, { "epoch": 4.836223506743738, "eval_loss": 0.5394500494003296, "eval_runtime": 11.7846, "eval_samples_per_second": 1612.619, "eval_steps_per_second": 33.603, "step": 12550 }, { "epoch": 4.855491329479769, "grad_norm": 0.48591041564941406, "learning_rate": 0.00018057957610789982, "loss": 0.46747753143310544, "step": 12600 }, { "epoch": 4.855491329479769, "eval_loss": 0.5285096168518066, "eval_runtime": 11.789, "eval_samples_per_second": 1612.015, "eval_steps_per_second": 33.591, "step": 12600 }, { "epoch": 4.8747591522158, "grad_norm": 0.48317044973373413, "learning_rate": 0.0001805025048169557, "loss": 0.4632664108276367, "step": 12650 }, { "epoch": 4.8747591522158, "eval_loss": 0.5367640256881714, "eval_runtime": 11.7848, "eval_samples_per_second": 1612.59, "eval_steps_per_second": 33.603, "step": 12650 }, { "epoch": 4.894026974951831, "grad_norm": 0.4904310703277588, "learning_rate": 0.00018042543352601157, "loss": 0.46919902801513674, "step": 12700 }, { "epoch": 4.894026974951831, "eval_loss": 0.541573703289032, "eval_runtime": 11.7748, "eval_samples_per_second": 1613.961, "eval_steps_per_second": 33.631, "step": 12700 }, { "epoch": 4.913294797687861, "grad_norm": 0.4833162724971771, "learning_rate": 0.00018034836223506742, "loss": 0.4701871871948242, "step": 12750 }, { "epoch": 4.913294797687861, "eval_loss": 0.54112708568573, "eval_runtime": 11.7829, "eval_samples_per_second": 1612.841, "eval_steps_per_second": 33.608, "step": 12750 }, { "epoch": 4.932562620423893, "grad_norm": 0.6152871251106262, "learning_rate": 0.0001802712909441233, "loss": 0.4714957809448242, "step": 12800 }, { "epoch": 4.932562620423893, "eval_loss": 0.5322556495666504, "eval_runtime": 11.7798, "eval_samples_per_second": 1613.276, "eval_steps_per_second": 33.617, "step": 12800 }, { "epoch": 4.951830443159923, "grad_norm": 0.4843738377094269, "learning_rate": 0.0001801942196531792, "loss": 0.46891883850097654, "step": 12850 }, { "epoch": 4.951830443159923, "eval_loss": 0.5379421710968018, "eval_runtime": 11.7897, "eval_samples_per_second": 1611.912, "eval_steps_per_second": 33.589, "step": 12850 }, { "epoch": 4.971098265895954, "grad_norm": 0.4622536301612854, "learning_rate": 0.00018011714836223508, "loss": 0.4611796188354492, "step": 12900 }, { "epoch": 4.971098265895954, "eval_loss": 0.5362759232521057, "eval_runtime": 11.7786, "eval_samples_per_second": 1613.432, "eval_steps_per_second": 33.62, "step": 12900 }, { "epoch": 4.990366088631984, "grad_norm": 0.461800754070282, "learning_rate": 0.00018004007707129096, "loss": 0.46757354736328127, "step": 12950 }, { "epoch": 4.990366088631984, "eval_loss": 0.5320266485214233, "eval_runtime": 11.7814, "eval_samples_per_second": 1613.051, "eval_steps_per_second": 33.612, "step": 12950 }, { "epoch": 5.009633911368016, "grad_norm": 0.5559517741203308, "learning_rate": 0.00017996300578034685, "loss": 0.4632146453857422, "step": 13000 }, { "epoch": 5.009633911368016, "eval_loss": 0.5373769998550415, "eval_runtime": 11.7827, "eval_samples_per_second": 1612.867, "eval_steps_per_second": 33.608, "step": 13000 }, { "epoch": 5.028901734104046, "grad_norm": 0.4611318111419678, "learning_rate": 0.0001798859344894027, "loss": 0.4622968673706055, "step": 13050 }, { "epoch": 5.028901734104046, "eval_loss": 0.5423731803894043, "eval_runtime": 11.7969, "eval_samples_per_second": 1610.929, "eval_steps_per_second": 33.568, "step": 13050 }, { "epoch": 5.048169556840077, "grad_norm": 0.48244401812553406, "learning_rate": 0.00017980886319845856, "loss": 0.4571623992919922, "step": 13100 }, { "epoch": 5.048169556840077, "eval_loss": 0.5320508480072021, "eval_runtime": 11.7787, "eval_samples_per_second": 1613.427, "eval_steps_per_second": 33.62, "step": 13100 }, { "epoch": 5.067437379576108, "grad_norm": 0.5404294729232788, "learning_rate": 0.00017973179190751445, "loss": 0.46228607177734377, "step": 13150 }, { "epoch": 5.067437379576108, "eval_loss": 0.5289859771728516, "eval_runtime": 11.7758, "eval_samples_per_second": 1613.825, "eval_steps_per_second": 33.628, "step": 13150 }, { "epoch": 5.086705202312139, "grad_norm": 0.4911145269870758, "learning_rate": 0.00017965472061657033, "loss": 0.46021984100341795, "step": 13200 }, { "epoch": 5.086705202312139, "eval_loss": 0.5308319926261902, "eval_runtime": 11.7758, "eval_samples_per_second": 1613.817, "eval_steps_per_second": 33.628, "step": 13200 }, { "epoch": 5.105973025048169, "grad_norm": 0.47378796339035034, "learning_rate": 0.00017957764932562622, "loss": 0.4652442169189453, "step": 13250 }, { "epoch": 5.105973025048169, "eval_loss": 0.5334806442260742, "eval_runtime": 11.7872, "eval_samples_per_second": 1612.252, "eval_steps_per_second": 33.596, "step": 13250 }, { "epoch": 5.1252408477842, "grad_norm": 0.44428208470344543, "learning_rate": 0.0001795005780346821, "loss": 0.4665904235839844, "step": 13300 }, { "epoch": 5.1252408477842, "eval_loss": 0.5256557464599609, "eval_runtime": 11.8383, "eval_samples_per_second": 1605.297, "eval_steps_per_second": 33.451, "step": 13300 }, { "epoch": 5.144508670520231, "grad_norm": 0.4868975281715393, "learning_rate": 0.00017942350674373796, "loss": 0.46156261444091795, "step": 13350 }, { "epoch": 5.144508670520231, "eval_loss": 0.5276922583580017, "eval_runtime": 11.7794, "eval_samples_per_second": 1613.332, "eval_steps_per_second": 33.618, "step": 13350 }, { "epoch": 5.163776493256262, "grad_norm": 0.4647890031337738, "learning_rate": 0.00017934643545279385, "loss": 0.45836334228515624, "step": 13400 }, { "epoch": 5.163776493256262, "eval_loss": 0.5345872640609741, "eval_runtime": 11.7973, "eval_samples_per_second": 1610.883, "eval_steps_per_second": 33.567, "step": 13400 }, { "epoch": 5.1830443159922925, "grad_norm": 0.4934742748737335, "learning_rate": 0.00017926936416184973, "loss": 0.4629267501831055, "step": 13450 }, { "epoch": 5.1830443159922925, "eval_loss": 0.5172365307807922, "eval_runtime": 11.7879, "eval_samples_per_second": 1612.158, "eval_steps_per_second": 33.594, "step": 13450 }, { "epoch": 5.202312138728324, "grad_norm": 0.5718507170677185, "learning_rate": 0.0001791922928709056, "loss": 0.4567113876342773, "step": 13500 }, { "epoch": 5.202312138728324, "eval_loss": 0.5191019773483276, "eval_runtime": 11.792, "eval_samples_per_second": 1611.597, "eval_steps_per_second": 33.582, "step": 13500 }, { "epoch": 5.2215799614643545, "grad_norm": 0.5254505276679993, "learning_rate": 0.00017911522157996147, "loss": 0.45942543029785154, "step": 13550 }, { "epoch": 5.2215799614643545, "eval_loss": 0.5296939611434937, "eval_runtime": 11.8009, "eval_samples_per_second": 1610.386, "eval_steps_per_second": 33.557, "step": 13550 }, { "epoch": 5.240847784200385, "grad_norm": 0.4767918884754181, "learning_rate": 0.00017903815028901736, "loss": 0.46776790618896485, "step": 13600 }, { "epoch": 5.240847784200385, "eval_loss": 0.524659276008606, "eval_runtime": 11.7792, "eval_samples_per_second": 1613.353, "eval_steps_per_second": 33.619, "step": 13600 }, { "epoch": 5.2601156069364166, "grad_norm": 0.43644294142723083, "learning_rate": 0.00017896107899807322, "loss": 0.45473384857177734, "step": 13650 }, { "epoch": 5.2601156069364166, "eval_loss": 0.5195856094360352, "eval_runtime": 11.7738, "eval_samples_per_second": 1614.094, "eval_steps_per_second": 33.634, "step": 13650 }, { "epoch": 5.279383429672447, "grad_norm": 0.46350541710853577, "learning_rate": 0.0001788840077071291, "loss": 0.4517910385131836, "step": 13700 }, { "epoch": 5.279383429672447, "eval_loss": 0.5220227837562561, "eval_runtime": 11.7922, "eval_samples_per_second": 1611.579, "eval_steps_per_second": 33.582, "step": 13700 }, { "epoch": 5.298651252408478, "grad_norm": 0.5262802839279175, "learning_rate": 0.00017880693641618499, "loss": 0.4557621383666992, "step": 13750 }, { "epoch": 5.298651252408478, "eval_loss": 0.5168203711509705, "eval_runtime": 11.7903, "eval_samples_per_second": 1611.835, "eval_steps_per_second": 33.587, "step": 13750 }, { "epoch": 5.317919075144509, "grad_norm": 0.4817775785923004, "learning_rate": 0.00017872986512524087, "loss": 0.46035064697265626, "step": 13800 }, { "epoch": 5.317919075144509, "eval_loss": 0.5325997471809387, "eval_runtime": 11.7721, "eval_samples_per_second": 1614.32, "eval_steps_per_second": 33.639, "step": 13800 }, { "epoch": 5.33718689788054, "grad_norm": 0.5007907748222351, "learning_rate": 0.00017865279383429673, "loss": 0.4616971969604492, "step": 13850 }, { "epoch": 5.33718689788054, "eval_loss": 0.5227739214897156, "eval_runtime": 11.7876, "eval_samples_per_second": 1612.206, "eval_steps_per_second": 33.595, "step": 13850 }, { "epoch": 5.35645472061657, "grad_norm": 0.5254023671150208, "learning_rate": 0.0001785757225433526, "loss": 0.4569298934936523, "step": 13900 }, { "epoch": 5.35645472061657, "eval_loss": 0.5219462513923645, "eval_runtime": 11.7925, "eval_samples_per_second": 1611.538, "eval_steps_per_second": 33.581, "step": 13900 }, { "epoch": 5.375722543352601, "grad_norm": 0.499540776014328, "learning_rate": 0.00017849865125240847, "loss": 0.45092124938964845, "step": 13950 }, { "epoch": 5.375722543352601, "eval_loss": 0.5230153203010559, "eval_runtime": 11.7818, "eval_samples_per_second": 1612.998, "eval_steps_per_second": 33.611, "step": 13950 }, { "epoch": 5.394990366088632, "grad_norm": 0.5098601579666138, "learning_rate": 0.00017842157996146436, "loss": 0.4566230773925781, "step": 14000 }, { "epoch": 5.394990366088632, "eval_loss": 0.520475447177887, "eval_runtime": 11.7919, "eval_samples_per_second": 1611.621, "eval_steps_per_second": 33.583, "step": 14000 }, { "epoch": 5.414258188824663, "grad_norm": 0.44043639302253723, "learning_rate": 0.00017834450867052024, "loss": 0.4553034591674805, "step": 14050 }, { "epoch": 5.414258188824663, "eval_loss": 0.5326269865036011, "eval_runtime": 11.793, "eval_samples_per_second": 1611.47, "eval_steps_per_second": 33.579, "step": 14050 }, { "epoch": 5.433526011560693, "grad_norm": 0.5455739498138428, "learning_rate": 0.00017826743737957613, "loss": 0.44902801513671875, "step": 14100 }, { "epoch": 5.433526011560693, "eval_loss": 0.5234522819519043, "eval_runtime": 11.787, "eval_samples_per_second": 1612.287, "eval_steps_per_second": 33.596, "step": 14100 }, { "epoch": 5.452793834296725, "grad_norm": 0.4797365665435791, "learning_rate": 0.000178190366088632, "loss": 0.45086681365966796, "step": 14150 }, { "epoch": 5.452793834296725, "eval_loss": 0.5213855504989624, "eval_runtime": 11.7897, "eval_samples_per_second": 1611.917, "eval_steps_per_second": 33.589, "step": 14150 }, { "epoch": 5.472061657032755, "grad_norm": 0.5314825773239136, "learning_rate": 0.00017811329479768787, "loss": 0.45931285858154297, "step": 14200 }, { "epoch": 5.472061657032755, "eval_loss": 0.5295806527137756, "eval_runtime": 11.789, "eval_samples_per_second": 1612.009, "eval_steps_per_second": 33.591, "step": 14200 }, { "epoch": 5.491329479768786, "grad_norm": 0.5515539646148682, "learning_rate": 0.00017803622350674373, "loss": 0.4547503662109375, "step": 14250 }, { "epoch": 5.491329479768786, "eval_loss": 0.5280287861824036, "eval_runtime": 11.7845, "eval_samples_per_second": 1612.624, "eval_steps_per_second": 33.603, "step": 14250 }, { "epoch": 5.5105973025048165, "grad_norm": 0.4367469847202301, "learning_rate": 0.0001779591522157996, "loss": 0.4542605209350586, "step": 14300 }, { "epoch": 5.5105973025048165, "eval_loss": 0.5214731693267822, "eval_runtime": 11.7832, "eval_samples_per_second": 1612.811, "eval_steps_per_second": 33.607, "step": 14300 }, { "epoch": 5.529865125240848, "grad_norm": 0.47857388854026794, "learning_rate": 0.0001778820809248555, "loss": 0.4491832733154297, "step": 14350 }, { "epoch": 5.529865125240848, "eval_loss": 0.5128691792488098, "eval_runtime": 11.8, "eval_samples_per_second": 1610.508, "eval_steps_per_second": 33.559, "step": 14350 }, { "epoch": 5.5491329479768785, "grad_norm": 0.5030723810195923, "learning_rate": 0.00017780500963391138, "loss": 0.4574004364013672, "step": 14400 }, { "epoch": 5.5491329479768785, "eval_loss": 0.531676709651947, "eval_runtime": 11.8261, "eval_samples_per_second": 1606.959, "eval_steps_per_second": 33.485, "step": 14400 }, { "epoch": 5.568400770712909, "grad_norm": 0.43092888593673706, "learning_rate": 0.00017772793834296727, "loss": 0.45830497741699217, "step": 14450 }, { "epoch": 5.568400770712909, "eval_loss": 0.5114962458610535, "eval_runtime": 11.7954, "eval_samples_per_second": 1611.132, "eval_steps_per_second": 33.572, "step": 14450 }, { "epoch": 5.5876685934489405, "grad_norm": 0.5198416113853455, "learning_rate": 0.00017765086705202315, "loss": 0.4549860382080078, "step": 14500 }, { "epoch": 5.5876685934489405, "eval_loss": 0.5147572755813599, "eval_runtime": 11.7789, "eval_samples_per_second": 1613.398, "eval_steps_per_second": 33.62, "step": 14500 }, { "epoch": 5.606936416184971, "grad_norm": 0.568012535572052, "learning_rate": 0.000177573795761079, "loss": 0.4500347900390625, "step": 14550 }, { "epoch": 5.606936416184971, "eval_loss": 0.5260612964630127, "eval_runtime": 11.8502, "eval_samples_per_second": 1603.682, "eval_steps_per_second": 33.417, "step": 14550 }, { "epoch": 5.626204238921002, "grad_norm": 0.461184561252594, "learning_rate": 0.00017749672447013487, "loss": 0.45430999755859375, "step": 14600 }, { "epoch": 5.626204238921002, "eval_loss": 0.51976478099823, "eval_runtime": 11.8102, "eval_samples_per_second": 1609.121, "eval_steps_per_second": 33.53, "step": 14600 }, { "epoch": 5.645472061657033, "grad_norm": 0.49724453687667847, "learning_rate": 0.00017741965317919075, "loss": 0.45937633514404297, "step": 14650 }, { "epoch": 5.645472061657033, "eval_loss": 0.5161289572715759, "eval_runtime": 11.7798, "eval_samples_per_second": 1613.265, "eval_steps_per_second": 33.617, "step": 14650 }, { "epoch": 5.664739884393064, "grad_norm": 0.5009393095970154, "learning_rate": 0.00017734258188824664, "loss": 0.448577880859375, "step": 14700 }, { "epoch": 5.664739884393064, "eval_loss": 0.5138633847236633, "eval_runtime": 11.7866, "eval_samples_per_second": 1612.334, "eval_steps_per_second": 33.597, "step": 14700 }, { "epoch": 5.684007707129094, "grad_norm": 0.5108370780944824, "learning_rate": 0.00017726551059730252, "loss": 0.45879791259765623, "step": 14750 }, { "epoch": 5.684007707129094, "eval_loss": 0.5129594206809998, "eval_runtime": 11.7913, "eval_samples_per_second": 1611.691, "eval_steps_per_second": 33.584, "step": 14750 }, { "epoch": 5.703275529865126, "grad_norm": 0.4837997853755951, "learning_rate": 0.0001771884393063584, "loss": 0.4535396957397461, "step": 14800 }, { "epoch": 5.703275529865126, "eval_loss": 0.52413409948349, "eval_runtime": 11.7986, "eval_samples_per_second": 1610.705, "eval_steps_per_second": 33.563, "step": 14800 }, { "epoch": 5.722543352601156, "grad_norm": 0.44351962208747864, "learning_rate": 0.00017711136801541426, "loss": 0.446834716796875, "step": 14850 }, { "epoch": 5.722543352601156, "eval_loss": 0.5157108306884766, "eval_runtime": 11.7919, "eval_samples_per_second": 1611.609, "eval_steps_per_second": 33.582, "step": 14850 }, { "epoch": 5.741811175337187, "grad_norm": 0.47284823656082153, "learning_rate": 0.00017703429672447015, "loss": 0.4558345031738281, "step": 14900 }, { "epoch": 5.741811175337187, "eval_loss": 0.5183041095733643, "eval_runtime": 11.7926, "eval_samples_per_second": 1611.513, "eval_steps_per_second": 33.58, "step": 14900 }, { "epoch": 5.761078998073218, "grad_norm": 0.4867861866950989, "learning_rate": 0.000176957225433526, "loss": 0.45276153564453125, "step": 14950 }, { "epoch": 5.761078998073218, "eval_loss": 0.5191811323165894, "eval_runtime": 11.7879, "eval_samples_per_second": 1612.158, "eval_steps_per_second": 33.594, "step": 14950 }, { "epoch": 5.780346820809249, "grad_norm": 0.4188956618309021, "learning_rate": 0.0001768801541425819, "loss": 0.44755260467529295, "step": 15000 }, { "epoch": 5.780346820809249, "eval_loss": 0.512732744216919, "eval_runtime": 11.7886, "eval_samples_per_second": 1612.062, "eval_steps_per_second": 33.592, "step": 15000 }, { "epoch": 5.799614643545279, "grad_norm": 0.4467390775680542, "learning_rate": 0.00017680308285163778, "loss": 0.45075206756591796, "step": 15050 }, { "epoch": 5.799614643545279, "eval_loss": 0.5155383348464966, "eval_runtime": 11.781, "eval_samples_per_second": 1613.112, "eval_steps_per_second": 33.614, "step": 15050 }, { "epoch": 5.81888246628131, "grad_norm": 0.443208783864975, "learning_rate": 0.00017672601156069366, "loss": 0.45153816223144533, "step": 15100 }, { "epoch": 5.81888246628131, "eval_loss": 0.5170373916625977, "eval_runtime": 11.7984, "eval_samples_per_second": 1610.73, "eval_steps_per_second": 33.564, "step": 15100 }, { "epoch": 5.838150289017341, "grad_norm": 0.5011909604072571, "learning_rate": 0.00017664894026974952, "loss": 0.4535177612304688, "step": 15150 }, { "epoch": 5.838150289017341, "eval_loss": 0.5271491408348083, "eval_runtime": 11.8846, "eval_samples_per_second": 1599.044, "eval_steps_per_second": 33.32, "step": 15150 }, { "epoch": 5.857418111753372, "grad_norm": 0.4987393319606781, "learning_rate": 0.0001765718689788054, "loss": 0.440333251953125, "step": 15200 }, { "epoch": 5.857418111753372, "eval_loss": 0.5251661539077759, "eval_runtime": 11.8035, "eval_samples_per_second": 1610.035, "eval_steps_per_second": 33.549, "step": 15200 }, { "epoch": 5.8766859344894025, "grad_norm": 0.5387109518051147, "learning_rate": 0.0001764947976878613, "loss": 0.4489044189453125, "step": 15250 }, { "epoch": 5.8766859344894025, "eval_loss": 0.521905779838562, "eval_runtime": 11.8134, "eval_samples_per_second": 1608.683, "eval_steps_per_second": 33.521, "step": 15250 }, { "epoch": 5.895953757225434, "grad_norm": 0.4548056721687317, "learning_rate": 0.00017641772639691717, "loss": 0.4410498046875, "step": 15300 }, { "epoch": 5.895953757225434, "eval_loss": 0.5245518684387207, "eval_runtime": 11.7988, "eval_samples_per_second": 1610.677, "eval_steps_per_second": 33.563, "step": 15300 }, { "epoch": 5.9152215799614645, "grad_norm": 0.4605467915534973, "learning_rate": 0.00017634065510597303, "loss": 0.4502186584472656, "step": 15350 }, { "epoch": 5.9152215799614645, "eval_loss": 0.521344780921936, "eval_runtime": 11.7962, "eval_samples_per_second": 1611.027, "eval_steps_per_second": 33.57, "step": 15350 }, { "epoch": 5.934489402697495, "grad_norm": 0.48502975702285767, "learning_rate": 0.00017626358381502892, "loss": 0.45240081787109376, "step": 15400 }, { "epoch": 5.934489402697495, "eval_loss": 0.5151711702346802, "eval_runtime": 11.782, "eval_samples_per_second": 1612.971, "eval_steps_per_second": 33.611, "step": 15400 }, { "epoch": 5.953757225433526, "grad_norm": 0.42830032110214233, "learning_rate": 0.00017618651252408477, "loss": 0.4440047454833984, "step": 15450 }, { "epoch": 5.953757225433526, "eval_loss": 0.5096163153648376, "eval_runtime": 11.7892, "eval_samples_per_second": 1611.985, "eval_steps_per_second": 33.59, "step": 15450 }, { "epoch": 5.973025048169557, "grad_norm": 0.44719401001930237, "learning_rate": 0.00017610944123314066, "loss": 0.44931999206542966, "step": 15500 }, { "epoch": 5.973025048169557, "eval_loss": 0.5109618902206421, "eval_runtime": 11.8547, "eval_samples_per_second": 1603.071, "eval_steps_per_second": 33.404, "step": 15500 }, { "epoch": 5.992292870905588, "grad_norm": 0.45736679434776306, "learning_rate": 0.00017603236994219654, "loss": 0.448182487487793, "step": 15550 }, { "epoch": 5.992292870905588, "eval_loss": 0.5136154294013977, "eval_runtime": 11.7892, "eval_samples_per_second": 1611.979, "eval_steps_per_second": 33.59, "step": 15550 }, { "epoch": 6.011560693641618, "grad_norm": 0.4975254237651825, "learning_rate": 0.00017595529865125243, "loss": 0.44667705535888674, "step": 15600 }, { "epoch": 6.011560693641618, "eval_loss": 0.5186009407043457, "eval_runtime": 11.8008, "eval_samples_per_second": 1610.399, "eval_steps_per_second": 33.557, "step": 15600 }, { "epoch": 6.03082851637765, "grad_norm": 0.46463543176651, "learning_rate": 0.00017587822736030831, "loss": 0.44347923278808593, "step": 15650 }, { "epoch": 6.03082851637765, "eval_loss": 0.50421541929245, "eval_runtime": 11.7914, "eval_samples_per_second": 1611.678, "eval_steps_per_second": 33.584, "step": 15650 }, { "epoch": 6.05009633911368, "grad_norm": 0.4851040542125702, "learning_rate": 0.00017580115606936417, "loss": 0.4479880905151367, "step": 15700 }, { "epoch": 6.05009633911368, "eval_loss": 0.530933141708374, "eval_runtime": 11.7955, "eval_samples_per_second": 1611.117, "eval_steps_per_second": 33.572, "step": 15700 }, { "epoch": 6.069364161849711, "grad_norm": 0.5149499177932739, "learning_rate": 0.00017572408477842003, "loss": 0.44879051208496096, "step": 15750 }, { "epoch": 6.069364161849711, "eval_loss": 0.5208334922790527, "eval_runtime": 11.7863, "eval_samples_per_second": 1612.377, "eval_steps_per_second": 33.598, "step": 15750 }, { "epoch": 6.088631984585742, "grad_norm": 0.46366313099861145, "learning_rate": 0.00017564701348747591, "loss": 0.441964225769043, "step": 15800 }, { "epoch": 6.088631984585742, "eval_loss": 0.5171211957931519, "eval_runtime": 11.7864, "eval_samples_per_second": 1612.373, "eval_steps_per_second": 33.598, "step": 15800 }, { "epoch": 6.107899807321773, "grad_norm": 0.47385674715042114, "learning_rate": 0.0001755699421965318, "loss": 0.4446437835693359, "step": 15850 }, { "epoch": 6.107899807321773, "eval_loss": 0.5199719667434692, "eval_runtime": 11.8288, "eval_samples_per_second": 1606.591, "eval_steps_per_second": 33.478, "step": 15850 }, { "epoch": 6.127167630057803, "grad_norm": 0.47526901960372925, "learning_rate": 0.00017549287090558768, "loss": 0.4495206832885742, "step": 15900 }, { "epoch": 6.127167630057803, "eval_loss": 0.525589644908905, "eval_runtime": 11.8002, "eval_samples_per_second": 1610.486, "eval_steps_per_second": 33.559, "step": 15900 }, { "epoch": 6.146435452793835, "grad_norm": 0.4920772314071655, "learning_rate": 0.00017541579961464357, "loss": 0.4440087890625, "step": 15950 }, { "epoch": 6.146435452793835, "eval_loss": 0.5237160325050354, "eval_runtime": 11.7827, "eval_samples_per_second": 1612.87, "eval_steps_per_second": 33.609, "step": 15950 }, { "epoch": 6.165703275529865, "grad_norm": 0.5098642110824585, "learning_rate": 0.00017533872832369943, "loss": 0.44675533294677733, "step": 16000 }, { "epoch": 6.165703275529865, "eval_loss": 0.5146178007125854, "eval_runtime": 11.8248, "eval_samples_per_second": 1607.128, "eval_steps_per_second": 33.489, "step": 16000 }, { "epoch": 6.184971098265896, "grad_norm": 0.4989316761493683, "learning_rate": 0.0001752616570327553, "loss": 0.4389476776123047, "step": 16050 }, { "epoch": 6.184971098265896, "eval_loss": 0.5142693519592285, "eval_runtime": 11.8093, "eval_samples_per_second": 1609.239, "eval_steps_per_second": 33.533, "step": 16050 }, { "epoch": 6.204238921001926, "grad_norm": 0.46497642993927, "learning_rate": 0.00017518458574181117, "loss": 0.4501412582397461, "step": 16100 }, { "epoch": 6.204238921001926, "eval_loss": 0.5020025372505188, "eval_runtime": 11.7916, "eval_samples_per_second": 1611.649, "eval_steps_per_second": 33.583, "step": 16100 }, { "epoch": 6.223506743737958, "grad_norm": 0.442963182926178, "learning_rate": 0.00017510751445086705, "loss": 0.44518890380859377, "step": 16150 }, { "epoch": 6.223506743737958, "eval_loss": 0.5138471126556396, "eval_runtime": 11.791, "eval_samples_per_second": 1611.744, "eval_steps_per_second": 33.585, "step": 16150 }, { "epoch": 6.242774566473988, "grad_norm": 0.47822749614715576, "learning_rate": 0.00017503044315992294, "loss": 0.43821014404296876, "step": 16200 }, { "epoch": 6.242774566473988, "eval_loss": 0.5112191438674927, "eval_runtime": 11.8005, "eval_samples_per_second": 1610.44, "eval_steps_per_second": 33.558, "step": 16200 }, { "epoch": 6.262042389210019, "grad_norm": 0.5071032047271729, "learning_rate": 0.00017495337186897882, "loss": 0.447220573425293, "step": 16250 }, { "epoch": 6.262042389210019, "eval_loss": 0.5022717118263245, "eval_runtime": 11.8015, "eval_samples_per_second": 1610.299, "eval_steps_per_second": 33.555, "step": 16250 }, { "epoch": 6.28131021194605, "grad_norm": 0.46061861515045166, "learning_rate": 0.00017487630057803468, "loss": 0.4489071655273438, "step": 16300 }, { "epoch": 6.28131021194605, "eval_loss": 0.5153299570083618, "eval_runtime": 11.7971, "eval_samples_per_second": 1610.899, "eval_steps_per_second": 33.567, "step": 16300 }, { "epoch": 6.300578034682081, "grad_norm": 0.46234917640686035, "learning_rate": 0.00017479922928709057, "loss": 0.438372917175293, "step": 16350 }, { "epoch": 6.300578034682081, "eval_loss": 0.5115172266960144, "eval_runtime": 11.7998, "eval_samples_per_second": 1610.537, "eval_steps_per_second": 33.56, "step": 16350 }, { "epoch": 6.319845857418112, "grad_norm": 0.4384799301624298, "learning_rate": 0.00017472215799614645, "loss": 0.4400736618041992, "step": 16400 }, { "epoch": 6.319845857418112, "eval_loss": 0.49961650371551514, "eval_runtime": 11.799, "eval_samples_per_second": 1610.644, "eval_steps_per_second": 33.562, "step": 16400 }, { "epoch": 6.339113680154142, "grad_norm": 0.49438998103141785, "learning_rate": 0.0001746450867052023, "loss": 0.4402273559570313, "step": 16450 }, { "epoch": 6.339113680154142, "eval_loss": 0.5002704858779907, "eval_runtime": 11.7931, "eval_samples_per_second": 1611.456, "eval_steps_per_second": 33.579, "step": 16450 }, { "epoch": 6.358381502890174, "grad_norm": 0.42053595185279846, "learning_rate": 0.0001745680154142582, "loss": 0.4375503158569336, "step": 16500 }, { "epoch": 6.358381502890174, "eval_loss": 0.5110408067703247, "eval_runtime": 11.8239, "eval_samples_per_second": 1607.251, "eval_steps_per_second": 33.491, "step": 16500 }, { "epoch": 6.377649325626204, "grad_norm": 0.440516859292984, "learning_rate": 0.00017449094412331408, "loss": 0.4425928497314453, "step": 16550 }, { "epoch": 6.377649325626204, "eval_loss": 0.49532556533813477, "eval_runtime": 11.7864, "eval_samples_per_second": 1612.361, "eval_steps_per_second": 33.598, "step": 16550 }, { "epoch": 6.396917148362235, "grad_norm": 0.4952506422996521, "learning_rate": 0.00017441387283236994, "loss": 0.4349885177612305, "step": 16600 }, { "epoch": 6.396917148362235, "eval_loss": 0.4996231198310852, "eval_runtime": 11.8649, "eval_samples_per_second": 1601.699, "eval_steps_per_second": 33.376, "step": 16600 }, { "epoch": 6.416184971098266, "grad_norm": 0.423630952835083, "learning_rate": 0.00017433680154142582, "loss": 0.4410920333862305, "step": 16650 }, { "epoch": 6.416184971098266, "eval_loss": 0.4955505430698395, "eval_runtime": 11.7964, "eval_samples_per_second": 1610.999, "eval_steps_per_second": 33.57, "step": 16650 }, { "epoch": 6.435452793834297, "grad_norm": 0.46397677063941956, "learning_rate": 0.0001742597302504817, "loss": 0.43901882171630857, "step": 16700 }, { "epoch": 6.435452793834297, "eval_loss": 0.5004918575286865, "eval_runtime": 11.7896, "eval_samples_per_second": 1611.933, "eval_steps_per_second": 33.589, "step": 16700 }, { "epoch": 6.454720616570327, "grad_norm": 0.4575091600418091, "learning_rate": 0.0001741826589595376, "loss": 0.4391038513183594, "step": 16750 }, { "epoch": 6.454720616570327, "eval_loss": 0.5030953288078308, "eval_runtime": 11.7914, "eval_samples_per_second": 1611.677, "eval_steps_per_second": 33.584, "step": 16750 }, { "epoch": 6.473988439306359, "grad_norm": 0.4218173623085022, "learning_rate": 0.00017410558766859345, "loss": 0.43956527709960935, "step": 16800 }, { "epoch": 6.473988439306359, "eval_loss": 0.5013048052787781, "eval_runtime": 11.7926, "eval_samples_per_second": 1611.518, "eval_steps_per_second": 33.58, "step": 16800 }, { "epoch": 6.493256262042389, "grad_norm": 0.4322430193424225, "learning_rate": 0.00017402851637764933, "loss": 0.43196697235107423, "step": 16850 }, { "epoch": 6.493256262042389, "eval_loss": 0.5013118386268616, "eval_runtime": 11.7912, "eval_samples_per_second": 1611.717, "eval_steps_per_second": 33.584, "step": 16850 }, { "epoch": 6.51252408477842, "grad_norm": 0.43160316348075867, "learning_rate": 0.0001739514450867052, "loss": 0.4431794357299805, "step": 16900 }, { "epoch": 6.51252408477842, "eval_loss": 0.5110673904418945, "eval_runtime": 11.7962, "eval_samples_per_second": 1611.026, "eval_steps_per_second": 33.57, "step": 16900 }, { "epoch": 6.531791907514451, "grad_norm": 0.49111542105674744, "learning_rate": 0.00017387437379576108, "loss": 0.44232017517089844, "step": 16950 }, { "epoch": 6.531791907514451, "eval_loss": 0.5047245621681213, "eval_runtime": 11.8052, "eval_samples_per_second": 1609.804, "eval_steps_per_second": 33.545, "step": 16950 }, { "epoch": 6.551059730250482, "grad_norm": 0.4832664430141449, "learning_rate": 0.00017379730250481696, "loss": 0.44115184783935546, "step": 17000 }, { "epoch": 6.551059730250482, "eval_loss": 0.503083348274231, "eval_runtime": 11.8068, "eval_samples_per_second": 1609.577, "eval_steps_per_second": 33.54, "step": 17000 }, { "epoch": 6.570327552986512, "grad_norm": 0.5922213792800903, "learning_rate": 0.00017372023121387285, "loss": 0.43453838348388674, "step": 17050 }, { "epoch": 6.570327552986512, "eval_loss": 0.5021129250526428, "eval_runtime": 11.7997, "eval_samples_per_second": 1610.551, "eval_steps_per_second": 33.56, "step": 17050 }, { "epoch": 6.589595375722544, "grad_norm": 0.4921945333480835, "learning_rate": 0.00017364315992292873, "loss": 0.4365489959716797, "step": 17100 }, { "epoch": 6.589595375722544, "eval_loss": 0.5093340277671814, "eval_runtime": 11.8154, "eval_samples_per_second": 1608.409, "eval_steps_per_second": 33.516, "step": 17100 }, { "epoch": 6.608863198458574, "grad_norm": 0.4384665787220001, "learning_rate": 0.0001735660886319846, "loss": 0.4366027450561523, "step": 17150 }, { "epoch": 6.608863198458574, "eval_loss": 0.5064074993133545, "eval_runtime": 11.7926, "eval_samples_per_second": 1611.519, "eval_steps_per_second": 33.58, "step": 17150 }, { "epoch": 6.628131021194605, "grad_norm": 0.5026290416717529, "learning_rate": 0.00017348901734104047, "loss": 0.437703971862793, "step": 17200 }, { "epoch": 6.628131021194605, "eval_loss": 0.49922600388526917, "eval_runtime": 11.8032, "eval_samples_per_second": 1610.074, "eval_steps_per_second": 33.55, "step": 17200 }, { "epoch": 6.6473988439306355, "grad_norm": 0.5124346017837524, "learning_rate": 0.00017341194605009633, "loss": 0.4357683563232422, "step": 17250 }, { "epoch": 6.6473988439306355, "eval_loss": 0.5048104524612427, "eval_runtime": 11.7947, "eval_samples_per_second": 1611.231, "eval_steps_per_second": 33.574, "step": 17250 }, { "epoch": 6.666666666666667, "grad_norm": 0.45675018429756165, "learning_rate": 0.00017333487475915222, "loss": 0.4339780807495117, "step": 17300 }, { "epoch": 6.666666666666667, "eval_loss": 0.4992403984069824, "eval_runtime": 11.8533, "eval_samples_per_second": 1603.267, "eval_steps_per_second": 33.408, "step": 17300 }, { "epoch": 6.6859344894026975, "grad_norm": 0.4528164267539978, "learning_rate": 0.0001732578034682081, "loss": 0.4414427947998047, "step": 17350 }, { "epoch": 6.6859344894026975, "eval_loss": 0.5065803527832031, "eval_runtime": 11.7915, "eval_samples_per_second": 1611.675, "eval_steps_per_second": 33.584, "step": 17350 }, { "epoch": 6.705202312138728, "grad_norm": 0.49560031294822693, "learning_rate": 0.000173180732177264, "loss": 0.43104766845703124, "step": 17400 }, { "epoch": 6.705202312138728, "eval_loss": 0.5004961490631104, "eval_runtime": 11.7969, "eval_samples_per_second": 1610.936, "eval_steps_per_second": 33.568, "step": 17400 }, { "epoch": 6.7244701348747595, "grad_norm": 0.3947650194168091, "learning_rate": 0.00017310366088631987, "loss": 0.432781982421875, "step": 17450 }, { "epoch": 6.7244701348747595, "eval_loss": 0.5007652640342712, "eval_runtime": 11.8002, "eval_samples_per_second": 1610.486, "eval_steps_per_second": 33.559, "step": 17450 }, { "epoch": 6.74373795761079, "grad_norm": 0.4669573903083801, "learning_rate": 0.00017302658959537573, "loss": 0.43634613037109377, "step": 17500 }, { "epoch": 6.74373795761079, "eval_loss": 0.5016438961029053, "eval_runtime": 11.8148, "eval_samples_per_second": 1608.492, "eval_steps_per_second": 33.517, "step": 17500 }, { "epoch": 6.763005780346821, "grad_norm": 0.47617945075035095, "learning_rate": 0.00017294951830443161, "loss": 0.42889091491699216, "step": 17550 }, { "epoch": 6.763005780346821, "eval_loss": 0.520977795124054, "eval_runtime": 11.8004, "eval_samples_per_second": 1610.451, "eval_steps_per_second": 33.558, "step": 17550 }, { "epoch": 6.782273603082851, "grad_norm": 0.4752465784549713, "learning_rate": 0.00017287244701348747, "loss": 0.4416308212280273, "step": 17600 }, { "epoch": 6.782273603082851, "eval_loss": 0.5011850595474243, "eval_runtime": 11.8082, "eval_samples_per_second": 1609.392, "eval_steps_per_second": 33.536, "step": 17600 }, { "epoch": 6.801541425818883, "grad_norm": 0.47106125950813293, "learning_rate": 0.00017279537572254336, "loss": 0.4374371337890625, "step": 17650 }, { "epoch": 6.801541425818883, "eval_loss": 0.4961530566215515, "eval_runtime": 11.7965, "eval_samples_per_second": 1610.984, "eval_steps_per_second": 33.569, "step": 17650 }, { "epoch": 6.820809248554913, "grad_norm": 0.4505392014980316, "learning_rate": 0.00017271830443159924, "loss": 0.4371138000488281, "step": 17700 }, { "epoch": 6.820809248554913, "eval_loss": 0.4944923222064972, "eval_runtime": 13.2037, "eval_samples_per_second": 1439.289, "eval_steps_per_second": 29.992, "step": 17700 }, { "epoch": 6.840077071290944, "grad_norm": 0.4777015149593353, "learning_rate": 0.00017264123314065513, "loss": 0.43514705657958985, "step": 17750 }, { "epoch": 6.840077071290944, "eval_loss": 0.49974122643470764, "eval_runtime": 13.218, "eval_samples_per_second": 1437.735, "eval_steps_per_second": 29.959, "step": 17750 }, { "epoch": 6.859344894026975, "grad_norm": 0.4539125859737396, "learning_rate": 0.00017256416184971098, "loss": 0.4337408447265625, "step": 17800 }, { "epoch": 6.859344894026975, "eval_loss": 0.4974925220012665, "eval_runtime": 13.3795, "eval_samples_per_second": 1420.378, "eval_steps_per_second": 29.597, "step": 17800 }, { "epoch": 6.878612716763006, "grad_norm": 0.41309189796447754, "learning_rate": 0.00017248709055876687, "loss": 0.4338951873779297, "step": 17850 }, { "epoch": 6.878612716763006, "eval_loss": 0.4931277334690094, "eval_runtime": 12.1115, "eval_samples_per_second": 1569.087, "eval_steps_per_second": 32.696, "step": 17850 }, { "epoch": 6.897880539499036, "grad_norm": 0.48061224818229675, "learning_rate": 0.00017241001926782275, "loss": 0.4352729415893555, "step": 17900 }, { "epoch": 6.897880539499036, "eval_loss": 0.5024764537811279, "eval_runtime": 11.8259, "eval_samples_per_second": 1606.984, "eval_steps_per_second": 33.486, "step": 17900 }, { "epoch": 6.917148362235068, "grad_norm": 0.4590555727481842, "learning_rate": 0.0001723329479768786, "loss": 0.43797447204589846, "step": 17950 }, { "epoch": 6.917148362235068, "eval_loss": 0.4912962019443512, "eval_runtime": 11.8238, "eval_samples_per_second": 1607.273, "eval_steps_per_second": 33.492, "step": 17950 }, { "epoch": 6.936416184971098, "grad_norm": 0.4490097463130951, "learning_rate": 0.0001722558766859345, "loss": 0.42794281005859375, "step": 18000 }, { "epoch": 6.936416184971098, "eval_loss": 0.48559698462486267, "eval_runtime": 13.2882, "eval_samples_per_second": 1430.139, "eval_steps_per_second": 29.801, "step": 18000 }, { "epoch": 6.955684007707129, "grad_norm": 0.4649190306663513, "learning_rate": 0.00017217880539499038, "loss": 0.43179595947265625, "step": 18050 }, { "epoch": 6.955684007707129, "eval_loss": 0.49804040789604187, "eval_runtime": 11.792, "eval_samples_per_second": 1611.595, "eval_steps_per_second": 33.582, "step": 18050 }, { "epoch": 6.97495183044316, "grad_norm": 0.43740737438201904, "learning_rate": 0.00017210173410404624, "loss": 0.4345121383666992, "step": 18100 }, { "epoch": 6.97495183044316, "eval_loss": 0.5065304636955261, "eval_runtime": 12.3665, "eval_samples_per_second": 1536.73, "eval_steps_per_second": 32.022, "step": 18100 }, { "epoch": 6.994219653179191, "grad_norm": 0.4777332544326782, "learning_rate": 0.00017202466281310212, "loss": 0.42909679412841795, "step": 18150 }, { "epoch": 6.994219653179191, "eval_loss": 0.4926902651786804, "eval_runtime": 11.8465, "eval_samples_per_second": 1604.18, "eval_steps_per_second": 33.427, "step": 18150 }, { "epoch": 7.0134874759152215, "grad_norm": 0.4738602638244629, "learning_rate": 0.000171947591522158, "loss": 0.4332198715209961, "step": 18200 }, { "epoch": 7.0134874759152215, "eval_loss": 0.49622878432273865, "eval_runtime": 11.8094, "eval_samples_per_second": 1609.227, "eval_steps_per_second": 33.533, "step": 18200 }, { "epoch": 7.032755298651252, "grad_norm": 0.527941107749939, "learning_rate": 0.0001718705202312139, "loss": 0.43561668395996095, "step": 18250 }, { "epoch": 7.032755298651252, "eval_loss": 0.4889552593231201, "eval_runtime": 11.8189, "eval_samples_per_second": 1607.936, "eval_steps_per_second": 33.506, "step": 18250 }, { "epoch": 7.0520231213872835, "grad_norm": 0.514523446559906, "learning_rate": 0.00017179344894026975, "loss": 0.4271894073486328, "step": 18300 }, { "epoch": 7.0520231213872835, "eval_loss": 0.4923912584781647, "eval_runtime": 11.821, "eval_samples_per_second": 1607.644, "eval_steps_per_second": 33.5, "step": 18300 }, { "epoch": 7.071290944123314, "grad_norm": 0.44800734519958496, "learning_rate": 0.00017171637764932564, "loss": 0.42959102630615237, "step": 18350 }, { "epoch": 7.071290944123314, "eval_loss": 0.4852442741394043, "eval_runtime": 11.8521, "eval_samples_per_second": 1603.429, "eval_steps_per_second": 33.412, "step": 18350 }, { "epoch": 7.090558766859345, "grad_norm": 0.4187127947807312, "learning_rate": 0.0001716393063583815, "loss": 0.4327618408203125, "step": 18400 }, { "epoch": 7.090558766859345, "eval_loss": 0.4900670647621155, "eval_runtime": 11.8228, "eval_samples_per_second": 1607.397, "eval_steps_per_second": 33.494, "step": 18400 }, { "epoch": 7.109826589595376, "grad_norm": 0.6178864240646362, "learning_rate": 0.00017156223506743738, "loss": 0.4312919616699219, "step": 18450 }, { "epoch": 7.109826589595376, "eval_loss": 0.49219638109207153, "eval_runtime": 11.8175, "eval_samples_per_second": 1608.125, "eval_steps_per_second": 33.51, "step": 18450 }, { "epoch": 7.129094412331407, "grad_norm": 0.46727705001831055, "learning_rate": 0.00017148516377649326, "loss": 0.43129562377929687, "step": 18500 }, { "epoch": 7.129094412331407, "eval_loss": 0.4905467629432678, "eval_runtime": 11.8232, "eval_samples_per_second": 1607.345, "eval_steps_per_second": 33.493, "step": 18500 }, { "epoch": 7.148362235067437, "grad_norm": 0.44432124495506287, "learning_rate": 0.00017140809248554915, "loss": 0.4354438018798828, "step": 18550 }, { "epoch": 7.148362235067437, "eval_loss": 0.49216827750205994, "eval_runtime": 11.8149, "eval_samples_per_second": 1608.479, "eval_steps_per_second": 33.517, "step": 18550 }, { "epoch": 7.167630057803469, "grad_norm": 0.48657649755477905, "learning_rate": 0.00017133102119460503, "loss": 0.4246221923828125, "step": 18600 }, { "epoch": 7.167630057803469, "eval_loss": 0.48143184185028076, "eval_runtime": 11.8083, "eval_samples_per_second": 1609.381, "eval_steps_per_second": 33.536, "step": 18600 }, { "epoch": 7.186897880539499, "grad_norm": 0.5179467797279358, "learning_rate": 0.0001712539499036609, "loss": 0.4301288986206055, "step": 18650 }, { "epoch": 7.186897880539499, "eval_loss": 0.4878024160861969, "eval_runtime": 11.8165, "eval_samples_per_second": 1608.256, "eval_steps_per_second": 33.512, "step": 18650 }, { "epoch": 7.20616570327553, "grad_norm": 0.4748821258544922, "learning_rate": 0.00017117687861271675, "loss": 0.42851287841796876, "step": 18700 }, { "epoch": 7.20616570327553, "eval_loss": 0.49397167563438416, "eval_runtime": 11.8192, "eval_samples_per_second": 1607.891, "eval_steps_per_second": 33.505, "step": 18700 }, { "epoch": 7.22543352601156, "grad_norm": 0.44963929057121277, "learning_rate": 0.00017109980732177263, "loss": 0.43318996429443357, "step": 18750 }, { "epoch": 7.22543352601156, "eval_loss": 0.49174365401268005, "eval_runtime": 11.8127, "eval_samples_per_second": 1608.774, "eval_steps_per_second": 33.523, "step": 18750 }, { "epoch": 7.244701348747592, "grad_norm": 0.46700164675712585, "learning_rate": 0.00017102273603082852, "loss": 0.4332516479492188, "step": 18800 }, { "epoch": 7.244701348747592, "eval_loss": 0.4938528537750244, "eval_runtime": 11.8219, "eval_samples_per_second": 1607.519, "eval_steps_per_second": 33.497, "step": 18800 }, { "epoch": 7.263969171483622, "grad_norm": 0.4882650375366211, "learning_rate": 0.0001709456647398844, "loss": 0.4273331832885742, "step": 18850 }, { "epoch": 7.263969171483622, "eval_loss": 0.4957718551158905, "eval_runtime": 11.8081, "eval_samples_per_second": 1609.409, "eval_steps_per_second": 33.536, "step": 18850 }, { "epoch": 7.283236994219653, "grad_norm": 0.4159916043281555, "learning_rate": 0.0001708685934489403, "loss": 0.4290004730224609, "step": 18900 }, { "epoch": 7.283236994219653, "eval_loss": 0.5029377937316895, "eval_runtime": 11.8359, "eval_samples_per_second": 1605.618, "eval_steps_per_second": 33.457, "step": 18900 }, { "epoch": 7.302504816955684, "grad_norm": 0.4774915874004364, "learning_rate": 0.00017079152215799617, "loss": 0.4293732833862305, "step": 18950 }, { "epoch": 7.302504816955684, "eval_loss": 0.5016842484474182, "eval_runtime": 11.807, "eval_samples_per_second": 1609.557, "eval_steps_per_second": 33.539, "step": 18950 }, { "epoch": 7.321772639691715, "grad_norm": 0.4764392673969269, "learning_rate": 0.00017071445086705203, "loss": 0.4276705169677734, "step": 19000 }, { "epoch": 7.321772639691715, "eval_loss": 0.49698731303215027, "eval_runtime": 11.8576, "eval_samples_per_second": 1602.679, "eval_steps_per_second": 33.396, "step": 19000 }, { "epoch": 7.341040462427745, "grad_norm": 0.4899766743183136, "learning_rate": 0.0001706373795761079, "loss": 0.4325958251953125, "step": 19050 }, { "epoch": 7.341040462427745, "eval_loss": 0.502516508102417, "eval_runtime": 11.8188, "eval_samples_per_second": 1607.948, "eval_steps_per_second": 33.506, "step": 19050 }, { "epoch": 7.360308285163777, "grad_norm": 0.48762696981430054, "learning_rate": 0.00017056030828516377, "loss": 0.42429801940917966, "step": 19100 }, { "epoch": 7.360308285163777, "eval_loss": 0.5113667249679565, "eval_runtime": 11.8149, "eval_samples_per_second": 1608.471, "eval_steps_per_second": 33.517, "step": 19100 }, { "epoch": 7.3795761078998074, "grad_norm": 0.4457147419452667, "learning_rate": 0.00017048323699421966, "loss": 0.42884456634521484, "step": 19150 }, { "epoch": 7.3795761078998074, "eval_loss": 0.5030876398086548, "eval_runtime": 11.8126, "eval_samples_per_second": 1608.789, "eval_steps_per_second": 33.523, "step": 19150 }, { "epoch": 7.398843930635838, "grad_norm": 0.49163588881492615, "learning_rate": 0.00017040616570327554, "loss": 0.42685791015625, "step": 19200 }, { "epoch": 7.398843930635838, "eval_loss": 0.49079224467277527, "eval_runtime": 11.8144, "eval_samples_per_second": 1608.548, "eval_steps_per_second": 33.518, "step": 19200 }, { "epoch": 7.418111753371869, "grad_norm": 0.5455588698387146, "learning_rate": 0.00017032909441233143, "loss": 0.42848594665527345, "step": 19250 }, { "epoch": 7.418111753371869, "eval_loss": 0.4933931529521942, "eval_runtime": 11.8748, "eval_samples_per_second": 1600.363, "eval_steps_per_second": 33.348, "step": 19250 }, { "epoch": 7.4373795761079, "grad_norm": 0.43163007497787476, "learning_rate": 0.0001702520231213873, "loss": 0.4202920532226562, "step": 19300 }, { "epoch": 7.4373795761079, "eval_loss": 0.49864184856414795, "eval_runtime": 11.8221, "eval_samples_per_second": 1607.496, "eval_steps_per_second": 33.497, "step": 19300 }, { "epoch": 7.456647398843931, "grad_norm": 0.4442368447780609, "learning_rate": 0.00017017495183044317, "loss": 0.42987884521484376, "step": 19350 }, { "epoch": 7.456647398843931, "eval_loss": 0.5030323266983032, "eval_runtime": 11.8257, "eval_samples_per_second": 1607.011, "eval_steps_per_second": 33.486, "step": 19350 }, { "epoch": 7.475915221579961, "grad_norm": 0.4184732735157013, "learning_rate": 0.00017009788053949906, "loss": 0.43229248046875, "step": 19400 }, { "epoch": 7.475915221579961, "eval_loss": 0.49624088406562805, "eval_runtime": 11.8229, "eval_samples_per_second": 1607.389, "eval_steps_per_second": 33.494, "step": 19400 }, { "epoch": 7.495183044315993, "grad_norm": 0.41610148549079895, "learning_rate": 0.00017002080924855491, "loss": 0.4322586822509766, "step": 19450 }, { "epoch": 7.495183044315993, "eval_loss": 0.4914975166320801, "eval_runtime": 11.8217, "eval_samples_per_second": 1607.551, "eval_steps_per_second": 33.498, "step": 19450 }, { "epoch": 7.514450867052023, "grad_norm": 0.48153382539749146, "learning_rate": 0.0001699437379576108, "loss": 0.42691741943359374, "step": 19500 }, { "epoch": 7.514450867052023, "eval_loss": 0.48589327931404114, "eval_runtime": 11.806, "eval_samples_per_second": 1609.693, "eval_steps_per_second": 33.542, "step": 19500 }, { "epoch": 7.533718689788054, "grad_norm": 0.42066672444343567, "learning_rate": 0.00016986666666666668, "loss": 0.42665016174316406, "step": 19550 }, { "epoch": 7.533718689788054, "eval_loss": 0.48576316237449646, "eval_runtime": 11.8082, "eval_samples_per_second": 1609.392, "eval_steps_per_second": 33.536, "step": 19550 }, { "epoch": 7.552986512524085, "grad_norm": 0.4868031144142151, "learning_rate": 0.00016978959537572254, "loss": 0.42619712829589845, "step": 19600 }, { "epoch": 7.552986512524085, "eval_loss": 0.48951202630996704, "eval_runtime": 11.8104, "eval_samples_per_second": 1609.095, "eval_steps_per_second": 33.53, "step": 19600 }, { "epoch": 7.572254335260116, "grad_norm": 0.4862057566642761, "learning_rate": 0.00016971252408477843, "loss": 0.4224903106689453, "step": 19650 }, { "epoch": 7.572254335260116, "eval_loss": 0.4773549437522888, "eval_runtime": 11.8187, "eval_samples_per_second": 1607.958, "eval_steps_per_second": 33.506, "step": 19650 }, { "epoch": 7.591522157996146, "grad_norm": 0.4975226819515228, "learning_rate": 0.0001696354527938343, "loss": 0.42778453826904295, "step": 19700 }, { "epoch": 7.591522157996146, "eval_loss": 0.502377986907959, "eval_runtime": 11.8111, "eval_samples_per_second": 1608.994, "eval_steps_per_second": 33.528, "step": 19700 }, { "epoch": 7.610789980732177, "grad_norm": 0.45241421461105347, "learning_rate": 0.0001695583815028902, "loss": 0.4231147766113281, "step": 19750 }, { "epoch": 7.610789980732177, "eval_loss": 0.48937806487083435, "eval_runtime": 11.8071, "eval_samples_per_second": 1609.543, "eval_steps_per_second": 33.539, "step": 19750 }, { "epoch": 7.630057803468208, "grad_norm": 0.465789794921875, "learning_rate": 0.00016948131021194605, "loss": 0.4268674468994141, "step": 19800 }, { "epoch": 7.630057803468208, "eval_loss": 0.48858407139778137, "eval_runtime": 11.8133, "eval_samples_per_second": 1608.695, "eval_steps_per_second": 33.522, "step": 19800 }, { "epoch": 7.649325626204239, "grad_norm": 0.47830289602279663, "learning_rate": 0.00016940423892100194, "loss": 0.42394199371337893, "step": 19850 }, { "epoch": 7.649325626204239, "eval_loss": 0.4822678565979004, "eval_runtime": 11.8046, "eval_samples_per_second": 1609.877, "eval_steps_per_second": 33.546, "step": 19850 }, { "epoch": 7.668593448940269, "grad_norm": 0.42264917492866516, "learning_rate": 0.0001693271676300578, "loss": 0.42383056640625, "step": 19900 }, { "epoch": 7.668593448940269, "eval_loss": 0.4837288558483124, "eval_runtime": 11.8198, "eval_samples_per_second": 1607.807, "eval_steps_per_second": 33.503, "step": 19900 }, { "epoch": 7.687861271676301, "grad_norm": 0.39491164684295654, "learning_rate": 0.00016925009633911368, "loss": 0.42544754028320314, "step": 19950 }, { "epoch": 7.687861271676301, "eval_loss": 0.48593902587890625, "eval_runtime": 11.8224, "eval_samples_per_second": 1607.453, "eval_steps_per_second": 33.496, "step": 19950 }, { "epoch": 7.707129094412331, "grad_norm": 0.4179113805294037, "learning_rate": 0.00016917302504816957, "loss": 0.42940460205078124, "step": 20000 }, { "epoch": 7.707129094412331, "eval_loss": 0.49314484000205994, "eval_runtime": 11.822, "eval_samples_per_second": 1607.517, "eval_steps_per_second": 33.497, "step": 20000 }, { "epoch": 7.726396917148362, "grad_norm": 0.4329525828361511, "learning_rate": 0.00016909595375722545, "loss": 0.4260286331176758, "step": 20050 }, { "epoch": 7.726396917148362, "eval_loss": 0.4993878901004791, "eval_runtime": 11.8343, "eval_samples_per_second": 1605.847, "eval_steps_per_second": 33.462, "step": 20050 }, { "epoch": 7.745664739884393, "grad_norm": 0.43154412508010864, "learning_rate": 0.00016901888246628134, "loss": 0.42371501922607424, "step": 20100 }, { "epoch": 7.745664739884393, "eval_loss": 0.4884226620197296, "eval_runtime": 11.8279, "eval_samples_per_second": 1606.705, "eval_steps_per_second": 33.48, "step": 20100 }, { "epoch": 7.764932562620424, "grad_norm": 0.6652418375015259, "learning_rate": 0.0001689418111753372, "loss": 0.42386119842529296, "step": 20150 }, { "epoch": 7.764932562620424, "eval_loss": 0.4937761723995209, "eval_runtime": 11.8206, "eval_samples_per_second": 1607.698, "eval_steps_per_second": 33.501, "step": 20150 }, { "epoch": 7.7842003853564545, "grad_norm": 0.4203815758228302, "learning_rate": 0.00016886473988439305, "loss": 0.4225673294067383, "step": 20200 }, { "epoch": 7.7842003853564545, "eval_loss": 0.484182208776474, "eval_runtime": 11.8265, "eval_samples_per_second": 1606.896, "eval_steps_per_second": 33.484, "step": 20200 }, { "epoch": 7.803468208092486, "grad_norm": 0.44223129749298096, "learning_rate": 0.00016878766859344894, "loss": 0.42796104431152343, "step": 20250 }, { "epoch": 7.803468208092486, "eval_loss": 0.4804375469684601, "eval_runtime": 11.8206, "eval_samples_per_second": 1607.703, "eval_steps_per_second": 33.501, "step": 20250 }, { "epoch": 7.8227360308285165, "grad_norm": 0.5606984496116638, "learning_rate": 0.00016871059730250482, "loss": 0.4236497497558594, "step": 20300 }, { "epoch": 7.8227360308285165, "eval_loss": 0.47552910447120667, "eval_runtime": 11.8145, "eval_samples_per_second": 1608.529, "eval_steps_per_second": 33.518, "step": 20300 }, { "epoch": 7.842003853564547, "grad_norm": 0.4757080376148224, "learning_rate": 0.0001686335260115607, "loss": 0.424999885559082, "step": 20350 }, { "epoch": 7.842003853564547, "eval_loss": 0.471565306186676, "eval_runtime": 11.8157, "eval_samples_per_second": 1608.363, "eval_steps_per_second": 33.515, "step": 20350 }, { "epoch": 7.861271676300578, "grad_norm": 0.46330690383911133, "learning_rate": 0.0001685564547206166, "loss": 0.42600570678710936, "step": 20400 }, { "epoch": 7.861271676300578, "eval_loss": 0.47631657123565674, "eval_runtime": 11.8142, "eval_samples_per_second": 1608.57, "eval_steps_per_second": 33.519, "step": 20400 }, { "epoch": 7.880539499036609, "grad_norm": 0.44312265515327454, "learning_rate": 0.00016847938342967248, "loss": 0.423262939453125, "step": 20450 }, { "epoch": 7.880539499036609, "eval_loss": 0.48198261857032776, "eval_runtime": 11.8093, "eval_samples_per_second": 1609.24, "eval_steps_per_second": 33.533, "step": 20450 }, { "epoch": 7.89980732177264, "grad_norm": 0.47821977734565735, "learning_rate": 0.00016840231213872833, "loss": 0.4249131774902344, "step": 20500 }, { "epoch": 7.89980732177264, "eval_loss": 0.4922085702419281, "eval_runtime": 11.8175, "eval_samples_per_second": 1608.121, "eval_steps_per_second": 33.51, "step": 20500 }, { "epoch": 7.91907514450867, "grad_norm": 0.44139882922172546, "learning_rate": 0.0001683252408477842, "loss": 0.42473735809326174, "step": 20550 }, { "epoch": 7.91907514450867, "eval_loss": 0.490535706281662, "eval_runtime": 11.8221, "eval_samples_per_second": 1607.495, "eval_steps_per_second": 33.497, "step": 20550 }, { "epoch": 7.938342967244702, "grad_norm": 0.4765031635761261, "learning_rate": 0.00016824816955684008, "loss": 0.42616119384765627, "step": 20600 }, { "epoch": 7.938342967244702, "eval_loss": 0.4834766089916229, "eval_runtime": 11.8025, "eval_samples_per_second": 1610.165, "eval_steps_per_second": 33.552, "step": 20600 }, { "epoch": 7.957610789980732, "grad_norm": 0.44993463158607483, "learning_rate": 0.00016817109826589596, "loss": 0.42288238525390626, "step": 20650 }, { "epoch": 7.957610789980732, "eval_loss": 0.4965783953666687, "eval_runtime": 11.8194, "eval_samples_per_second": 1607.861, "eval_steps_per_second": 33.504, "step": 20650 }, { "epoch": 7.976878612716763, "grad_norm": 0.4247177541255951, "learning_rate": 0.00016809402697495185, "loss": 0.4166572189331055, "step": 20700 }, { "epoch": 7.976878612716763, "eval_loss": 0.47500768303871155, "eval_runtime": 11.8246, "eval_samples_per_second": 1607.16, "eval_steps_per_second": 33.49, "step": 20700 }, { "epoch": 7.996146435452793, "grad_norm": 0.4889267683029175, "learning_rate": 0.0001680169556840077, "loss": 0.4234194564819336, "step": 20750 }, { "epoch": 7.996146435452793, "eval_loss": 0.4803604185581207, "eval_runtime": 11.812, "eval_samples_per_second": 1608.878, "eval_steps_per_second": 33.525, "step": 20750 }, { "epoch": 8.015414258188825, "grad_norm": 0.4533916711807251, "learning_rate": 0.0001679398843930636, "loss": 0.41738582611083985, "step": 20800 }, { "epoch": 8.015414258188825, "eval_loss": 0.4788426160812378, "eval_runtime": 11.8256, "eval_samples_per_second": 1607.016, "eval_steps_per_second": 33.487, "step": 20800 }, { "epoch": 8.034682080924856, "grad_norm": 0.49089741706848145, "learning_rate": 0.00016786281310211948, "loss": 0.42442855834960935, "step": 20850 }, { "epoch": 8.034682080924856, "eval_loss": 0.48839110136032104, "eval_runtime": 11.837, "eval_samples_per_second": 1605.48, "eval_steps_per_second": 33.455, "step": 20850 }, { "epoch": 8.053949903660886, "grad_norm": 0.4140833616256714, "learning_rate": 0.00016778574181117533, "loss": 0.418427734375, "step": 20900 }, { "epoch": 8.053949903660886, "eval_loss": 0.4959338903427124, "eval_runtime": 11.8114, "eval_samples_per_second": 1608.96, "eval_steps_per_second": 33.527, "step": 20900 }, { "epoch": 8.073217726396917, "grad_norm": 0.4451848566532135, "learning_rate": 0.00016770867052023122, "loss": 0.4229635238647461, "step": 20950 }, { "epoch": 8.073217726396917, "eval_loss": 0.48218944668769836, "eval_runtime": 11.8192, "eval_samples_per_second": 1607.897, "eval_steps_per_second": 33.505, "step": 20950 }, { "epoch": 8.092485549132949, "grad_norm": 0.44255509972572327, "learning_rate": 0.0001676315992292871, "loss": 0.42198997497558594, "step": 21000 }, { "epoch": 8.092485549132949, "eval_loss": 0.47897204756736755, "eval_runtime": 11.8354, "eval_samples_per_second": 1605.692, "eval_steps_per_second": 33.459, "step": 21000 }, { "epoch": 8.111753371868978, "grad_norm": 0.4628770053386688, "learning_rate": 0.00016755452793834296, "loss": 0.41917106628417966, "step": 21050 }, { "epoch": 8.111753371868978, "eval_loss": 0.4909592866897583, "eval_runtime": 11.8201, "eval_samples_per_second": 1607.77, "eval_steps_per_second": 33.502, "step": 21050 }, { "epoch": 8.13102119460501, "grad_norm": 0.4589299261569977, "learning_rate": 0.00016747745664739885, "loss": 0.42341377258300783, "step": 21100 }, { "epoch": 8.13102119460501, "eval_loss": 0.4949495494365692, "eval_runtime": 11.8148, "eval_samples_per_second": 1608.498, "eval_steps_per_second": 33.517, "step": 21100 }, { "epoch": 8.15028901734104, "grad_norm": 0.38081473112106323, "learning_rate": 0.00016740038535645473, "loss": 0.4113267135620117, "step": 21150 }, { "epoch": 8.15028901734104, "eval_loss": 0.4870956540107727, "eval_runtime": 11.816, "eval_samples_per_second": 1608.324, "eval_steps_per_second": 33.514, "step": 21150 }, { "epoch": 8.169556840077071, "grad_norm": 0.37682875990867615, "learning_rate": 0.00016732331406551062, "loss": 0.4219830322265625, "step": 21200 }, { "epoch": 8.169556840077071, "eval_loss": 0.4932413399219513, "eval_runtime": 11.8372, "eval_samples_per_second": 1605.454, "eval_steps_per_second": 33.454, "step": 21200 }, { "epoch": 8.188824662813103, "grad_norm": 0.4702693819999695, "learning_rate": 0.00016724624277456647, "loss": 0.4151516342163086, "step": 21250 }, { "epoch": 8.188824662813103, "eval_loss": 0.4832304120063782, "eval_runtime": 11.8275, "eval_samples_per_second": 1606.765, "eval_steps_per_second": 33.481, "step": 21250 }, { "epoch": 8.208092485549132, "grad_norm": 0.46806827187538147, "learning_rate": 0.00016716917148362236, "loss": 0.41576744079589845, "step": 21300 }, { "epoch": 8.208092485549132, "eval_loss": 0.48683032393455505, "eval_runtime": 11.8127, "eval_samples_per_second": 1608.775, "eval_steps_per_second": 33.523, "step": 21300 }, { "epoch": 8.227360308285164, "grad_norm": 0.4199375510215759, "learning_rate": 0.00016709210019267822, "loss": 0.41887649536132815, "step": 21350 }, { "epoch": 8.227360308285164, "eval_loss": 0.48174870014190674, "eval_runtime": 11.8512, "eval_samples_per_second": 1603.544, "eval_steps_per_second": 33.414, "step": 21350 }, { "epoch": 8.246628131021195, "grad_norm": 0.46221721172332764, "learning_rate": 0.0001670150289017341, "loss": 0.418994255065918, "step": 21400 }, { "epoch": 8.246628131021195, "eval_loss": 0.48378169536590576, "eval_runtime": 11.8235, "eval_samples_per_second": 1607.312, "eval_steps_per_second": 33.493, "step": 21400 }, { "epoch": 8.265895953757225, "grad_norm": 0.42628708481788635, "learning_rate": 0.00016693795761078999, "loss": 0.4183528900146484, "step": 21450 }, { "epoch": 8.265895953757225, "eval_loss": 0.48402947187423706, "eval_runtime": 11.8272, "eval_samples_per_second": 1606.802, "eval_steps_per_second": 33.482, "step": 21450 }, { "epoch": 8.285163776493256, "grad_norm": 0.47649237513542175, "learning_rate": 0.00016686088631984587, "loss": 0.4255474853515625, "step": 21500 }, { "epoch": 8.285163776493256, "eval_loss": 0.48483502864837646, "eval_runtime": 11.8231, "eval_samples_per_second": 1607.359, "eval_steps_per_second": 33.494, "step": 21500 }, { "epoch": 8.304431599229288, "grad_norm": 0.44782474637031555, "learning_rate": 0.00016678381502890176, "loss": 0.4240903472900391, "step": 21550 }, { "epoch": 8.304431599229288, "eval_loss": 0.4841375946998596, "eval_runtime": 11.8905, "eval_samples_per_second": 1598.256, "eval_steps_per_second": 33.304, "step": 21550 }, { "epoch": 8.323699421965317, "grad_norm": 0.4298747479915619, "learning_rate": 0.00016670674373795764, "loss": 0.4145013427734375, "step": 21600 }, { "epoch": 8.323699421965317, "eval_loss": 0.49052029848098755, "eval_runtime": 11.8132, "eval_samples_per_second": 1608.707, "eval_steps_per_second": 33.522, "step": 21600 }, { "epoch": 8.342967244701349, "grad_norm": 0.4794468879699707, "learning_rate": 0.0001666296724470135, "loss": 0.4223299026489258, "step": 21650 }, { "epoch": 8.342967244701349, "eval_loss": 0.47821173071861267, "eval_runtime": 11.8211, "eval_samples_per_second": 1607.632, "eval_steps_per_second": 33.499, "step": 21650 }, { "epoch": 8.36223506743738, "grad_norm": 0.42012080550193787, "learning_rate": 0.00016655260115606936, "loss": 0.4214884185791016, "step": 21700 }, { "epoch": 8.36223506743738, "eval_loss": 0.4747600853443146, "eval_runtime": 11.8147, "eval_samples_per_second": 1608.502, "eval_steps_per_second": 33.518, "step": 21700 }, { "epoch": 8.38150289017341, "grad_norm": 0.440970242023468, "learning_rate": 0.00016647552986512524, "loss": 0.4084894561767578, "step": 21750 }, { "epoch": 8.38150289017341, "eval_loss": 0.4743127226829529, "eval_runtime": 11.8102, "eval_samples_per_second": 1609.114, "eval_steps_per_second": 33.53, "step": 21750 }, { "epoch": 8.400770712909441, "grad_norm": 0.5102022886276245, "learning_rate": 0.00016639845857418113, "loss": 0.41423477172851564, "step": 21800 }, { "epoch": 8.400770712909441, "eval_loss": 0.47620320320129395, "eval_runtime": 11.8142, "eval_samples_per_second": 1608.567, "eval_steps_per_second": 33.519, "step": 21800 }, { "epoch": 8.420038535645473, "grad_norm": 0.4602818191051483, "learning_rate": 0.000166321387283237, "loss": 0.413984375, "step": 21850 }, { "epoch": 8.420038535645473, "eval_loss": 0.4792347252368927, "eval_runtime": 11.811, "eval_samples_per_second": 1609.008, "eval_steps_per_second": 33.528, "step": 21850 }, { "epoch": 8.439306358381502, "grad_norm": 0.4190075695514679, "learning_rate": 0.0001662443159922929, "loss": 0.4156875228881836, "step": 21900 }, { "epoch": 8.439306358381502, "eval_loss": 0.4766746759414673, "eval_runtime": 11.8167, "eval_samples_per_second": 1608.238, "eval_steps_per_second": 33.512, "step": 21900 }, { "epoch": 8.458574181117534, "grad_norm": 0.42470672726631165, "learning_rate": 0.00016616724470134875, "loss": 0.4173742294311523, "step": 21950 }, { "epoch": 8.458574181117534, "eval_loss": 0.4770633280277252, "eval_runtime": 11.8203, "eval_samples_per_second": 1607.738, "eval_steps_per_second": 33.502, "step": 21950 }, { "epoch": 8.477842003853565, "grad_norm": 0.46187299489974976, "learning_rate": 0.00016609017341040464, "loss": 0.42138824462890623, "step": 22000 }, { "epoch": 8.477842003853565, "eval_loss": 0.47176647186279297, "eval_runtime": 11.8244, "eval_samples_per_second": 1607.192, "eval_steps_per_second": 33.49, "step": 22000 }, { "epoch": 8.497109826589595, "grad_norm": 0.4254043698310852, "learning_rate": 0.0001660131021194605, "loss": 0.41503673553466797, "step": 22050 }, { "epoch": 8.497109826589595, "eval_loss": 0.47966092824935913, "eval_runtime": 11.8187, "eval_samples_per_second": 1607.959, "eval_steps_per_second": 33.506, "step": 22050 }, { "epoch": 8.516377649325626, "grad_norm": 0.4699583649635315, "learning_rate": 0.00016593603082851638, "loss": 0.4146232604980469, "step": 22100 }, { "epoch": 8.516377649325626, "eval_loss": 0.4778515100479126, "eval_runtime": 11.8134, "eval_samples_per_second": 1608.677, "eval_steps_per_second": 33.521, "step": 22100 }, { "epoch": 8.535645472061656, "grad_norm": 0.4389056861400604, "learning_rate": 0.00016585895953757227, "loss": 0.41419044494628904, "step": 22150 }, { "epoch": 8.535645472061656, "eval_loss": 0.4897356629371643, "eval_runtime": 11.811, "eval_samples_per_second": 1609.008, "eval_steps_per_second": 33.528, "step": 22150 }, { "epoch": 8.554913294797688, "grad_norm": 0.4304552376270294, "learning_rate": 0.00016578188824662815, "loss": 0.41781471252441404, "step": 22200 }, { "epoch": 8.554913294797688, "eval_loss": 0.4921175539493561, "eval_runtime": 11.813, "eval_samples_per_second": 1608.737, "eval_steps_per_second": 33.522, "step": 22200 }, { "epoch": 8.574181117533719, "grad_norm": 0.44396817684173584, "learning_rate": 0.000165704816955684, "loss": 0.41830795288085937, "step": 22250 }, { "epoch": 8.574181117533719, "eval_loss": 0.4801716208457947, "eval_runtime": 11.8421, "eval_samples_per_second": 1604.788, "eval_steps_per_second": 33.44, "step": 22250 }, { "epoch": 8.593448940269749, "grad_norm": 0.4537654221057892, "learning_rate": 0.0001656277456647399, "loss": 0.421578369140625, "step": 22300 }, { "epoch": 8.593448940269749, "eval_loss": 0.495498389005661, "eval_runtime": 11.8155, "eval_samples_per_second": 1608.393, "eval_steps_per_second": 33.515, "step": 22300 }, { "epoch": 8.61271676300578, "grad_norm": 0.447837769985199, "learning_rate": 0.00016555067437379578, "loss": 0.41189361572265626, "step": 22350 }, { "epoch": 8.61271676300578, "eval_loss": 0.49026983976364136, "eval_runtime": 11.8112, "eval_samples_per_second": 1608.986, "eval_steps_per_second": 33.528, "step": 22350 }, { "epoch": 8.631984585741812, "grad_norm": 0.4252611994743347, "learning_rate": 0.00016547360308285164, "loss": 0.4131362152099609, "step": 22400 }, { "epoch": 8.631984585741812, "eval_loss": 0.48400214314460754, "eval_runtime": 11.818, "eval_samples_per_second": 1608.052, "eval_steps_per_second": 33.508, "step": 22400 }, { "epoch": 8.651252408477841, "grad_norm": 0.461011677980423, "learning_rate": 0.00016539653179190752, "loss": 0.4118386459350586, "step": 22450 }, { "epoch": 8.651252408477841, "eval_loss": 0.47748470306396484, "eval_runtime": 11.8165, "eval_samples_per_second": 1608.256, "eval_steps_per_second": 33.512, "step": 22450 }, { "epoch": 8.670520231213873, "grad_norm": 0.40479350090026855, "learning_rate": 0.0001653194605009634, "loss": 0.42069393157958984, "step": 22500 }, { "epoch": 8.670520231213873, "eval_loss": 0.4771746098995209, "eval_runtime": 11.8366, "eval_samples_per_second": 1605.531, "eval_steps_per_second": 33.456, "step": 22500 }, { "epoch": 8.689788053949904, "grad_norm": 0.38814017176628113, "learning_rate": 0.00016524238921001926, "loss": 0.41523681640625, "step": 22550 }, { "epoch": 8.689788053949904, "eval_loss": 0.4679921269416809, "eval_runtime": 11.8399, "eval_samples_per_second": 1605.079, "eval_steps_per_second": 33.446, "step": 22550 }, { "epoch": 8.709055876685934, "grad_norm": 0.4841057360172272, "learning_rate": 0.00016516531791907515, "loss": 0.41702751159667967, "step": 22600 }, { "epoch": 8.709055876685934, "eval_loss": 0.478724867105484, "eval_runtime": 11.8427, "eval_samples_per_second": 1604.708, "eval_steps_per_second": 33.438, "step": 22600 }, { "epoch": 8.728323699421965, "grad_norm": 0.4495275020599365, "learning_rate": 0.00016508824662813103, "loss": 0.40450927734375, "step": 22650 }, { "epoch": 8.728323699421965, "eval_loss": 0.4694836139678955, "eval_runtime": 11.8252, "eval_samples_per_second": 1607.07, "eval_steps_per_second": 33.488, "step": 22650 }, { "epoch": 8.747591522157997, "grad_norm": 0.3836930990219116, "learning_rate": 0.00016501117533718692, "loss": 0.4162457275390625, "step": 22700 }, { "epoch": 8.747591522157997, "eval_loss": 0.4857344925403595, "eval_runtime": 11.8198, "eval_samples_per_second": 1607.807, "eval_steps_per_second": 33.503, "step": 22700 }, { "epoch": 8.766859344894026, "grad_norm": 0.3992312550544739, "learning_rate": 0.00016493410404624278, "loss": 0.4076694488525391, "step": 22750 }, { "epoch": 8.766859344894026, "eval_loss": 0.4768076539039612, "eval_runtime": 11.8545, "eval_samples_per_second": 1603.104, "eval_steps_per_second": 33.405, "step": 22750 }, { "epoch": 8.786127167630058, "grad_norm": 0.4430375099182129, "learning_rate": 0.00016485703275529866, "loss": 0.41414031982421873, "step": 22800 }, { "epoch": 8.786127167630058, "eval_loss": 0.48254674673080444, "eval_runtime": 11.8065, "eval_samples_per_second": 1609.618, "eval_steps_per_second": 33.541, "step": 22800 }, { "epoch": 8.80539499036609, "grad_norm": 0.44590723514556885, "learning_rate": 0.00016477996146435452, "loss": 0.41648590087890625, "step": 22850 }, { "epoch": 8.80539499036609, "eval_loss": 0.4813753366470337, "eval_runtime": 11.8118, "eval_samples_per_second": 1608.895, "eval_steps_per_second": 33.526, "step": 22850 }, { "epoch": 8.824662813102119, "grad_norm": 0.4111785590648651, "learning_rate": 0.0001647028901734104, "loss": 0.41525142669677734, "step": 22900 }, { "epoch": 8.824662813102119, "eval_loss": 0.481361448764801, "eval_runtime": 11.8187, "eval_samples_per_second": 1607.955, "eval_steps_per_second": 33.506, "step": 22900 }, { "epoch": 8.84393063583815, "grad_norm": 0.4120384454727173, "learning_rate": 0.0001646258188824663, "loss": 0.414000244140625, "step": 22950 }, { "epoch": 8.84393063583815, "eval_loss": 0.4847261905670166, "eval_runtime": 11.8173, "eval_samples_per_second": 1608.149, "eval_steps_per_second": 33.51, "step": 22950 }, { "epoch": 8.863198458574182, "grad_norm": 0.4167059361934662, "learning_rate": 0.00016454874759152217, "loss": 0.40938026428222657, "step": 23000 }, { "epoch": 8.863198458574182, "eval_loss": 0.4884645938873291, "eval_runtime": 11.8279, "eval_samples_per_second": 1606.71, "eval_steps_per_second": 33.48, "step": 23000 }, { "epoch": 8.882466281310212, "grad_norm": 0.42467638850212097, "learning_rate": 0.00016447167630057806, "loss": 0.41419937133789064, "step": 23050 }, { "epoch": 8.882466281310212, "eval_loss": 0.4819784164428711, "eval_runtime": 11.8324, "eval_samples_per_second": 1606.095, "eval_steps_per_second": 33.467, "step": 23050 }, { "epoch": 8.901734104046243, "grad_norm": 0.47403082251548767, "learning_rate": 0.00016439460500963392, "loss": 0.4155962371826172, "step": 23100 }, { "epoch": 8.901734104046243, "eval_loss": 0.47870004177093506, "eval_runtime": 11.8234, "eval_samples_per_second": 1607.319, "eval_steps_per_second": 33.493, "step": 23100 }, { "epoch": 8.921001926782274, "grad_norm": 0.4137413203716278, "learning_rate": 0.00016431753371868977, "loss": 0.4148798370361328, "step": 23150 }, { "epoch": 8.921001926782274, "eval_loss": 0.4811168313026428, "eval_runtime": 11.8238, "eval_samples_per_second": 1607.261, "eval_steps_per_second": 33.492, "step": 23150 }, { "epoch": 8.940269749518304, "grad_norm": 0.45216459035873413, "learning_rate": 0.00016424046242774566, "loss": 0.41280403137207033, "step": 23200 }, { "epoch": 8.940269749518304, "eval_loss": 0.480740487575531, "eval_runtime": 11.8477, "eval_samples_per_second": 1604.023, "eval_steps_per_second": 33.424, "step": 23200 }, { "epoch": 8.959537572254336, "grad_norm": 0.44058048725128174, "learning_rate": 0.00016416339113680154, "loss": 0.4123952865600586, "step": 23250 }, { "epoch": 8.959537572254336, "eval_loss": 0.4804123342037201, "eval_runtime": 11.8317, "eval_samples_per_second": 1606.198, "eval_steps_per_second": 33.469, "step": 23250 }, { "epoch": 8.978805394990367, "grad_norm": 0.5586961507797241, "learning_rate": 0.00016408631984585743, "loss": 0.41831024169921877, "step": 23300 }, { "epoch": 8.978805394990367, "eval_loss": 0.4817918539047241, "eval_runtime": 11.8552, "eval_samples_per_second": 1603.014, "eval_steps_per_second": 33.403, "step": 23300 }, { "epoch": 8.998073217726397, "grad_norm": 0.41329625248908997, "learning_rate": 0.0001640092485549133, "loss": 0.4114185333251953, "step": 23350 }, { "epoch": 8.998073217726397, "eval_loss": 0.4688608944416046, "eval_runtime": 11.836, "eval_samples_per_second": 1605.605, "eval_steps_per_second": 33.457, "step": 23350 }, { "epoch": 9.017341040462428, "grad_norm": 0.6255316138267517, "learning_rate": 0.0001639321772639692, "loss": 0.4092318725585937, "step": 23400 }, { "epoch": 9.017341040462428, "eval_loss": 0.4782598316669464, "eval_runtime": 11.8338, "eval_samples_per_second": 1605.909, "eval_steps_per_second": 33.463, "step": 23400 }, { "epoch": 9.036608863198458, "grad_norm": 0.43412476778030396, "learning_rate": 0.00016385510597302506, "loss": 0.4149618911743164, "step": 23450 }, { "epoch": 9.036608863198458, "eval_loss": 0.4690821170806885, "eval_runtime": 11.849, "eval_samples_per_second": 1603.849, "eval_steps_per_second": 33.421, "step": 23450 }, { "epoch": 9.05587668593449, "grad_norm": 0.454772412776947, "learning_rate": 0.00016377803468208094, "loss": 0.413372802734375, "step": 23500 }, { "epoch": 9.05587668593449, "eval_loss": 0.4748166799545288, "eval_runtime": 11.8273, "eval_samples_per_second": 1606.786, "eval_steps_per_second": 33.482, "step": 23500 }, { "epoch": 9.07514450867052, "grad_norm": 0.4365652799606323, "learning_rate": 0.0001637009633911368, "loss": 0.4145172882080078, "step": 23550 }, { "epoch": 9.07514450867052, "eval_loss": 0.4695042073726654, "eval_runtime": 11.8296, "eval_samples_per_second": 1606.479, "eval_steps_per_second": 33.475, "step": 23550 }, { "epoch": 9.09441233140655, "grad_norm": 0.44942450523376465, "learning_rate": 0.00016362389210019268, "loss": 0.412008056640625, "step": 23600 }, { "epoch": 9.09441233140655, "eval_loss": 0.4732353985309601, "eval_runtime": 11.8292, "eval_samples_per_second": 1606.529, "eval_steps_per_second": 33.476, "step": 23600 }, { "epoch": 9.113680154142582, "grad_norm": 0.5026931762695312, "learning_rate": 0.00016354682080924857, "loss": 0.40738094329833985, "step": 23650 }, { "epoch": 9.113680154142582, "eval_loss": 0.4798746109008789, "eval_runtime": 11.8511, "eval_samples_per_second": 1603.558, "eval_steps_per_second": 33.414, "step": 23650 }, { "epoch": 9.132947976878613, "grad_norm": 0.3945106267929077, "learning_rate": 0.00016346974951830445, "loss": 0.40823898315429685, "step": 23700 }, { "epoch": 9.132947976878613, "eval_loss": 0.47859516739845276, "eval_runtime": 11.8387, "eval_samples_per_second": 1605.25, "eval_steps_per_second": 33.45, "step": 23700 }, { "epoch": 9.152215799614643, "grad_norm": 0.40928465127944946, "learning_rate": 0.0001633926782273603, "loss": 0.4113623046875, "step": 23750 }, { "epoch": 9.152215799614643, "eval_loss": 0.4718487560749054, "eval_runtime": 11.8294, "eval_samples_per_second": 1606.507, "eval_steps_per_second": 33.476, "step": 23750 }, { "epoch": 9.171483622350674, "grad_norm": 0.3834016025066376, "learning_rate": 0.0001633156069364162, "loss": 0.4107219696044922, "step": 23800 }, { "epoch": 9.171483622350674, "eval_loss": 0.46343082189559937, "eval_runtime": 11.8564, "eval_samples_per_second": 1602.847, "eval_steps_per_second": 33.4, "step": 23800 }, { "epoch": 9.190751445086706, "grad_norm": 0.45830419659614563, "learning_rate": 0.00016323853564547208, "loss": 0.40842178344726565, "step": 23850 }, { "epoch": 9.190751445086706, "eval_loss": 0.4758710265159607, "eval_runtime": 11.8322, "eval_samples_per_second": 1606.124, "eval_steps_per_second": 33.468, "step": 23850 }, { "epoch": 9.210019267822736, "grad_norm": 0.48041674494743347, "learning_rate": 0.00016316146435452794, "loss": 0.412760009765625, "step": 23900 }, { "epoch": 9.210019267822736, "eval_loss": 0.4697442352771759, "eval_runtime": 11.8353, "eval_samples_per_second": 1605.701, "eval_steps_per_second": 33.459, "step": 23900 }, { "epoch": 9.229287090558767, "grad_norm": 0.37571513652801514, "learning_rate": 0.00016308439306358382, "loss": 0.40597091674804686, "step": 23950 }, { "epoch": 9.229287090558767, "eval_loss": 0.4725053906440735, "eval_runtime": 11.8248, "eval_samples_per_second": 1607.131, "eval_steps_per_second": 33.489, "step": 23950 }, { "epoch": 9.248554913294798, "grad_norm": 0.45188605785369873, "learning_rate": 0.0001630073217726397, "loss": 0.4082343292236328, "step": 24000 }, { "epoch": 9.248554913294798, "eval_loss": 0.48157864809036255, "eval_runtime": 11.85, "eval_samples_per_second": 1603.707, "eval_steps_per_second": 33.418, "step": 24000 }, { "epoch": 9.267822736030828, "grad_norm": 0.42944037914276123, "learning_rate": 0.00016293025048169557, "loss": 0.41132919311523436, "step": 24050 }, { "epoch": 9.267822736030828, "eval_loss": 0.46747833490371704, "eval_runtime": 11.8345, "eval_samples_per_second": 1605.81, "eval_steps_per_second": 33.461, "step": 24050 }, { "epoch": 9.28709055876686, "grad_norm": 0.35839805006980896, "learning_rate": 0.00016285317919075145, "loss": 0.41068252563476565, "step": 24100 }, { "epoch": 9.28709055876686, "eval_loss": 0.46876853704452515, "eval_runtime": 11.8346, "eval_samples_per_second": 1605.8, "eval_steps_per_second": 33.461, "step": 24100 }, { "epoch": 9.306358381502891, "grad_norm": 0.45749354362487793, "learning_rate": 0.00016277610789980734, "loss": 0.4038092041015625, "step": 24150 }, { "epoch": 9.306358381502891, "eval_loss": 0.4687473773956299, "eval_runtime": 11.8453, "eval_samples_per_second": 1604.35, "eval_steps_per_second": 33.431, "step": 24150 }, { "epoch": 9.32562620423892, "grad_norm": 0.4441518783569336, "learning_rate": 0.00016269903660886322, "loss": 0.4153568649291992, "step": 24200 }, { "epoch": 9.32562620423892, "eval_loss": 0.4687119722366333, "eval_runtime": 11.815, "eval_samples_per_second": 1608.459, "eval_steps_per_second": 33.517, "step": 24200 }, { "epoch": 9.344894026974952, "grad_norm": 0.47323504090309143, "learning_rate": 0.00016262196531791908, "loss": 0.4087883758544922, "step": 24250 }, { "epoch": 9.344894026974952, "eval_loss": 0.4738074839115143, "eval_runtime": 11.8317, "eval_samples_per_second": 1606.199, "eval_steps_per_second": 33.47, "step": 24250 }, { "epoch": 9.364161849710982, "grad_norm": 0.4212712347507477, "learning_rate": 0.00016254489402697496, "loss": 0.40656959533691406, "step": 24300 }, { "epoch": 9.364161849710982, "eval_loss": 0.4708568751811981, "eval_runtime": 11.8285, "eval_samples_per_second": 1606.629, "eval_steps_per_second": 33.478, "step": 24300 }, { "epoch": 9.383429672447013, "grad_norm": 0.4441240727901459, "learning_rate": 0.00016246782273603082, "loss": 0.41059867858886717, "step": 24350 }, { "epoch": 9.383429672447013, "eval_loss": 0.4619208574295044, "eval_runtime": 11.895, "eval_samples_per_second": 1597.64, "eval_steps_per_second": 33.291, "step": 24350 }, { "epoch": 9.402697495183045, "grad_norm": 0.4832783639431, "learning_rate": 0.0001623907514450867, "loss": 0.41122127532958985, "step": 24400 }, { "epoch": 9.402697495183045, "eval_loss": 0.4629591107368469, "eval_runtime": 11.8416, "eval_samples_per_second": 1604.845, "eval_steps_per_second": 33.441, "step": 24400 }, { "epoch": 9.421965317919074, "grad_norm": 0.4360071122646332, "learning_rate": 0.0001623136801541426, "loss": 0.4079976272583008, "step": 24450 }, { "epoch": 9.421965317919074, "eval_loss": 0.46647271513938904, "eval_runtime": 11.846, "eval_samples_per_second": 1604.249, "eval_steps_per_second": 33.429, "step": 24450 }, { "epoch": 9.441233140655106, "grad_norm": 0.4476543962955475, "learning_rate": 0.00016223660886319848, "loss": 0.41353984832763674, "step": 24500 }, { "epoch": 9.441233140655106, "eval_loss": 0.47116345167160034, "eval_runtime": 11.8365, "eval_samples_per_second": 1605.538, "eval_steps_per_second": 33.456, "step": 24500 }, { "epoch": 9.460500963391137, "grad_norm": 0.4163832664489746, "learning_rate": 0.00016215953757225436, "loss": 0.40807472229003905, "step": 24550 }, { "epoch": 9.460500963391137, "eval_loss": 0.46989259123802185, "eval_runtime": 11.8374, "eval_samples_per_second": 1605.426, "eval_steps_per_second": 33.453, "step": 24550 }, { "epoch": 9.479768786127167, "grad_norm": 0.38382259011268616, "learning_rate": 0.00016208246628131022, "loss": 0.4090992736816406, "step": 24600 }, { "epoch": 9.479768786127167, "eval_loss": 0.4787062704563141, "eval_runtime": 11.8251, "eval_samples_per_second": 1607.091, "eval_steps_per_second": 33.488, "step": 24600 }, { "epoch": 9.499036608863198, "grad_norm": 0.39221426844596863, "learning_rate": 0.00016200539499036608, "loss": 0.4091017150878906, "step": 24650 }, { "epoch": 9.499036608863198, "eval_loss": 0.4714788794517517, "eval_runtime": 11.8289, "eval_samples_per_second": 1606.577, "eval_steps_per_second": 33.477, "step": 24650 }, { "epoch": 9.51830443159923, "grad_norm": 0.44396892189979553, "learning_rate": 0.00016192832369942196, "loss": 0.4043901062011719, "step": 24700 }, { "epoch": 9.51830443159923, "eval_loss": 0.47141608595848083, "eval_runtime": 11.8486, "eval_samples_per_second": 1603.903, "eval_steps_per_second": 33.422, "step": 24700 }, { "epoch": 9.53757225433526, "grad_norm": 0.47057271003723145, "learning_rate": 0.00016185125240847785, "loss": 0.40705379486083987, "step": 24750 }, { "epoch": 9.53757225433526, "eval_loss": 0.4715209901332855, "eval_runtime": 11.822, "eval_samples_per_second": 1607.51, "eval_steps_per_second": 33.497, "step": 24750 }, { "epoch": 9.556840077071291, "grad_norm": 0.4481187164783478, "learning_rate": 0.00016177418111753373, "loss": 0.4067497253417969, "step": 24800 }, { "epoch": 9.556840077071291, "eval_loss": 0.46947142481803894, "eval_runtime": 11.819, "eval_samples_per_second": 1607.919, "eval_steps_per_second": 33.505, "step": 24800 }, { "epoch": 9.576107899807322, "grad_norm": 0.42132502794265747, "learning_rate": 0.00016169710982658962, "loss": 0.4080718231201172, "step": 24850 }, { "epoch": 9.576107899807322, "eval_loss": 0.46416646242141724, "eval_runtime": 11.8373, "eval_samples_per_second": 1605.431, "eval_steps_per_second": 33.454, "step": 24850 }, { "epoch": 9.595375722543352, "grad_norm": 0.46884071826934814, "learning_rate": 0.0001616200385356455, "loss": 0.41091442108154297, "step": 24900 }, { "epoch": 9.595375722543352, "eval_loss": 0.47043415904045105, "eval_runtime": 11.8334, "eval_samples_per_second": 1605.962, "eval_steps_per_second": 33.465, "step": 24900 }, { "epoch": 9.614643545279383, "grad_norm": 0.42876559495925903, "learning_rate": 0.00016154296724470136, "loss": 0.4116065979003906, "step": 24950 }, { "epoch": 9.614643545279383, "eval_loss": 0.4717128872871399, "eval_runtime": 11.8072, "eval_samples_per_second": 1609.524, "eval_steps_per_second": 33.539, "step": 24950 }, { "epoch": 9.633911368015415, "grad_norm": 0.45604047179222107, "learning_rate": 0.00016146589595375722, "loss": 0.41203697204589845, "step": 25000 }, { "epoch": 9.633911368015415, "eval_loss": 0.4801386296749115, "eval_runtime": 11.8238, "eval_samples_per_second": 1607.271, "eval_steps_per_second": 33.492, "step": 25000 }, { "epoch": 9.653179190751445, "grad_norm": 0.4222913384437561, "learning_rate": 0.0001613888246628131, "loss": 0.41199588775634766, "step": 25050 }, { "epoch": 9.653179190751445, "eval_loss": 0.470223993062973, "eval_runtime": 11.8333, "eval_samples_per_second": 1605.98, "eval_steps_per_second": 33.465, "step": 25050 }, { "epoch": 9.672447013487476, "grad_norm": 0.4078138470649719, "learning_rate": 0.00016131175337186899, "loss": 0.41151191711425783, "step": 25100 }, { "epoch": 9.672447013487476, "eval_loss": 0.46550899744033813, "eval_runtime": 11.8314, "eval_samples_per_second": 1606.236, "eval_steps_per_second": 33.47, "step": 25100 }, { "epoch": 9.691714836223507, "grad_norm": 0.3877038359642029, "learning_rate": 0.00016123468208092487, "loss": 0.41262237548828123, "step": 25150 }, { "epoch": 9.691714836223507, "eval_loss": 0.4639965295791626, "eval_runtime": 11.8248, "eval_samples_per_second": 1607.13, "eval_steps_per_second": 33.489, "step": 25150 }, { "epoch": 9.710982658959537, "grad_norm": 0.4259389042854309, "learning_rate": 0.00016115761078998076, "loss": 0.41023372650146483, "step": 25200 }, { "epoch": 9.710982658959537, "eval_loss": 0.4671492874622345, "eval_runtime": 11.8258, "eval_samples_per_second": 1606.997, "eval_steps_per_second": 33.486, "step": 25200 }, { "epoch": 9.730250481695569, "grad_norm": 0.42498835921287537, "learning_rate": 0.00016108053949903661, "loss": 0.40017219543457033, "step": 25250 }, { "epoch": 9.730250481695569, "eval_loss": 0.4663718640804291, "eval_runtime": 11.8276, "eval_samples_per_second": 1606.755, "eval_steps_per_second": 33.481, "step": 25250 }, { "epoch": 9.7495183044316, "grad_norm": 0.3901697099208832, "learning_rate": 0.0001610034682080925, "loss": 0.4024842834472656, "step": 25300 }, { "epoch": 9.7495183044316, "eval_loss": 0.47596317529678345, "eval_runtime": 11.8834, "eval_samples_per_second": 1599.209, "eval_steps_per_second": 33.324, "step": 25300 }, { "epoch": 9.76878612716763, "grad_norm": 0.45097339153289795, "learning_rate": 0.00016092639691714836, "loss": 0.410355339050293, "step": 25350 }, { "epoch": 9.76878612716763, "eval_loss": 0.46011248230934143, "eval_runtime": 11.8215, "eval_samples_per_second": 1607.578, "eval_steps_per_second": 33.498, "step": 25350 }, { "epoch": 9.788053949903661, "grad_norm": 0.3664410412311554, "learning_rate": 0.00016084932562620424, "loss": 0.41034603118896484, "step": 25400 }, { "epoch": 9.788053949903661, "eval_loss": 0.46679604053497314, "eval_runtime": 11.839, "eval_samples_per_second": 1605.208, "eval_steps_per_second": 33.449, "step": 25400 }, { "epoch": 9.807321772639693, "grad_norm": 0.37220489978790283, "learning_rate": 0.00016077225433526013, "loss": 0.4085308837890625, "step": 25450 }, { "epoch": 9.807321772639693, "eval_loss": 0.4587947130203247, "eval_runtime": 11.816, "eval_samples_per_second": 1608.322, "eval_steps_per_second": 33.514, "step": 25450 }, { "epoch": 9.826589595375722, "grad_norm": 0.4264661371707916, "learning_rate": 0.00016069518304431598, "loss": 0.4048243713378906, "step": 25500 }, { "epoch": 9.826589595375722, "eval_loss": 0.4682500958442688, "eval_runtime": 11.8237, "eval_samples_per_second": 1607.278, "eval_steps_per_second": 33.492, "step": 25500 }, { "epoch": 9.845857418111754, "grad_norm": 0.4014965891838074, "learning_rate": 0.00016061811175337187, "loss": 0.40574337005615235, "step": 25550 }, { "epoch": 9.845857418111754, "eval_loss": 0.46454569697380066, "eval_runtime": 11.8384, "eval_samples_per_second": 1605.28, "eval_steps_per_second": 33.45, "step": 25550 }, { "epoch": 9.865125240847783, "grad_norm": 0.4170167148113251, "learning_rate": 0.00016054104046242775, "loss": 0.4041874694824219, "step": 25600 }, { "epoch": 9.865125240847783, "eval_loss": 0.46875709295272827, "eval_runtime": 11.8462, "eval_samples_per_second": 1604.228, "eval_steps_per_second": 33.428, "step": 25600 }, { "epoch": 9.884393063583815, "grad_norm": 0.3853864371776581, "learning_rate": 0.00016046396917148364, "loss": 0.40357906341552735, "step": 25650 }, { "epoch": 9.884393063583815, "eval_loss": 0.4598325788974762, "eval_runtime": 11.8285, "eval_samples_per_second": 1606.635, "eval_steps_per_second": 33.479, "step": 25650 }, { "epoch": 9.903660886319846, "grad_norm": 0.4253461956977844, "learning_rate": 0.00016038689788053952, "loss": 0.4060938262939453, "step": 25700 }, { "epoch": 9.903660886319846, "eval_loss": 0.46498289704322815, "eval_runtime": 11.8101, "eval_samples_per_second": 1609.127, "eval_steps_per_second": 33.531, "step": 25700 }, { "epoch": 9.922928709055876, "grad_norm": 0.4131441116333008, "learning_rate": 0.00016030982658959538, "loss": 0.4084878158569336, "step": 25750 }, { "epoch": 9.922928709055876, "eval_loss": 0.4679907560348511, "eval_runtime": 11.8279, "eval_samples_per_second": 1606.709, "eval_steps_per_second": 33.48, "step": 25750 }, { "epoch": 9.942196531791907, "grad_norm": 0.4472554326057434, "learning_rate": 0.00016023275529865124, "loss": 0.40047290802001956, "step": 25800 }, { "epoch": 9.942196531791907, "eval_loss": 0.4614337384700775, "eval_runtime": 11.8335, "eval_samples_per_second": 1605.949, "eval_steps_per_second": 33.464, "step": 25800 }, { "epoch": 9.961464354527939, "grad_norm": 0.43350842595100403, "learning_rate": 0.00016015568400770712, "loss": 0.3982276916503906, "step": 25850 }, { "epoch": 9.961464354527939, "eval_loss": 0.46572205424308777, "eval_runtime": 11.8352, "eval_samples_per_second": 1605.714, "eval_steps_per_second": 33.459, "step": 25850 }, { "epoch": 9.980732177263969, "grad_norm": 0.4463820457458496, "learning_rate": 0.000160078612716763, "loss": 0.4048011016845703, "step": 25900 }, { "epoch": 9.980732177263969, "eval_loss": 0.4599461257457733, "eval_runtime": 11.8376, "eval_samples_per_second": 1605.387, "eval_steps_per_second": 33.453, "step": 25900 }, { "epoch": 10.0, "grad_norm": 0.4412059783935547, "learning_rate": 0.0001600015414258189, "loss": 0.40637882232666017, "step": 25950 }, { "epoch": 10.0, "eval_loss": 0.4636792242527008, "eval_runtime": 11.8449, "eval_samples_per_second": 1604.409, "eval_steps_per_second": 33.432, "step": 25950 }, { "epoch": 10.019267822736031, "grad_norm": 0.40187644958496094, "learning_rate": 0.00015992447013487478, "loss": 0.39861824035644533, "step": 26000 }, { "epoch": 10.019267822736031, "eval_loss": 0.4661506414413452, "eval_runtime": 11.8363, "eval_samples_per_second": 1605.568, "eval_steps_per_second": 33.456, "step": 26000 }, { "epoch": 10.038535645472061, "grad_norm": 0.382499635219574, "learning_rate": 0.00015984739884393066, "loss": 0.40142417907714845, "step": 26050 }, { "epoch": 10.038535645472061, "eval_loss": 0.47049078345298767, "eval_runtime": 11.8229, "eval_samples_per_second": 1607.388, "eval_steps_per_second": 33.494, "step": 26050 }, { "epoch": 10.057803468208093, "grad_norm": 0.45124533772468567, "learning_rate": 0.00015977032755298652, "loss": 0.40590728759765626, "step": 26100 }, { "epoch": 10.057803468208093, "eval_loss": 0.4672909080982208, "eval_runtime": 11.8539, "eval_samples_per_second": 1603.184, "eval_steps_per_second": 33.407, "step": 26100 }, { "epoch": 10.077071290944124, "grad_norm": 0.4059544503688812, "learning_rate": 0.00015969325626204238, "loss": 0.40190582275390624, "step": 26150 }, { "epoch": 10.077071290944124, "eval_loss": 0.46770283579826355, "eval_runtime": 11.8437, "eval_samples_per_second": 1604.565, "eval_steps_per_second": 33.435, "step": 26150 }, { "epoch": 10.096339113680154, "grad_norm": 0.39346814155578613, "learning_rate": 0.00015961618497109826, "loss": 0.40577289581298825, "step": 26200 }, { "epoch": 10.096339113680154, "eval_loss": 0.4672413766384125, "eval_runtime": 11.8358, "eval_samples_per_second": 1605.633, "eval_steps_per_second": 33.458, "step": 26200 }, { "epoch": 10.115606936416185, "grad_norm": 0.4337371289730072, "learning_rate": 0.00015953911368015415, "loss": 0.3984918212890625, "step": 26250 }, { "epoch": 10.115606936416185, "eval_loss": 0.4577603340148926, "eval_runtime": 11.8532, "eval_samples_per_second": 1603.284, "eval_steps_per_second": 33.409, "step": 26250 }, { "epoch": 10.134874759152217, "grad_norm": 0.4112143814563751, "learning_rate": 0.00015946204238921003, "loss": 0.40789505004882814, "step": 26300 }, { "epoch": 10.134874759152217, "eval_loss": 0.4621376097202301, "eval_runtime": 11.8331, "eval_samples_per_second": 1606.004, "eval_steps_per_second": 33.465, "step": 26300 }, { "epoch": 10.154142581888246, "grad_norm": 0.49528443813323975, "learning_rate": 0.00015938497109826592, "loss": 0.40205223083496094, "step": 26350 }, { "epoch": 10.154142581888246, "eval_loss": 0.4683309495449066, "eval_runtime": 11.8364, "eval_samples_per_second": 1605.561, "eval_steps_per_second": 33.456, "step": 26350 }, { "epoch": 10.173410404624278, "grad_norm": 0.4023872911930084, "learning_rate": 0.00015930789980732178, "loss": 0.4078504943847656, "step": 26400 }, { "epoch": 10.173410404624278, "eval_loss": 0.46704596281051636, "eval_runtime": 11.8283, "eval_samples_per_second": 1606.649, "eval_steps_per_second": 33.479, "step": 26400 }, { "epoch": 10.19267822736031, "grad_norm": 0.473865270614624, "learning_rate": 0.00015923082851637766, "loss": 0.40197998046875, "step": 26450 }, { "epoch": 10.19267822736031, "eval_loss": 0.4710236191749573, "eval_runtime": 11.8494, "eval_samples_per_second": 1603.8, "eval_steps_per_second": 33.42, "step": 26450 }, { "epoch": 10.211946050096339, "grad_norm": 0.4372361898422241, "learning_rate": 0.00015915375722543352, "loss": 0.4052556610107422, "step": 26500 }, { "epoch": 10.211946050096339, "eval_loss": 0.4604220986366272, "eval_runtime": 11.8497, "eval_samples_per_second": 1603.755, "eval_steps_per_second": 33.419, "step": 26500 }, { "epoch": 10.23121387283237, "grad_norm": 0.46354690194129944, "learning_rate": 0.0001590766859344894, "loss": 0.4015184020996094, "step": 26550 }, { "epoch": 10.23121387283237, "eval_loss": 0.4697933793067932, "eval_runtime": 11.8301, "eval_samples_per_second": 1606.414, "eval_steps_per_second": 33.474, "step": 26550 }, { "epoch": 10.2504816955684, "grad_norm": 0.3806550204753876, "learning_rate": 0.0001589996146435453, "loss": 0.40570980072021484, "step": 26600 }, { "epoch": 10.2504816955684, "eval_loss": 0.45122602581977844, "eval_runtime": 11.8422, "eval_samples_per_second": 1604.771, "eval_steps_per_second": 33.44, "step": 26600 }, { "epoch": 10.269749518304431, "grad_norm": 0.49780625104904175, "learning_rate": 0.00015892254335260117, "loss": 0.4019667816162109, "step": 26650 }, { "epoch": 10.269749518304431, "eval_loss": 0.46228358149528503, "eval_runtime": 11.8524, "eval_samples_per_second": 1603.391, "eval_steps_per_second": 33.411, "step": 26650 }, { "epoch": 10.289017341040463, "grad_norm": 0.4198426604270935, "learning_rate": 0.00015884547206165703, "loss": 0.4046209716796875, "step": 26700 }, { "epoch": 10.289017341040463, "eval_loss": 0.45990902185440063, "eval_runtime": 11.8407, "eval_samples_per_second": 1604.966, "eval_steps_per_second": 33.444, "step": 26700 }, { "epoch": 10.308285163776493, "grad_norm": 0.41785699129104614, "learning_rate": 0.00015876840077071292, "loss": 0.40707313537597656, "step": 26750 }, { "epoch": 10.308285163776493, "eval_loss": 0.4654827117919922, "eval_runtime": 11.8395, "eval_samples_per_second": 1605.134, "eval_steps_per_second": 33.447, "step": 26750 }, { "epoch": 10.327552986512524, "grad_norm": 0.443072110414505, "learning_rate": 0.0001586913294797688, "loss": 0.4066902160644531, "step": 26800 }, { "epoch": 10.327552986512524, "eval_loss": 0.4651105999946594, "eval_runtime": 11.8998, "eval_samples_per_second": 1596.995, "eval_steps_per_second": 33.278, "step": 26800 }, { "epoch": 10.346820809248555, "grad_norm": 0.4295455813407898, "learning_rate": 0.00015861425818882466, "loss": 0.40585357666015626, "step": 26850 }, { "epoch": 10.346820809248555, "eval_loss": 0.4560443162918091, "eval_runtime": 11.8359, "eval_samples_per_second": 1605.626, "eval_steps_per_second": 33.458, "step": 26850 }, { "epoch": 10.366088631984585, "grad_norm": 0.4484204649925232, "learning_rate": 0.00015853718689788054, "loss": 0.3990660858154297, "step": 26900 }, { "epoch": 10.366088631984585, "eval_loss": 0.4632647633552551, "eval_runtime": 11.8431, "eval_samples_per_second": 1604.645, "eval_steps_per_second": 33.437, "step": 26900 }, { "epoch": 10.385356454720617, "grad_norm": 0.4101542830467224, "learning_rate": 0.00015846011560693643, "loss": 0.40240936279296874, "step": 26950 }, { "epoch": 10.385356454720617, "eval_loss": 0.4672456979751587, "eval_runtime": 11.8338, "eval_samples_per_second": 1605.905, "eval_steps_per_second": 33.463, "step": 26950 }, { "epoch": 10.404624277456648, "grad_norm": 0.48129507899284363, "learning_rate": 0.0001583830443159923, "loss": 0.3985650253295898, "step": 27000 }, { "epoch": 10.404624277456648, "eval_loss": 0.4665919840335846, "eval_runtime": 11.8229, "eval_samples_per_second": 1607.391, "eval_steps_per_second": 33.494, "step": 27000 }, { "epoch": 10.423892100192678, "grad_norm": 0.4118266701698303, "learning_rate": 0.00015830597302504817, "loss": 0.40241561889648436, "step": 27050 }, { "epoch": 10.423892100192678, "eval_loss": 0.4593660831451416, "eval_runtime": 11.8297, "eval_samples_per_second": 1606.459, "eval_steps_per_second": 33.475, "step": 27050 }, { "epoch": 10.443159922928709, "grad_norm": 0.42279183864593506, "learning_rate": 0.00015822890173410406, "loss": 0.40591514587402344, "step": 27100 }, { "epoch": 10.443159922928709, "eval_loss": 0.4693402349948883, "eval_runtime": 11.8348, "eval_samples_per_second": 1605.775, "eval_steps_per_second": 33.461, "step": 27100 }, { "epoch": 10.46242774566474, "grad_norm": 0.49944305419921875, "learning_rate": 0.00015815183044315994, "loss": 0.3978759002685547, "step": 27150 }, { "epoch": 10.46242774566474, "eval_loss": 0.46136343479156494, "eval_runtime": 11.8463, "eval_samples_per_second": 1604.212, "eval_steps_per_second": 33.428, "step": 27150 }, { "epoch": 10.48169556840077, "grad_norm": 0.42541977763175964, "learning_rate": 0.0001580747591522158, "loss": 0.40185195922851563, "step": 27200 }, { "epoch": 10.48169556840077, "eval_loss": 0.451945036649704, "eval_runtime": 11.8525, "eval_samples_per_second": 1603.379, "eval_steps_per_second": 33.411, "step": 27200 }, { "epoch": 10.500963391136802, "grad_norm": 0.37363845109939575, "learning_rate": 0.00015799768786127168, "loss": 0.4022100067138672, "step": 27250 }, { "epoch": 10.500963391136802, "eval_loss": 0.456046462059021, "eval_runtime": 11.8455, "eval_samples_per_second": 1604.316, "eval_steps_per_second": 33.43, "step": 27250 }, { "epoch": 10.520231213872833, "grad_norm": 0.45829853415489197, "learning_rate": 0.00015792061657032754, "loss": 0.4011544418334961, "step": 27300 }, { "epoch": 10.520231213872833, "eval_loss": 0.471386581659317, "eval_runtime": 11.8372, "eval_samples_per_second": 1605.449, "eval_steps_per_second": 33.454, "step": 27300 }, { "epoch": 10.539499036608863, "grad_norm": 0.36931806802749634, "learning_rate": 0.00015784354527938343, "loss": 0.3984589767456055, "step": 27350 }, { "epoch": 10.539499036608863, "eval_loss": 0.4568248391151428, "eval_runtime": 11.8322, "eval_samples_per_second": 1606.123, "eval_steps_per_second": 33.468, "step": 27350 }, { "epoch": 10.558766859344894, "grad_norm": 0.3638107478618622, "learning_rate": 0.0001577664739884393, "loss": 0.3960812759399414, "step": 27400 }, { "epoch": 10.558766859344894, "eval_loss": 0.45025575160980225, "eval_runtime": 11.8255, "eval_samples_per_second": 1607.037, "eval_steps_per_second": 33.487, "step": 27400 }, { "epoch": 10.578034682080926, "grad_norm": 0.4933474063873291, "learning_rate": 0.0001576894026974952, "loss": 0.4001067352294922, "step": 27450 }, { "epoch": 10.578034682080926, "eval_loss": 0.4641176462173462, "eval_runtime": 11.8346, "eval_samples_per_second": 1605.798, "eval_steps_per_second": 33.461, "step": 27450 }, { "epoch": 10.597302504816955, "grad_norm": 0.4384477138519287, "learning_rate": 0.00015761233140655108, "loss": 0.40402820587158206, "step": 27500 }, { "epoch": 10.597302504816955, "eval_loss": 0.4567807912826538, "eval_runtime": 11.8451, "eval_samples_per_second": 1604.377, "eval_steps_per_second": 33.432, "step": 27500 }, { "epoch": 10.616570327552987, "grad_norm": 0.44720911979675293, "learning_rate": 0.00015753526011560697, "loss": 0.40342193603515625, "step": 27550 }, { "epoch": 10.616570327552987, "eval_loss": 0.4566930830478668, "eval_runtime": 11.8416, "eval_samples_per_second": 1604.846, "eval_steps_per_second": 33.441, "step": 27550 }, { "epoch": 10.635838150289018, "grad_norm": 0.42635756731033325, "learning_rate": 0.00015745818882466282, "loss": 0.4016175079345703, "step": 27600 }, { "epoch": 10.635838150289018, "eval_loss": 0.4609306752681732, "eval_runtime": 11.85, "eval_samples_per_second": 1603.718, "eval_steps_per_second": 33.418, "step": 27600 }, { "epoch": 10.655105973025048, "grad_norm": 0.4518902897834778, "learning_rate": 0.00015738111753371868, "loss": 0.400330696105957, "step": 27650 }, { "epoch": 10.655105973025048, "eval_loss": 0.44802841544151306, "eval_runtime": 11.8328, "eval_samples_per_second": 1606.043, "eval_steps_per_second": 33.466, "step": 27650 }, { "epoch": 10.67437379576108, "grad_norm": 0.4460395276546478, "learning_rate": 0.00015730404624277457, "loss": 0.39486454010009764, "step": 27700 }, { "epoch": 10.67437379576108, "eval_loss": 0.454080194234848, "eval_runtime": 11.8498, "eval_samples_per_second": 1603.734, "eval_steps_per_second": 33.418, "step": 27700 }, { "epoch": 10.693641618497109, "grad_norm": 0.3871495723724365, "learning_rate": 0.00015722697495183045, "loss": 0.40095870971679687, "step": 27750 }, { "epoch": 10.693641618497109, "eval_loss": 0.45006752014160156, "eval_runtime": 11.8485, "eval_samples_per_second": 1603.922, "eval_steps_per_second": 33.422, "step": 27750 }, { "epoch": 10.71290944123314, "grad_norm": 0.4391438364982605, "learning_rate": 0.00015714990366088634, "loss": 0.4006927871704102, "step": 27800 }, { "epoch": 10.71290944123314, "eval_loss": 0.4593578577041626, "eval_runtime": 11.8389, "eval_samples_per_second": 1605.218, "eval_steps_per_second": 33.449, "step": 27800 }, { "epoch": 10.732177263969172, "grad_norm": 0.49551302194595337, "learning_rate": 0.00015707283236994222, "loss": 0.4003697204589844, "step": 27850 }, { "epoch": 10.732177263969172, "eval_loss": 0.45575714111328125, "eval_runtime": 11.845, "eval_samples_per_second": 1604.383, "eval_steps_per_second": 33.432, "step": 27850 }, { "epoch": 10.751445086705202, "grad_norm": 0.4282897710800171, "learning_rate": 0.00015699576107899808, "loss": 0.3995050048828125, "step": 27900 }, { "epoch": 10.751445086705202, "eval_loss": 0.45492056012153625, "eval_runtime": 11.8366, "eval_samples_per_second": 1605.528, "eval_steps_per_second": 33.456, "step": 27900 }, { "epoch": 10.770712909441233, "grad_norm": 0.41727596521377563, "learning_rate": 0.00015691868978805396, "loss": 0.39996688842773437, "step": 27950 }, { "epoch": 10.770712909441233, "eval_loss": 0.4659992456436157, "eval_runtime": 11.8388, "eval_samples_per_second": 1605.236, "eval_steps_per_second": 33.449, "step": 27950 }, { "epoch": 10.789980732177264, "grad_norm": 0.3512781858444214, "learning_rate": 0.00015684161849710982, "loss": 0.4015879821777344, "step": 28000 }, { "epoch": 10.789980732177264, "eval_loss": 0.4524978697299957, "eval_runtime": 11.8207, "eval_samples_per_second": 1607.684, "eval_steps_per_second": 33.5, "step": 28000 }, { "epoch": 10.809248554913294, "grad_norm": 0.40848442912101746, "learning_rate": 0.0001567645472061657, "loss": 0.4001332855224609, "step": 28050 }, { "epoch": 10.809248554913294, "eval_loss": 0.454598993062973, "eval_runtime": 11.8452, "eval_samples_per_second": 1604.368, "eval_steps_per_second": 33.431, "step": 28050 }, { "epoch": 10.828516377649326, "grad_norm": 0.4510248899459839, "learning_rate": 0.0001566874759152216, "loss": 0.4017295837402344, "step": 28100 }, { "epoch": 10.828516377649326, "eval_loss": 0.45360085368156433, "eval_runtime": 11.8374, "eval_samples_per_second": 1605.417, "eval_steps_per_second": 33.453, "step": 28100 }, { "epoch": 10.847784200385357, "grad_norm": 0.38292381167411804, "learning_rate": 0.00015661040462427748, "loss": 0.40094417572021485, "step": 28150 }, { "epoch": 10.847784200385357, "eval_loss": 0.44876453280448914, "eval_runtime": 11.8414, "eval_samples_per_second": 1604.877, "eval_steps_per_second": 33.442, "step": 28150 }, { "epoch": 10.867052023121387, "grad_norm": 0.4419468641281128, "learning_rate": 0.00015653333333333333, "loss": 0.40042686462402344, "step": 28200 }, { "epoch": 10.867052023121387, "eval_loss": 0.44937989115715027, "eval_runtime": 11.8396, "eval_samples_per_second": 1605.122, "eval_steps_per_second": 33.447, "step": 28200 }, { "epoch": 10.886319845857418, "grad_norm": 0.41495048999786377, "learning_rate": 0.00015645626204238922, "loss": 0.39489967346191407, "step": 28250 }, { "epoch": 10.886319845857418, "eval_loss": 0.4553990960121155, "eval_runtime": 11.8634, "eval_samples_per_second": 1601.901, "eval_steps_per_second": 33.38, "step": 28250 }, { "epoch": 10.90558766859345, "grad_norm": 0.4330084025859833, "learning_rate": 0.0001563791907514451, "loss": 0.40193267822265627, "step": 28300 }, { "epoch": 10.90558766859345, "eval_loss": 0.44999366998672485, "eval_runtime": 11.8455, "eval_samples_per_second": 1604.329, "eval_steps_per_second": 33.431, "step": 28300 }, { "epoch": 10.92485549132948, "grad_norm": 0.3740021288394928, "learning_rate": 0.00015630211946050096, "loss": 0.3945880126953125, "step": 28350 }, { "epoch": 10.92485549132948, "eval_loss": 0.45337289571762085, "eval_runtime": 11.8669, "eval_samples_per_second": 1601.435, "eval_steps_per_second": 33.37, "step": 28350 }, { "epoch": 10.94412331406551, "grad_norm": 0.3932056725025177, "learning_rate": 0.00015622504816955685, "loss": 0.4012223434448242, "step": 28400 }, { "epoch": 10.94412331406551, "eval_loss": 0.4608412981033325, "eval_runtime": 11.8352, "eval_samples_per_second": 1605.725, "eval_steps_per_second": 33.46, "step": 28400 }, { "epoch": 10.963391136801542, "grad_norm": 0.341951847076416, "learning_rate": 0.00015614797687861273, "loss": 0.3941073226928711, "step": 28450 }, { "epoch": 10.963391136801542, "eval_loss": 0.46346718072891235, "eval_runtime": 11.85, "eval_samples_per_second": 1603.718, "eval_steps_per_second": 33.418, "step": 28450 }, { "epoch": 10.982658959537572, "grad_norm": 0.46905791759490967, "learning_rate": 0.0001560709055876686, "loss": 0.4006522750854492, "step": 28500 }, { "epoch": 10.982658959537572, "eval_loss": 0.4450742304325104, "eval_runtime": 11.8489, "eval_samples_per_second": 1603.863, "eval_steps_per_second": 33.421, "step": 28500 }, { "epoch": 11.001926782273603, "grad_norm": 0.4140762984752655, "learning_rate": 0.00015599383429672447, "loss": 0.3958004379272461, "step": 28550 }, { "epoch": 11.001926782273603, "eval_loss": 0.4575416147708893, "eval_runtime": 11.9272, "eval_samples_per_second": 1593.328, "eval_steps_per_second": 33.201, "step": 28550 }, { "epoch": 11.021194605009635, "grad_norm": 0.48300179839134216, "learning_rate": 0.00015591676300578036, "loss": 0.39445713043212893, "step": 28600 }, { "epoch": 11.021194605009635, "eval_loss": 0.4553254544734955, "eval_runtime": 11.8618, "eval_samples_per_second": 1602.112, "eval_steps_per_second": 33.384, "step": 28600 }, { "epoch": 11.040462427745664, "grad_norm": 0.39319923520088196, "learning_rate": 0.00015583969171483624, "loss": 0.3942254638671875, "step": 28650 }, { "epoch": 11.040462427745664, "eval_loss": 0.45360320806503296, "eval_runtime": 11.8484, "eval_samples_per_second": 1603.926, "eval_steps_per_second": 33.422, "step": 28650 }, { "epoch": 11.059730250481696, "grad_norm": 0.4254665672779083, "learning_rate": 0.0001557626204238921, "loss": 0.39918846130371094, "step": 28700 }, { "epoch": 11.059730250481696, "eval_loss": 0.4591009318828583, "eval_runtime": 11.8428, "eval_samples_per_second": 1604.686, "eval_steps_per_second": 33.438, "step": 28700 }, { "epoch": 11.078998073217726, "grad_norm": 0.3895827531814575, "learning_rate": 0.000155685549132948, "loss": 0.3960301971435547, "step": 28750 }, { "epoch": 11.078998073217726, "eval_loss": 0.4601430594921112, "eval_runtime": 11.8391, "eval_samples_per_second": 1605.195, "eval_steps_per_second": 33.449, "step": 28750 }, { "epoch": 11.098265895953757, "grad_norm": 0.35536444187164307, "learning_rate": 0.00015560847784200384, "loss": 0.3967865753173828, "step": 28800 }, { "epoch": 11.098265895953757, "eval_loss": 0.4600837528705597, "eval_runtime": 11.86, "eval_samples_per_second": 1602.364, "eval_steps_per_second": 33.39, "step": 28800 }, { "epoch": 11.117533718689788, "grad_norm": 0.3685913383960724, "learning_rate": 0.00015553140655105973, "loss": 0.3936845016479492, "step": 28850 }, { "epoch": 11.117533718689788, "eval_loss": 0.45744091272354126, "eval_runtime": 11.8414, "eval_samples_per_second": 1604.879, "eval_steps_per_second": 33.442, "step": 28850 }, { "epoch": 11.136801541425818, "grad_norm": 0.3967703580856323, "learning_rate": 0.00015545433526011561, "loss": 0.39264434814453125, "step": 28900 }, { "epoch": 11.136801541425818, "eval_loss": 0.46828731894493103, "eval_runtime": 11.8626, "eval_samples_per_second": 1602.011, "eval_steps_per_second": 33.382, "step": 28900 }, { "epoch": 11.15606936416185, "grad_norm": 0.43711912631988525, "learning_rate": 0.0001553772639691715, "loss": 0.39709461212158204, "step": 28950 }, { "epoch": 11.15606936416185, "eval_loss": 0.456818550825119, "eval_runtime": 11.8396, "eval_samples_per_second": 1605.123, "eval_steps_per_second": 33.447, "step": 28950 }, { "epoch": 11.175337186897881, "grad_norm": 0.43483173847198486, "learning_rate": 0.00015530019267822738, "loss": 0.39340446472167967, "step": 29000 }, { "epoch": 11.175337186897881, "eval_loss": 0.4510941803455353, "eval_runtime": 11.8413, "eval_samples_per_second": 1604.885, "eval_steps_per_second": 33.442, "step": 29000 }, { "epoch": 11.19460500963391, "grad_norm": 0.3896440863609314, "learning_rate": 0.00015522312138728324, "loss": 0.3936733627319336, "step": 29050 }, { "epoch": 11.19460500963391, "eval_loss": 0.4654483199119568, "eval_runtime": 11.8493, "eval_samples_per_second": 1603.805, "eval_steps_per_second": 33.42, "step": 29050 }, { "epoch": 11.213872832369942, "grad_norm": 0.3861600160598755, "learning_rate": 0.0001551460500963391, "loss": 0.3980462646484375, "step": 29100 }, { "epoch": 11.213872832369942, "eval_loss": 0.4543946385383606, "eval_runtime": 11.8475, "eval_samples_per_second": 1604.047, "eval_steps_per_second": 33.425, "step": 29100 }, { "epoch": 11.233140655105974, "grad_norm": 0.4006408154964447, "learning_rate": 0.00015506897880539498, "loss": 0.39612014770507814, "step": 29150 }, { "epoch": 11.233140655105974, "eval_loss": 0.4504699110984802, "eval_runtime": 11.8405, "eval_samples_per_second": 1605.003, "eval_steps_per_second": 33.445, "step": 29150 }, { "epoch": 11.252408477842003, "grad_norm": 0.36671844124794006, "learning_rate": 0.00015499190751445087, "loss": 0.39465267181396485, "step": 29200 }, { "epoch": 11.252408477842003, "eval_loss": 0.44224387407302856, "eval_runtime": 11.8606, "eval_samples_per_second": 1602.285, "eval_steps_per_second": 33.388, "step": 29200 }, { "epoch": 11.271676300578035, "grad_norm": 0.3842998147010803, "learning_rate": 0.00015491483622350675, "loss": 0.38602821350097655, "step": 29250 }, { "epoch": 11.271676300578035, "eval_loss": 0.44937044382095337, "eval_runtime": 11.847, "eval_samples_per_second": 1604.115, "eval_steps_per_second": 33.426, "step": 29250 }, { "epoch": 11.290944123314066, "grad_norm": 0.46864649653434753, "learning_rate": 0.00015483776493256264, "loss": 0.38969112396240235, "step": 29300 }, { "epoch": 11.290944123314066, "eval_loss": 0.44569677114486694, "eval_runtime": 11.8346, "eval_samples_per_second": 1605.803, "eval_steps_per_second": 33.461, "step": 29300 }, { "epoch": 11.310211946050096, "grad_norm": 0.4405817985534668, "learning_rate": 0.00015476069364161852, "loss": 0.3971552276611328, "step": 29350 }, { "epoch": 11.310211946050096, "eval_loss": 0.45203521847724915, "eval_runtime": 11.861, "eval_samples_per_second": 1602.228, "eval_steps_per_second": 33.387, "step": 29350 }, { "epoch": 11.329479768786127, "grad_norm": 0.41280612349510193, "learning_rate": 0.00015468362235067438, "loss": 0.3966523742675781, "step": 29400 }, { "epoch": 11.329479768786127, "eval_loss": 0.4507267475128174, "eval_runtime": 11.8492, "eval_samples_per_second": 1603.822, "eval_steps_per_second": 33.42, "step": 29400 }, { "epoch": 11.348747591522159, "grad_norm": 0.4651191532611847, "learning_rate": 0.00015460655105973027, "loss": 0.39272216796875, "step": 29450 }, { "epoch": 11.348747591522159, "eval_loss": 0.4574761688709259, "eval_runtime": 11.8485, "eval_samples_per_second": 1603.911, "eval_steps_per_second": 33.422, "step": 29450 }, { "epoch": 11.368015414258188, "grad_norm": 0.4049437940120697, "learning_rate": 0.00015452947976878612, "loss": 0.3885381317138672, "step": 29500 }, { "epoch": 11.368015414258188, "eval_loss": 0.4647764265537262, "eval_runtime": 11.8332, "eval_samples_per_second": 1605.993, "eval_steps_per_second": 33.465, "step": 29500 }, { "epoch": 11.38728323699422, "grad_norm": 0.5119481682777405, "learning_rate": 0.000154452408477842, "loss": 0.3961164855957031, "step": 29550 }, { "epoch": 11.38728323699422, "eval_loss": 0.4613955616950989, "eval_runtime": 11.8512, "eval_samples_per_second": 1603.546, "eval_steps_per_second": 33.414, "step": 29550 }, { "epoch": 11.406551059730251, "grad_norm": 0.426400750875473, "learning_rate": 0.0001543753371868979, "loss": 0.39475948333740235, "step": 29600 }, { "epoch": 11.406551059730251, "eval_loss": 0.45892465114593506, "eval_runtime": 11.8442, "eval_samples_per_second": 1604.498, "eval_steps_per_second": 33.434, "step": 29600 }, { "epoch": 11.425818882466281, "grad_norm": 0.44019651412963867, "learning_rate": 0.00015429826589595378, "loss": 0.39900634765625, "step": 29650 }, { "epoch": 11.425818882466281, "eval_loss": 0.45396846532821655, "eval_runtime": 11.8492, "eval_samples_per_second": 1603.827, "eval_steps_per_second": 33.42, "step": 29650 }, { "epoch": 11.445086705202312, "grad_norm": 0.421414852142334, "learning_rate": 0.00015422119460500964, "loss": 0.39043407440185546, "step": 29700 }, { "epoch": 11.445086705202312, "eval_loss": 0.4548008441925049, "eval_runtime": 11.8377, "eval_samples_per_second": 1605.383, "eval_steps_per_second": 33.453, "step": 29700 }, { "epoch": 11.464354527938344, "grad_norm": 0.4003957211971283, "learning_rate": 0.00015414412331406552, "loss": 0.39838233947753904, "step": 29750 }, { "epoch": 11.464354527938344, "eval_loss": 0.45841294527053833, "eval_runtime": 11.8416, "eval_samples_per_second": 1604.854, "eval_steps_per_second": 33.441, "step": 29750 }, { "epoch": 11.483622350674374, "grad_norm": 0.394020140171051, "learning_rate": 0.0001540670520231214, "loss": 0.39435256958007814, "step": 29800 }, { "epoch": 11.483622350674374, "eval_loss": 0.45162901282310486, "eval_runtime": 11.8392, "eval_samples_per_second": 1605.174, "eval_steps_per_second": 33.448, "step": 29800 }, { "epoch": 11.502890173410405, "grad_norm": 0.34160202741622925, "learning_rate": 0.00015398998073217726, "loss": 0.3918733978271484, "step": 29850 }, { "epoch": 11.502890173410405, "eval_loss": 0.4571681022644043, "eval_runtime": 11.8546, "eval_samples_per_second": 1603.092, "eval_steps_per_second": 33.405, "step": 29850 }, { "epoch": 11.522157996146435, "grad_norm": 0.4250192642211914, "learning_rate": 0.00015391290944123315, "loss": 0.39578784942626954, "step": 29900 }, { "epoch": 11.522157996146435, "eval_loss": 0.45914527773857117, "eval_runtime": 11.8618, "eval_samples_per_second": 1602.113, "eval_steps_per_second": 33.384, "step": 29900 }, { "epoch": 11.541425818882466, "grad_norm": 0.44942280650138855, "learning_rate": 0.00015383583815028903, "loss": 0.3898142242431641, "step": 29950 }, { "epoch": 11.541425818882466, "eval_loss": 0.45601025223731995, "eval_runtime": 11.8635, "eval_samples_per_second": 1601.89, "eval_steps_per_second": 33.38, "step": 29950 }, { "epoch": 11.560693641618498, "grad_norm": 0.38915881514549255, "learning_rate": 0.0001537587668593449, "loss": 0.3919211196899414, "step": 30000 }, { "epoch": 11.560693641618498, "eval_loss": 0.45641839504241943, "eval_runtime": 11.8624, "eval_samples_per_second": 1602.039, "eval_steps_per_second": 33.383, "step": 30000 }, { "epoch": 11.579961464354527, "grad_norm": 0.3840504288673401, "learning_rate": 0.00015368169556840078, "loss": 0.3906602096557617, "step": 30050 }, { "epoch": 11.579961464354527, "eval_loss": 0.4560301899909973, "eval_runtime": 11.8496, "eval_samples_per_second": 1603.762, "eval_steps_per_second": 33.419, "step": 30050 }, { "epoch": 11.599229287090559, "grad_norm": 0.380941241979599, "learning_rate": 0.00015360462427745666, "loss": 0.3914708709716797, "step": 30100 }, { "epoch": 11.599229287090559, "eval_loss": 0.4494810402393341, "eval_runtime": 11.8674, "eval_samples_per_second": 1601.363, "eval_steps_per_second": 33.369, "step": 30100 }, { "epoch": 11.61849710982659, "grad_norm": 0.37402382493019104, "learning_rate": 0.00015352755298651255, "loss": 0.38761459350585936, "step": 30150 }, { "epoch": 11.61849710982659, "eval_loss": 0.4522658586502075, "eval_runtime": 11.8433, "eval_samples_per_second": 1604.621, "eval_steps_per_second": 33.437, "step": 30150 }, { "epoch": 11.63776493256262, "grad_norm": 0.401523619890213, "learning_rate": 0.0001534504816955684, "loss": 0.3997273254394531, "step": 30200 }, { "epoch": 11.63776493256262, "eval_loss": 0.45182889699935913, "eval_runtime": 11.844, "eval_samples_per_second": 1604.53, "eval_steps_per_second": 33.435, "step": 30200 }, { "epoch": 11.657032755298651, "grad_norm": 0.3664919435977936, "learning_rate": 0.0001533734104046243, "loss": 0.39567520141601564, "step": 30250 }, { "epoch": 11.657032755298651, "eval_loss": 0.45935487747192383, "eval_runtime": 11.8461, "eval_samples_per_second": 1604.245, "eval_steps_per_second": 33.429, "step": 30250 }, { "epoch": 11.676300578034683, "grad_norm": 0.42537301778793335, "learning_rate": 0.00015329633911368015, "loss": 0.39372753143310546, "step": 30300 }, { "epoch": 11.676300578034683, "eval_loss": 0.46379411220550537, "eval_runtime": 11.8615, "eval_samples_per_second": 1602.153, "eval_steps_per_second": 33.385, "step": 30300 }, { "epoch": 11.695568400770712, "grad_norm": 0.4089376926422119, "learning_rate": 0.00015321926782273603, "loss": 0.3890171813964844, "step": 30350 }, { "epoch": 11.695568400770712, "eval_loss": 0.4595257341861725, "eval_runtime": 11.845, "eval_samples_per_second": 1604.387, "eval_steps_per_second": 33.432, "step": 30350 }, { "epoch": 11.714836223506744, "grad_norm": 0.46001991629600525, "learning_rate": 0.00015314219653179192, "loss": 0.3914097595214844, "step": 30400 }, { "epoch": 11.714836223506744, "eval_loss": 0.4690283536911011, "eval_runtime": 11.8491, "eval_samples_per_second": 1603.838, "eval_steps_per_second": 33.42, "step": 30400 }, { "epoch": 11.734104046242775, "grad_norm": 0.42713841795921326, "learning_rate": 0.0001530651252408478, "loss": 0.39358470916748045, "step": 30450 }, { "epoch": 11.734104046242775, "eval_loss": 0.4605637192726135, "eval_runtime": 11.9051, "eval_samples_per_second": 1596.285, "eval_steps_per_second": 33.263, "step": 30450 }, { "epoch": 11.753371868978805, "grad_norm": 0.342904269695282, "learning_rate": 0.0001529880539499037, "loss": 0.38637622833251956, "step": 30500 }, { "epoch": 11.753371868978805, "eval_loss": 0.46399441361427307, "eval_runtime": 11.8852, "eval_samples_per_second": 1598.965, "eval_steps_per_second": 33.319, "step": 30500 }, { "epoch": 11.772639691714836, "grad_norm": 0.4341523051261902, "learning_rate": 0.00015291098265895954, "loss": 0.39065208435058596, "step": 30550 }, { "epoch": 11.772639691714836, "eval_loss": 0.44729703664779663, "eval_runtime": 11.8475, "eval_samples_per_second": 1604.045, "eval_steps_per_second": 33.425, "step": 30550 }, { "epoch": 11.791907514450868, "grad_norm": 0.4358941912651062, "learning_rate": 0.0001528339113680154, "loss": 0.39062469482421874, "step": 30600 }, { "epoch": 11.791907514450868, "eval_loss": 0.45536816120147705, "eval_runtime": 11.8379, "eval_samples_per_second": 1605.358, "eval_steps_per_second": 33.452, "step": 30600 }, { "epoch": 11.811175337186897, "grad_norm": 0.3605685234069824, "learning_rate": 0.0001527568400770713, "loss": 0.39523555755615236, "step": 30650 }, { "epoch": 11.811175337186897, "eval_loss": 0.45379355549812317, "eval_runtime": 11.8604, "eval_samples_per_second": 1602.312, "eval_steps_per_second": 33.389, "step": 30650 }, { "epoch": 11.830443159922929, "grad_norm": 0.4231812357902527, "learning_rate": 0.00015267976878612717, "loss": 0.3934964752197266, "step": 30700 }, { "epoch": 11.830443159922929, "eval_loss": 0.449960321187973, "eval_runtime": 11.8528, "eval_samples_per_second": 1603.338, "eval_steps_per_second": 33.41, "step": 30700 }, { "epoch": 11.849710982658959, "grad_norm": 0.38512036204338074, "learning_rate": 0.00015260269749518306, "loss": 0.39371074676513673, "step": 30750 }, { "epoch": 11.849710982658959, "eval_loss": 0.4562492370605469, "eval_runtime": 11.8571, "eval_samples_per_second": 1602.756, "eval_steps_per_second": 33.398, "step": 30750 }, { "epoch": 11.86897880539499, "grad_norm": 0.44638869166374207, "learning_rate": 0.00015252562620423894, "loss": 0.3924036407470703, "step": 30800 }, { "epoch": 11.86897880539499, "eval_loss": 0.45969659090042114, "eval_runtime": 11.8732, "eval_samples_per_second": 1600.574, "eval_steps_per_second": 33.352, "step": 30800 }, { "epoch": 11.888246628131022, "grad_norm": 0.3954039514064789, "learning_rate": 0.0001524485549132948, "loss": 0.3921511459350586, "step": 30850 }, { "epoch": 11.888246628131022, "eval_loss": 0.4552130699157715, "eval_runtime": 11.8457, "eval_samples_per_second": 1604.294, "eval_steps_per_second": 33.43, "step": 30850 }, { "epoch": 11.907514450867051, "grad_norm": 0.42334675788879395, "learning_rate": 0.00015237148362235068, "loss": 0.3936252975463867, "step": 30900 }, { "epoch": 11.907514450867051, "eval_loss": 0.4496912658214569, "eval_runtime": 11.8677, "eval_samples_per_second": 1601.314, "eval_steps_per_second": 33.368, "step": 30900 }, { "epoch": 11.926782273603083, "grad_norm": 0.43492215871810913, "learning_rate": 0.00015229441233140654, "loss": 0.3980381393432617, "step": 30950 }, { "epoch": 11.926782273603083, "eval_loss": 0.4481752812862396, "eval_runtime": 11.8521, "eval_samples_per_second": 1603.428, "eval_steps_per_second": 33.412, "step": 30950 }, { "epoch": 11.946050096339114, "grad_norm": 0.49305716156959534, "learning_rate": 0.00015221734104046243, "loss": 0.3880479431152344, "step": 31000 }, { "epoch": 11.946050096339114, "eval_loss": 0.4459008276462555, "eval_runtime": 11.8427, "eval_samples_per_second": 1604.697, "eval_steps_per_second": 33.438, "step": 31000 }, { "epoch": 11.965317919075144, "grad_norm": 0.4362562298774719, "learning_rate": 0.0001521402697495183, "loss": 0.3901301574707031, "step": 31050 }, { "epoch": 11.965317919075144, "eval_loss": 0.45388954877853394, "eval_runtime": 11.8486, "eval_samples_per_second": 1603.901, "eval_steps_per_second": 33.422, "step": 31050 }, { "epoch": 11.984585741811175, "grad_norm": 0.4215191900730133, "learning_rate": 0.0001520631984585742, "loss": 0.39616291046142577, "step": 31100 }, { "epoch": 11.984585741811175, "eval_loss": 0.46040353178977966, "eval_runtime": 11.8378, "eval_samples_per_second": 1605.366, "eval_steps_per_second": 33.452, "step": 31100 }, { "epoch": 12.003853564547207, "grad_norm": 0.42114460468292236, "learning_rate": 0.00015198612716763006, "loss": 0.3977012634277344, "step": 31150 }, { "epoch": 12.003853564547207, "eval_loss": 0.46189120411872864, "eval_runtime": 11.8491, "eval_samples_per_second": 1603.841, "eval_steps_per_second": 33.42, "step": 31150 }, { "epoch": 12.023121387283236, "grad_norm": 0.4358437657356262, "learning_rate": 0.00015190905587668594, "loss": 0.39031387329101563, "step": 31200 }, { "epoch": 12.023121387283236, "eval_loss": 0.4499233663082123, "eval_runtime": 11.8586, "eval_samples_per_second": 1602.553, "eval_steps_per_second": 33.394, "step": 31200 }, { "epoch": 12.042389210019268, "grad_norm": 0.3714042901992798, "learning_rate": 0.00015183198458574182, "loss": 0.3923456573486328, "step": 31250 }, { "epoch": 12.042389210019268, "eval_loss": 0.4474877119064331, "eval_runtime": 11.8434, "eval_samples_per_second": 1604.603, "eval_steps_per_second": 33.436, "step": 31250 }, { "epoch": 12.0616570327553, "grad_norm": 0.4068152904510498, "learning_rate": 0.00015175491329479768, "loss": 0.3900384521484375, "step": 31300 }, { "epoch": 12.0616570327553, "eval_loss": 0.4478099048137665, "eval_runtime": 11.8507, "eval_samples_per_second": 1603.616, "eval_steps_per_second": 33.416, "step": 31300 }, { "epoch": 12.080924855491329, "grad_norm": 0.4034474492073059, "learning_rate": 0.00015167784200385357, "loss": 0.388704833984375, "step": 31350 }, { "epoch": 12.080924855491329, "eval_loss": 0.45014938712120056, "eval_runtime": 11.8517, "eval_samples_per_second": 1603.483, "eval_steps_per_second": 33.413, "step": 31350 }, { "epoch": 12.10019267822736, "grad_norm": 0.42279571294784546, "learning_rate": 0.00015160077071290945, "loss": 0.39206085205078123, "step": 31400 }, { "epoch": 12.10019267822736, "eval_loss": 0.454529732465744, "eval_runtime": 11.8475, "eval_samples_per_second": 1604.052, "eval_steps_per_second": 33.425, "step": 31400 }, { "epoch": 12.119460500963392, "grad_norm": 0.3484994173049927, "learning_rate": 0.0001515236994219653, "loss": 0.38950103759765625, "step": 31450 }, { "epoch": 12.119460500963392, "eval_loss": 0.4608440399169922, "eval_runtime": 11.8545, "eval_samples_per_second": 1603.106, "eval_steps_per_second": 33.405, "step": 31450 }, { "epoch": 12.138728323699421, "grad_norm": 0.43363094329833984, "learning_rate": 0.0001514466281310212, "loss": 0.393829345703125, "step": 31500 }, { "epoch": 12.138728323699421, "eval_loss": 0.45876017212867737, "eval_runtime": 11.8588, "eval_samples_per_second": 1602.518, "eval_steps_per_second": 33.393, "step": 31500 }, { "epoch": 12.157996146435453, "grad_norm": 0.4060385525226593, "learning_rate": 0.00015136955684007708, "loss": 0.39255508422851565, "step": 31550 }, { "epoch": 12.157996146435453, "eval_loss": 0.44654610753059387, "eval_runtime": 11.8537, "eval_samples_per_second": 1603.212, "eval_steps_per_second": 33.407, "step": 31550 }, { "epoch": 12.177263969171484, "grad_norm": 0.3990229666233063, "learning_rate": 0.00015129248554913296, "loss": 0.39042522430419924, "step": 31600 }, { "epoch": 12.177263969171484, "eval_loss": 0.44910523295402527, "eval_runtime": 11.8676, "eval_samples_per_second": 1601.331, "eval_steps_per_second": 33.368, "step": 31600 }, { "epoch": 12.196531791907514, "grad_norm": 0.4627830684185028, "learning_rate": 0.00015121541425818885, "loss": 0.39346435546875, "step": 31650 }, { "epoch": 12.196531791907514, "eval_loss": 0.45692846179008484, "eval_runtime": 11.8631, "eval_samples_per_second": 1601.945, "eval_steps_per_second": 33.381, "step": 31650 }, { "epoch": 12.215799614643545, "grad_norm": 0.3881038427352905, "learning_rate": 0.0001511383429672447, "loss": 0.390938835144043, "step": 31700 }, { "epoch": 12.215799614643545, "eval_loss": 0.4441341459751129, "eval_runtime": 11.8465, "eval_samples_per_second": 1604.184, "eval_steps_per_second": 33.428, "step": 31700 }, { "epoch": 12.235067437379577, "grad_norm": 0.3326747417449951, "learning_rate": 0.00015106127167630057, "loss": 0.38904930114746095, "step": 31750 }, { "epoch": 12.235067437379577, "eval_loss": 0.4630785584449768, "eval_runtime": 11.8543, "eval_samples_per_second": 1603.126, "eval_steps_per_second": 33.405, "step": 31750 }, { "epoch": 12.254335260115607, "grad_norm": 0.3989524245262146, "learning_rate": 0.00015098420038535645, "loss": 0.38979713439941405, "step": 31800 }, { "epoch": 12.254335260115607, "eval_loss": 0.46137335896492004, "eval_runtime": 11.859, "eval_samples_per_second": 1602.499, "eval_steps_per_second": 33.392, "step": 31800 }, { "epoch": 12.273603082851638, "grad_norm": 0.4047364592552185, "learning_rate": 0.00015090712909441234, "loss": 0.3857042694091797, "step": 31850 }, { "epoch": 12.273603082851638, "eval_loss": 0.4536493420600891, "eval_runtime": 11.8651, "eval_samples_per_second": 1601.672, "eval_steps_per_second": 33.375, "step": 31850 }, { "epoch": 12.29287090558767, "grad_norm": 0.39940065145492554, "learning_rate": 0.00015083005780346822, "loss": 0.38882492065429686, "step": 31900 }, { "epoch": 12.29287090558767, "eval_loss": 0.4474599063396454, "eval_runtime": 11.8697, "eval_samples_per_second": 1601.054, "eval_steps_per_second": 33.362, "step": 31900 }, { "epoch": 12.3121387283237, "grad_norm": 0.43288853764533997, "learning_rate": 0.0001507529865125241, "loss": 0.3913050842285156, "step": 31950 }, { "epoch": 12.3121387283237, "eval_loss": 0.4502812325954437, "eval_runtime": 11.8459, "eval_samples_per_second": 1604.274, "eval_steps_per_second": 33.429, "step": 31950 }, { "epoch": 12.33140655105973, "grad_norm": 0.37454158067703247, "learning_rate": 0.00015067591522158, "loss": 0.3859987640380859, "step": 32000 }, { "epoch": 12.33140655105973, "eval_loss": 0.4549870193004608, "eval_runtime": 11.8774, "eval_samples_per_second": 1600.019, "eval_steps_per_second": 33.341, "step": 32000 }, { "epoch": 12.35067437379576, "grad_norm": 0.41337496042251587, "learning_rate": 0.00015059884393063585, "loss": 0.39063976287841795, "step": 32050 }, { "epoch": 12.35067437379576, "eval_loss": 0.44127336144447327, "eval_runtime": 11.8474, "eval_samples_per_second": 1604.071, "eval_steps_per_second": 33.425, "step": 32050 }, { "epoch": 12.369942196531792, "grad_norm": 0.4536186754703522, "learning_rate": 0.0001505217726396917, "loss": 0.3904057312011719, "step": 32100 }, { "epoch": 12.369942196531792, "eval_loss": 0.4508403241634369, "eval_runtime": 11.8669, "eval_samples_per_second": 1601.43, "eval_steps_per_second": 33.37, "step": 32100 }, { "epoch": 12.389210019267823, "grad_norm": 0.4342248737812042, "learning_rate": 0.0001504447013487476, "loss": 0.3960081100463867, "step": 32150 }, { "epoch": 12.389210019267823, "eval_loss": 0.44524508714675903, "eval_runtime": 11.8671, "eval_samples_per_second": 1601.403, "eval_steps_per_second": 33.37, "step": 32150 }, { "epoch": 12.408477842003853, "grad_norm": 0.38431116938591003, "learning_rate": 0.00015036763005780348, "loss": 0.38363262176513674, "step": 32200 }, { "epoch": 12.408477842003853, "eval_loss": 0.4483093023300171, "eval_runtime": 11.914, "eval_samples_per_second": 1595.102, "eval_steps_per_second": 33.238, "step": 32200 }, { "epoch": 12.427745664739884, "grad_norm": 0.3924608528614044, "learning_rate": 0.00015029055876685936, "loss": 0.3831897735595703, "step": 32250 }, { "epoch": 12.427745664739884, "eval_loss": 0.4436962604522705, "eval_runtime": 11.8537, "eval_samples_per_second": 1603.208, "eval_steps_per_second": 33.407, "step": 32250 }, { "epoch": 12.447013487475916, "grad_norm": 0.47275248169898987, "learning_rate": 0.00015021348747591525, "loss": 0.3956794357299805, "step": 32300 }, { "epoch": 12.447013487475916, "eval_loss": 0.441049724817276, "eval_runtime": 11.8511, "eval_samples_per_second": 1603.569, "eval_steps_per_second": 33.415, "step": 32300 }, { "epoch": 12.466281310211945, "grad_norm": 0.3830760717391968, "learning_rate": 0.0001501364161849711, "loss": 0.3836980438232422, "step": 32350 }, { "epoch": 12.466281310211945, "eval_loss": 0.45635056495666504, "eval_runtime": 11.8464, "eval_samples_per_second": 1604.204, "eval_steps_per_second": 33.428, "step": 32350 }, { "epoch": 12.485549132947977, "grad_norm": 0.38848841190338135, "learning_rate": 0.000150059344894027, "loss": 0.38939682006835935, "step": 32400 }, { "epoch": 12.485549132947977, "eval_loss": 0.4614080488681793, "eval_runtime": 11.8521, "eval_samples_per_second": 1603.422, "eval_steps_per_second": 33.412, "step": 32400 }, { "epoch": 12.504816955684008, "grad_norm": 0.42707547545433044, "learning_rate": 0.00014998227360308285, "loss": 0.38625564575195315, "step": 32450 }, { "epoch": 12.504816955684008, "eval_loss": 0.45031875371932983, "eval_runtime": 11.853, "eval_samples_per_second": 1603.309, "eval_steps_per_second": 33.409, "step": 32450 }, { "epoch": 12.524084778420038, "grad_norm": 0.4026077389717102, "learning_rate": 0.00014990520231213873, "loss": 0.3844913864135742, "step": 32500 }, { "epoch": 12.524084778420038, "eval_loss": 0.4512634873390198, "eval_runtime": 11.8494, "eval_samples_per_second": 1603.798, "eval_steps_per_second": 33.419, "step": 32500 }, { "epoch": 12.54335260115607, "grad_norm": 0.41325244307518005, "learning_rate": 0.00014982813102119462, "loss": 0.3908930969238281, "step": 32550 }, { "epoch": 12.54335260115607, "eval_loss": 0.4504145383834839, "eval_runtime": 11.8839, "eval_samples_per_second": 1599.139, "eval_steps_per_second": 33.322, "step": 32550 }, { "epoch": 12.5626204238921, "grad_norm": 0.37914320826530457, "learning_rate": 0.0001497510597302505, "loss": 0.38340446472167966, "step": 32600 }, { "epoch": 12.5626204238921, "eval_loss": 0.4488254189491272, "eval_runtime": 11.847, "eval_samples_per_second": 1604.119, "eval_steps_per_second": 33.426, "step": 32600 }, { "epoch": 12.58188824662813, "grad_norm": 0.3523308336734772, "learning_rate": 0.00014967398843930636, "loss": 0.3887754440307617, "step": 32650 }, { "epoch": 12.58188824662813, "eval_loss": 0.44851309061050415, "eval_runtime": 11.8473, "eval_samples_per_second": 1604.072, "eval_steps_per_second": 33.425, "step": 32650 }, { "epoch": 12.601156069364162, "grad_norm": 0.43805384635925293, "learning_rate": 0.00014959691714836224, "loss": 0.3841027069091797, "step": 32700 }, { "epoch": 12.601156069364162, "eval_loss": 0.45910513401031494, "eval_runtime": 11.8468, "eval_samples_per_second": 1604.153, "eval_steps_per_second": 33.427, "step": 32700 }, { "epoch": 12.620423892100193, "grad_norm": 0.3909459114074707, "learning_rate": 0.00014951984585741813, "loss": 0.3883742904663086, "step": 32750 }, { "epoch": 12.620423892100193, "eval_loss": 0.4537789523601532, "eval_runtime": 11.8712, "eval_samples_per_second": 1600.848, "eval_steps_per_second": 33.358, "step": 32750 }, { "epoch": 12.639691714836223, "grad_norm": 0.421660840511322, "learning_rate": 0.00014944277456647399, "loss": 0.3869754409790039, "step": 32800 }, { "epoch": 12.639691714836223, "eval_loss": 0.4450753927230835, "eval_runtime": 11.8542, "eval_samples_per_second": 1603.147, "eval_steps_per_second": 33.406, "step": 32800 }, { "epoch": 12.658959537572255, "grad_norm": 0.3880141079425812, "learning_rate": 0.00014936570327552987, "loss": 0.3954215240478516, "step": 32850 }, { "epoch": 12.658959537572255, "eval_loss": 0.45030343532562256, "eval_runtime": 11.8733, "eval_samples_per_second": 1600.562, "eval_steps_per_second": 33.352, "step": 32850 }, { "epoch": 12.678227360308284, "grad_norm": 0.39543411135673523, "learning_rate": 0.00014928863198458576, "loss": 0.3931756591796875, "step": 32900 }, { "epoch": 12.678227360308284, "eval_loss": 0.4459887444972992, "eval_runtime": 11.855, "eval_samples_per_second": 1603.043, "eval_steps_per_second": 33.404, "step": 32900 }, { "epoch": 12.697495183044316, "grad_norm": 0.4634943902492523, "learning_rate": 0.0001492115606936416, "loss": 0.3891709136962891, "step": 32950 }, { "epoch": 12.697495183044316, "eval_loss": 0.44175538420677185, "eval_runtime": 11.8532, "eval_samples_per_second": 1603.285, "eval_steps_per_second": 33.409, "step": 32950 }, { "epoch": 12.716763005780347, "grad_norm": 0.34462836384773254, "learning_rate": 0.0001491344894026975, "loss": 0.3871088409423828, "step": 33000 }, { "epoch": 12.716763005780347, "eval_loss": 0.4502773880958557, "eval_runtime": 11.8505, "eval_samples_per_second": 1603.646, "eval_steps_per_second": 33.416, "step": 33000 }, { "epoch": 12.736030828516377, "grad_norm": 0.4960377812385559, "learning_rate": 0.00014905741811175338, "loss": 0.3916002655029297, "step": 33050 }, { "epoch": 12.736030828516377, "eval_loss": 0.450802206993103, "eval_runtime": 11.8688, "eval_samples_per_second": 1601.171, "eval_steps_per_second": 33.365, "step": 33050 }, { "epoch": 12.755298651252408, "grad_norm": 0.35653576254844666, "learning_rate": 0.00014898034682080927, "loss": 0.3922689819335938, "step": 33100 }, { "epoch": 12.755298651252408, "eval_loss": 0.4481211304664612, "eval_runtime": 11.8594, "eval_samples_per_second": 1602.438, "eval_steps_per_second": 33.391, "step": 33100 }, { "epoch": 12.77456647398844, "grad_norm": 0.396040141582489, "learning_rate": 0.00014890327552986513, "loss": 0.3826356506347656, "step": 33150 }, { "epoch": 12.77456647398844, "eval_loss": 0.4415667951107025, "eval_runtime": 11.8427, "eval_samples_per_second": 1604.695, "eval_steps_per_second": 33.438, "step": 33150 }, { "epoch": 12.79383429672447, "grad_norm": 0.4226914644241333, "learning_rate": 0.000148826204238921, "loss": 0.38808269500732423, "step": 33200 }, { "epoch": 12.79383429672447, "eval_loss": 0.44614118337631226, "eval_runtime": 11.8636, "eval_samples_per_second": 1601.877, "eval_steps_per_second": 33.379, "step": 33200 }, { "epoch": 12.8131021194605, "grad_norm": 0.39424431324005127, "learning_rate": 0.00014874913294797687, "loss": 0.3780611801147461, "step": 33250 }, { "epoch": 12.8131021194605, "eval_loss": 0.44576069712638855, "eval_runtime": 11.8793, "eval_samples_per_second": 1599.753, "eval_steps_per_second": 33.335, "step": 33250 }, { "epoch": 12.832369942196532, "grad_norm": 0.40800848603248596, "learning_rate": 0.00014867206165703275, "loss": 0.3877024459838867, "step": 33300 }, { "epoch": 12.832369942196532, "eval_loss": 0.4367907643318176, "eval_runtime": 11.858, "eval_samples_per_second": 1602.637, "eval_steps_per_second": 33.395, "step": 33300 }, { "epoch": 12.851637764932562, "grad_norm": 0.3747848868370056, "learning_rate": 0.00014859499036608864, "loss": 0.3832316207885742, "step": 33350 }, { "epoch": 12.851637764932562, "eval_loss": 0.4440147578716278, "eval_runtime": 11.8693, "eval_samples_per_second": 1601.107, "eval_steps_per_second": 33.363, "step": 33350 }, { "epoch": 12.870905587668593, "grad_norm": 0.4521227478981018, "learning_rate": 0.00014851791907514452, "loss": 0.3896062469482422, "step": 33400 }, { "epoch": 12.870905587668593, "eval_loss": 0.44130903482437134, "eval_runtime": 11.8472, "eval_samples_per_second": 1604.091, "eval_steps_per_second": 33.426, "step": 33400 }, { "epoch": 12.890173410404625, "grad_norm": 0.41999736428260803, "learning_rate": 0.0001484408477842004, "loss": 0.38482357025146485, "step": 33450 }, { "epoch": 12.890173410404625, "eval_loss": 0.4448047876358032, "eval_runtime": 11.88, "eval_samples_per_second": 1599.661, "eval_steps_per_second": 33.333, "step": 33450 }, { "epoch": 12.909441233140655, "grad_norm": 0.4279807507991791, "learning_rate": 0.00014836377649325627, "loss": 0.38372215270996096, "step": 33500 }, { "epoch": 12.909441233140655, "eval_loss": 0.44204211235046387, "eval_runtime": 11.8593, "eval_samples_per_second": 1602.455, "eval_steps_per_second": 33.392, "step": 33500 }, { "epoch": 12.928709055876686, "grad_norm": 0.3618074357509613, "learning_rate": 0.00014828670520231215, "loss": 0.38378875732421874, "step": 33550 }, { "epoch": 12.928709055876686, "eval_loss": 0.4423665702342987, "eval_runtime": 11.8639, "eval_samples_per_second": 1601.839, "eval_steps_per_second": 33.379, "step": 33550 }, { "epoch": 12.947976878612717, "grad_norm": 0.3646143674850464, "learning_rate": 0.000148209633911368, "loss": 0.3893615341186523, "step": 33600 }, { "epoch": 12.947976878612717, "eval_loss": 0.4359856843948364, "eval_runtime": 11.8817, "eval_samples_per_second": 1599.429, "eval_steps_per_second": 33.328, "step": 33600 }, { "epoch": 12.967244701348747, "grad_norm": 0.40546998381614685, "learning_rate": 0.0001481325626204239, "loss": 0.3880016326904297, "step": 33650 }, { "epoch": 12.967244701348747, "eval_loss": 0.4439350664615631, "eval_runtime": 11.8506, "eval_samples_per_second": 1603.631, "eval_steps_per_second": 33.416, "step": 33650 }, { "epoch": 12.986512524084779, "grad_norm": 0.45561444759368896, "learning_rate": 0.00014805549132947978, "loss": 0.3839890670776367, "step": 33700 }, { "epoch": 12.986512524084779, "eval_loss": 0.4347866475582123, "eval_runtime": 11.8437, "eval_samples_per_second": 1604.57, "eval_steps_per_second": 33.436, "step": 33700 }, { "epoch": 13.00578034682081, "grad_norm": 0.39561402797698975, "learning_rate": 0.00014797842003853566, "loss": 0.3843824768066406, "step": 33750 }, { "epoch": 13.00578034682081, "eval_loss": 0.4474296569824219, "eval_runtime": 11.8678, "eval_samples_per_second": 1601.304, "eval_steps_per_second": 33.368, "step": 33750 }, { "epoch": 13.02504816955684, "grad_norm": 0.42122897505760193, "learning_rate": 0.00014790134874759155, "loss": 0.3799915313720703, "step": 33800 }, { "epoch": 13.02504816955684, "eval_loss": 0.44946303963661194, "eval_runtime": 11.8698, "eval_samples_per_second": 1601.033, "eval_steps_per_second": 33.362, "step": 33800 }, { "epoch": 13.044315992292871, "grad_norm": 0.4144188463687897, "learning_rate": 0.0001478242774566474, "loss": 0.3832762908935547, "step": 33850 }, { "epoch": 13.044315992292871, "eval_loss": 0.4419473707675934, "eval_runtime": 11.871, "eval_samples_per_second": 1600.875, "eval_steps_per_second": 33.359, "step": 33850 }, { "epoch": 13.063583815028903, "grad_norm": 0.45908936858177185, "learning_rate": 0.0001477472061657033, "loss": 0.3925685119628906, "step": 33900 }, { "epoch": 13.063583815028903, "eval_loss": 0.44261232018470764, "eval_runtime": 11.8817, "eval_samples_per_second": 1599.437, "eval_steps_per_second": 33.329, "step": 33900 }, { "epoch": 13.082851637764932, "grad_norm": 0.3635522723197937, "learning_rate": 0.00014767013487475915, "loss": 0.3892251586914062, "step": 33950 }, { "epoch": 13.082851637764932, "eval_loss": 0.4354937970638275, "eval_runtime": 11.9547, "eval_samples_per_second": 1589.671, "eval_steps_per_second": 33.125, "step": 33950 }, { "epoch": 13.102119460500964, "grad_norm": 0.45413830876350403, "learning_rate": 0.00014759306358381503, "loss": 0.3857122802734375, "step": 34000 }, { "epoch": 13.102119460500964, "eval_loss": 0.43082794547080994, "eval_runtime": 11.8708, "eval_samples_per_second": 1600.898, "eval_steps_per_second": 33.359, "step": 34000 }, { "epoch": 13.121387283236995, "grad_norm": 0.4426018297672272, "learning_rate": 0.00014751599229287092, "loss": 0.38512870788574216, "step": 34050 }, { "epoch": 13.121387283236995, "eval_loss": 0.45098626613616943, "eval_runtime": 11.8537, "eval_samples_per_second": 1603.211, "eval_steps_per_second": 33.407, "step": 34050 }, { "epoch": 13.140655105973025, "grad_norm": 0.4461077153682709, "learning_rate": 0.0001474389210019268, "loss": 0.3892154312133789, "step": 34100 }, { "epoch": 13.140655105973025, "eval_loss": 0.4369465708732605, "eval_runtime": 11.8658, "eval_samples_per_second": 1601.583, "eval_steps_per_second": 33.373, "step": 34100 }, { "epoch": 13.159922928709056, "grad_norm": 0.3452952206134796, "learning_rate": 0.00014736184971098266, "loss": 0.38659713745117186, "step": 34150 }, { "epoch": 13.159922928709056, "eval_loss": 0.4415247440338135, "eval_runtime": 11.8571, "eval_samples_per_second": 1602.755, "eval_steps_per_second": 33.398, "step": 34150 }, { "epoch": 13.179190751445086, "grad_norm": 0.4412609338760376, "learning_rate": 0.00014728477842003855, "loss": 0.3871027374267578, "step": 34200 }, { "epoch": 13.179190751445086, "eval_loss": 0.4445304870605469, "eval_runtime": 11.8779, "eval_samples_per_second": 1599.953, "eval_steps_per_second": 33.339, "step": 34200 }, { "epoch": 13.198458574181117, "grad_norm": 0.4457407593727112, "learning_rate": 0.00014720770712909443, "loss": 0.3861553955078125, "step": 34250 }, { "epoch": 13.198458574181117, "eval_loss": 0.44165557622909546, "eval_runtime": 11.8592, "eval_samples_per_second": 1602.47, "eval_steps_per_second": 33.392, "step": 34250 }, { "epoch": 13.217726396917149, "grad_norm": 0.3584430515766144, "learning_rate": 0.0001471306358381503, "loss": 0.3779230880737305, "step": 34300 }, { "epoch": 13.217726396917149, "eval_loss": 0.44310125708580017, "eval_runtime": 11.858, "eval_samples_per_second": 1602.627, "eval_steps_per_second": 33.395, "step": 34300 }, { "epoch": 13.236994219653178, "grad_norm": 0.3485181927680969, "learning_rate": 0.00014705356454720617, "loss": 0.3854789733886719, "step": 34350 }, { "epoch": 13.236994219653178, "eval_loss": 0.447884738445282, "eval_runtime": 11.859, "eval_samples_per_second": 1602.493, "eval_steps_per_second": 33.392, "step": 34350 }, { "epoch": 13.25626204238921, "grad_norm": 0.409240186214447, "learning_rate": 0.00014697649325626206, "loss": 0.38491485595703123, "step": 34400 }, { "epoch": 13.25626204238921, "eval_loss": 0.4403080940246582, "eval_runtime": 11.8696, "eval_samples_per_second": 1601.067, "eval_steps_per_second": 33.363, "step": 34400 }, { "epoch": 13.275529865125241, "grad_norm": 0.45753994584083557, "learning_rate": 0.00014689942196531792, "loss": 0.3841815948486328, "step": 34450 }, { "epoch": 13.275529865125241, "eval_loss": 0.4473288357257843, "eval_runtime": 11.8558, "eval_samples_per_second": 1602.93, "eval_steps_per_second": 33.401, "step": 34450 }, { "epoch": 13.294797687861271, "grad_norm": 0.4172387421131134, "learning_rate": 0.0001468223506743738, "loss": 0.3820580291748047, "step": 34500 }, { "epoch": 13.294797687861271, "eval_loss": 0.4440932273864746, "eval_runtime": 11.8678, "eval_samples_per_second": 1601.306, "eval_steps_per_second": 33.368, "step": 34500 }, { "epoch": 13.314065510597302, "grad_norm": 0.3742911219596863, "learning_rate": 0.00014674527938342969, "loss": 0.3841078186035156, "step": 34550 }, { "epoch": 13.314065510597302, "eval_loss": 0.45247793197631836, "eval_runtime": 11.8599, "eval_samples_per_second": 1602.371, "eval_steps_per_second": 33.39, "step": 34550 }, { "epoch": 13.333333333333334, "grad_norm": 0.3443085849285126, "learning_rate": 0.00014666820809248557, "loss": 0.3845148468017578, "step": 34600 }, { "epoch": 13.333333333333334, "eval_loss": 0.44033533334732056, "eval_runtime": 11.8604, "eval_samples_per_second": 1602.308, "eval_steps_per_second": 33.388, "step": 34600 }, { "epoch": 13.352601156069364, "grad_norm": 0.4232437312602997, "learning_rate": 0.00014659113680154143, "loss": 0.38118759155273435, "step": 34650 }, { "epoch": 13.352601156069364, "eval_loss": 0.43646183609962463, "eval_runtime": 11.8548, "eval_samples_per_second": 1603.068, "eval_steps_per_second": 33.404, "step": 34650 }, { "epoch": 13.371868978805395, "grad_norm": 0.4170492887496948, "learning_rate": 0.0001465140655105973, "loss": 0.38123985290527346, "step": 34700 }, { "epoch": 13.371868978805395, "eval_loss": 0.4376121163368225, "eval_runtime": 11.8452, "eval_samples_per_second": 1604.359, "eval_steps_per_second": 33.431, "step": 34700 }, { "epoch": 13.391136801541426, "grad_norm": 0.4493910074234009, "learning_rate": 0.00014643699421965317, "loss": 0.3879561996459961, "step": 34750 }, { "epoch": 13.391136801541426, "eval_loss": 0.44145703315734863, "eval_runtime": 11.8757, "eval_samples_per_second": 1600.239, "eval_steps_per_second": 33.345, "step": 34750 }, { "epoch": 13.410404624277456, "grad_norm": 0.39392679929733276, "learning_rate": 0.00014635992292870906, "loss": 0.3875014495849609, "step": 34800 }, { "epoch": 13.410404624277456, "eval_loss": 0.44671136140823364, "eval_runtime": 11.8569, "eval_samples_per_second": 1602.781, "eval_steps_per_second": 33.398, "step": 34800 }, { "epoch": 13.429672447013488, "grad_norm": 0.38099411129951477, "learning_rate": 0.00014628285163776494, "loss": 0.38419761657714846, "step": 34850 }, { "epoch": 13.429672447013488, "eval_loss": 0.45085451006889343, "eval_runtime": 11.8672, "eval_samples_per_second": 1601.395, "eval_steps_per_second": 33.369, "step": 34850 }, { "epoch": 13.448940269749519, "grad_norm": 0.3834643065929413, "learning_rate": 0.00014620578034682083, "loss": 0.3812310028076172, "step": 34900 }, { "epoch": 13.448940269749519, "eval_loss": 0.4420259892940521, "eval_runtime": 11.849, "eval_samples_per_second": 1603.849, "eval_steps_per_second": 33.421, "step": 34900 }, { "epoch": 13.468208092485549, "grad_norm": 0.35009071230888367, "learning_rate": 0.0001461287090558767, "loss": 0.3843400573730469, "step": 34950 }, { "epoch": 13.468208092485549, "eval_loss": 0.44652843475341797, "eval_runtime": 11.8666, "eval_samples_per_second": 1601.473, "eval_steps_per_second": 33.371, "step": 34950 }, { "epoch": 13.48747591522158, "grad_norm": 0.447021484375, "learning_rate": 0.00014605163776493257, "loss": 0.385933837890625, "step": 35000 }, { "epoch": 13.48747591522158, "eval_loss": 0.44178229570388794, "eval_runtime": 11.8912, "eval_samples_per_second": 1598.155, "eval_steps_per_second": 33.302, "step": 35000 }, { "epoch": 13.50674373795761, "grad_norm": 0.4078798294067383, "learning_rate": 0.00014597456647398843, "loss": 0.3816157531738281, "step": 35050 }, { "epoch": 13.50674373795761, "eval_loss": 0.44384071230888367, "eval_runtime": 11.8711, "eval_samples_per_second": 1600.86, "eval_steps_per_second": 33.358, "step": 35050 }, { "epoch": 13.526011560693641, "grad_norm": 0.3858228623867035, "learning_rate": 0.0001458974951830443, "loss": 0.3807398986816406, "step": 35100 }, { "epoch": 13.526011560693641, "eval_loss": 0.44934239983558655, "eval_runtime": 11.8881, "eval_samples_per_second": 1598.58, "eval_steps_per_second": 33.311, "step": 35100 }, { "epoch": 13.545279383429673, "grad_norm": 0.4410717487335205, "learning_rate": 0.0001458204238921002, "loss": 0.38348075866699216, "step": 35150 }, { "epoch": 13.545279383429673, "eval_loss": 0.4434875249862671, "eval_runtime": 11.8637, "eval_samples_per_second": 1601.855, "eval_steps_per_second": 33.379, "step": 35150 }, { "epoch": 13.564547206165702, "grad_norm": 0.46155276894569397, "learning_rate": 0.00014574335260115608, "loss": 0.3851802062988281, "step": 35200 }, { "epoch": 13.564547206165702, "eval_loss": 0.4503213167190552, "eval_runtime": 11.8778, "eval_samples_per_second": 1599.966, "eval_steps_per_second": 33.34, "step": 35200 }, { "epoch": 13.583815028901734, "grad_norm": 0.4224143326282501, "learning_rate": 0.00014566628131021197, "loss": 0.38350261688232423, "step": 35250 }, { "epoch": 13.583815028901734, "eval_loss": 0.44244617223739624, "eval_runtime": 11.8736, "eval_samples_per_second": 1600.527, "eval_steps_per_second": 33.351, "step": 35250 }, { "epoch": 13.603082851637765, "grad_norm": 0.4418644905090332, "learning_rate": 0.00014558921001926782, "loss": 0.38030529022216797, "step": 35300 }, { "epoch": 13.603082851637765, "eval_loss": 0.4428344964981079, "eval_runtime": 11.8711, "eval_samples_per_second": 1600.861, "eval_steps_per_second": 33.358, "step": 35300 }, { "epoch": 13.622350674373795, "grad_norm": 0.35853660106658936, "learning_rate": 0.0001455121387283237, "loss": 0.3806134033203125, "step": 35350 }, { "epoch": 13.622350674373795, "eval_loss": 0.4355015456676483, "eval_runtime": 11.8568, "eval_samples_per_second": 1602.789, "eval_steps_per_second": 33.398, "step": 35350 }, { "epoch": 13.641618497109826, "grad_norm": 0.38472795486450195, "learning_rate": 0.00014543506743737957, "loss": 0.38445186614990234, "step": 35400 }, { "epoch": 13.641618497109826, "eval_loss": 0.4423065483570099, "eval_runtime": 11.8573, "eval_samples_per_second": 1602.724, "eval_steps_per_second": 33.397, "step": 35400 }, { "epoch": 13.660886319845858, "grad_norm": 0.3891620337963104, "learning_rate": 0.00014535799614643545, "loss": 0.3819063949584961, "step": 35450 }, { "epoch": 13.660886319845858, "eval_loss": 0.44763273000717163, "eval_runtime": 11.88, "eval_samples_per_second": 1599.661, "eval_steps_per_second": 33.333, "step": 35450 }, { "epoch": 13.680154142581888, "grad_norm": 0.49668535590171814, "learning_rate": 0.00014528092485549134, "loss": 0.38743789672851564, "step": 35500 }, { "epoch": 13.680154142581888, "eval_loss": 0.43931326270103455, "eval_runtime": 11.8663, "eval_samples_per_second": 1601.507, "eval_steps_per_second": 33.372, "step": 35500 }, { "epoch": 13.699421965317919, "grad_norm": 0.3873242139816284, "learning_rate": 0.00014520385356454722, "loss": 0.3822274398803711, "step": 35550 }, { "epoch": 13.699421965317919, "eval_loss": 0.44045722484588623, "eval_runtime": 11.9266, "eval_samples_per_second": 1593.418, "eval_steps_per_second": 33.203, "step": 35550 }, { "epoch": 13.71868978805395, "grad_norm": 0.4137391746044159, "learning_rate": 0.00014512678227360308, "loss": 0.3796673583984375, "step": 35600 }, { "epoch": 13.71868978805395, "eval_loss": 0.44419652223587036, "eval_runtime": 11.8905, "eval_samples_per_second": 1598.254, "eval_steps_per_second": 33.304, "step": 35600 }, { "epoch": 13.73795761078998, "grad_norm": 0.38313838839530945, "learning_rate": 0.00014504971098265896, "loss": 0.38700874328613283, "step": 35650 }, { "epoch": 13.73795761078998, "eval_loss": 0.44430670142173767, "eval_runtime": 11.9855, "eval_samples_per_second": 1585.588, "eval_steps_per_second": 33.04, "step": 35650 }, { "epoch": 13.757225433526012, "grad_norm": 0.4058935046195984, "learning_rate": 0.00014497263969171485, "loss": 0.38067874908447263, "step": 35700 }, { "epoch": 13.757225433526012, "eval_loss": 0.4400652050971985, "eval_runtime": 11.8671, "eval_samples_per_second": 1601.402, "eval_steps_per_second": 33.37, "step": 35700 }, { "epoch": 13.776493256262043, "grad_norm": 0.3526720106601715, "learning_rate": 0.00014489556840077073, "loss": 0.37816337585449217, "step": 35750 }, { "epoch": 13.776493256262043, "eval_loss": 0.4351047873497009, "eval_runtime": 11.8684, "eval_samples_per_second": 1601.222, "eval_steps_per_second": 33.366, "step": 35750 }, { "epoch": 13.795761078998073, "grad_norm": 0.38308560848236084, "learning_rate": 0.0001448184971098266, "loss": 0.38212928771972654, "step": 35800 }, { "epoch": 13.795761078998073, "eval_loss": 0.43916022777557373, "eval_runtime": 11.8766, "eval_samples_per_second": 1600.115, "eval_steps_per_second": 33.343, "step": 35800 }, { "epoch": 13.815028901734104, "grad_norm": 0.3698022961616516, "learning_rate": 0.00014474142581888248, "loss": 0.37908817291259767, "step": 35850 }, { "epoch": 13.815028901734104, "eval_loss": 0.44043493270874023, "eval_runtime": 11.8806, "eval_samples_per_second": 1599.583, "eval_steps_per_second": 33.332, "step": 35850 }, { "epoch": 13.834296724470136, "grad_norm": 0.3977256417274475, "learning_rate": 0.00014466435452793833, "loss": 0.38509559631347656, "step": 35900 }, { "epoch": 13.834296724470136, "eval_loss": 0.44282832741737366, "eval_runtime": 11.8964, "eval_samples_per_second": 1597.456, "eval_steps_per_second": 33.287, "step": 35900 }, { "epoch": 13.853564547206165, "grad_norm": 0.4107773005962372, "learning_rate": 0.00014458728323699422, "loss": 0.3797709274291992, "step": 35950 }, { "epoch": 13.853564547206165, "eval_loss": 0.4266767203807831, "eval_runtime": 11.877, "eval_samples_per_second": 1600.062, "eval_steps_per_second": 33.342, "step": 35950 }, { "epoch": 13.872832369942197, "grad_norm": 0.3417651057243347, "learning_rate": 0.0001445102119460501, "loss": 0.3813280487060547, "step": 36000 }, { "epoch": 13.872832369942197, "eval_loss": 0.4321760833263397, "eval_runtime": 11.8735, "eval_samples_per_second": 1600.544, "eval_steps_per_second": 33.352, "step": 36000 }, { "epoch": 13.892100192678228, "grad_norm": 0.40450921654701233, "learning_rate": 0.000144433140655106, "loss": 0.38295875549316405, "step": 36050 }, { "epoch": 13.892100192678228, "eval_loss": 0.4431474506855011, "eval_runtime": 11.9544, "eval_samples_per_second": 1589.714, "eval_steps_per_second": 33.126, "step": 36050 }, { "epoch": 13.911368015414258, "grad_norm": 0.40607213973999023, "learning_rate": 0.00014435606936416187, "loss": 0.37555423736572263, "step": 36100 }, { "epoch": 13.911368015414258, "eval_loss": 0.4343235492706299, "eval_runtime": 11.8576, "eval_samples_per_second": 1602.689, "eval_steps_per_second": 33.396, "step": 36100 }, { "epoch": 13.93063583815029, "grad_norm": 0.379294753074646, "learning_rate": 0.00014427899807321773, "loss": 0.3865922164916992, "step": 36150 }, { "epoch": 13.93063583815029, "eval_loss": 0.4355189800262451, "eval_runtime": 11.8811, "eval_samples_per_second": 1599.519, "eval_steps_per_second": 33.33, "step": 36150 }, { "epoch": 13.94990366088632, "grad_norm": 0.37016549706459045, "learning_rate": 0.0001442019267822736, "loss": 0.37695892333984377, "step": 36200 }, { "epoch": 13.94990366088632, "eval_loss": 0.44395947456359863, "eval_runtime": 11.8812, "eval_samples_per_second": 1599.499, "eval_steps_per_second": 33.33, "step": 36200 }, { "epoch": 13.96917148362235, "grad_norm": 0.39724665880203247, "learning_rate": 0.00014412485549132947, "loss": 0.3811039352416992, "step": 36250 }, { "epoch": 13.96917148362235, "eval_loss": 0.4306323528289795, "eval_runtime": 11.8799, "eval_samples_per_second": 1599.683, "eval_steps_per_second": 33.334, "step": 36250 }, { "epoch": 13.988439306358382, "grad_norm": 0.36283284425735474, "learning_rate": 0.00014404778420038536, "loss": 0.38348583221435545, "step": 36300 }, { "epoch": 13.988439306358382, "eval_loss": 0.440794438123703, "eval_runtime": 11.8777, "eval_samples_per_second": 1599.97, "eval_steps_per_second": 33.34, "step": 36300 }, { "epoch": 14.007707129094412, "grad_norm": 0.4344624876976013, "learning_rate": 0.00014397071290944124, "loss": 0.38422943115234376, "step": 36350 }, { "epoch": 14.007707129094412, "eval_loss": 0.44913098216056824, "eval_runtime": 11.8794, "eval_samples_per_second": 1599.741, "eval_steps_per_second": 33.335, "step": 36350 }, { "epoch": 14.026974951830443, "grad_norm": 0.3727020025253296, "learning_rate": 0.00014389364161849713, "loss": 0.38116233825683593, "step": 36400 }, { "epoch": 14.026974951830443, "eval_loss": 0.438638836145401, "eval_runtime": 11.8753, "eval_samples_per_second": 1600.301, "eval_steps_per_second": 33.347, "step": 36400 }, { "epoch": 14.046242774566474, "grad_norm": 0.5244733691215515, "learning_rate": 0.000143816570327553, "loss": 0.38202838897705077, "step": 36450 }, { "epoch": 14.046242774566474, "eval_loss": 0.4355951249599457, "eval_runtime": 11.8606, "eval_samples_per_second": 1602.281, "eval_steps_per_second": 33.388, "step": 36450 }, { "epoch": 14.065510597302504, "grad_norm": 0.4294710159301758, "learning_rate": 0.00014373949903660887, "loss": 0.3792076110839844, "step": 36500 }, { "epoch": 14.065510597302504, "eval_loss": 0.4515509009361267, "eval_runtime": 11.8641, "eval_samples_per_second": 1601.804, "eval_steps_per_second": 33.378, "step": 36500 }, { "epoch": 14.084778420038536, "grad_norm": 0.39455175399780273, "learning_rate": 0.00014366242774566473, "loss": 0.3719952392578125, "step": 36550 }, { "epoch": 14.084778420038536, "eval_loss": 0.44721341133117676, "eval_runtime": 11.8689, "eval_samples_per_second": 1601.162, "eval_steps_per_second": 33.365, "step": 36550 }, { "epoch": 14.104046242774567, "grad_norm": 0.3977121114730835, "learning_rate": 0.00014358535645472061, "loss": 0.3800851440429687, "step": 36600 }, { "epoch": 14.104046242774567, "eval_loss": 0.4458774924278259, "eval_runtime": 11.8764, "eval_samples_per_second": 1600.154, "eval_steps_per_second": 33.344, "step": 36600 }, { "epoch": 14.123314065510597, "grad_norm": 0.4647289216518402, "learning_rate": 0.0001435082851637765, "loss": 0.3805638122558594, "step": 36650 }, { "epoch": 14.123314065510597, "eval_loss": 0.4410754144191742, "eval_runtime": 11.8941, "eval_samples_per_second": 1597.771, "eval_steps_per_second": 33.294, "step": 36650 }, { "epoch": 14.142581888246628, "grad_norm": 0.45145711302757263, "learning_rate": 0.00014343121387283238, "loss": 0.3828955078125, "step": 36700 }, { "epoch": 14.142581888246628, "eval_loss": 0.43638166785240173, "eval_runtime": 11.8811, "eval_samples_per_second": 1599.522, "eval_steps_per_second": 33.33, "step": 36700 }, { "epoch": 14.16184971098266, "grad_norm": 0.4412001371383667, "learning_rate": 0.00014335414258188827, "loss": 0.38061290740966797, "step": 36750 }, { "epoch": 14.16184971098266, "eval_loss": 0.4303288459777832, "eval_runtime": 11.9008, "eval_samples_per_second": 1596.873, "eval_steps_per_second": 33.275, "step": 36750 }, { "epoch": 14.18111753371869, "grad_norm": 0.35142451524734497, "learning_rate": 0.00014327707129094413, "loss": 0.38024070739746096, "step": 36800 }, { "epoch": 14.18111753371869, "eval_loss": 0.42968130111694336, "eval_runtime": 11.8762, "eval_samples_per_second": 1600.172, "eval_steps_per_second": 33.344, "step": 36800 }, { "epoch": 14.20038535645472, "grad_norm": 0.3531448543071747, "learning_rate": 0.0001432, "loss": 0.37969287872314456, "step": 36850 }, { "epoch": 14.20038535645472, "eval_loss": 0.43430638313293457, "eval_runtime": 11.9052, "eval_samples_per_second": 1596.275, "eval_steps_per_second": 33.263, "step": 36850 }, { "epoch": 14.219653179190752, "grad_norm": 0.417361855506897, "learning_rate": 0.00014312292870905587, "loss": 0.3793895721435547, "step": 36900 }, { "epoch": 14.219653179190752, "eval_loss": 0.4432015120983124, "eval_runtime": 11.8801, "eval_samples_per_second": 1599.649, "eval_steps_per_second": 33.333, "step": 36900 }, { "epoch": 14.238921001926782, "grad_norm": 0.4530898630619049, "learning_rate": 0.00014304585741811175, "loss": 0.37840225219726564, "step": 36950 }, { "epoch": 14.238921001926782, "eval_loss": 0.4461728632450104, "eval_runtime": 11.8641, "eval_samples_per_second": 1601.81, "eval_steps_per_second": 33.378, "step": 36950 }, { "epoch": 14.258188824662813, "grad_norm": 0.40305784344673157, "learning_rate": 0.00014296878612716764, "loss": 0.38248989105224607, "step": 37000 }, { "epoch": 14.258188824662813, "eval_loss": 0.430886834859848, "eval_runtime": 11.8785, "eval_samples_per_second": 1599.869, "eval_steps_per_second": 33.338, "step": 37000 }, { "epoch": 14.277456647398845, "grad_norm": 0.3808079659938812, "learning_rate": 0.00014289171483622352, "loss": 0.3820143127441406, "step": 37050 }, { "epoch": 14.277456647398845, "eval_loss": 0.4320124089717865, "eval_runtime": 11.8903, "eval_samples_per_second": 1598.283, "eval_steps_per_second": 33.305, "step": 37050 }, { "epoch": 14.296724470134874, "grad_norm": 0.4340185821056366, "learning_rate": 0.00014281464354527938, "loss": 0.38225830078125, "step": 37100 }, { "epoch": 14.296724470134874, "eval_loss": 0.4346868097782135, "eval_runtime": 11.8808, "eval_samples_per_second": 1599.552, "eval_steps_per_second": 33.331, "step": 37100 }, { "epoch": 14.315992292870906, "grad_norm": 0.38903936743736267, "learning_rate": 0.00014273757225433527, "loss": 0.3738589096069336, "step": 37150 }, { "epoch": 14.315992292870906, "eval_loss": 0.43090885877609253, "eval_runtime": 11.8708, "eval_samples_per_second": 1600.903, "eval_steps_per_second": 33.359, "step": 37150 }, { "epoch": 14.335260115606937, "grad_norm": 0.43755948543548584, "learning_rate": 0.00014266050096339115, "loss": 0.37940147399902346, "step": 37200 }, { "epoch": 14.335260115606937, "eval_loss": 0.4312439560890198, "eval_runtime": 11.88, "eval_samples_per_second": 1599.663, "eval_steps_per_second": 33.333, "step": 37200 }, { "epoch": 14.354527938342967, "grad_norm": 0.3950025141239166, "learning_rate": 0.000142583429672447, "loss": 0.37618667602539063, "step": 37250 }, { "epoch": 14.354527938342967, "eval_loss": 0.42848825454711914, "eval_runtime": 11.8791, "eval_samples_per_second": 1599.778, "eval_steps_per_second": 33.336, "step": 37250 }, { "epoch": 14.373795761078998, "grad_norm": 0.4110814034938812, "learning_rate": 0.0001425063583815029, "loss": 0.38583080291748045, "step": 37300 }, { "epoch": 14.373795761078998, "eval_loss": 0.44013723731040955, "eval_runtime": 11.8743, "eval_samples_per_second": 1600.433, "eval_steps_per_second": 33.349, "step": 37300 }, { "epoch": 14.393063583815028, "grad_norm": 0.3987620174884796, "learning_rate": 0.00014242928709055878, "loss": 0.3769171142578125, "step": 37350 }, { "epoch": 14.393063583815028, "eval_loss": 0.4426681101322174, "eval_runtime": 11.8774, "eval_samples_per_second": 1600.014, "eval_steps_per_second": 33.341, "step": 37350 }, { "epoch": 14.41233140655106, "grad_norm": 0.3421357274055481, "learning_rate": 0.00014235221579961464, "loss": 0.3795610046386719, "step": 37400 }, { "epoch": 14.41233140655106, "eval_loss": 0.4427623450756073, "eval_runtime": 11.8769, "eval_samples_per_second": 1600.084, "eval_steps_per_second": 33.342, "step": 37400 }, { "epoch": 14.431599229287091, "grad_norm": 0.38396304845809937, "learning_rate": 0.00014227514450867052, "loss": 0.37682018280029295, "step": 37450 }, { "epoch": 14.431599229287091, "eval_loss": 0.43596968054771423, "eval_runtime": 11.8893, "eval_samples_per_second": 1598.411, "eval_steps_per_second": 33.307, "step": 37450 }, { "epoch": 14.45086705202312, "grad_norm": 0.37377476692199707, "learning_rate": 0.0001421980732177264, "loss": 0.3792469024658203, "step": 37500 }, { "epoch": 14.45086705202312, "eval_loss": 0.44341138005256653, "eval_runtime": 11.894, "eval_samples_per_second": 1597.78, "eval_steps_per_second": 33.294, "step": 37500 }, { "epoch": 14.470134874759152, "grad_norm": 0.42090049386024475, "learning_rate": 0.0001421210019267823, "loss": 0.3782391357421875, "step": 37550 }, { "epoch": 14.470134874759152, "eval_loss": 0.44600072503089905, "eval_runtime": 11.8826, "eval_samples_per_second": 1599.31, "eval_steps_per_second": 33.326, "step": 37550 }, { "epoch": 14.489402697495184, "grad_norm": 0.3489246368408203, "learning_rate": 0.00014204393063583818, "loss": 0.3764407348632812, "step": 37600 }, { "epoch": 14.489402697495184, "eval_loss": 0.4365929663181305, "eval_runtime": 11.8819, "eval_samples_per_second": 1599.414, "eval_steps_per_second": 33.328, "step": 37600 }, { "epoch": 14.508670520231213, "grad_norm": 0.48868289589881897, "learning_rate": 0.00014196685934489403, "loss": 0.37302169799804685, "step": 37650 }, { "epoch": 14.508670520231213, "eval_loss": 0.4403647482395172, "eval_runtime": 11.8722, "eval_samples_per_second": 1600.717, "eval_steps_per_second": 33.355, "step": 37650 }, { "epoch": 14.527938342967245, "grad_norm": 0.3843846619129181, "learning_rate": 0.0001418897880539499, "loss": 0.3758945846557617, "step": 37700 }, { "epoch": 14.527938342967245, "eval_loss": 0.4317283034324646, "eval_runtime": 11.8832, "eval_samples_per_second": 1599.238, "eval_steps_per_second": 33.324, "step": 37700 }, { "epoch": 14.547206165703276, "grad_norm": 0.4582479000091553, "learning_rate": 0.00014181271676300578, "loss": 0.3745765686035156, "step": 37750 }, { "epoch": 14.547206165703276, "eval_loss": 0.43730291724205017, "eval_runtime": 11.8793, "eval_samples_per_second": 1599.764, "eval_steps_per_second": 33.335, "step": 37750 }, { "epoch": 14.566473988439306, "grad_norm": 0.4013454020023346, "learning_rate": 0.00014173564547206166, "loss": 0.37971736907958986, "step": 37800 }, { "epoch": 14.566473988439306, "eval_loss": 0.4352198541164398, "eval_runtime": 11.894, "eval_samples_per_second": 1597.775, "eval_steps_per_second": 33.294, "step": 37800 }, { "epoch": 14.585741811175337, "grad_norm": 0.4029386639595032, "learning_rate": 0.00014165857418111755, "loss": 0.37762550354003904, "step": 37850 }, { "epoch": 14.585741811175337, "eval_loss": 0.43234410881996155, "eval_runtime": 11.8847, "eval_samples_per_second": 1599.035, "eval_steps_per_second": 33.32, "step": 37850 }, { "epoch": 14.605009633911369, "grad_norm": 0.3346317410469055, "learning_rate": 0.00014158150289017343, "loss": 0.38245803833007813, "step": 37900 }, { "epoch": 14.605009633911369, "eval_loss": 0.4329797625541687, "eval_runtime": 11.8809, "eval_samples_per_second": 1599.536, "eval_steps_per_second": 33.331, "step": 37900 }, { "epoch": 14.624277456647398, "grad_norm": 0.3934146761894226, "learning_rate": 0.00014150443159922932, "loss": 0.3765183639526367, "step": 37950 }, { "epoch": 14.624277456647398, "eval_loss": 0.42956236004829407, "eval_runtime": 11.876, "eval_samples_per_second": 1600.207, "eval_steps_per_second": 33.345, "step": 37950 }, { "epoch": 14.64354527938343, "grad_norm": 0.42616984248161316, "learning_rate": 0.00014142736030828517, "loss": 0.37680046081542967, "step": 38000 }, { "epoch": 14.64354527938343, "eval_loss": 0.4306581914424896, "eval_runtime": 11.8779, "eval_samples_per_second": 1599.946, "eval_steps_per_second": 33.339, "step": 38000 }, { "epoch": 14.662813102119461, "grad_norm": 0.4336816966533661, "learning_rate": 0.00014135028901734103, "loss": 0.3824346160888672, "step": 38050 }, { "epoch": 14.662813102119461, "eval_loss": 0.431427001953125, "eval_runtime": 11.8803, "eval_samples_per_second": 1599.62, "eval_steps_per_second": 33.332, "step": 38050 }, { "epoch": 14.68208092485549, "grad_norm": 0.39746788144111633, "learning_rate": 0.00014127321772639692, "loss": 0.3736263275146484, "step": 38100 }, { "epoch": 14.68208092485549, "eval_loss": 0.42889219522476196, "eval_runtime": 11.8879, "eval_samples_per_second": 1598.599, "eval_steps_per_second": 33.311, "step": 38100 }, { "epoch": 14.701348747591522, "grad_norm": 0.36066871881484985, "learning_rate": 0.0001411961464354528, "loss": 0.37296863555908205, "step": 38150 }, { "epoch": 14.701348747591522, "eval_loss": 0.4357042610645294, "eval_runtime": 11.889, "eval_samples_per_second": 1598.446, "eval_steps_per_second": 33.308, "step": 38150 }, { "epoch": 14.720616570327554, "grad_norm": 0.4162771701812744, "learning_rate": 0.0001411190751445087, "loss": 0.3744561386108398, "step": 38200 }, { "epoch": 14.720616570327554, "eval_loss": 0.43652740120887756, "eval_runtime": 11.8833, "eval_samples_per_second": 1599.226, "eval_steps_per_second": 33.324, "step": 38200 }, { "epoch": 14.739884393063583, "grad_norm": 0.38236865401268005, "learning_rate": 0.00014104200385356457, "loss": 0.3768023681640625, "step": 38250 }, { "epoch": 14.739884393063583, "eval_loss": 0.4331950843334198, "eval_runtime": 11.8745, "eval_samples_per_second": 1600.403, "eval_steps_per_second": 33.349, "step": 38250 }, { "epoch": 14.759152215799615, "grad_norm": 0.3618032932281494, "learning_rate": 0.00014096493256262043, "loss": 0.38070743560791015, "step": 38300 }, { "epoch": 14.759152215799615, "eval_loss": 0.4337736964225769, "eval_runtime": 11.8805, "eval_samples_per_second": 1599.595, "eval_steps_per_second": 33.332, "step": 38300 }, { "epoch": 14.778420038535646, "grad_norm": 0.4100356101989746, "learning_rate": 0.00014088786127167631, "loss": 0.371185302734375, "step": 38350 }, { "epoch": 14.778420038535646, "eval_loss": 0.44419893622398376, "eval_runtime": 11.8831, "eval_samples_per_second": 1599.245, "eval_steps_per_second": 33.325, "step": 38350 }, { "epoch": 14.797687861271676, "grad_norm": 0.32992130517959595, "learning_rate": 0.00014081078998073217, "loss": 0.3827793121337891, "step": 38400 }, { "epoch": 14.797687861271676, "eval_loss": 0.4368211627006531, "eval_runtime": 11.8649, "eval_samples_per_second": 1601.704, "eval_steps_per_second": 33.376, "step": 38400 }, { "epoch": 14.816955684007707, "grad_norm": 0.3754708468914032, "learning_rate": 0.00014073371868978806, "loss": 0.37352714538574217, "step": 38450 }, { "epoch": 14.816955684007707, "eval_loss": 0.43016132712364197, "eval_runtime": 11.873, "eval_samples_per_second": 1600.6, "eval_steps_per_second": 33.353, "step": 38450 }, { "epoch": 14.836223506743737, "grad_norm": 0.4609224796295166, "learning_rate": 0.00014065664739884394, "loss": 0.37465797424316405, "step": 38500 }, { "epoch": 14.836223506743737, "eval_loss": 0.4341723620891571, "eval_runtime": 11.8826, "eval_samples_per_second": 1599.309, "eval_steps_per_second": 33.326, "step": 38500 }, { "epoch": 14.855491329479769, "grad_norm": 0.44062355160713196, "learning_rate": 0.00014057957610789983, "loss": 0.3718267440795898, "step": 38550 }, { "epoch": 14.855491329479769, "eval_loss": 0.4427218735218048, "eval_runtime": 11.8677, "eval_samples_per_second": 1601.317, "eval_steps_per_second": 33.368, "step": 38550 }, { "epoch": 14.8747591522158, "grad_norm": 0.3496776521205902, "learning_rate": 0.00014050250481695568, "loss": 0.3780777740478516, "step": 38600 }, { "epoch": 14.8747591522158, "eval_loss": 0.4404853880405426, "eval_runtime": 11.8601, "eval_samples_per_second": 1602.353, "eval_steps_per_second": 33.389, "step": 38600 }, { "epoch": 14.89402697495183, "grad_norm": 0.3871830403804779, "learning_rate": 0.00014042543352601157, "loss": 0.3738559722900391, "step": 38650 }, { "epoch": 14.89402697495183, "eval_loss": 0.44334375858306885, "eval_runtime": 11.8645, "eval_samples_per_second": 1601.755, "eval_steps_per_second": 33.377, "step": 38650 }, { "epoch": 14.913294797687861, "grad_norm": 0.4026762545108795, "learning_rate": 0.00014034836223506745, "loss": 0.37500312805175784, "step": 38700 }, { "epoch": 14.913294797687861, "eval_loss": 0.43844181299209595, "eval_runtime": 11.8786, "eval_samples_per_second": 1599.855, "eval_steps_per_second": 33.337, "step": 38700 }, { "epoch": 14.932562620423893, "grad_norm": 0.3798488676548004, "learning_rate": 0.0001402712909441233, "loss": 0.3795243453979492, "step": 38750 }, { "epoch": 14.932562620423893, "eval_loss": 0.44505763053894043, "eval_runtime": 11.8927, "eval_samples_per_second": 1597.951, "eval_steps_per_second": 33.298, "step": 38750 }, { "epoch": 14.951830443159922, "grad_norm": 0.4112244248390198, "learning_rate": 0.0001401942196531792, "loss": 0.38222347259521483, "step": 38800 }, { "epoch": 14.951830443159922, "eval_loss": 0.4375055432319641, "eval_runtime": 11.8905, "eval_samples_per_second": 1598.249, "eval_steps_per_second": 33.304, "step": 38800 }, { "epoch": 14.971098265895954, "grad_norm": 0.3385605812072754, "learning_rate": 0.00014011714836223508, "loss": 0.3804395294189453, "step": 38850 }, { "epoch": 14.971098265895954, "eval_loss": 0.4361778199672699, "eval_runtime": 11.8938, "eval_samples_per_second": 1597.806, "eval_steps_per_second": 33.295, "step": 38850 }, { "epoch": 14.990366088631985, "grad_norm": 0.42839252948760986, "learning_rate": 0.00014004007707129094, "loss": 0.3759267807006836, "step": 38900 }, { "epoch": 14.990366088631985, "eval_loss": 0.44306430220603943, "eval_runtime": 11.874, "eval_samples_per_second": 1600.478, "eval_steps_per_second": 33.35, "step": 38900 }, { "epoch": 15.009633911368015, "grad_norm": 0.37814638018608093, "learning_rate": 0.00013996300578034682, "loss": 0.3811022186279297, "step": 38950 }, { "epoch": 15.009633911368015, "eval_loss": 0.43825221061706543, "eval_runtime": 11.8778, "eval_samples_per_second": 1599.96, "eval_steps_per_second": 33.34, "step": 38950 }, { "epoch": 15.028901734104046, "grad_norm": 0.3653123676776886, "learning_rate": 0.0001398859344894027, "loss": 0.36930694580078127, "step": 39000 }, { "epoch": 15.028901734104046, "eval_loss": 0.4372926950454712, "eval_runtime": 11.8871, "eval_samples_per_second": 1598.713, "eval_steps_per_second": 33.314, "step": 39000 }, { "epoch": 15.048169556840078, "grad_norm": 0.46872478723526, "learning_rate": 0.0001398088631984586, "loss": 0.3763685607910156, "step": 39050 }, { "epoch": 15.048169556840078, "eval_loss": 0.4245768189430237, "eval_runtime": 11.8887, "eval_samples_per_second": 1598.489, "eval_steps_per_second": 33.309, "step": 39050 }, { "epoch": 15.067437379576107, "grad_norm": 0.3406349718570709, "learning_rate": 0.00013973179190751445, "loss": 0.3775032043457031, "step": 39100 }, { "epoch": 15.067437379576107, "eval_loss": 0.42978355288505554, "eval_runtime": 11.8836, "eval_samples_per_second": 1599.18, "eval_steps_per_second": 33.323, "step": 39100 }, { "epoch": 15.086705202312139, "grad_norm": 0.387792706489563, "learning_rate": 0.00013965472061657034, "loss": 0.3743032073974609, "step": 39150 }, { "epoch": 15.086705202312139, "eval_loss": 0.43320927023887634, "eval_runtime": 11.8887, "eval_samples_per_second": 1598.487, "eval_steps_per_second": 33.309, "step": 39150 }, { "epoch": 15.10597302504817, "grad_norm": 0.3755277395248413, "learning_rate": 0.0001395776493256262, "loss": 0.37282238006591795, "step": 39200 }, { "epoch": 15.10597302504817, "eval_loss": 0.43271225690841675, "eval_runtime": 11.879, "eval_samples_per_second": 1599.805, "eval_steps_per_second": 33.336, "step": 39200 }, { "epoch": 15.1252408477842, "grad_norm": 0.36496102809906006, "learning_rate": 0.00013950057803468208, "loss": 0.37724536895751953, "step": 39250 }, { "epoch": 15.1252408477842, "eval_loss": 0.43432119488716125, "eval_runtime": 11.871, "eval_samples_per_second": 1600.877, "eval_steps_per_second": 33.359, "step": 39250 }, { "epoch": 15.144508670520231, "grad_norm": 0.3838697671890259, "learning_rate": 0.00013942350674373796, "loss": 0.37530208587646485, "step": 39300 }, { "epoch": 15.144508670520231, "eval_loss": 0.435111939907074, "eval_runtime": 11.8963, "eval_samples_per_second": 1597.475, "eval_steps_per_second": 33.288, "step": 39300 }, { "epoch": 15.163776493256263, "grad_norm": 0.3791313171386719, "learning_rate": 0.00013934643545279385, "loss": 0.3725900650024414, "step": 39350 }, { "epoch": 15.163776493256263, "eval_loss": 0.4309372305870056, "eval_runtime": 11.8912, "eval_samples_per_second": 1598.158, "eval_steps_per_second": 33.302, "step": 39350 }, { "epoch": 15.183044315992293, "grad_norm": 0.4660663604736328, "learning_rate": 0.00013926936416184973, "loss": 0.3757645797729492, "step": 39400 }, { "epoch": 15.183044315992293, "eval_loss": 0.4328477084636688, "eval_runtime": 11.8806, "eval_samples_per_second": 1599.578, "eval_steps_per_second": 33.332, "step": 39400 }, { "epoch": 15.202312138728324, "grad_norm": 0.5034533143043518, "learning_rate": 0.0001391922928709056, "loss": 0.37257965087890627, "step": 39450 }, { "epoch": 15.202312138728324, "eval_loss": 0.4282819628715515, "eval_runtime": 11.8895, "eval_samples_per_second": 1598.385, "eval_steps_per_second": 33.307, "step": 39450 }, { "epoch": 15.221579961464354, "grad_norm": 0.3505989611148834, "learning_rate": 0.00013911522157996145, "loss": 0.38132286071777344, "step": 39500 }, { "epoch": 15.221579961464354, "eval_loss": 0.4260183870792389, "eval_runtime": 11.8845, "eval_samples_per_second": 1599.057, "eval_steps_per_second": 33.321, "step": 39500 }, { "epoch": 15.240847784200385, "grad_norm": 0.37635183334350586, "learning_rate": 0.00013903815028901733, "loss": 0.3783117294311523, "step": 39550 }, { "epoch": 15.240847784200385, "eval_loss": 0.4350746273994446, "eval_runtime": 11.8818, "eval_samples_per_second": 1599.424, "eval_steps_per_second": 33.328, "step": 39550 }, { "epoch": 15.260115606936417, "grad_norm": 0.40330711007118225, "learning_rate": 0.00013896107899807322, "loss": 0.3761822509765625, "step": 39600 }, { "epoch": 15.260115606936417, "eval_loss": 0.42514094710350037, "eval_runtime": 11.8696, "eval_samples_per_second": 1601.059, "eval_steps_per_second": 33.362, "step": 39600 }, { "epoch": 15.279383429672446, "grad_norm": 0.4602627754211426, "learning_rate": 0.0001388840077071291, "loss": 0.38019187927246095, "step": 39650 }, { "epoch": 15.279383429672446, "eval_loss": 0.4334520101547241, "eval_runtime": 11.8991, "eval_samples_per_second": 1597.092, "eval_steps_per_second": 33.28, "step": 39650 }, { "epoch": 15.298651252408478, "grad_norm": 0.3472575545310974, "learning_rate": 0.000138806936416185, "loss": 0.37660491943359375, "step": 39700 }, { "epoch": 15.298651252408478, "eval_loss": 0.43516436219215393, "eval_runtime": 11.8815, "eval_samples_per_second": 1599.463, "eval_steps_per_second": 33.329, "step": 39700 }, { "epoch": 15.31791907514451, "grad_norm": 0.3947117328643799, "learning_rate": 0.00013872986512524087, "loss": 0.374868278503418, "step": 39750 }, { "epoch": 15.31791907514451, "eval_loss": 0.43646037578582764, "eval_runtime": 11.8834, "eval_samples_per_second": 1599.21, "eval_steps_per_second": 33.324, "step": 39750 }, { "epoch": 15.337186897880539, "grad_norm": 0.46189001202583313, "learning_rate": 0.00013865279383429673, "loss": 0.37289535522460937, "step": 39800 }, { "epoch": 15.337186897880539, "eval_loss": 0.43620336055755615, "eval_runtime": 11.8907, "eval_samples_per_second": 1598.22, "eval_steps_per_second": 33.303, "step": 39800 }, { "epoch": 15.35645472061657, "grad_norm": 0.3841976523399353, "learning_rate": 0.00013857572254335262, "loss": 0.3794962692260742, "step": 39850 }, { "epoch": 15.35645472061657, "eval_loss": 0.4291752278804779, "eval_runtime": 11.883, "eval_samples_per_second": 1599.259, "eval_steps_per_second": 33.325, "step": 39850 }, { "epoch": 15.375722543352602, "grad_norm": 0.3878290057182312, "learning_rate": 0.00013849865125240847, "loss": 0.3694114685058594, "step": 39900 }, { "epoch": 15.375722543352602, "eval_loss": 0.428070604801178, "eval_runtime": 11.8899, "eval_samples_per_second": 1598.328, "eval_steps_per_second": 33.306, "step": 39900 }, { "epoch": 15.394990366088631, "grad_norm": 0.37133467197418213, "learning_rate": 0.00013842157996146436, "loss": 0.3685879898071289, "step": 39950 }, { "epoch": 15.394990366088631, "eval_loss": 0.433061420917511, "eval_runtime": 11.8774, "eval_samples_per_second": 1600.019, "eval_steps_per_second": 33.341, "step": 39950 }, { "epoch": 15.414258188824663, "grad_norm": 0.39582914113998413, "learning_rate": 0.00013834450867052024, "loss": 0.3827000045776367, "step": 40000 }, { "epoch": 15.414258188824663, "eval_loss": 0.42989546060562134, "eval_runtime": 11.888, "eval_samples_per_second": 1598.589, "eval_steps_per_second": 33.311, "step": 40000 }, { "epoch": 15.433526011560694, "grad_norm": 0.33938807249069214, "learning_rate": 0.0001382674373795761, "loss": 0.374171142578125, "step": 40050 }, { "epoch": 15.433526011560694, "eval_loss": 0.4306272864341736, "eval_runtime": 11.9062, "eval_samples_per_second": 1596.147, "eval_steps_per_second": 33.26, "step": 40050 }, { "epoch": 15.452793834296724, "grad_norm": 0.36955955624580383, "learning_rate": 0.000138190366088632, "loss": 0.37653350830078125, "step": 40100 }, { "epoch": 15.452793834296724, "eval_loss": 0.42696696519851685, "eval_runtime": 11.8836, "eval_samples_per_second": 1599.178, "eval_steps_per_second": 33.323, "step": 40100 }, { "epoch": 15.472061657032755, "grad_norm": 0.4573238492012024, "learning_rate": 0.00013811329479768787, "loss": 0.37626487731933594, "step": 40150 }, { "epoch": 15.472061657032755, "eval_loss": 0.4341013431549072, "eval_runtime": 11.893, "eval_samples_per_second": 1597.92, "eval_steps_per_second": 33.297, "step": 40150 }, { "epoch": 15.491329479768787, "grad_norm": 0.4089154601097107, "learning_rate": 0.00013803622350674376, "loss": 0.37144874572753905, "step": 40200 }, { "epoch": 15.491329479768787, "eval_loss": 0.43066081404685974, "eval_runtime": 11.9081, "eval_samples_per_second": 1595.895, "eval_steps_per_second": 33.255, "step": 40200 }, { "epoch": 15.510597302504816, "grad_norm": 0.3453061282634735, "learning_rate": 0.00013795915221579961, "loss": 0.3749897003173828, "step": 40250 }, { "epoch": 15.510597302504816, "eval_loss": 0.44410696625709534, "eval_runtime": 11.9506, "eval_samples_per_second": 1590.218, "eval_steps_per_second": 33.137, "step": 40250 }, { "epoch": 15.529865125240848, "grad_norm": 0.4174209237098694, "learning_rate": 0.0001378820809248555, "loss": 0.3763295745849609, "step": 40300 }, { "epoch": 15.529865125240848, "eval_loss": 0.4337541162967682, "eval_runtime": 11.8836, "eval_samples_per_second": 1599.18, "eval_steps_per_second": 33.323, "step": 40300 }, { "epoch": 15.54913294797688, "grad_norm": 0.3294774889945984, "learning_rate": 0.00013780500963391136, "loss": 0.37364242553710936, "step": 40350 }, { "epoch": 15.54913294797688, "eval_loss": 0.4321858882904053, "eval_runtime": 11.8813, "eval_samples_per_second": 1599.486, "eval_steps_per_second": 33.33, "step": 40350 }, { "epoch": 15.568400770712909, "grad_norm": 0.34207165241241455, "learning_rate": 0.00013772793834296724, "loss": 0.37625545501708985, "step": 40400 }, { "epoch": 15.568400770712909, "eval_loss": 0.4468172788619995, "eval_runtime": 11.8886, "eval_samples_per_second": 1598.51, "eval_steps_per_second": 33.309, "step": 40400 }, { "epoch": 15.58766859344894, "grad_norm": 0.36835983395576477, "learning_rate": 0.00013765086705202313, "loss": 0.3699828338623047, "step": 40450 }, { "epoch": 15.58766859344894, "eval_loss": 0.4322862923145294, "eval_runtime": 11.8785, "eval_samples_per_second": 1599.86, "eval_steps_per_second": 33.337, "step": 40450 }, { "epoch": 15.606936416184972, "grad_norm": 0.3465955853462219, "learning_rate": 0.000137573795761079, "loss": 0.3785712432861328, "step": 40500 }, { "epoch": 15.606936416184972, "eval_loss": 0.42690226435661316, "eval_runtime": 11.8883, "eval_samples_per_second": 1598.544, "eval_steps_per_second": 33.31, "step": 40500 }, { "epoch": 15.626204238921002, "grad_norm": 0.41579464077949524, "learning_rate": 0.0001374967244701349, "loss": 0.37517547607421875, "step": 40550 }, { "epoch": 15.626204238921002, "eval_loss": 0.43364417552948, "eval_runtime": 11.8799, "eval_samples_per_second": 1599.673, "eval_steps_per_second": 33.334, "step": 40550 }, { "epoch": 15.645472061657033, "grad_norm": 0.40581342577934265, "learning_rate": 0.00013741965317919075, "loss": 0.37625831604003906, "step": 40600 }, { "epoch": 15.645472061657033, "eval_loss": 0.4265032410621643, "eval_runtime": 11.8952, "eval_samples_per_second": 1597.626, "eval_steps_per_second": 33.291, "step": 40600 }, { "epoch": 15.664739884393063, "grad_norm": 0.3297688066959381, "learning_rate": 0.0001373425818882466, "loss": 0.3771537780761719, "step": 40650 }, { "epoch": 15.664739884393063, "eval_loss": 0.433337539434433, "eval_runtime": 11.8773, "eval_samples_per_second": 1600.023, "eval_steps_per_second": 33.341, "step": 40650 }, { "epoch": 15.684007707129094, "grad_norm": 0.5016096234321594, "learning_rate": 0.0001372655105973025, "loss": 0.3686669921875, "step": 40700 }, { "epoch": 15.684007707129094, "eval_loss": 0.42902764678001404, "eval_runtime": 11.8794, "eval_samples_per_second": 1599.743, "eval_steps_per_second": 33.335, "step": 40700 }, { "epoch": 15.703275529865126, "grad_norm": 0.35741814970970154, "learning_rate": 0.00013718843930635838, "loss": 0.3745785140991211, "step": 40750 }, { "epoch": 15.703275529865126, "eval_loss": 0.42540350556373596, "eval_runtime": 11.9107, "eval_samples_per_second": 1595.545, "eval_steps_per_second": 33.248, "step": 40750 }, { "epoch": 15.722543352601155, "grad_norm": 0.39775365591049194, "learning_rate": 0.00013711136801541427, "loss": 0.3728546905517578, "step": 40800 }, { "epoch": 15.722543352601155, "eval_loss": 0.43384385108947754, "eval_runtime": 11.8836, "eval_samples_per_second": 1599.183, "eval_steps_per_second": 33.323, "step": 40800 }, { "epoch": 15.741811175337187, "grad_norm": 0.3606812655925751, "learning_rate": 0.00013703429672447015, "loss": 0.3712510299682617, "step": 40850 }, { "epoch": 15.741811175337187, "eval_loss": 0.4305489659309387, "eval_runtime": 11.8814, "eval_samples_per_second": 1599.47, "eval_steps_per_second": 33.329, "step": 40850 }, { "epoch": 15.761078998073218, "grad_norm": 0.3871927261352539, "learning_rate": 0.00013695722543352604, "loss": 0.3710041046142578, "step": 40900 }, { "epoch": 15.761078998073218, "eval_loss": 0.43041518330574036, "eval_runtime": 11.8828, "eval_samples_per_second": 1599.291, "eval_steps_per_second": 33.326, "step": 40900 }, { "epoch": 15.780346820809248, "grad_norm": 0.4051716923713684, "learning_rate": 0.0001368801541425819, "loss": 0.3748811340332031, "step": 40950 }, { "epoch": 15.780346820809248, "eval_loss": 0.43674615025520325, "eval_runtime": 12.0336, "eval_samples_per_second": 1579.244, "eval_steps_per_second": 32.908, "step": 40950 }, { "epoch": 15.79961464354528, "grad_norm": 0.37610912322998047, "learning_rate": 0.00013680308285163775, "loss": 0.373887825012207, "step": 41000 }, { "epoch": 15.79961464354528, "eval_loss": 0.4259445071220398, "eval_runtime": 12.2746, "eval_samples_per_second": 1548.241, "eval_steps_per_second": 32.262, "step": 41000 }, { "epoch": 15.81888246628131, "grad_norm": 0.3993060290813446, "learning_rate": 0.00013672601156069364, "loss": 0.3735411834716797, "step": 41050 }, { "epoch": 15.81888246628131, "eval_loss": 0.4301701784133911, "eval_runtime": 12.0746, "eval_samples_per_second": 1573.883, "eval_steps_per_second": 32.796, "step": 41050 }, { "epoch": 15.83815028901734, "grad_norm": 0.3452811539173126, "learning_rate": 0.00013664894026974952, "loss": 0.36894729614257815, "step": 41100 }, { "epoch": 15.83815028901734, "eval_loss": 0.4360896646976471, "eval_runtime": 12.7103, "eval_samples_per_second": 1495.166, "eval_steps_per_second": 31.156, "step": 41100 }, { "epoch": 15.857418111753372, "grad_norm": 0.375769704580307, "learning_rate": 0.0001365718689788054, "loss": 0.36541015625, "step": 41150 }, { "epoch": 15.857418111753372, "eval_loss": 0.4356953203678131, "eval_runtime": 13.4497, "eval_samples_per_second": 1412.973, "eval_steps_per_second": 29.443, "step": 41150 }, { "epoch": 15.876685934489403, "grad_norm": 0.34270644187927246, "learning_rate": 0.0001364947976878613, "loss": 0.37390499114990233, "step": 41200 }, { "epoch": 15.876685934489403, "eval_loss": 0.4285966157913208, "eval_runtime": 13.1696, "eval_samples_per_second": 1443.017, "eval_steps_per_second": 30.069, "step": 41200 }, { "epoch": 15.895953757225433, "grad_norm": 0.34214526414871216, "learning_rate": 0.00013641772639691715, "loss": 0.3765554428100586, "step": 41250 }, { "epoch": 15.895953757225433, "eval_loss": 0.43224433064460754, "eval_runtime": 13.5011, "eval_samples_per_second": 1407.588, "eval_steps_per_second": 29.331, "step": 41250 }, { "epoch": 15.915221579961464, "grad_norm": 0.35899817943573, "learning_rate": 0.00013634065510597303, "loss": 0.37418479919433595, "step": 41300 }, { "epoch": 15.915221579961464, "eval_loss": 0.43924105167388916, "eval_runtime": 13.5234, "eval_samples_per_second": 1405.272, "eval_steps_per_second": 29.283, "step": 41300 }, { "epoch": 15.934489402697496, "grad_norm": 0.4219180643558502, "learning_rate": 0.0001362635838150289, "loss": 0.3805815505981445, "step": 41350 }, { "epoch": 15.934489402697496, "eval_loss": 0.43903473019599915, "eval_runtime": 13.8904, "eval_samples_per_second": 1368.143, "eval_steps_per_second": 28.509, "step": 41350 }, { "epoch": 15.953757225433526, "grad_norm": 0.37344813346862793, "learning_rate": 0.00013618651252408478, "loss": 0.3752391815185547, "step": 41400 }, { "epoch": 15.953757225433526, "eval_loss": 0.42890122532844543, "eval_runtime": 14.1314, "eval_samples_per_second": 1344.807, "eval_steps_per_second": 28.023, "step": 41400 }, { "epoch": 15.973025048169557, "grad_norm": 0.4036698043346405, "learning_rate": 0.00013610944123314066, "loss": 0.3756585693359375, "step": 41450 }, { "epoch": 15.973025048169557, "eval_loss": 0.42962875962257385, "eval_runtime": 13.494, "eval_samples_per_second": 1408.328, "eval_steps_per_second": 29.346, "step": 41450 }, { "epoch": 15.992292870905588, "grad_norm": 0.47626256942749023, "learning_rate": 0.00013603236994219655, "loss": 0.3763775634765625, "step": 41500 }, { "epoch": 15.992292870905588, "eval_loss": 0.42747271060943604, "eval_runtime": 13.6321, "eval_samples_per_second": 1394.057, "eval_steps_per_second": 29.049, "step": 41500 }, { "epoch": 16.01156069364162, "grad_norm": 0.38398414850234985, "learning_rate": 0.0001359552986512524, "loss": 0.37544708251953124, "step": 41550 }, { "epoch": 16.01156069364162, "eval_loss": 0.432369589805603, "eval_runtime": 13.7131, "eval_samples_per_second": 1385.833, "eval_steps_per_second": 28.878, "step": 41550 }, { "epoch": 16.03082851637765, "grad_norm": 0.37174907326698303, "learning_rate": 0.0001358782273603083, "loss": 0.3774713134765625, "step": 41600 }, { "epoch": 16.03082851637765, "eval_loss": 0.43593958020210266, "eval_runtime": 13.9827, "eval_samples_per_second": 1359.11, "eval_steps_per_second": 28.321, "step": 41600 }, { "epoch": 16.05009633911368, "grad_norm": 0.34926530718803406, "learning_rate": 0.00013580115606936417, "loss": 0.37779754638671875, "step": 41650 }, { "epoch": 16.05009633911368, "eval_loss": 0.42556384205818176, "eval_runtime": 13.4068, "eval_samples_per_second": 1417.491, "eval_steps_per_second": 29.537, "step": 41650 }, { "epoch": 16.069364161849713, "grad_norm": 0.38328316807746887, "learning_rate": 0.00013572408477842006, "loss": 0.3739446258544922, "step": 41700 }, { "epoch": 16.069364161849713, "eval_loss": 0.4237867593765259, "eval_runtime": 14.319, "eval_samples_per_second": 1327.19, "eval_steps_per_second": 27.656, "step": 41700 }, { "epoch": 16.088631984585742, "grad_norm": 0.39713814854621887, "learning_rate": 0.00013564701348747592, "loss": 0.36322330474853515, "step": 41750 }, { "epoch": 16.088631984585742, "eval_loss": 0.42249420285224915, "eval_runtime": 14.1872, "eval_samples_per_second": 1339.515, "eval_steps_per_second": 27.912, "step": 41750 }, { "epoch": 16.107899807321772, "grad_norm": 0.41743654012680054, "learning_rate": 0.0001355699421965318, "loss": 0.3706114959716797, "step": 41800 }, { "epoch": 16.107899807321772, "eval_loss": 0.43168193101882935, "eval_runtime": 13.4506, "eval_samples_per_second": 1412.872, "eval_steps_per_second": 29.441, "step": 41800 }, { "epoch": 16.127167630057805, "grad_norm": 0.39899131655693054, "learning_rate": 0.00013549287090558766, "loss": 0.3676503753662109, "step": 41850 }, { "epoch": 16.127167630057805, "eval_loss": 0.43259158730506897, "eval_runtime": 13.3884, "eval_samples_per_second": 1419.438, "eval_steps_per_second": 29.578, "step": 41850 }, { "epoch": 16.146435452793835, "grad_norm": 0.38755011558532715, "learning_rate": 0.00013541579961464355, "loss": 0.37317375183105467, "step": 41900 }, { "epoch": 16.146435452793835, "eval_loss": 0.4338461756706238, "eval_runtime": 13.1722, "eval_samples_per_second": 1442.739, "eval_steps_per_second": 30.063, "step": 41900 }, { "epoch": 16.165703275529864, "grad_norm": 0.3416798412799835, "learning_rate": 0.00013533872832369943, "loss": 0.37428306579589843, "step": 41950 }, { "epoch": 16.165703275529864, "eval_loss": 0.42191070318222046, "eval_runtime": 13.5922, "eval_samples_per_second": 1398.151, "eval_steps_per_second": 29.134, "step": 41950 }, { "epoch": 16.184971098265898, "grad_norm": 0.3851470947265625, "learning_rate": 0.00013526165703275531, "loss": 0.37133007049560546, "step": 42000 }, { "epoch": 16.184971098265898, "eval_loss": 0.43319636583328247, "eval_runtime": 13.8835, "eval_samples_per_second": 1368.817, "eval_steps_per_second": 28.523, "step": 42000 }, { "epoch": 16.204238921001927, "grad_norm": 0.4078589081764221, "learning_rate": 0.0001351845857418112, "loss": 0.37536376953125, "step": 42050 }, { "epoch": 16.204238921001927, "eval_loss": 0.43929001688957214, "eval_runtime": 13.8985, "eval_samples_per_second": 1367.342, "eval_steps_per_second": 28.492, "step": 42050 }, { "epoch": 16.223506743737957, "grad_norm": 0.3911657929420471, "learning_rate": 0.00013510751445086706, "loss": 0.36996772766113284, "step": 42100 }, { "epoch": 16.223506743737957, "eval_loss": 0.439253032207489, "eval_runtime": 13.8205, "eval_samples_per_second": 1375.058, "eval_steps_per_second": 28.653, "step": 42100 }, { "epoch": 16.24277456647399, "grad_norm": 0.3930109739303589, "learning_rate": 0.00013503044315992292, "loss": 0.37351905822753906, "step": 42150 }, { "epoch": 16.24277456647399, "eval_loss": 0.4297162592411041, "eval_runtime": 13.6543, "eval_samples_per_second": 1391.8, "eval_steps_per_second": 29.002, "step": 42150 }, { "epoch": 16.26204238921002, "grad_norm": 0.4032129645347595, "learning_rate": 0.0001349533718689788, "loss": 0.3805409622192383, "step": 42200 }, { "epoch": 16.26204238921002, "eval_loss": 0.43879228830337524, "eval_runtime": 14.3874, "eval_samples_per_second": 1320.876, "eval_steps_per_second": 27.524, "step": 42200 }, { "epoch": 16.28131021194605, "grad_norm": 0.360784113407135, "learning_rate": 0.00013487630057803469, "loss": 0.3733913803100586, "step": 42250 }, { "epoch": 16.28131021194605, "eval_loss": 0.43226128816604614, "eval_runtime": 13.7107, "eval_samples_per_second": 1386.073, "eval_steps_per_second": 28.883, "step": 42250 }, { "epoch": 16.30057803468208, "grad_norm": 0.3691336214542389, "learning_rate": 0.00013479922928709057, "loss": 0.3696689987182617, "step": 42300 }, { "epoch": 16.30057803468208, "eval_loss": 0.4356008768081665, "eval_runtime": 13.5115, "eval_samples_per_second": 1406.508, "eval_steps_per_second": 29.308, "step": 42300 }, { "epoch": 16.319845857418112, "grad_norm": 0.45689713954925537, "learning_rate": 0.00013472215799614645, "loss": 0.3687092208862305, "step": 42350 }, { "epoch": 16.319845857418112, "eval_loss": 0.4309006631374359, "eval_runtime": 14.2593, "eval_samples_per_second": 1332.749, "eval_steps_per_second": 27.771, "step": 42350 }, { "epoch": 16.339113680154142, "grad_norm": 0.4264954626560211, "learning_rate": 0.00013464508670520234, "loss": 0.3683721923828125, "step": 42400 }, { "epoch": 16.339113680154142, "eval_loss": 0.4383135735988617, "eval_runtime": 13.8441, "eval_samples_per_second": 1372.713, "eval_steps_per_second": 28.604, "step": 42400 }, { "epoch": 16.358381502890172, "grad_norm": 0.3785674273967743, "learning_rate": 0.0001345680154142582, "loss": 0.3650739288330078, "step": 42450 }, { "epoch": 16.358381502890172, "eval_loss": 0.432735800743103, "eval_runtime": 13.7429, "eval_samples_per_second": 1382.824, "eval_steps_per_second": 28.815, "step": 42450 }, { "epoch": 16.377649325626205, "grad_norm": 0.4029272496700287, "learning_rate": 0.00013449094412331406, "loss": 0.3725774383544922, "step": 42500 }, { "epoch": 16.377649325626205, "eval_loss": 0.42395177483558655, "eval_runtime": 13.0262, "eval_samples_per_second": 1458.906, "eval_steps_per_second": 30.4, "step": 42500 }, { "epoch": 16.396917148362235, "grad_norm": 0.3576814532279968, "learning_rate": 0.00013441387283236994, "loss": 0.3735223388671875, "step": 42550 }, { "epoch": 16.396917148362235, "eval_loss": 0.4308323562145233, "eval_runtime": 13.5358, "eval_samples_per_second": 1403.986, "eval_steps_per_second": 29.256, "step": 42550 }, { "epoch": 16.416184971098264, "grad_norm": 0.49241942167282104, "learning_rate": 0.00013433680154142583, "loss": 0.3728443145751953, "step": 42600 }, { "epoch": 16.416184971098264, "eval_loss": 0.42887529730796814, "eval_runtime": 13.6202, "eval_samples_per_second": 1395.276, "eval_steps_per_second": 29.074, "step": 42600 }, { "epoch": 16.435452793834298, "grad_norm": 0.4092554748058319, "learning_rate": 0.0001342597302504817, "loss": 0.36810256958007814, "step": 42650 }, { "epoch": 16.435452793834298, "eval_loss": 0.4417731463909149, "eval_runtime": 13.6783, "eval_samples_per_second": 1389.356, "eval_steps_per_second": 28.951, "step": 42650 }, { "epoch": 16.454720616570327, "grad_norm": 0.37867680191993713, "learning_rate": 0.0001341826589595376, "loss": 0.3690605163574219, "step": 42700 }, { "epoch": 16.454720616570327, "eval_loss": 0.4286178946495056, "eval_runtime": 13.8104, "eval_samples_per_second": 1376.06, "eval_steps_per_second": 28.674, "step": 42700 }, { "epoch": 16.473988439306357, "grad_norm": 0.39600369334220886, "learning_rate": 0.00013410558766859345, "loss": 0.3694684600830078, "step": 42750 }, { "epoch": 16.473988439306357, "eval_loss": 0.4360545873641968, "eval_runtime": 13.1379, "eval_samples_per_second": 1446.502, "eval_steps_per_second": 30.142, "step": 42750 }, { "epoch": 16.49325626204239, "grad_norm": 0.33108261227607727, "learning_rate": 0.00013402851637764934, "loss": 0.37401611328125, "step": 42800 }, { "epoch": 16.49325626204239, "eval_loss": 0.428684264421463, "eval_runtime": 13.8291, "eval_samples_per_second": 1374.204, "eval_steps_per_second": 28.635, "step": 42800 }, { "epoch": 16.51252408477842, "grad_norm": 0.44455987215042114, "learning_rate": 0.0001339514450867052, "loss": 0.37491497039794924, "step": 42850 }, { "epoch": 16.51252408477842, "eval_loss": 0.433973491191864, "eval_runtime": 13.8954, "eval_samples_per_second": 1367.647, "eval_steps_per_second": 28.499, "step": 42850 }, { "epoch": 16.53179190751445, "grad_norm": 0.36617428064346313, "learning_rate": 0.00013387437379576108, "loss": 0.36596900939941407, "step": 42900 }, { "epoch": 16.53179190751445, "eval_loss": 0.4438251852989197, "eval_runtime": 13.7359, "eval_samples_per_second": 1383.526, "eval_steps_per_second": 28.83, "step": 42900 }, { "epoch": 16.551059730250483, "grad_norm": 0.36309975385665894, "learning_rate": 0.00013379730250481697, "loss": 0.37040008544921876, "step": 42950 }, { "epoch": 16.551059730250483, "eval_loss": 0.4257933497428894, "eval_runtime": 13.8589, "eval_samples_per_second": 1371.245, "eval_steps_per_second": 28.574, "step": 42950 }, { "epoch": 16.570327552986512, "grad_norm": 0.33942168951034546, "learning_rate": 0.00013372023121387285, "loss": 0.36859466552734377, "step": 43000 }, { "epoch": 16.570327552986512, "eval_loss": 0.4427822530269623, "eval_runtime": 13.5644, "eval_samples_per_second": 1401.022, "eval_steps_per_second": 29.194, "step": 43000 }, { "epoch": 16.589595375722542, "grad_norm": 0.35855618119239807, "learning_rate": 0.0001336431599229287, "loss": 0.3679096221923828, "step": 43050 }, { "epoch": 16.589595375722542, "eval_loss": 0.4357832968235016, "eval_runtime": 14.1078, "eval_samples_per_second": 1347.057, "eval_steps_per_second": 28.07, "step": 43050 }, { "epoch": 16.608863198458575, "grad_norm": 0.36088114976882935, "learning_rate": 0.0001335660886319846, "loss": 0.37289794921875, "step": 43100 }, { "epoch": 16.608863198458575, "eval_loss": 0.4309292137622833, "eval_runtime": 13.9138, "eval_samples_per_second": 1365.837, "eval_steps_per_second": 28.461, "step": 43100 }, { "epoch": 16.628131021194605, "grad_norm": 0.4330693483352661, "learning_rate": 0.00013348901734104048, "loss": 0.3688077545166016, "step": 43150 }, { "epoch": 16.628131021194605, "eval_loss": 0.431353360414505, "eval_runtime": 13.4979, "eval_samples_per_second": 1407.92, "eval_steps_per_second": 29.338, "step": 43150 }, { "epoch": 16.647398843930635, "grad_norm": 0.4000328779220581, "learning_rate": 0.00013341194605009634, "loss": 0.37062530517578124, "step": 43200 }, { "epoch": 16.647398843930635, "eval_loss": 0.4407671093940735, "eval_runtime": 13.5697, "eval_samples_per_second": 1400.477, "eval_steps_per_second": 29.183, "step": 43200 }, { "epoch": 16.666666666666668, "grad_norm": 0.35663479566574097, "learning_rate": 0.00013333487475915222, "loss": 0.37038330078125, "step": 43250 }, { "epoch": 16.666666666666668, "eval_loss": 0.4331173896789551, "eval_runtime": 13.6728, "eval_samples_per_second": 1389.913, "eval_steps_per_second": 28.963, "step": 43250 }, { "epoch": 16.685934489402698, "grad_norm": 0.3871069848537445, "learning_rate": 0.0001332578034682081, "loss": 0.3695670700073242, "step": 43300 }, { "epoch": 16.685934489402698, "eval_loss": 0.42923298478126526, "eval_runtime": 13.3286, "eval_samples_per_second": 1425.802, "eval_steps_per_second": 29.71, "step": 43300 }, { "epoch": 16.705202312138727, "grad_norm": 0.3629269599914551, "learning_rate": 0.00013318073217726396, "loss": 0.37100929260253906, "step": 43350 }, { "epoch": 16.705202312138727, "eval_loss": 0.43239057064056396, "eval_runtime": 13.4557, "eval_samples_per_second": 1412.334, "eval_steps_per_second": 29.43, "step": 43350 }, { "epoch": 16.72447013487476, "grad_norm": 0.3986419141292572, "learning_rate": 0.00013310366088631985, "loss": 0.368055419921875, "step": 43400 }, { "epoch": 16.72447013487476, "eval_loss": 0.43369826674461365, "eval_runtime": 13.6092, "eval_samples_per_second": 1396.411, "eval_steps_per_second": 29.098, "step": 43400 }, { "epoch": 16.74373795761079, "grad_norm": 0.4003140926361084, "learning_rate": 0.00013302658959537573, "loss": 0.36834556579589844, "step": 43450 }, { "epoch": 16.74373795761079, "eval_loss": 0.4296095371246338, "eval_runtime": 13.5154, "eval_samples_per_second": 1406.096, "eval_steps_per_second": 29.3, "step": 43450 }, { "epoch": 16.76300578034682, "grad_norm": 0.41678041219711304, "learning_rate": 0.00013294951830443162, "loss": 0.37308555603027344, "step": 43500 }, { "epoch": 16.76300578034682, "eval_loss": 0.4341019093990326, "eval_runtime": 13.0995, "eval_samples_per_second": 1450.747, "eval_steps_per_second": 30.23, "step": 43500 }, { "epoch": 16.782273603082853, "grad_norm": 0.3736550807952881, "learning_rate": 0.00013287244701348748, "loss": 0.3714396286010742, "step": 43550 }, { "epoch": 16.782273603082853, "eval_loss": 0.4261009693145752, "eval_runtime": 13.4049, "eval_samples_per_second": 1417.695, "eval_steps_per_second": 29.542, "step": 43550 }, { "epoch": 16.801541425818883, "grad_norm": 0.41450366377830505, "learning_rate": 0.00013279537572254336, "loss": 0.36949851989746096, "step": 43600 }, { "epoch": 16.801541425818883, "eval_loss": 0.43403947353363037, "eval_runtime": 13.9237, "eval_samples_per_second": 1364.87, "eval_steps_per_second": 28.441, "step": 43600 }, { "epoch": 16.820809248554912, "grad_norm": 0.3693954050540924, "learning_rate": 0.00013271830443159922, "loss": 0.36407970428466796, "step": 43650 }, { "epoch": 16.820809248554912, "eval_loss": 0.4251818060874939, "eval_runtime": 13.5474, "eval_samples_per_second": 1402.779, "eval_steps_per_second": 29.231, "step": 43650 }, { "epoch": 16.840077071290946, "grad_norm": 0.36591336131095886, "learning_rate": 0.0001326412331406551, "loss": 0.3660692596435547, "step": 43700 }, { "epoch": 16.840077071290946, "eval_loss": 0.4234589636325836, "eval_runtime": 13.6192, "eval_samples_per_second": 1395.381, "eval_steps_per_second": 29.077, "step": 43700 }, { "epoch": 16.859344894026975, "grad_norm": 0.3535354733467102, "learning_rate": 0.000132564161849711, "loss": 0.37185432434082033, "step": 43750 }, { "epoch": 16.859344894026975, "eval_loss": 0.432957261800766, "eval_runtime": 13.8306, "eval_samples_per_second": 1374.058, "eval_steps_per_second": 28.632, "step": 43750 }, { "epoch": 16.878612716763005, "grad_norm": 0.4135783314704895, "learning_rate": 0.00013248709055876687, "loss": 0.3621230697631836, "step": 43800 }, { "epoch": 16.878612716763005, "eval_loss": 0.4323079288005829, "eval_runtime": 13.6729, "eval_samples_per_second": 1389.904, "eval_steps_per_second": 28.962, "step": 43800 }, { "epoch": 16.897880539499038, "grad_norm": 0.38264596462249756, "learning_rate": 0.00013241001926782276, "loss": 0.3785008239746094, "step": 43850 }, { "epoch": 16.897880539499038, "eval_loss": 0.4330856204032898, "eval_runtime": 14.3267, "eval_samples_per_second": 1326.479, "eval_steps_per_second": 27.641, "step": 43850 }, { "epoch": 16.917148362235068, "grad_norm": 0.3724619150161743, "learning_rate": 0.00013233294797687864, "loss": 0.36570465087890625, "step": 43900 }, { "epoch": 16.917148362235068, "eval_loss": 0.4288078546524048, "eval_runtime": 13.799, "eval_samples_per_second": 1377.198, "eval_steps_per_second": 28.698, "step": 43900 }, { "epoch": 16.936416184971097, "grad_norm": 0.37341591715812683, "learning_rate": 0.0001322558766859345, "loss": 0.37107051849365236, "step": 43950 }, { "epoch": 16.936416184971097, "eval_loss": 0.42794784903526306, "eval_runtime": 12.8115, "eval_samples_per_second": 1483.357, "eval_steps_per_second": 30.91, "step": 43950 }, { "epoch": 16.95568400770713, "grad_norm": 0.38808193802833557, "learning_rate": 0.00013217880539499036, "loss": 0.3675981903076172, "step": 44000 }, { "epoch": 16.95568400770713, "eval_loss": 0.4329322278499603, "eval_runtime": 13.4415, "eval_samples_per_second": 1413.835, "eval_steps_per_second": 29.461, "step": 44000 }, { "epoch": 16.97495183044316, "grad_norm": 0.5143356919288635, "learning_rate": 0.00013210173410404624, "loss": 0.3720729827880859, "step": 44050 }, { "epoch": 16.97495183044316, "eval_loss": 0.43611136078834534, "eval_runtime": 13.6809, "eval_samples_per_second": 1389.087, "eval_steps_per_second": 28.945, "step": 44050 }, { "epoch": 16.99421965317919, "grad_norm": 0.40137407183647156, "learning_rate": 0.00013202466281310213, "loss": 0.3734406280517578, "step": 44100 }, { "epoch": 16.99421965317919, "eval_loss": 0.4358988404273987, "eval_runtime": 13.5413, "eval_samples_per_second": 1403.41, "eval_steps_per_second": 29.244, "step": 44100 }, { "epoch": 17.013487475915223, "grad_norm": 0.3703709840774536, "learning_rate": 0.000131947591522158, "loss": 0.36584407806396485, "step": 44150 }, { "epoch": 17.013487475915223, "eval_loss": 0.42761895060539246, "eval_runtime": 13.6506, "eval_samples_per_second": 1392.169, "eval_steps_per_second": 29.01, "step": 44150 }, { "epoch": 17.032755298651253, "grad_norm": 0.49579018354415894, "learning_rate": 0.0001318705202312139, "loss": 0.3708392333984375, "step": 44200 }, { "epoch": 17.032755298651253, "eval_loss": 0.43107619881629944, "eval_runtime": 13.8118, "eval_samples_per_second": 1375.926, "eval_steps_per_second": 28.671, "step": 44200 }, { "epoch": 17.052023121387283, "grad_norm": 0.383868545293808, "learning_rate": 0.00013179344894026976, "loss": 0.36859573364257814, "step": 44250 }, { "epoch": 17.052023121387283, "eval_loss": 0.43457165360450745, "eval_runtime": 14.6139, "eval_samples_per_second": 1300.409, "eval_steps_per_second": 27.098, "step": 44250 }, { "epoch": 17.071290944123316, "grad_norm": 0.38016825914382935, "learning_rate": 0.00013171637764932564, "loss": 0.3706692886352539, "step": 44300 }, { "epoch": 17.071290944123316, "eval_loss": 0.4332432448863983, "eval_runtime": 13.6961, "eval_samples_per_second": 1387.545, "eval_steps_per_second": 28.913, "step": 44300 }, { "epoch": 17.090558766859345, "grad_norm": 0.34186485409736633, "learning_rate": 0.0001316393063583815, "loss": 0.3760959243774414, "step": 44350 }, { "epoch": 17.090558766859345, "eval_loss": 0.43865352869033813, "eval_runtime": 13.9652, "eval_samples_per_second": 1360.807, "eval_steps_per_second": 28.356, "step": 44350 }, { "epoch": 17.109826589595375, "grad_norm": 0.38422954082489014, "learning_rate": 0.00013156223506743738, "loss": 0.3688961029052734, "step": 44400 }, { "epoch": 17.109826589595375, "eval_loss": 0.4344821274280548, "eval_runtime": 13.7971, "eval_samples_per_second": 1377.386, "eval_steps_per_second": 28.702, "step": 44400 }, { "epoch": 17.129094412331405, "grad_norm": 0.34002965688705444, "learning_rate": 0.00013148516377649327, "loss": 0.36843368530273435, "step": 44450 }, { "epoch": 17.129094412331405, "eval_loss": 0.42719700932502747, "eval_runtime": 13.6193, "eval_samples_per_second": 1395.372, "eval_steps_per_second": 29.076, "step": 44450 }, { "epoch": 17.148362235067438, "grad_norm": 0.361782044172287, "learning_rate": 0.00013140809248554915, "loss": 0.3677081298828125, "step": 44500 }, { "epoch": 17.148362235067438, "eval_loss": 0.43045005202293396, "eval_runtime": 13.7511, "eval_samples_per_second": 1381.996, "eval_steps_per_second": 28.798, "step": 44500 }, { "epoch": 17.167630057803468, "grad_norm": 0.3567553758621216, "learning_rate": 0.000131331021194605, "loss": 0.37139427185058593, "step": 44550 }, { "epoch": 17.167630057803468, "eval_loss": 0.4320901036262512, "eval_runtime": 13.483, "eval_samples_per_second": 1409.474, "eval_steps_per_second": 29.37, "step": 44550 }, { "epoch": 17.186897880539497, "grad_norm": 0.3501719534397125, "learning_rate": 0.0001312539499036609, "loss": 0.36335906982421873, "step": 44600 }, { "epoch": 17.186897880539497, "eval_loss": 0.42585986852645874, "eval_runtime": 13.5575, "eval_samples_per_second": 1401.731, "eval_steps_per_second": 29.209, "step": 44600 }, { "epoch": 17.20616570327553, "grad_norm": 0.3611781895160675, "learning_rate": 0.00013117687861271678, "loss": 0.37115131378173827, "step": 44650 }, { "epoch": 17.20616570327553, "eval_loss": 0.43602967262268066, "eval_runtime": 13.566, "eval_samples_per_second": 1400.857, "eval_steps_per_second": 29.191, "step": 44650 }, { "epoch": 17.22543352601156, "grad_norm": 0.4206189513206482, "learning_rate": 0.00013109980732177264, "loss": 0.37216102600097656, "step": 44700 }, { "epoch": 17.22543352601156, "eval_loss": 0.43211933970451355, "eval_runtime": 13.4921, "eval_samples_per_second": 1408.523, "eval_steps_per_second": 29.35, "step": 44700 }, { "epoch": 17.24470134874759, "grad_norm": 0.3878670632839203, "learning_rate": 0.00013102273603082852, "loss": 0.3648579788208008, "step": 44750 }, { "epoch": 17.24470134874759, "eval_loss": 0.4294303059577942, "eval_runtime": 13.2888, "eval_samples_per_second": 1430.077, "eval_steps_per_second": 29.8, "step": 44750 }, { "epoch": 17.263969171483623, "grad_norm": 0.3707346022129059, "learning_rate": 0.00013094566473988438, "loss": 0.3734614562988281, "step": 44800 }, { "epoch": 17.263969171483623, "eval_loss": 0.42817074060440063, "eval_runtime": 13.6031, "eval_samples_per_second": 1397.037, "eval_steps_per_second": 29.111, "step": 44800 }, { "epoch": 17.283236994219653, "grad_norm": 0.37440598011016846, "learning_rate": 0.00013086859344894027, "loss": 0.37040218353271487, "step": 44850 }, { "epoch": 17.283236994219653, "eval_loss": 0.423378586769104, "eval_runtime": 13.463, "eval_samples_per_second": 1411.571, "eval_steps_per_second": 29.414, "step": 44850 }, { "epoch": 17.302504816955683, "grad_norm": 0.3622802197933197, "learning_rate": 0.00013079152215799615, "loss": 0.36451484680175783, "step": 44900 }, { "epoch": 17.302504816955683, "eval_loss": 0.4297627806663513, "eval_runtime": 13.0869, "eval_samples_per_second": 1452.144, "eval_steps_per_second": 30.259, "step": 44900 }, { "epoch": 17.321772639691716, "grad_norm": 0.4774889647960663, "learning_rate": 0.00013071445086705204, "loss": 0.36693408966064456, "step": 44950 }, { "epoch": 17.321772639691716, "eval_loss": 0.426893025636673, "eval_runtime": 13.6077, "eval_samples_per_second": 1396.558, "eval_steps_per_second": 29.101, "step": 44950 }, { "epoch": 17.341040462427745, "grad_norm": 0.33426615595817566, "learning_rate": 0.00013063737957610792, "loss": 0.360343017578125, "step": 45000 }, { "epoch": 17.341040462427745, "eval_loss": 0.43193116784095764, "eval_runtime": 13.5668, "eval_samples_per_second": 1400.777, "eval_steps_per_second": 29.189, "step": 45000 }, { "epoch": 17.360308285163775, "grad_norm": 0.37792202830314636, "learning_rate": 0.00013056030828516378, "loss": 0.3688396453857422, "step": 45050 }, { "epoch": 17.360308285163775, "eval_loss": 0.4293178617954254, "eval_runtime": 13.2388, "eval_samples_per_second": 1435.483, "eval_steps_per_second": 29.912, "step": 45050 }, { "epoch": 17.37957610789981, "grad_norm": 0.33859407901763916, "learning_rate": 0.00013048323699421964, "loss": 0.36768997192382813, "step": 45100 }, { "epoch": 17.37957610789981, "eval_loss": 0.4376179873943329, "eval_runtime": 12.9272, "eval_samples_per_second": 1470.075, "eval_steps_per_second": 30.633, "step": 45100 }, { "epoch": 17.398843930635838, "grad_norm": 0.38350751996040344, "learning_rate": 0.00013040616570327552, "loss": 0.3670475006103516, "step": 45150 }, { "epoch": 17.398843930635838, "eval_loss": 0.4281247556209564, "eval_runtime": 13.3868, "eval_samples_per_second": 1419.61, "eval_steps_per_second": 29.581, "step": 45150 }, { "epoch": 17.418111753371868, "grad_norm": 0.3675788938999176, "learning_rate": 0.0001303290944123314, "loss": 0.3665888595581055, "step": 45200 }, { "epoch": 17.418111753371868, "eval_loss": 0.42549774050712585, "eval_runtime": 13.545, "eval_samples_per_second": 1403.032, "eval_steps_per_second": 29.236, "step": 45200 }, { "epoch": 17.4373795761079, "grad_norm": 0.3872062563896179, "learning_rate": 0.0001302520231213873, "loss": 0.35975521087646484, "step": 45250 }, { "epoch": 17.4373795761079, "eval_loss": 0.42082735896110535, "eval_runtime": 13.5208, "eval_samples_per_second": 1405.543, "eval_steps_per_second": 29.288, "step": 45250 }, { "epoch": 17.45664739884393, "grad_norm": 0.3560031056404114, "learning_rate": 0.00013017495183044318, "loss": 0.37006210327148437, "step": 45300 }, { "epoch": 17.45664739884393, "eval_loss": 0.41533511877059937, "eval_runtime": 13.4273, "eval_samples_per_second": 1415.325, "eval_steps_per_second": 29.492, "step": 45300 }, { "epoch": 17.47591522157996, "grad_norm": 0.4397311508655548, "learning_rate": 0.00013009788053949906, "loss": 0.3701839828491211, "step": 45350 }, { "epoch": 17.47591522157996, "eval_loss": 0.42810550332069397, "eval_runtime": 13.461, "eval_samples_per_second": 1411.781, "eval_steps_per_second": 29.418, "step": 45350 }, { "epoch": 17.495183044315993, "grad_norm": 0.42181652784347534, "learning_rate": 0.00013002080924855492, "loss": 0.37228874206542967, "step": 45400 }, { "epoch": 17.495183044315993, "eval_loss": 0.42436346411705017, "eval_runtime": 13.2663, "eval_samples_per_second": 1432.507, "eval_steps_per_second": 29.85, "step": 45400 }, { "epoch": 17.514450867052023, "grad_norm": 0.3349694013595581, "learning_rate": 0.00012994373795761078, "loss": 0.3585169219970703, "step": 45450 }, { "epoch": 17.514450867052023, "eval_loss": 0.4305500388145447, "eval_runtime": 13.4353, "eval_samples_per_second": 1414.48, "eval_steps_per_second": 29.475, "step": 45450 }, { "epoch": 17.533718689788053, "grad_norm": 0.52252197265625, "learning_rate": 0.00012986666666666666, "loss": 0.3727501678466797, "step": 45500 }, { "epoch": 17.533718689788053, "eval_loss": 0.4225037693977356, "eval_runtime": 13.6257, "eval_samples_per_second": 1394.721, "eval_steps_per_second": 29.063, "step": 45500 }, { "epoch": 17.552986512524086, "grad_norm": 0.41588500142097473, "learning_rate": 0.00012978959537572255, "loss": 0.360313720703125, "step": 45550 }, { "epoch": 17.552986512524086, "eval_loss": 0.4286901652812958, "eval_runtime": 13.9537, "eval_samples_per_second": 1361.934, "eval_steps_per_second": 28.38, "step": 45550 }, { "epoch": 17.572254335260116, "grad_norm": 0.3576446771621704, "learning_rate": 0.00012971252408477843, "loss": 0.3643194580078125, "step": 45600 }, { "epoch": 17.572254335260116, "eval_loss": 0.43395912647247314, "eval_runtime": 12.8367, "eval_samples_per_second": 1480.439, "eval_steps_per_second": 30.849, "step": 45600 }, { "epoch": 17.591522157996145, "grad_norm": 0.33994391560554504, "learning_rate": 0.00012963545279383432, "loss": 0.3635233306884766, "step": 45650 }, { "epoch": 17.591522157996145, "eval_loss": 0.43229004740715027, "eval_runtime": 12.1705, "eval_samples_per_second": 1561.48, "eval_steps_per_second": 32.538, "step": 45650 }, { "epoch": 17.61078998073218, "grad_norm": 0.34782856702804565, "learning_rate": 0.00012955838150289017, "loss": 0.36920364379882814, "step": 45700 }, { "epoch": 17.61078998073218, "eval_loss": 0.43526697158813477, "eval_runtime": 12.1326, "eval_samples_per_second": 1566.356, "eval_steps_per_second": 32.639, "step": 45700 }, { "epoch": 17.63005780346821, "grad_norm": 0.3460638225078583, "learning_rate": 0.00012948131021194606, "loss": 0.36825904846191404, "step": 45750 }, { "epoch": 17.63005780346821, "eval_loss": 0.4201161861419678, "eval_runtime": 12.3252, "eval_samples_per_second": 1541.881, "eval_steps_per_second": 32.129, "step": 45750 }, { "epoch": 17.649325626204238, "grad_norm": 0.3529464602470398, "learning_rate": 0.00012940423892100194, "loss": 0.3697012329101563, "step": 45800 }, { "epoch": 17.649325626204238, "eval_loss": 0.42190828919410706, "eval_runtime": 12.023, "eval_samples_per_second": 1580.643, "eval_steps_per_second": 32.937, "step": 45800 }, { "epoch": 17.66859344894027, "grad_norm": 0.5239589810371399, "learning_rate": 0.0001293271676300578, "loss": 0.36318321228027345, "step": 45850 }, { "epoch": 17.66859344894027, "eval_loss": 0.41692692041397095, "eval_runtime": 12.1104, "eval_samples_per_second": 1569.223, "eval_steps_per_second": 32.699, "step": 45850 }, { "epoch": 17.6878612716763, "grad_norm": 0.38257190585136414, "learning_rate": 0.00012925009633911369, "loss": 0.3661704254150391, "step": 45900 }, { "epoch": 17.6878612716763, "eval_loss": 0.4283389151096344, "eval_runtime": 12.352, "eval_samples_per_second": 1538.539, "eval_steps_per_second": 32.06, "step": 45900 }, { "epoch": 17.70712909441233, "grad_norm": 0.3859211504459381, "learning_rate": 0.00012917302504816957, "loss": 0.37084178924560546, "step": 45950 }, { "epoch": 17.70712909441233, "eval_loss": 0.41809800267219543, "eval_runtime": 12.1716, "eval_samples_per_second": 1561.341, "eval_steps_per_second": 32.535, "step": 45950 }, { "epoch": 17.726396917148364, "grad_norm": 0.3455635607242584, "learning_rate": 0.00012909595375722543, "loss": 0.36633934020996095, "step": 46000 }, { "epoch": 17.726396917148364, "eval_loss": 0.42823803424835205, "eval_runtime": 12.2453, "eval_samples_per_second": 1551.943, "eval_steps_per_second": 32.339, "step": 46000 }, { "epoch": 17.745664739884393, "grad_norm": 0.3533863127231598, "learning_rate": 0.00012901888246628131, "loss": 0.36382144927978516, "step": 46050 }, { "epoch": 17.745664739884393, "eval_loss": 0.42004576325416565, "eval_runtime": 12.0841, "eval_samples_per_second": 1572.645, "eval_steps_per_second": 32.77, "step": 46050 }, { "epoch": 17.764932562620423, "grad_norm": 0.3684418797492981, "learning_rate": 0.0001289418111753372, "loss": 0.3674930191040039, "step": 46100 }, { "epoch": 17.764932562620423, "eval_loss": 0.4288540482521057, "eval_runtime": 12.0589, "eval_samples_per_second": 1575.932, "eval_steps_per_second": 32.839, "step": 46100 }, { "epoch": 17.784200385356456, "grad_norm": 0.4553726315498352, "learning_rate": 0.00012886473988439308, "loss": 0.3695876312255859, "step": 46150 }, { "epoch": 17.784200385356456, "eval_loss": 0.42821556329727173, "eval_runtime": 12.1924, "eval_samples_per_second": 1558.68, "eval_steps_per_second": 32.479, "step": 46150 }, { "epoch": 17.803468208092486, "grad_norm": 0.38256749510765076, "learning_rate": 0.00012878766859344894, "loss": 0.36089866638183593, "step": 46200 }, { "epoch": 17.803468208092486, "eval_loss": 0.43202799558639526, "eval_runtime": 12.0929, "eval_samples_per_second": 1571.499, "eval_steps_per_second": 32.746, "step": 46200 }, { "epoch": 17.822736030828516, "grad_norm": 0.41206926107406616, "learning_rate": 0.00012871059730250483, "loss": 0.36745758056640626, "step": 46250 }, { "epoch": 17.822736030828516, "eval_loss": 0.42816153168678284, "eval_runtime": 12.0847, "eval_samples_per_second": 1572.567, "eval_steps_per_second": 32.769, "step": 46250 }, { "epoch": 17.84200385356455, "grad_norm": 0.34915268421173096, "learning_rate": 0.00012863352601156068, "loss": 0.3637145233154297, "step": 46300 }, { "epoch": 17.84200385356455, "eval_loss": 0.42589157819747925, "eval_runtime": 12.0299, "eval_samples_per_second": 1579.736, "eval_steps_per_second": 32.918, "step": 46300 }, { "epoch": 17.86127167630058, "grad_norm": 0.43560901284217834, "learning_rate": 0.00012855645472061657, "loss": 0.3640867233276367, "step": 46350 }, { "epoch": 17.86127167630058, "eval_loss": 0.4223552942276001, "eval_runtime": 12.0379, "eval_samples_per_second": 1578.687, "eval_steps_per_second": 32.896, "step": 46350 }, { "epoch": 17.880539499036608, "grad_norm": 0.3560962677001953, "learning_rate": 0.00012847938342967245, "loss": 0.3646985626220703, "step": 46400 }, { "epoch": 17.880539499036608, "eval_loss": 0.4282087981700897, "eval_runtime": 12.1206, "eval_samples_per_second": 1567.905, "eval_steps_per_second": 32.672, "step": 46400 }, { "epoch": 17.89980732177264, "grad_norm": 0.3754415214061737, "learning_rate": 0.00012840231213872834, "loss": 0.37040702819824217, "step": 46450 }, { "epoch": 17.89980732177264, "eval_loss": 0.42970162630081177, "eval_runtime": 12.1826, "eval_samples_per_second": 1559.927, "eval_steps_per_second": 32.505, "step": 46450 }, { "epoch": 17.91907514450867, "grad_norm": 0.37331798672676086, "learning_rate": 0.00012832524084778422, "loss": 0.36863452911376954, "step": 46500 }, { "epoch": 17.91907514450867, "eval_loss": 0.4258643388748169, "eval_runtime": 12.1637, "eval_samples_per_second": 1562.352, "eval_steps_per_second": 32.556, "step": 46500 }, { "epoch": 17.9383429672447, "grad_norm": 0.3738945424556732, "learning_rate": 0.00012824816955684008, "loss": 0.3678428649902344, "step": 46550 }, { "epoch": 17.9383429672447, "eval_loss": 0.423272043466568, "eval_runtime": 12.0522, "eval_samples_per_second": 1576.814, "eval_steps_per_second": 32.857, "step": 46550 }, { "epoch": 17.95761078998073, "grad_norm": 0.3526585102081299, "learning_rate": 0.00012817109826589594, "loss": 0.37026630401611327, "step": 46600 }, { "epoch": 17.95761078998073, "eval_loss": 0.43189677596092224, "eval_runtime": 12.0147, "eval_samples_per_second": 1581.729, "eval_steps_per_second": 32.96, "step": 46600 }, { "epoch": 17.976878612716764, "grad_norm": 0.37575283646583557, "learning_rate": 0.00012809402697495182, "loss": 0.36950260162353515, "step": 46650 }, { "epoch": 17.976878612716764, "eval_loss": 0.4225762188434601, "eval_runtime": 12.0138, "eval_samples_per_second": 1581.845, "eval_steps_per_second": 32.962, "step": 46650 }, { "epoch": 17.996146435452793, "grad_norm": 0.3989676833152771, "learning_rate": 0.0001280169556840077, "loss": 0.3680271530151367, "step": 46700 }, { "epoch": 17.996146435452793, "eval_loss": 0.4222710430622101, "eval_runtime": 12.1972, "eval_samples_per_second": 1558.067, "eval_steps_per_second": 32.467, "step": 46700 }, { "epoch": 18.015414258188823, "grad_norm": 0.3823298513889313, "learning_rate": 0.0001279398843930636, "loss": 0.37191024780273435, "step": 46750 }, { "epoch": 18.015414258188823, "eval_loss": 0.42618614435195923, "eval_runtime": 12.1412, "eval_samples_per_second": 1565.25, "eval_steps_per_second": 32.616, "step": 46750 }, { "epoch": 18.034682080924856, "grad_norm": 0.3642926514148712, "learning_rate": 0.00012786281310211948, "loss": 0.37019695281982423, "step": 46800 }, { "epoch": 18.034682080924856, "eval_loss": 0.4217292368412018, "eval_runtime": 12.2093, "eval_samples_per_second": 1556.52, "eval_steps_per_second": 32.434, "step": 46800 }, { "epoch": 18.053949903660886, "grad_norm": 0.389198899269104, "learning_rate": 0.00012778574181117536, "loss": 0.36612682342529296, "step": 46850 }, { "epoch": 18.053949903660886, "eval_loss": 0.43379056453704834, "eval_runtime": 12.2961, "eval_samples_per_second": 1545.529, "eval_steps_per_second": 32.205, "step": 46850 }, { "epoch": 18.073217726396916, "grad_norm": 0.4314260482788086, "learning_rate": 0.00012770867052023122, "loss": 0.3591333770751953, "step": 46900 }, { "epoch": 18.073217726396916, "eval_loss": 0.42030394077301025, "eval_runtime": 12.1893, "eval_samples_per_second": 1559.068, "eval_steps_per_second": 32.487, "step": 46900 }, { "epoch": 18.09248554913295, "grad_norm": 0.4394644796848297, "learning_rate": 0.00012763159922928708, "loss": 0.3643639373779297, "step": 46950 }, { "epoch": 18.09248554913295, "eval_loss": 0.4249099791049957, "eval_runtime": 12.0713, "eval_samples_per_second": 1574.313, "eval_steps_per_second": 32.805, "step": 46950 }, { "epoch": 18.11175337186898, "grad_norm": 0.3679722547531128, "learning_rate": 0.00012755452793834296, "loss": 0.3668488311767578, "step": 47000 }, { "epoch": 18.11175337186898, "eval_loss": 0.4316596984863281, "eval_runtime": 12.1546, "eval_samples_per_second": 1563.518, "eval_steps_per_second": 32.58, "step": 47000 }, { "epoch": 18.131021194605008, "grad_norm": 0.3955745995044708, "learning_rate": 0.00012747745664739885, "loss": 0.3646841812133789, "step": 47050 }, { "epoch": 18.131021194605008, "eval_loss": 0.4274338185787201, "eval_runtime": 12.1241, "eval_samples_per_second": 1567.455, "eval_steps_per_second": 32.662, "step": 47050 }, { "epoch": 18.15028901734104, "grad_norm": 0.3466762602329254, "learning_rate": 0.00012740038535645473, "loss": 0.3687885665893555, "step": 47100 }, { "epoch": 18.15028901734104, "eval_loss": 0.429480642080307, "eval_runtime": 12.0668, "eval_samples_per_second": 1574.899, "eval_steps_per_second": 32.817, "step": 47100 }, { "epoch": 18.16955684007707, "grad_norm": 0.3598794639110565, "learning_rate": 0.00012732331406551062, "loss": 0.3669497299194336, "step": 47150 }, { "epoch": 18.16955684007707, "eval_loss": 0.42260411381721497, "eval_runtime": 12.0645, "eval_samples_per_second": 1575.197, "eval_steps_per_second": 32.824, "step": 47150 }, { "epoch": 18.1888246628131, "grad_norm": 0.39547863602638245, "learning_rate": 0.00012724624277456648, "loss": 0.3647163391113281, "step": 47200 }, { "epoch": 18.1888246628131, "eval_loss": 0.42760169506073, "eval_runtime": 12.0642, "eval_samples_per_second": 1575.24, "eval_steps_per_second": 32.824, "step": 47200 }, { "epoch": 18.208092485549134, "grad_norm": 0.3201744854450226, "learning_rate": 0.00012716917148362236, "loss": 0.365451545715332, "step": 47250 }, { "epoch": 18.208092485549134, "eval_loss": 0.43334344029426575, "eval_runtime": 12.0729, "eval_samples_per_second": 1574.101, "eval_steps_per_second": 32.801, "step": 47250 }, { "epoch": 18.227360308285164, "grad_norm": 0.4164421260356903, "learning_rate": 0.00012709210019267822, "loss": 0.3654734039306641, "step": 47300 }, { "epoch": 18.227360308285164, "eval_loss": 0.4199920892715454, "eval_runtime": 12.0832, "eval_samples_per_second": 1572.768, "eval_steps_per_second": 32.773, "step": 47300 }, { "epoch": 18.246628131021193, "grad_norm": 0.3777416944503784, "learning_rate": 0.0001270150289017341, "loss": 0.3654653167724609, "step": 47350 }, { "epoch": 18.246628131021193, "eval_loss": 0.42851874232292175, "eval_runtime": 12.0719, "eval_samples_per_second": 1574.232, "eval_steps_per_second": 32.803, "step": 47350 }, { "epoch": 18.265895953757227, "grad_norm": 0.36835476756095886, "learning_rate": 0.00012693795761079, "loss": 0.36379295349121094, "step": 47400 }, { "epoch": 18.265895953757227, "eval_loss": 0.4297223389148712, "eval_runtime": 12.1549, "eval_samples_per_second": 1563.489, "eval_steps_per_second": 32.58, "step": 47400 }, { "epoch": 18.285163776493256, "grad_norm": 0.3683689534664154, "learning_rate": 0.00012686088631984587, "loss": 0.36588115692138673, "step": 47450 }, { "epoch": 18.285163776493256, "eval_loss": 0.4336344003677368, "eval_runtime": 12.0835, "eval_samples_per_second": 1572.719, "eval_steps_per_second": 32.772, "step": 47450 }, { "epoch": 18.304431599229286, "grad_norm": 0.3920043706893921, "learning_rate": 0.00012678381502890173, "loss": 0.3653715515136719, "step": 47500 }, { "epoch": 18.304431599229286, "eval_loss": 0.4182981550693512, "eval_runtime": 12.1347, "eval_samples_per_second": 1566.093, "eval_steps_per_second": 32.634, "step": 47500 }, { "epoch": 18.32369942196532, "grad_norm": 0.3784172534942627, "learning_rate": 0.00012670674373795762, "loss": 0.36684211730957034, "step": 47550 }, { "epoch": 18.32369942196532, "eval_loss": 0.43084821105003357, "eval_runtime": 12.0785, "eval_samples_per_second": 1573.375, "eval_steps_per_second": 32.786, "step": 47550 }, { "epoch": 18.34296724470135, "grad_norm": 0.3103810250759125, "learning_rate": 0.0001266296724470135, "loss": 0.3618255615234375, "step": 47600 }, { "epoch": 18.34296724470135, "eval_loss": 0.43056800961494446, "eval_runtime": 12.0775, "eval_samples_per_second": 1573.507, "eval_steps_per_second": 32.788, "step": 47600 }, { "epoch": 18.36223506743738, "grad_norm": 0.37391647696495056, "learning_rate": 0.00012655260115606936, "loss": 0.36443649291992186, "step": 47650 }, { "epoch": 18.36223506743738, "eval_loss": 0.42415741086006165, "eval_runtime": 12.1361, "eval_samples_per_second": 1565.91, "eval_steps_per_second": 32.63, "step": 47650 }, { "epoch": 18.38150289017341, "grad_norm": 0.32233235239982605, "learning_rate": 0.00012647552986512524, "loss": 0.36931964874267575, "step": 47700 }, { "epoch": 18.38150289017341, "eval_loss": 0.4240269362926483, "eval_runtime": 12.0864, "eval_samples_per_second": 1572.351, "eval_steps_per_second": 32.764, "step": 47700 }, { "epoch": 18.40077071290944, "grad_norm": 0.36460956931114197, "learning_rate": 0.00012639845857418113, "loss": 0.36482097625732424, "step": 47750 }, { "epoch": 18.40077071290944, "eval_loss": 0.42896297574043274, "eval_runtime": 12.0874, "eval_samples_per_second": 1572.212, "eval_steps_per_second": 32.761, "step": 47750 }, { "epoch": 18.42003853564547, "grad_norm": 0.3561745584011078, "learning_rate": 0.000126321387283237, "loss": 0.3599088668823242, "step": 47800 }, { "epoch": 18.42003853564547, "eval_loss": 0.42100024223327637, "eval_runtime": 12.0822, "eval_samples_per_second": 1572.891, "eval_steps_per_second": 32.775, "step": 47800 }, { "epoch": 18.439306358381504, "grad_norm": 0.4389243721961975, "learning_rate": 0.00012624431599229287, "loss": 0.3711346435546875, "step": 47850 }, { "epoch": 18.439306358381504, "eval_loss": 0.42498719692230225, "eval_runtime": 12.075, "eval_samples_per_second": 1573.826, "eval_steps_per_second": 32.795, "step": 47850 }, { "epoch": 18.458574181117534, "grad_norm": 0.3740912973880768, "learning_rate": 0.00012616724470134876, "loss": 0.359480094909668, "step": 47900 }, { "epoch": 18.458574181117534, "eval_loss": 0.4188101291656494, "eval_runtime": 12.1209, "eval_samples_per_second": 1567.868, "eval_steps_per_second": 32.671, "step": 47900 }, { "epoch": 18.477842003853564, "grad_norm": 0.32928267121315, "learning_rate": 0.00012609017341040464, "loss": 0.3637599563598633, "step": 47950 }, { "epoch": 18.477842003853564, "eval_loss": 0.4207070767879486, "eval_runtime": 12.0982, "eval_samples_per_second": 1570.808, "eval_steps_per_second": 32.732, "step": 47950 }, { "epoch": 18.497109826589597, "grad_norm": 0.35234931111335754, "learning_rate": 0.00012601310211946053, "loss": 0.36348739624023435, "step": 48000 }, { "epoch": 18.497109826589597, "eval_loss": 0.4078846573829651, "eval_runtime": 12.1071, "eval_samples_per_second": 1569.652, "eval_steps_per_second": 32.708, "step": 48000 }, { "epoch": 18.516377649325626, "grad_norm": 0.36954763531684875, "learning_rate": 0.00012593603082851638, "loss": 0.36401866912841796, "step": 48050 }, { "epoch": 18.516377649325626, "eval_loss": 0.4232370853424072, "eval_runtime": 12.0174, "eval_samples_per_second": 1581.369, "eval_steps_per_second": 32.952, "step": 48050 }, { "epoch": 18.535645472061656, "grad_norm": 0.36345550417900085, "learning_rate": 0.00012585895953757224, "loss": 0.3645285034179688, "step": 48100 }, { "epoch": 18.535645472061656, "eval_loss": 0.42691993713378906, "eval_runtime": 12.0465, "eval_samples_per_second": 1577.549, "eval_steps_per_second": 32.873, "step": 48100 }, { "epoch": 18.55491329479769, "grad_norm": 0.4006262421607971, "learning_rate": 0.00012578188824662813, "loss": 0.3608406066894531, "step": 48150 }, { "epoch": 18.55491329479769, "eval_loss": 0.42338675260543823, "eval_runtime": 12.022, "eval_samples_per_second": 1580.762, "eval_steps_per_second": 32.939, "step": 48150 }, { "epoch": 18.57418111753372, "grad_norm": 0.39063921570777893, "learning_rate": 0.000125704816955684, "loss": 0.3693534851074219, "step": 48200 }, { "epoch": 18.57418111753372, "eval_loss": 0.41848573088645935, "eval_runtime": 12.0919, "eval_samples_per_second": 1571.629, "eval_steps_per_second": 32.749, "step": 48200 }, { "epoch": 18.59344894026975, "grad_norm": 0.3227307200431824, "learning_rate": 0.0001256277456647399, "loss": 0.3591547393798828, "step": 48250 }, { "epoch": 18.59344894026975, "eval_loss": 0.42389559745788574, "eval_runtime": 12.0163, "eval_samples_per_second": 1581.515, "eval_steps_per_second": 32.955, "step": 48250 }, { "epoch": 18.612716763005782, "grad_norm": 0.41088858246803284, "learning_rate": 0.00012555067437379578, "loss": 0.3635719299316406, "step": 48300 }, { "epoch": 18.612716763005782, "eval_loss": 0.426102876663208, "eval_runtime": 12.1048, "eval_samples_per_second": 1569.95, "eval_steps_per_second": 32.714, "step": 48300 }, { "epoch": 18.63198458574181, "grad_norm": 0.3715960681438446, "learning_rate": 0.00012547360308285167, "loss": 0.36206062316894533, "step": 48350 }, { "epoch": 18.63198458574181, "eval_loss": 0.431070476770401, "eval_runtime": 12.2721, "eval_samples_per_second": 1548.556, "eval_steps_per_second": 32.268, "step": 48350 }, { "epoch": 18.65125240847784, "grad_norm": 0.3734889328479767, "learning_rate": 0.00012539653179190752, "loss": 0.3691383361816406, "step": 48400 }, { "epoch": 18.65125240847784, "eval_loss": 0.4222260117530823, "eval_runtime": 12.0853, "eval_samples_per_second": 1572.484, "eval_steps_per_second": 32.767, "step": 48400 }, { "epoch": 18.670520231213874, "grad_norm": 0.3833114206790924, "learning_rate": 0.00012531946050096338, "loss": 0.36653182983398436, "step": 48450 }, { "epoch": 18.670520231213874, "eval_loss": 0.4197717607021332, "eval_runtime": 12.0719, "eval_samples_per_second": 1574.235, "eval_steps_per_second": 32.803, "step": 48450 }, { "epoch": 18.689788053949904, "grad_norm": 0.34855183959007263, "learning_rate": 0.00012524238921001927, "loss": 0.358399658203125, "step": 48500 }, { "epoch": 18.689788053949904, "eval_loss": 0.43210676312446594, "eval_runtime": 12.0677, "eval_samples_per_second": 1574.781, "eval_steps_per_second": 32.815, "step": 48500 }, { "epoch": 18.709055876685934, "grad_norm": 0.3430051803588867, "learning_rate": 0.00012516531791907515, "loss": 0.3618324661254883, "step": 48550 }, { "epoch": 18.709055876685934, "eval_loss": 0.4190230071544647, "eval_runtime": 12.112, "eval_samples_per_second": 1569.028, "eval_steps_per_second": 32.695, "step": 48550 }, { "epoch": 18.728323699421964, "grad_norm": 0.3718356788158417, "learning_rate": 0.00012508824662813104, "loss": 0.363167724609375, "step": 48600 }, { "epoch": 18.728323699421964, "eval_loss": 0.4228369891643524, "eval_runtime": 12.1178, "eval_samples_per_second": 1568.269, "eval_steps_per_second": 32.679, "step": 48600 }, { "epoch": 18.747591522157997, "grad_norm": 0.3649858236312866, "learning_rate": 0.00012501117533718692, "loss": 0.36519851684570315, "step": 48650 }, { "epoch": 18.747591522157997, "eval_loss": 0.42005136609077454, "eval_runtime": 12.0768, "eval_samples_per_second": 1573.6, "eval_steps_per_second": 32.79, "step": 48650 }, { "epoch": 18.766859344894026, "grad_norm": 0.341705858707428, "learning_rate": 0.00012493410404624278, "loss": 0.3641293334960938, "step": 48700 }, { "epoch": 18.766859344894026, "eval_loss": 0.42272719740867615, "eval_runtime": 12.0093, "eval_samples_per_second": 1582.441, "eval_steps_per_second": 32.974, "step": 48700 }, { "epoch": 18.786127167630056, "grad_norm": 0.40158915519714355, "learning_rate": 0.00012485703275529866, "loss": 0.36475181579589844, "step": 48750 }, { "epoch": 18.786127167630056, "eval_loss": 0.4243825376033783, "eval_runtime": 12.0472, "eval_samples_per_second": 1577.461, "eval_steps_per_second": 32.871, "step": 48750 }, { "epoch": 18.80539499036609, "grad_norm": 0.41746851801872253, "learning_rate": 0.00012477996146435452, "loss": 0.35660495758056643, "step": 48800 }, { "epoch": 18.80539499036609, "eval_loss": 0.4344119429588318, "eval_runtime": 12.0213, "eval_samples_per_second": 1580.867, "eval_steps_per_second": 32.942, "step": 48800 }, { "epoch": 18.82466281310212, "grad_norm": 0.3560159206390381, "learning_rate": 0.0001247028901734104, "loss": 0.35801063537597655, "step": 48850 }, { "epoch": 18.82466281310212, "eval_loss": 0.42284905910491943, "eval_runtime": 12.0188, "eval_samples_per_second": 1581.186, "eval_steps_per_second": 32.948, "step": 48850 }, { "epoch": 18.84393063583815, "grad_norm": 0.3729179799556732, "learning_rate": 0.0001246258188824663, "loss": 0.3652605438232422, "step": 48900 }, { "epoch": 18.84393063583815, "eval_loss": 0.4178811013698578, "eval_runtime": 12.0031, "eval_samples_per_second": 1583.261, "eval_steps_per_second": 32.992, "step": 48900 }, { "epoch": 18.863198458574182, "grad_norm": 0.4274574816226959, "learning_rate": 0.00012454874759152218, "loss": 0.36356178283691404, "step": 48950 }, { "epoch": 18.863198458574182, "eval_loss": 0.420963853597641, "eval_runtime": 12.0868, "eval_samples_per_second": 1572.293, "eval_steps_per_second": 32.763, "step": 48950 }, { "epoch": 18.88246628131021, "grad_norm": 0.3944108784198761, "learning_rate": 0.00012447167630057803, "loss": 0.3626186752319336, "step": 49000 }, { "epoch": 18.88246628131021, "eval_loss": 0.42054593563079834, "eval_runtime": 12.0245, "eval_samples_per_second": 1580.442, "eval_steps_per_second": 32.933, "step": 49000 }, { "epoch": 18.90173410404624, "grad_norm": 0.31847572326660156, "learning_rate": 0.00012439460500963392, "loss": 0.35749794006347657, "step": 49050 }, { "epoch": 18.90173410404624, "eval_loss": 0.4252181351184845, "eval_runtime": 12.0876, "eval_samples_per_second": 1572.188, "eval_steps_per_second": 32.761, "step": 49050 }, { "epoch": 18.921001926782274, "grad_norm": 0.3433185815811157, "learning_rate": 0.0001243175337186898, "loss": 0.36524330139160155, "step": 49100 }, { "epoch": 18.921001926782274, "eval_loss": 0.41775238513946533, "eval_runtime": 12.0779, "eval_samples_per_second": 1573.455, "eval_steps_per_second": 32.787, "step": 49100 }, { "epoch": 18.940269749518304, "grad_norm": 0.3972156047821045, "learning_rate": 0.00012424046242774566, "loss": 0.3593850326538086, "step": 49150 }, { "epoch": 18.940269749518304, "eval_loss": 0.4195556640625, "eval_runtime": 12.2436, "eval_samples_per_second": 1552.16, "eval_steps_per_second": 32.343, "step": 49150 }, { "epoch": 18.959537572254334, "grad_norm": 0.38025814294815063, "learning_rate": 0.00012416339113680155, "loss": 0.3641579818725586, "step": 49200 }, { "epoch": 18.959537572254334, "eval_loss": 0.42620575428009033, "eval_runtime": 12.0804, "eval_samples_per_second": 1573.121, "eval_steps_per_second": 32.78, "step": 49200 }, { "epoch": 18.978805394990367, "grad_norm": 0.3225485682487488, "learning_rate": 0.00012408631984585743, "loss": 0.3620465850830078, "step": 49250 }, { "epoch": 18.978805394990367, "eval_loss": 0.41535431146621704, "eval_runtime": 12.0476, "eval_samples_per_second": 1577.415, "eval_steps_per_second": 32.87, "step": 49250 }, { "epoch": 18.998073217726397, "grad_norm": 0.3681804835796356, "learning_rate": 0.0001240092485549133, "loss": 0.36272052764892576, "step": 49300 }, { "epoch": 18.998073217726397, "eval_loss": 0.42640623450279236, "eval_runtime": 12.0159, "eval_samples_per_second": 1581.574, "eval_steps_per_second": 32.956, "step": 49300 }, { "epoch": 19.017341040462426, "grad_norm": 0.33260998129844666, "learning_rate": 0.00012393217726396917, "loss": 0.36292152404785155, "step": 49350 }, { "epoch": 19.017341040462426, "eval_loss": 0.43364062905311584, "eval_runtime": 12.0261, "eval_samples_per_second": 1580.233, "eval_steps_per_second": 32.928, "step": 49350 }, { "epoch": 19.03660886319846, "grad_norm": 0.3346443176269531, "learning_rate": 0.00012385510597302506, "loss": 0.3579365539550781, "step": 49400 }, { "epoch": 19.03660886319846, "eval_loss": 0.43031108379364014, "eval_runtime": 12.0221, "eval_samples_per_second": 1580.752, "eval_steps_per_second": 32.939, "step": 49400 }, { "epoch": 19.05587668593449, "grad_norm": 0.3682253360748291, "learning_rate": 0.00012377803468208094, "loss": 0.3607744598388672, "step": 49450 }, { "epoch": 19.05587668593449, "eval_loss": 0.4342081844806671, "eval_runtime": 12.0693, "eval_samples_per_second": 1574.579, "eval_steps_per_second": 32.811, "step": 49450 }, { "epoch": 19.07514450867052, "grad_norm": 0.30341458320617676, "learning_rate": 0.0001237009633911368, "loss": 0.3623856735229492, "step": 49500 }, { "epoch": 19.07514450867052, "eval_loss": 0.4235397279262543, "eval_runtime": 12.109, "eval_samples_per_second": 1569.413, "eval_steps_per_second": 32.703, "step": 49500 }, { "epoch": 19.094412331406552, "grad_norm": 0.312283456325531, "learning_rate": 0.00012362389210019266, "loss": 0.3634869384765625, "step": 49550 }, { "epoch": 19.094412331406552, "eval_loss": 0.42309293150901794, "eval_runtime": 12.051, "eval_samples_per_second": 1576.962, "eval_steps_per_second": 32.86, "step": 49550 }, { "epoch": 19.113680154142582, "grad_norm": 0.37398943305015564, "learning_rate": 0.00012354682080924854, "loss": 0.36343677520751955, "step": 49600 }, { "epoch": 19.113680154142582, "eval_loss": 0.42625221610069275, "eval_runtime": 12.0864, "eval_samples_per_second": 1572.342, "eval_steps_per_second": 32.764, "step": 49600 }, { "epoch": 19.13294797687861, "grad_norm": 0.39395788311958313, "learning_rate": 0.00012346974951830443, "loss": 0.3574723815917969, "step": 49650 }, { "epoch": 19.13294797687861, "eval_loss": 0.4246785342693329, "eval_runtime": 12.0534, "eval_samples_per_second": 1576.648, "eval_steps_per_second": 32.854, "step": 49650 }, { "epoch": 19.152215799614645, "grad_norm": 0.3660305440425873, "learning_rate": 0.00012339267822736031, "loss": 0.3633097457885742, "step": 49700 }, { "epoch": 19.152215799614645, "eval_loss": 0.42504313588142395, "eval_runtime": 12.1342, "eval_samples_per_second": 1566.149, "eval_steps_per_second": 32.635, "step": 49700 }, { "epoch": 19.171483622350674, "grad_norm": 0.3595553934574127, "learning_rate": 0.0001233156069364162, "loss": 0.3611714172363281, "step": 49750 }, { "epoch": 19.171483622350674, "eval_loss": 0.4178341031074524, "eval_runtime": 12.0986, "eval_samples_per_second": 1570.761, "eval_steps_per_second": 32.731, "step": 49750 }, { "epoch": 19.190751445086704, "grad_norm": 0.4375123381614685, "learning_rate": 0.00012323853564547208, "loss": 0.35857017517089845, "step": 49800 }, { "epoch": 19.190751445086704, "eval_loss": 0.4235744774341583, "eval_runtime": 12.0712, "eval_samples_per_second": 1574.33, "eval_steps_per_second": 32.805, "step": 49800 }, { "epoch": 19.210019267822737, "grad_norm": 0.3434578776359558, "learning_rate": 0.00012316146435452794, "loss": 0.3598847961425781, "step": 49850 }, { "epoch": 19.210019267822737, "eval_loss": 0.4280802011489868, "eval_runtime": 12.1341, "eval_samples_per_second": 1566.163, "eval_steps_per_second": 32.635, "step": 49850 }, { "epoch": 19.229287090558767, "grad_norm": 0.3477235436439514, "learning_rate": 0.00012308439306358383, "loss": 0.35865833282470705, "step": 49900 }, { "epoch": 19.229287090558767, "eval_loss": 0.43154215812683105, "eval_runtime": 12.0452, "eval_samples_per_second": 1577.728, "eval_steps_per_second": 32.876, "step": 49900 }, { "epoch": 19.248554913294797, "grad_norm": 0.3456646203994751, "learning_rate": 0.00012300732177263968, "loss": 0.35913307189941407, "step": 49950 }, { "epoch": 19.248554913294797, "eval_loss": 0.4257902204990387, "eval_runtime": 12.1507, "eval_samples_per_second": 1564.019, "eval_steps_per_second": 32.591, "step": 49950 }, { "epoch": 19.26782273603083, "grad_norm": 0.4136947691440582, "learning_rate": 0.00012293025048169557, "loss": 0.3623935317993164, "step": 50000 }, { "epoch": 19.26782273603083, "eval_loss": 0.4237935543060303, "eval_runtime": 12.1552, "eval_samples_per_second": 1563.45, "eval_steps_per_second": 32.579, "step": 50000 }, { "epoch": 19.28709055876686, "grad_norm": 0.31283634901046753, "learning_rate": 0.00012285317919075145, "loss": 0.3640547561645508, "step": 50050 }, { "epoch": 19.28709055876686, "eval_loss": 0.42223528027534485, "eval_runtime": 12.0411, "eval_samples_per_second": 1578.257, "eval_steps_per_second": 32.887, "step": 50050 }, { "epoch": 19.30635838150289, "grad_norm": 0.3363703191280365, "learning_rate": 0.00012277610789980734, "loss": 0.35911605834960936, "step": 50100 }, { "epoch": 19.30635838150289, "eval_loss": 0.4291505217552185, "eval_runtime": 12.0903, "eval_samples_per_second": 1571.838, "eval_steps_per_second": 32.754, "step": 50100 }, { "epoch": 19.325626204238922, "grad_norm": 0.3916914761066437, "learning_rate": 0.0001226990366088632, "loss": 0.36371875762939454, "step": 50150 }, { "epoch": 19.325626204238922, "eval_loss": 0.42211446166038513, "eval_runtime": 12.014, "eval_samples_per_second": 1581.827, "eval_steps_per_second": 32.962, "step": 50150 }, { "epoch": 19.344894026974952, "grad_norm": 0.32586127519607544, "learning_rate": 0.00012262196531791908, "loss": 0.3599825668334961, "step": 50200 }, { "epoch": 19.344894026974952, "eval_loss": 0.42099514603614807, "eval_runtime": 12.0471, "eval_samples_per_second": 1577.47, "eval_steps_per_second": 32.871, "step": 50200 }, { "epoch": 19.36416184971098, "grad_norm": 0.38298705220222473, "learning_rate": 0.00012254489402697497, "loss": 0.36114139556884767, "step": 50250 }, { "epoch": 19.36416184971098, "eval_loss": 0.4180140793323517, "eval_runtime": 12.0957, "eval_samples_per_second": 1571.141, "eval_steps_per_second": 32.739, "step": 50250 }, { "epoch": 19.383429672447015, "grad_norm": 0.8411246538162231, "learning_rate": 0.00012246782273603082, "loss": 0.36475074768066407, "step": 50300 }, { "epoch": 19.383429672447015, "eval_loss": 0.42804837226867676, "eval_runtime": 12.0997, "eval_samples_per_second": 1570.615, "eval_steps_per_second": 32.728, "step": 50300 }, { "epoch": 19.402697495183045, "grad_norm": 0.39634594321250916, "learning_rate": 0.0001223907514450867, "loss": 0.3670159149169922, "step": 50350 }, { "epoch": 19.402697495183045, "eval_loss": 0.4158008396625519, "eval_runtime": 12.0857, "eval_samples_per_second": 1572.438, "eval_steps_per_second": 32.766, "step": 50350 }, { "epoch": 19.421965317919074, "grad_norm": 0.3939979374408722, "learning_rate": 0.0001223136801541426, "loss": 0.3654576110839844, "step": 50400 }, { "epoch": 19.421965317919074, "eval_loss": 0.4260885417461395, "eval_runtime": 12.0889, "eval_samples_per_second": 1572.026, "eval_steps_per_second": 32.757, "step": 50400 }, { "epoch": 19.441233140655108, "grad_norm": 0.3997485339641571, "learning_rate": 0.00012223660886319845, "loss": 0.3570298385620117, "step": 50450 }, { "epoch": 19.441233140655108, "eval_loss": 0.41741740703582764, "eval_runtime": 12.0853, "eval_samples_per_second": 1572.487, "eval_steps_per_second": 32.767, "step": 50450 }, { "epoch": 19.460500963391137, "grad_norm": 0.3188541531562805, "learning_rate": 0.00012215953757225434, "loss": 0.3572777557373047, "step": 50500 }, { "epoch": 19.460500963391137, "eval_loss": 0.41808047890663147, "eval_runtime": 12.122, "eval_samples_per_second": 1567.725, "eval_steps_per_second": 32.668, "step": 50500 }, { "epoch": 19.479768786127167, "grad_norm": 0.3340902030467987, "learning_rate": 0.00012208246628131022, "loss": 0.3600804901123047, "step": 50550 }, { "epoch": 19.479768786127167, "eval_loss": 0.42954519391059875, "eval_runtime": 12.1319, "eval_samples_per_second": 1566.448, "eval_steps_per_second": 32.641, "step": 50550 }, { "epoch": 19.4990366088632, "grad_norm": 0.4256540834903717, "learning_rate": 0.0001220053949903661, "loss": 0.3554991149902344, "step": 50600 }, { "epoch": 19.4990366088632, "eval_loss": 0.4197840988636017, "eval_runtime": 12.142, "eval_samples_per_second": 1565.14, "eval_steps_per_second": 32.614, "step": 50600 }, { "epoch": 19.51830443159923, "grad_norm": 0.3161504566669464, "learning_rate": 0.00012192832369942198, "loss": 0.35390022277832034, "step": 50650 }, { "epoch": 19.51830443159923, "eval_loss": 0.42888227105140686, "eval_runtime": 12.0754, "eval_samples_per_second": 1573.776, "eval_steps_per_second": 32.794, "step": 50650 }, { "epoch": 19.53757225433526, "grad_norm": 0.44025853276252747, "learning_rate": 0.00012185125240847785, "loss": 0.35846355438232425, "step": 50700 }, { "epoch": 19.53757225433526, "eval_loss": 0.41795915365219116, "eval_runtime": 12.2461, "eval_samples_per_second": 1551.846, "eval_steps_per_second": 32.337, "step": 50700 }, { "epoch": 19.556840077071293, "grad_norm": 0.3636535406112671, "learning_rate": 0.00012177418111753372, "loss": 0.35820960998535156, "step": 50750 }, { "epoch": 19.556840077071293, "eval_loss": 0.4251055419445038, "eval_runtime": 12.1725, "eval_samples_per_second": 1561.227, "eval_steps_per_second": 32.532, "step": 50750 }, { "epoch": 19.576107899807322, "grad_norm": 0.38904717564582825, "learning_rate": 0.00012169710982658959, "loss": 0.35741390228271486, "step": 50800 }, { "epoch": 19.576107899807322, "eval_loss": 0.4245252311229706, "eval_runtime": 12.1117, "eval_samples_per_second": 1569.063, "eval_steps_per_second": 32.696, "step": 50800 }, { "epoch": 19.595375722543352, "grad_norm": 0.35792076587677, "learning_rate": 0.00012162003853564548, "loss": 0.3619580078125, "step": 50850 }, { "epoch": 19.595375722543352, "eval_loss": 0.41335391998291016, "eval_runtime": 12.0945, "eval_samples_per_second": 1571.296, "eval_steps_per_second": 32.742, "step": 50850 }, { "epoch": 19.614643545279385, "grad_norm": 0.3174975514411926, "learning_rate": 0.00012154296724470136, "loss": 0.3633633041381836, "step": 50900 }, { "epoch": 19.614643545279385, "eval_loss": 0.4214610159397125, "eval_runtime": 12.2022, "eval_samples_per_second": 1557.427, "eval_steps_per_second": 32.453, "step": 50900 }, { "epoch": 19.633911368015415, "grad_norm": 0.393169641494751, "learning_rate": 0.00012146589595375723, "loss": 0.36145820617675783, "step": 50950 }, { "epoch": 19.633911368015415, "eval_loss": 0.40997618436813354, "eval_runtime": 12.1687, "eval_samples_per_second": 1561.708, "eval_steps_per_second": 32.542, "step": 50950 }, { "epoch": 19.653179190751445, "grad_norm": 0.3564261198043823, "learning_rate": 0.00012138882466281312, "loss": 0.3660158920288086, "step": 51000 }, { "epoch": 19.653179190751445, "eval_loss": 0.42227858304977417, "eval_runtime": 12.193, "eval_samples_per_second": 1558.597, "eval_steps_per_second": 32.478, "step": 51000 }, { "epoch": 19.672447013487474, "grad_norm": 0.3980845510959625, "learning_rate": 0.00012131175337186898, "loss": 0.36169197082519533, "step": 51050 }, { "epoch": 19.672447013487474, "eval_loss": 0.4209466278553009, "eval_runtime": 12.1713, "eval_samples_per_second": 1561.375, "eval_steps_per_second": 32.535, "step": 51050 }, { "epoch": 19.691714836223507, "grad_norm": 0.3733876347541809, "learning_rate": 0.00012123468208092486, "loss": 0.3570684051513672, "step": 51100 }, { "epoch": 19.691714836223507, "eval_loss": 0.4155440330505371, "eval_runtime": 12.2389, "eval_samples_per_second": 1552.752, "eval_steps_per_second": 32.356, "step": 51100 }, { "epoch": 19.710982658959537, "grad_norm": 0.380935937166214, "learning_rate": 0.00012115761078998073, "loss": 0.3577065277099609, "step": 51150 }, { "epoch": 19.710982658959537, "eval_loss": 0.4171113967895508, "eval_runtime": 12.0206, "eval_samples_per_second": 1580.953, "eval_steps_per_second": 32.943, "step": 51150 }, { "epoch": 19.730250481695567, "grad_norm": 0.3778274655342102, "learning_rate": 0.00012108053949903662, "loss": 0.36009002685546876, "step": 51200 }, { "epoch": 19.730250481695567, "eval_loss": 0.4108974039554596, "eval_runtime": 12.1191, "eval_samples_per_second": 1568.106, "eval_steps_per_second": 32.676, "step": 51200 }, { "epoch": 19.7495183044316, "grad_norm": 0.39145633578300476, "learning_rate": 0.0001210034682080925, "loss": 0.3628722763061523, "step": 51250 }, { "epoch": 19.7495183044316, "eval_loss": 0.4186082184314728, "eval_runtime": 12.0595, "eval_samples_per_second": 1575.857, "eval_steps_per_second": 32.837, "step": 51250 }, { "epoch": 19.76878612716763, "grad_norm": 0.3722204267978668, "learning_rate": 0.00012092639691714837, "loss": 0.36310630798339844, "step": 51300 }, { "epoch": 19.76878612716763, "eval_loss": 0.4244997799396515, "eval_runtime": 12.141, "eval_samples_per_second": 1565.275, "eval_steps_per_second": 32.617, "step": 51300 }, { "epoch": 19.78805394990366, "grad_norm": 0.3483392596244812, "learning_rate": 0.00012084932562620423, "loss": 0.3588508987426758, "step": 51350 }, { "epoch": 19.78805394990366, "eval_loss": 0.41594281792640686, "eval_runtime": 12.0221, "eval_samples_per_second": 1580.753, "eval_steps_per_second": 32.939, "step": 51350 }, { "epoch": 19.807321772639693, "grad_norm": 0.4394356906414032, "learning_rate": 0.00012077225433526012, "loss": 0.3582759857177734, "step": 51400 }, { "epoch": 19.807321772639693, "eval_loss": 0.41960832476615906, "eval_runtime": 12.1905, "eval_samples_per_second": 1558.914, "eval_steps_per_second": 32.484, "step": 51400 }, { "epoch": 19.826589595375722, "grad_norm": 0.3257826566696167, "learning_rate": 0.000120695183044316, "loss": 0.3618727493286133, "step": 51450 }, { "epoch": 19.826589595375722, "eval_loss": 0.4270622730255127, "eval_runtime": 12.2296, "eval_samples_per_second": 1553.932, "eval_steps_per_second": 32.38, "step": 51450 }, { "epoch": 19.845857418111752, "grad_norm": 0.33393871784210205, "learning_rate": 0.00012061811175337187, "loss": 0.3621659851074219, "step": 51500 }, { "epoch": 19.845857418111752, "eval_loss": 0.4190393090248108, "eval_runtime": 12.1381, "eval_samples_per_second": 1565.645, "eval_steps_per_second": 32.624, "step": 51500 }, { "epoch": 19.865125240847785, "grad_norm": 0.38052570819854736, "learning_rate": 0.00012054104046242776, "loss": 0.3648019027709961, "step": 51550 }, { "epoch": 19.865125240847785, "eval_loss": 0.4177800118923187, "eval_runtime": 12.1456, "eval_samples_per_second": 1564.688, "eval_steps_per_second": 32.605, "step": 51550 }, { "epoch": 19.884393063583815, "grad_norm": 0.35202693939208984, "learning_rate": 0.00012046396917148364, "loss": 0.362895622253418, "step": 51600 }, { "epoch": 19.884393063583815, "eval_loss": 0.4199948012828827, "eval_runtime": 12.0258, "eval_samples_per_second": 1580.268, "eval_steps_per_second": 32.929, "step": 51600 }, { "epoch": 19.903660886319845, "grad_norm": 0.35126644372940063, "learning_rate": 0.0001203868978805395, "loss": 0.35955322265625, "step": 51650 }, { "epoch": 19.903660886319845, "eval_loss": 0.4205935299396515, "eval_runtime": 12.0181, "eval_samples_per_second": 1581.278, "eval_steps_per_second": 32.95, "step": 51650 }, { "epoch": 19.922928709055878, "grad_norm": 0.3639732301235199, "learning_rate": 0.00012030982658959537, "loss": 0.3564771270751953, "step": 51700 }, { "epoch": 19.922928709055878, "eval_loss": 0.4160435199737549, "eval_runtime": 12.0348, "eval_samples_per_second": 1579.086, "eval_steps_per_second": 32.905, "step": 51700 }, { "epoch": 19.942196531791907, "grad_norm": 0.41602951288223267, "learning_rate": 0.00012023275529865126, "loss": 0.36025821685791015, "step": 51750 }, { "epoch": 19.942196531791907, "eval_loss": 0.4170264005661011, "eval_runtime": 12.0699, "eval_samples_per_second": 1574.494, "eval_steps_per_second": 32.809, "step": 51750 }, { "epoch": 19.961464354527937, "grad_norm": 0.3439793586730957, "learning_rate": 0.00012015568400770714, "loss": 0.35454124450683594, "step": 51800 }, { "epoch": 19.961464354527937, "eval_loss": 0.42058616876602173, "eval_runtime": 12.1104, "eval_samples_per_second": 1569.224, "eval_steps_per_second": 32.699, "step": 51800 }, { "epoch": 19.98073217726397, "grad_norm": 0.4180435836315155, "learning_rate": 0.00012007861271676301, "loss": 0.36064483642578127, "step": 51850 }, { "epoch": 19.98073217726397, "eval_loss": 0.41555726528167725, "eval_runtime": 12.2087, "eval_samples_per_second": 1556.594, "eval_steps_per_second": 32.436, "step": 51850 }, { "epoch": 20.0, "grad_norm": 0.3975246548652649, "learning_rate": 0.0001200015414258189, "loss": 0.36031639099121093, "step": 51900 }, { "epoch": 20.0, "eval_loss": 0.41387155652046204, "eval_runtime": 12.1766, "eval_samples_per_second": 1560.701, "eval_steps_per_second": 32.521, "step": 51900 }, { "epoch": 20.01926782273603, "grad_norm": 0.3329463601112366, "learning_rate": 0.00011992447013487476, "loss": 0.35928672790527344, "step": 51950 }, { "epoch": 20.01926782273603, "eval_loss": 0.42389217019081116, "eval_runtime": 12.1965, "eval_samples_per_second": 1558.154, "eval_steps_per_second": 32.468, "step": 51950 }, { "epoch": 20.038535645472063, "grad_norm": 0.3999747037887573, "learning_rate": 0.00011984739884393064, "loss": 0.3572080612182617, "step": 52000 }, { "epoch": 20.038535645472063, "eval_loss": 0.40875688195228577, "eval_runtime": 12.178, "eval_samples_per_second": 1560.519, "eval_steps_per_second": 32.518, "step": 52000 }, { "epoch": 20.057803468208093, "grad_norm": 0.38784340023994446, "learning_rate": 0.00011977032755298651, "loss": 0.3599992370605469, "step": 52050 }, { "epoch": 20.057803468208093, "eval_loss": 0.4158271253108978, "eval_runtime": 12.1538, "eval_samples_per_second": 1563.63, "eval_steps_per_second": 32.582, "step": 52050 }, { "epoch": 20.077071290944122, "grad_norm": 0.3226776123046875, "learning_rate": 0.0001196932562620424, "loss": 0.36180633544921875, "step": 52100 }, { "epoch": 20.077071290944122, "eval_loss": 0.41981297731399536, "eval_runtime": 12.1744, "eval_samples_per_second": 1560.978, "eval_steps_per_second": 32.527, "step": 52100 }, { "epoch": 20.096339113680155, "grad_norm": 0.49342647194862366, "learning_rate": 0.00011961618497109828, "loss": 0.35459587097167966, "step": 52150 }, { "epoch": 20.096339113680155, "eval_loss": 0.42088526487350464, "eval_runtime": 12.1936, "eval_samples_per_second": 1558.517, "eval_steps_per_second": 32.476, "step": 52150 }, { "epoch": 20.115606936416185, "grad_norm": 0.3595350384712219, "learning_rate": 0.00011953911368015415, "loss": 0.3529925537109375, "step": 52200 }, { "epoch": 20.115606936416185, "eval_loss": 0.42012855410575867, "eval_runtime": 12.0432, "eval_samples_per_second": 1577.983, "eval_steps_per_second": 32.882, "step": 52200 }, { "epoch": 20.134874759152215, "grad_norm": 0.38102030754089355, "learning_rate": 0.00011946204238921001, "loss": 0.3612397766113281, "step": 52250 }, { "epoch": 20.134874759152215, "eval_loss": 0.42699456214904785, "eval_runtime": 12.115, "eval_samples_per_second": 1568.629, "eval_steps_per_second": 32.687, "step": 52250 }, { "epoch": 20.154142581888248, "grad_norm": 0.37754902243614197, "learning_rate": 0.0001193849710982659, "loss": 0.35381710052490234, "step": 52300 }, { "epoch": 20.154142581888248, "eval_loss": 0.4168192744255066, "eval_runtime": 12.0351, "eval_samples_per_second": 1579.05, "eval_steps_per_second": 32.904, "step": 52300 }, { "epoch": 20.173410404624278, "grad_norm": 0.3398839831352234, "learning_rate": 0.00011930789980732178, "loss": 0.35640880584716794, "step": 52350 }, { "epoch": 20.173410404624278, "eval_loss": 0.4166288375854492, "eval_runtime": 12.133, "eval_samples_per_second": 1566.303, "eval_steps_per_second": 32.638, "step": 52350 }, { "epoch": 20.192678227360307, "grad_norm": 0.37457823753356934, "learning_rate": 0.00011923082851637765, "loss": 0.355689697265625, "step": 52400 }, { "epoch": 20.192678227360307, "eval_loss": 0.4156523644924164, "eval_runtime": 12.1384, "eval_samples_per_second": 1565.614, "eval_steps_per_second": 32.624, "step": 52400 }, { "epoch": 20.21194605009634, "grad_norm": 0.3379766047000885, "learning_rate": 0.00011915375722543354, "loss": 0.357186279296875, "step": 52450 }, { "epoch": 20.21194605009634, "eval_loss": 0.41222673654556274, "eval_runtime": 12.1234, "eval_samples_per_second": 1567.546, "eval_steps_per_second": 32.664, "step": 52450 }, { "epoch": 20.23121387283237, "grad_norm": 0.41455620527267456, "learning_rate": 0.00011907668593448942, "loss": 0.35832695007324217, "step": 52500 }, { "epoch": 20.23121387283237, "eval_loss": 0.4205981194972992, "eval_runtime": 12.0519, "eval_samples_per_second": 1576.844, "eval_steps_per_second": 32.858, "step": 52500 }, { "epoch": 20.2504816955684, "grad_norm": 0.3102385699748993, "learning_rate": 0.00011899961464354528, "loss": 0.35893932342529294, "step": 52550 }, { "epoch": 20.2504816955684, "eval_loss": 0.41143661737442017, "eval_runtime": 12.0913, "eval_samples_per_second": 1571.714, "eval_steps_per_second": 32.751, "step": 52550 }, { "epoch": 20.269749518304433, "grad_norm": 0.36088940501213074, "learning_rate": 0.00011892254335260115, "loss": 0.35765403747558594, "step": 52600 }, { "epoch": 20.269749518304433, "eval_loss": 0.40982508659362793, "eval_runtime": 12.1032, "eval_samples_per_second": 1570.161, "eval_steps_per_second": 32.719, "step": 52600 }, { "epoch": 20.289017341040463, "grad_norm": 0.3188185691833496, "learning_rate": 0.00011884547206165704, "loss": 0.3556815719604492, "step": 52650 }, { "epoch": 20.289017341040463, "eval_loss": 0.4124082028865814, "eval_runtime": 12.0366, "eval_samples_per_second": 1578.854, "eval_steps_per_second": 32.9, "step": 52650 }, { "epoch": 20.308285163776493, "grad_norm": 0.29560866951942444, "learning_rate": 0.00011876840077071292, "loss": 0.3513153839111328, "step": 52700 }, { "epoch": 20.308285163776493, "eval_loss": 0.4092627167701721, "eval_runtime": 12.0783, "eval_samples_per_second": 1573.396, "eval_steps_per_second": 32.786, "step": 52700 }, { "epoch": 20.327552986512526, "grad_norm": 0.34917014837265015, "learning_rate": 0.00011869132947976879, "loss": 0.35949459075927737, "step": 52750 }, { "epoch": 20.327552986512526, "eval_loss": 0.4150541424751282, "eval_runtime": 12.1896, "eval_samples_per_second": 1559.032, "eval_steps_per_second": 32.487, "step": 52750 }, { "epoch": 20.346820809248555, "grad_norm": 0.4206242859363556, "learning_rate": 0.00011861425818882468, "loss": 0.3619451141357422, "step": 52800 }, { "epoch": 20.346820809248555, "eval_loss": 0.4206058382987976, "eval_runtime": 12.1802, "eval_samples_per_second": 1560.233, "eval_steps_per_second": 32.512, "step": 52800 }, { "epoch": 20.366088631984585, "grad_norm": 0.3719010353088379, "learning_rate": 0.00011853718689788053, "loss": 0.35890453338623046, "step": 52850 }, { "epoch": 20.366088631984585, "eval_loss": 0.41743627190589905, "eval_runtime": 12.2116, "eval_samples_per_second": 1556.23, "eval_steps_per_second": 32.428, "step": 52850 }, { "epoch": 20.38535645472062, "grad_norm": 0.3581315875053406, "learning_rate": 0.00011846011560693642, "loss": 0.35322731018066406, "step": 52900 }, { "epoch": 20.38535645472062, "eval_loss": 0.4166901707649231, "eval_runtime": 12.4096, "eval_samples_per_second": 1531.397, "eval_steps_per_second": 31.911, "step": 52900 }, { "epoch": 20.404624277456648, "grad_norm": 0.4088922441005707, "learning_rate": 0.0001183830443159923, "loss": 0.3640705490112305, "step": 52950 }, { "epoch": 20.404624277456648, "eval_loss": 0.4155498445034027, "eval_runtime": 12.4158, "eval_samples_per_second": 1530.636, "eval_steps_per_second": 31.895, "step": 52950 }, { "epoch": 20.423892100192678, "grad_norm": 0.37212249636650085, "learning_rate": 0.00011830597302504818, "loss": 0.36070545196533205, "step": 53000 }, { "epoch": 20.423892100192678, "eval_loss": 0.41370269656181335, "eval_runtime": 12.0819, "eval_samples_per_second": 1572.934, "eval_steps_per_second": 32.776, "step": 53000 }, { "epoch": 20.443159922928707, "grad_norm": 0.36552977561950684, "learning_rate": 0.00011822890173410406, "loss": 0.3591091918945313, "step": 53050 }, { "epoch": 20.443159922928707, "eval_loss": 0.4179646372795105, "eval_runtime": 12.0984, "eval_samples_per_second": 1570.785, "eval_steps_per_second": 32.732, "step": 53050 }, { "epoch": 20.46242774566474, "grad_norm": 0.4622708857059479, "learning_rate": 0.00011815183044315994, "loss": 0.3591180419921875, "step": 53100 }, { "epoch": 20.46242774566474, "eval_loss": 0.42042773962020874, "eval_runtime": 12.1245, "eval_samples_per_second": 1567.41, "eval_steps_per_second": 32.661, "step": 53100 }, { "epoch": 20.48169556840077, "grad_norm": 0.5514500737190247, "learning_rate": 0.0001180747591522158, "loss": 0.36044227600097656, "step": 53150 }, { "epoch": 20.48169556840077, "eval_loss": 0.41498902440071106, "eval_runtime": 12.0948, "eval_samples_per_second": 1571.251, "eval_steps_per_second": 32.741, "step": 53150 }, { "epoch": 20.5009633911368, "grad_norm": 0.35944485664367676, "learning_rate": 0.00011799768786127167, "loss": 0.3590814208984375, "step": 53200 }, { "epoch": 20.5009633911368, "eval_loss": 0.4181993007659912, "eval_runtime": 12.0381, "eval_samples_per_second": 1578.656, "eval_steps_per_second": 32.896, "step": 53200 }, { "epoch": 20.520231213872833, "grad_norm": 0.4540524482727051, "learning_rate": 0.00011792061657032756, "loss": 0.35715648651123044, "step": 53250 }, { "epoch": 20.520231213872833, "eval_loss": 0.43060198426246643, "eval_runtime": 12.0908, "eval_samples_per_second": 1571.769, "eval_steps_per_second": 32.752, "step": 53250 }, { "epoch": 20.539499036608863, "grad_norm": 0.3450084328651428, "learning_rate": 0.00011784354527938344, "loss": 0.3551230239868164, "step": 53300 }, { "epoch": 20.539499036608863, "eval_loss": 0.42615166306495667, "eval_runtime": 12.0713, "eval_samples_per_second": 1574.313, "eval_steps_per_second": 32.805, "step": 53300 }, { "epoch": 20.558766859344892, "grad_norm": 0.32798218727111816, "learning_rate": 0.00011776647398843932, "loss": 0.35243480682373046, "step": 53350 }, { "epoch": 20.558766859344892, "eval_loss": 0.4261520504951477, "eval_runtime": 12.685, "eval_samples_per_second": 1498.145, "eval_steps_per_second": 31.218, "step": 53350 }, { "epoch": 20.578034682080926, "grad_norm": 0.36507973074913025, "learning_rate": 0.0001176894026974952, "loss": 0.3606936645507812, "step": 53400 }, { "epoch": 20.578034682080926, "eval_loss": 0.4256114065647125, "eval_runtime": 12.2122, "eval_samples_per_second": 1556.145, "eval_steps_per_second": 32.427, "step": 53400 }, { "epoch": 20.597302504816955, "grad_norm": 0.31863436102867126, "learning_rate": 0.00011761233140655106, "loss": 0.3565537643432617, "step": 53450 }, { "epoch": 20.597302504816955, "eval_loss": 0.4184532165527344, "eval_runtime": 12.6694, "eval_samples_per_second": 1499.991, "eval_steps_per_second": 31.256, "step": 53450 }, { "epoch": 20.616570327552985, "grad_norm": 0.48235222697257996, "learning_rate": 0.00011753526011560694, "loss": 0.36095123291015624, "step": 53500 }, { "epoch": 20.616570327552985, "eval_loss": 0.4232397675514221, "eval_runtime": 12.9014, "eval_samples_per_second": 1473.015, "eval_steps_per_second": 30.694, "step": 53500 }, { "epoch": 20.63583815028902, "grad_norm": 0.36226576566696167, "learning_rate": 0.00011745818882466281, "loss": 0.35940895080566404, "step": 53550 }, { "epoch": 20.63583815028902, "eval_loss": 0.41963067650794983, "eval_runtime": 12.7977, "eval_samples_per_second": 1484.958, "eval_steps_per_second": 30.943, "step": 53550 }, { "epoch": 20.655105973025048, "grad_norm": 0.3587161600589752, "learning_rate": 0.0001173811175337187, "loss": 0.35505332946777346, "step": 53600 }, { "epoch": 20.655105973025048, "eval_loss": 0.41790419816970825, "eval_runtime": 12.7169, "eval_samples_per_second": 1494.388, "eval_steps_per_second": 31.14, "step": 53600 }, { "epoch": 20.674373795761078, "grad_norm": 0.3891097903251648, "learning_rate": 0.00011730404624277458, "loss": 0.3622920989990234, "step": 53650 }, { "epoch": 20.674373795761078, "eval_loss": 0.4160598814487457, "eval_runtime": 12.1199, "eval_samples_per_second": 1567.995, "eval_steps_per_second": 32.673, "step": 53650 }, { "epoch": 20.69364161849711, "grad_norm": 0.35480010509490967, "learning_rate": 0.00011722697495183046, "loss": 0.3603193664550781, "step": 53700 }, { "epoch": 20.69364161849711, "eval_loss": 0.42304351925849915, "eval_runtime": 12.0702, "eval_samples_per_second": 1574.454, "eval_steps_per_second": 32.808, "step": 53700 }, { "epoch": 20.71290944123314, "grad_norm": 0.3697350323200226, "learning_rate": 0.00011714990366088631, "loss": 0.3624173355102539, "step": 53750 }, { "epoch": 20.71290944123314, "eval_loss": 0.4196506142616272, "eval_runtime": 12.1451, "eval_samples_per_second": 1564.752, "eval_steps_per_second": 32.606, "step": 53750 }, { "epoch": 20.73217726396917, "grad_norm": 0.4681817293167114, "learning_rate": 0.0001170728323699422, "loss": 0.3630880355834961, "step": 53800 }, { "epoch": 20.73217726396917, "eval_loss": 0.42394939064979553, "eval_runtime": 12.1272, "eval_samples_per_second": 1567.055, "eval_steps_per_second": 32.654, "step": 53800 }, { "epoch": 20.751445086705203, "grad_norm": 0.3331587612628937, "learning_rate": 0.00011699576107899808, "loss": 0.35883819580078125, "step": 53850 }, { "epoch": 20.751445086705203, "eval_loss": 0.41702550649642944, "eval_runtime": 12.1056, "eval_samples_per_second": 1569.85, "eval_steps_per_second": 32.712, "step": 53850 }, { "epoch": 20.770712909441233, "grad_norm": 0.39932551980018616, "learning_rate": 0.00011691868978805395, "loss": 0.35637008666992187, "step": 53900 }, { "epoch": 20.770712909441233, "eval_loss": 0.42241984605789185, "eval_runtime": 12.1225, "eval_samples_per_second": 1567.657, "eval_steps_per_second": 32.666, "step": 53900 }, { "epoch": 20.789980732177263, "grad_norm": 0.34856000542640686, "learning_rate": 0.00011684161849710984, "loss": 0.3548444366455078, "step": 53950 }, { "epoch": 20.789980732177263, "eval_loss": 0.42359086871147156, "eval_runtime": 12.0308, "eval_samples_per_second": 1579.617, "eval_steps_per_second": 32.916, "step": 53950 }, { "epoch": 20.809248554913296, "grad_norm": 0.39819589257240295, "learning_rate": 0.00011676454720616572, "loss": 0.3586331558227539, "step": 54000 }, { "epoch": 20.809248554913296, "eval_loss": 0.41960608959198, "eval_runtime": 12.0509, "eval_samples_per_second": 1576.981, "eval_steps_per_second": 32.861, "step": 54000 }, { "epoch": 20.828516377649326, "grad_norm": 0.3683141767978668, "learning_rate": 0.00011668747591522158, "loss": 0.36463436126708987, "step": 54050 }, { "epoch": 20.828516377649326, "eval_loss": 0.4089110791683197, "eval_runtime": 12.0338, "eval_samples_per_second": 1579.218, "eval_steps_per_second": 32.907, "step": 54050 }, { "epoch": 20.847784200385355, "grad_norm": 0.360200971364975, "learning_rate": 0.00011661040462427745, "loss": 0.3558652877807617, "step": 54100 }, { "epoch": 20.847784200385355, "eval_loss": 0.41880515217781067, "eval_runtime": 12.1809, "eval_samples_per_second": 1560.15, "eval_steps_per_second": 32.51, "step": 54100 }, { "epoch": 20.86705202312139, "grad_norm": 0.3719461262226105, "learning_rate": 0.00011653333333333334, "loss": 0.3630248641967773, "step": 54150 }, { "epoch": 20.86705202312139, "eval_loss": 0.42059755325317383, "eval_runtime": 12.0949, "eval_samples_per_second": 1571.236, "eval_steps_per_second": 32.741, "step": 54150 }, { "epoch": 20.886319845857418, "grad_norm": 0.3723220229148865, "learning_rate": 0.00011645626204238922, "loss": 0.36102703094482425, "step": 54200 }, { "epoch": 20.886319845857418, "eval_loss": 0.4182237684726715, "eval_runtime": 12.0462, "eval_samples_per_second": 1577.591, "eval_steps_per_second": 32.873, "step": 54200 }, { "epoch": 20.905587668593448, "grad_norm": 0.353498250246048, "learning_rate": 0.0001163791907514451, "loss": 0.35755126953125, "step": 54250 }, { "epoch": 20.905587668593448, "eval_loss": 0.4024893045425415, "eval_runtime": 12.2093, "eval_samples_per_second": 1556.52, "eval_steps_per_second": 32.434, "step": 54250 }, { "epoch": 20.92485549132948, "grad_norm": 0.36358556151390076, "learning_rate": 0.00011630211946050098, "loss": 0.3583490753173828, "step": 54300 }, { "epoch": 20.92485549132948, "eval_loss": 0.41749289631843567, "eval_runtime": 12.1076, "eval_samples_per_second": 1569.596, "eval_steps_per_second": 32.707, "step": 54300 }, { "epoch": 20.94412331406551, "grad_norm": 0.362910658121109, "learning_rate": 0.00011622504816955684, "loss": 0.3542874145507813, "step": 54350 }, { "epoch": 20.94412331406551, "eval_loss": 0.4076080024242401, "eval_runtime": 12.2067, "eval_samples_per_second": 1556.85, "eval_steps_per_second": 32.441, "step": 54350 }, { "epoch": 20.96339113680154, "grad_norm": 0.3788284957408905, "learning_rate": 0.00011614797687861272, "loss": 0.3517755889892578, "step": 54400 }, { "epoch": 20.96339113680154, "eval_loss": 0.408950537443161, "eval_runtime": 12.3369, "eval_samples_per_second": 1540.418, "eval_steps_per_second": 32.099, "step": 54400 }, { "epoch": 20.982658959537574, "grad_norm": 0.3347620368003845, "learning_rate": 0.00011607090558766859, "loss": 0.35588798522949217, "step": 54450 }, { "epoch": 20.982658959537574, "eval_loss": 0.41775646805763245, "eval_runtime": 12.0845, "eval_samples_per_second": 1572.593, "eval_steps_per_second": 32.769, "step": 54450 }, { "epoch": 21.001926782273603, "grad_norm": 0.33074629306793213, "learning_rate": 0.00011599383429672448, "loss": 0.3537546539306641, "step": 54500 }, { "epoch": 21.001926782273603, "eval_loss": 0.4146362841129303, "eval_runtime": 12.2728, "eval_samples_per_second": 1548.466, "eval_steps_per_second": 32.267, "step": 54500 }, { "epoch": 21.021194605009633, "grad_norm": 0.33995765447616577, "learning_rate": 0.00011591676300578036, "loss": 0.3521453857421875, "step": 54550 }, { "epoch": 21.021194605009633, "eval_loss": 0.41872039437294006, "eval_runtime": 12.1976, "eval_samples_per_second": 1558.011, "eval_steps_per_second": 32.465, "step": 54550 }, { "epoch": 21.040462427745666, "grad_norm": 0.37845438718795776, "learning_rate": 0.00011583969171483622, "loss": 0.3542487716674805, "step": 54600 }, { "epoch": 21.040462427745666, "eval_loss": 0.4134826064109802, "eval_runtime": 12.0465, "eval_samples_per_second": 1577.555, "eval_steps_per_second": 32.873, "step": 54600 }, { "epoch": 21.059730250481696, "grad_norm": 0.41664963960647583, "learning_rate": 0.00011576262042389209, "loss": 0.3539153671264648, "step": 54650 }, { "epoch": 21.059730250481696, "eval_loss": 0.4145791530609131, "eval_runtime": 12.0812, "eval_samples_per_second": 1573.028, "eval_steps_per_second": 32.778, "step": 54650 }, { "epoch": 21.078998073217726, "grad_norm": 0.440048485994339, "learning_rate": 0.00011568554913294798, "loss": 0.3561517333984375, "step": 54700 }, { "epoch": 21.078998073217726, "eval_loss": 0.41362762451171875, "eval_runtime": 12.0388, "eval_samples_per_second": 1578.565, "eval_steps_per_second": 32.894, "step": 54700 }, { "epoch": 21.09826589595376, "grad_norm": 0.42101022601127625, "learning_rate": 0.00011560847784200386, "loss": 0.3551408004760742, "step": 54750 }, { "epoch": 21.09826589595376, "eval_loss": 0.4193161725997925, "eval_runtime": 12.0263, "eval_samples_per_second": 1580.205, "eval_steps_per_second": 32.928, "step": 54750 }, { "epoch": 21.11753371868979, "grad_norm": 0.3845166563987732, "learning_rate": 0.00011553140655105973, "loss": 0.35883487701416017, "step": 54800 }, { "epoch": 21.11753371868979, "eval_loss": 0.4212951958179474, "eval_runtime": 12.0371, "eval_samples_per_second": 1578.788, "eval_steps_per_second": 32.898, "step": 54800 }, { "epoch": 21.136801541425818, "grad_norm": 0.37967413663864136, "learning_rate": 0.00011545433526011562, "loss": 0.35779556274414065, "step": 54850 }, { "epoch": 21.136801541425818, "eval_loss": 0.4108249843120575, "eval_runtime": 12.0975, "eval_samples_per_second": 1570.909, "eval_steps_per_second": 32.734, "step": 54850 }, { "epoch": 21.15606936416185, "grad_norm": 0.38102662563323975, "learning_rate": 0.00011537726396917148, "loss": 0.35724754333496095, "step": 54900 }, { "epoch": 21.15606936416185, "eval_loss": 0.40588462352752686, "eval_runtime": 12.1452, "eval_samples_per_second": 1564.737, "eval_steps_per_second": 32.606, "step": 54900 }, { "epoch": 21.17533718689788, "grad_norm": 0.37619784474372864, "learning_rate": 0.00011530019267822736, "loss": 0.3637507247924805, "step": 54950 }, { "epoch": 21.17533718689788, "eval_loss": 0.4160265326499939, "eval_runtime": 12.1395, "eval_samples_per_second": 1565.472, "eval_steps_per_second": 32.621, "step": 54950 }, { "epoch": 21.19460500963391, "grad_norm": 0.39067137241363525, "learning_rate": 0.00011522312138728325, "loss": 0.3563017272949219, "step": 55000 }, { "epoch": 21.19460500963391, "eval_loss": 0.41210511326789856, "eval_runtime": 12.1948, "eval_samples_per_second": 1558.374, "eval_steps_per_second": 32.473, "step": 55000 }, { "epoch": 21.213872832369944, "grad_norm": 0.37263771891593933, "learning_rate": 0.00011514605009633912, "loss": 0.35384593963623046, "step": 55050 }, { "epoch": 21.213872832369944, "eval_loss": 0.41609421372413635, "eval_runtime": 12.1471, "eval_samples_per_second": 1564.483, "eval_steps_per_second": 32.6, "step": 55050 }, { "epoch": 21.233140655105974, "grad_norm": 0.3829081356525421, "learning_rate": 0.000115068978805395, "loss": 0.35456024169921874, "step": 55100 }, { "epoch": 21.233140655105974, "eval_loss": 0.40526431798934937, "eval_runtime": 12.1131, "eval_samples_per_second": 1568.884, "eval_steps_per_second": 32.692, "step": 55100 }, { "epoch": 21.252408477842003, "grad_norm": 0.3902374505996704, "learning_rate": 0.00011499190751445089, "loss": 0.35820541381835935, "step": 55150 }, { "epoch": 21.252408477842003, "eval_loss": 0.415315181016922, "eval_runtime": 12.1098, "eval_samples_per_second": 1569.306, "eval_steps_per_second": 32.701, "step": 55150 }, { "epoch": 21.271676300578033, "grad_norm": 0.3511368930339813, "learning_rate": 0.00011491483622350674, "loss": 0.35442337036132815, "step": 55200 }, { "epoch": 21.271676300578033, "eval_loss": 0.41814783215522766, "eval_runtime": 12.0384, "eval_samples_per_second": 1578.614, "eval_steps_per_second": 32.895, "step": 55200 }, { "epoch": 21.290944123314066, "grad_norm": 0.31765466928482056, "learning_rate": 0.00011483776493256262, "loss": 0.35568077087402344, "step": 55250 }, { "epoch": 21.290944123314066, "eval_loss": 0.41436851024627686, "eval_runtime": 12.0619, "eval_samples_per_second": 1575.533, "eval_steps_per_second": 32.831, "step": 55250 }, { "epoch": 21.310211946050096, "grad_norm": 0.37187883257865906, "learning_rate": 0.0001147606936416185, "loss": 0.3571092987060547, "step": 55300 }, { "epoch": 21.310211946050096, "eval_loss": 0.4027104079723358, "eval_runtime": 12.049, "eval_samples_per_second": 1577.229, "eval_steps_per_second": 32.866, "step": 55300 }, { "epoch": 21.329479768786126, "grad_norm": 0.37617027759552, "learning_rate": 0.00011468362235067439, "loss": 0.3604891586303711, "step": 55350 }, { "epoch": 21.329479768786126, "eval_loss": 0.41911813616752625, "eval_runtime": 12.2221, "eval_samples_per_second": 1554.893, "eval_steps_per_second": 32.4, "step": 55350 }, { "epoch": 21.34874759152216, "grad_norm": 0.39358028769493103, "learning_rate": 0.00011460655105973026, "loss": 0.35148063659667966, "step": 55400 }, { "epoch": 21.34874759152216, "eval_loss": 0.42192181944847107, "eval_runtime": 12.0468, "eval_samples_per_second": 1577.521, "eval_steps_per_second": 32.872, "step": 55400 }, { "epoch": 21.36801541425819, "grad_norm": 0.3331373333930969, "learning_rate": 0.00011452947976878614, "loss": 0.3625579833984375, "step": 55450 }, { "epoch": 21.36801541425819, "eval_loss": 0.4189731776714325, "eval_runtime": 12.1707, "eval_samples_per_second": 1561.455, "eval_steps_per_second": 32.537, "step": 55450 }, { "epoch": 21.387283236994218, "grad_norm": 0.3589906692504883, "learning_rate": 0.000114452408477842, "loss": 0.35813194274902344, "step": 55500 }, { "epoch": 21.387283236994218, "eval_loss": 0.4117266833782196, "eval_runtime": 12.0343, "eval_samples_per_second": 1579.153, "eval_steps_per_second": 32.906, "step": 55500 }, { "epoch": 21.40655105973025, "grad_norm": 0.3449852466583252, "learning_rate": 0.00011437533718689788, "loss": 0.3636948776245117, "step": 55550 }, { "epoch": 21.40655105973025, "eval_loss": 0.42043760418891907, "eval_runtime": 12.0518, "eval_samples_per_second": 1576.86, "eval_steps_per_second": 32.858, "step": 55550 }, { "epoch": 21.42581888246628, "grad_norm": 0.342980295419693, "learning_rate": 0.00011429826589595376, "loss": 0.3500786209106445, "step": 55600 }, { "epoch": 21.42581888246628, "eval_loss": 0.4173811674118042, "eval_runtime": 12.0571, "eval_samples_per_second": 1576.164, "eval_steps_per_second": 32.844, "step": 55600 }, { "epoch": 21.44508670520231, "grad_norm": 0.3718242049217224, "learning_rate": 0.00011422119460500964, "loss": 0.3487553024291992, "step": 55650 }, { "epoch": 21.44508670520231, "eval_loss": 0.4220547378063202, "eval_runtime": 12.1646, "eval_samples_per_second": 1562.244, "eval_steps_per_second": 32.554, "step": 55650 }, { "epoch": 21.464354527938344, "grad_norm": 0.36743447184562683, "learning_rate": 0.00011414412331406553, "loss": 0.3527417373657227, "step": 55700 }, { "epoch": 21.464354527938344, "eval_loss": 0.421878844499588, "eval_runtime": 12.0976, "eval_samples_per_second": 1570.893, "eval_steps_per_second": 32.734, "step": 55700 }, { "epoch": 21.483622350674374, "grad_norm": 0.36079153418540955, "learning_rate": 0.0001140670520231214, "loss": 0.35547962188720705, "step": 55750 }, { "epoch": 21.483622350674374, "eval_loss": 0.4217149317264557, "eval_runtime": 12.0504, "eval_samples_per_second": 1577.037, "eval_steps_per_second": 32.862, "step": 55750 }, { "epoch": 21.502890173410403, "grad_norm": 0.38862285017967224, "learning_rate": 0.00011398998073217725, "loss": 0.3568857574462891, "step": 55800 }, { "epoch": 21.502890173410403, "eval_loss": 0.4068288207054138, "eval_runtime": 12.0947, "eval_samples_per_second": 1571.264, "eval_steps_per_second": 32.742, "step": 55800 }, { "epoch": 21.522157996146436, "grad_norm": 0.36001691222190857, "learning_rate": 0.00011391290944123314, "loss": 0.35681900024414065, "step": 55850 }, { "epoch": 21.522157996146436, "eval_loss": 0.4152238667011261, "eval_runtime": 12.0929, "eval_samples_per_second": 1571.507, "eval_steps_per_second": 32.747, "step": 55850 }, { "epoch": 21.541425818882466, "grad_norm": 0.33467957377433777, "learning_rate": 0.00011383583815028902, "loss": 0.355180549621582, "step": 55900 }, { "epoch": 21.541425818882466, "eval_loss": 0.4176250994205475, "eval_runtime": 12.1311, "eval_samples_per_second": 1566.549, "eval_steps_per_second": 32.643, "step": 55900 }, { "epoch": 21.560693641618496, "grad_norm": 0.3940584361553192, "learning_rate": 0.0001137587668593449, "loss": 0.35159103393554686, "step": 55950 }, { "epoch": 21.560693641618496, "eval_loss": 0.4150150716304779, "eval_runtime": 12.059, "eval_samples_per_second": 1575.912, "eval_steps_per_second": 32.838, "step": 55950 }, { "epoch": 21.57996146435453, "grad_norm": 0.3203458786010742, "learning_rate": 0.00011368169556840078, "loss": 0.353521842956543, "step": 56000 }, { "epoch": 21.57996146435453, "eval_loss": 0.4191727340221405, "eval_runtime": 12.0706, "eval_samples_per_second": 1574.407, "eval_steps_per_second": 32.807, "step": 56000 }, { "epoch": 21.59922928709056, "grad_norm": 0.3858794867992401, "learning_rate": 0.00011360462427745667, "loss": 0.35966934204101564, "step": 56050 }, { "epoch": 21.59922928709056, "eval_loss": 0.42841023206710815, "eval_runtime": 12.2137, "eval_samples_per_second": 1555.963, "eval_steps_per_second": 32.423, "step": 56050 }, { "epoch": 21.61849710982659, "grad_norm": 0.36935701966285706, "learning_rate": 0.00011352755298651252, "loss": 0.35433197021484375, "step": 56100 }, { "epoch": 21.61849710982659, "eval_loss": 0.4283091425895691, "eval_runtime": 12.229, "eval_samples_per_second": 1554.008, "eval_steps_per_second": 32.382, "step": 56100 }, { "epoch": 21.63776493256262, "grad_norm": 0.40405070781707764, "learning_rate": 0.0001134504816955684, "loss": 0.35597938537597656, "step": 56150 }, { "epoch": 21.63776493256262, "eval_loss": 0.4190777838230133, "eval_runtime": 12.0419, "eval_samples_per_second": 1578.15, "eval_steps_per_second": 32.885, "step": 56150 }, { "epoch": 21.65703275529865, "grad_norm": 0.3257676362991333, "learning_rate": 0.00011337341040462428, "loss": 0.36185340881347655, "step": 56200 }, { "epoch": 21.65703275529865, "eval_loss": 0.4192923307418823, "eval_runtime": 12.2774, "eval_samples_per_second": 1547.889, "eval_steps_per_second": 32.254, "step": 56200 }, { "epoch": 21.67630057803468, "grad_norm": 0.35276421904563904, "learning_rate": 0.00011329633911368016, "loss": 0.3590824508666992, "step": 56250 }, { "epoch": 21.67630057803468, "eval_loss": 0.4174362123012543, "eval_runtime": 12.1461, "eval_samples_per_second": 1564.614, "eval_steps_per_second": 32.603, "step": 56250 }, { "epoch": 21.695568400770714, "grad_norm": 0.3440348505973816, "learning_rate": 0.00011321926782273604, "loss": 0.35351554870605467, "step": 56300 }, { "epoch": 21.695568400770714, "eval_loss": 0.4185763895511627, "eval_runtime": 12.1416, "eval_samples_per_second": 1565.202, "eval_steps_per_second": 32.615, "step": 56300 }, { "epoch": 21.714836223506744, "grad_norm": 0.3638891279697418, "learning_rate": 0.00011314219653179192, "loss": 0.35607933044433593, "step": 56350 }, { "epoch": 21.714836223506744, "eval_loss": 0.41225162148475647, "eval_runtime": 12.1483, "eval_samples_per_second": 1564.339, "eval_steps_per_second": 32.597, "step": 56350 }, { "epoch": 21.734104046242773, "grad_norm": 0.30850282311439514, "learning_rate": 0.00011306512524084778, "loss": 0.3489084243774414, "step": 56400 }, { "epoch": 21.734104046242773, "eval_loss": 0.41308078169822693, "eval_runtime": 12.1809, "eval_samples_per_second": 1560.15, "eval_steps_per_second": 32.51, "step": 56400 }, { "epoch": 21.753371868978807, "grad_norm": 0.318661630153656, "learning_rate": 0.00011298805394990366, "loss": 0.35631561279296875, "step": 56450 }, { "epoch": 21.753371868978807, "eval_loss": 0.4105689525604248, "eval_runtime": 12.1871, "eval_samples_per_second": 1559.35, "eval_steps_per_second": 32.493, "step": 56450 }, { "epoch": 21.772639691714836, "grad_norm": 0.3455582559108734, "learning_rate": 0.00011291098265895953, "loss": 0.3510941696166992, "step": 56500 }, { "epoch": 21.772639691714836, "eval_loss": 0.39998939633369446, "eval_runtime": 12.1602, "eval_samples_per_second": 1562.803, "eval_steps_per_second": 32.565, "step": 56500 }, { "epoch": 21.791907514450866, "grad_norm": 0.35619091987609863, "learning_rate": 0.00011283391136801542, "loss": 0.35710830688476564, "step": 56550 }, { "epoch": 21.791907514450866, "eval_loss": 0.4072059690952301, "eval_runtime": 12.2972, "eval_samples_per_second": 1545.391, "eval_steps_per_second": 32.202, "step": 56550 }, { "epoch": 21.8111753371869, "grad_norm": 0.33309003710746765, "learning_rate": 0.0001127568400770713, "loss": 0.35205902099609376, "step": 56600 }, { "epoch": 21.8111753371869, "eval_loss": 0.41570067405700684, "eval_runtime": 12.2863, "eval_samples_per_second": 1546.759, "eval_steps_per_second": 32.231, "step": 56600 }, { "epoch": 21.83044315992293, "grad_norm": 0.41228538751602173, "learning_rate": 0.00011267976878612718, "loss": 0.351953125, "step": 56650 }, { "epoch": 21.83044315992293, "eval_loss": 0.4084567129611969, "eval_runtime": 12.1828, "eval_samples_per_second": 1559.907, "eval_steps_per_second": 32.505, "step": 56650 }, { "epoch": 21.84971098265896, "grad_norm": 0.3611506223678589, "learning_rate": 0.00011260269749518303, "loss": 0.3584119415283203, "step": 56700 }, { "epoch": 21.84971098265896, "eval_loss": 0.41732168197631836, "eval_runtime": 12.223, "eval_samples_per_second": 1554.774, "eval_steps_per_second": 32.398, "step": 56700 }, { "epoch": 21.868978805394992, "grad_norm": 0.3314407169818878, "learning_rate": 0.00011252562620423892, "loss": 0.3531324005126953, "step": 56750 }, { "epoch": 21.868978805394992, "eval_loss": 0.4154232442378998, "eval_runtime": 12.163, "eval_samples_per_second": 1562.437, "eval_steps_per_second": 32.558, "step": 56750 }, { "epoch": 21.88824662813102, "grad_norm": 0.3601033091545105, "learning_rate": 0.0001124485549132948, "loss": 0.35515159606933594, "step": 56800 }, { "epoch": 21.88824662813102, "eval_loss": 0.41135695576667786, "eval_runtime": 12.1265, "eval_samples_per_second": 1567.15, "eval_steps_per_second": 32.656, "step": 56800 }, { "epoch": 21.90751445086705, "grad_norm": 0.3124025762081146, "learning_rate": 0.00011237148362235069, "loss": 0.3532611083984375, "step": 56850 }, { "epoch": 21.90751445086705, "eval_loss": 0.4155973792076111, "eval_runtime": 12.1342, "eval_samples_per_second": 1566.157, "eval_steps_per_second": 32.635, "step": 56850 }, { "epoch": 21.926782273603084, "grad_norm": 0.3524884879589081, "learning_rate": 0.00011229441233140656, "loss": 0.35065345764160155, "step": 56900 }, { "epoch": 21.926782273603084, "eval_loss": 0.4157809317111969, "eval_runtime": 12.1667, "eval_samples_per_second": 1561.972, "eval_steps_per_second": 32.548, "step": 56900 }, { "epoch": 21.946050096339114, "grad_norm": 0.35149919986724854, "learning_rate": 0.00011221734104046244, "loss": 0.3599644088745117, "step": 56950 }, { "epoch": 21.946050096339114, "eval_loss": 0.41941624879837036, "eval_runtime": 12.1302, "eval_samples_per_second": 1566.666, "eval_steps_per_second": 32.646, "step": 56950 }, { "epoch": 21.965317919075144, "grad_norm": 0.3715202212333679, "learning_rate": 0.0001121402697495183, "loss": 0.3529638671875, "step": 57000 }, { "epoch": 21.965317919075144, "eval_loss": 0.4216988980770111, "eval_runtime": 12.1014, "eval_samples_per_second": 1570.402, "eval_steps_per_second": 32.724, "step": 57000 }, { "epoch": 21.984585741811177, "grad_norm": 0.384132981300354, "learning_rate": 0.00011206319845857419, "loss": 0.35253463745117186, "step": 57050 }, { "epoch": 21.984585741811177, "eval_loss": 0.4179840087890625, "eval_runtime": 12.0448, "eval_samples_per_second": 1577.771, "eval_steps_per_second": 32.877, "step": 57050 }, { "epoch": 22.003853564547207, "grad_norm": 0.37805119156837463, "learning_rate": 0.00011198612716763006, "loss": 0.3537337875366211, "step": 57100 }, { "epoch": 22.003853564547207, "eval_loss": 0.41360199451446533, "eval_runtime": 12.0557, "eval_samples_per_second": 1576.346, "eval_steps_per_second": 32.847, "step": 57100 }, { "epoch": 22.023121387283236, "grad_norm": 0.3429993689060211, "learning_rate": 0.00011190905587668594, "loss": 0.3584504699707031, "step": 57150 }, { "epoch": 22.023121387283236, "eval_loss": 0.4044531285762787, "eval_runtime": 12.1396, "eval_samples_per_second": 1565.451, "eval_steps_per_second": 32.62, "step": 57150 }, { "epoch": 22.04238921001927, "grad_norm": 0.3374336063861847, "learning_rate": 0.00011183198458574183, "loss": 0.35786556243896483, "step": 57200 }, { "epoch": 22.04238921001927, "eval_loss": 0.41007694602012634, "eval_runtime": 12.2117, "eval_samples_per_second": 1556.209, "eval_steps_per_second": 32.428, "step": 57200 }, { "epoch": 22.0616570327553, "grad_norm": 0.38325121998786926, "learning_rate": 0.0001117549132947977, "loss": 0.3517098236083984, "step": 57250 }, { "epoch": 22.0616570327553, "eval_loss": 0.41289904713630676, "eval_runtime": 12.1186, "eval_samples_per_second": 1568.166, "eval_steps_per_second": 32.677, "step": 57250 }, { "epoch": 22.08092485549133, "grad_norm": 0.41371607780456543, "learning_rate": 0.00011167784200385356, "loss": 0.3532809066772461, "step": 57300 }, { "epoch": 22.08092485549133, "eval_loss": 0.41362932324409485, "eval_runtime": 12.1076, "eval_samples_per_second": 1569.597, "eval_steps_per_second": 32.707, "step": 57300 }, { "epoch": 22.10019267822736, "grad_norm": 0.4005432724952698, "learning_rate": 0.00011160077071290944, "loss": 0.3564036560058594, "step": 57350 }, { "epoch": 22.10019267822736, "eval_loss": 0.416367769241333, "eval_runtime": 12.1113, "eval_samples_per_second": 1569.114, "eval_steps_per_second": 32.697, "step": 57350 }, { "epoch": 22.119460500963392, "grad_norm": 0.36728349328041077, "learning_rate": 0.00011152369942196533, "loss": 0.35272491455078125, "step": 57400 }, { "epoch": 22.119460500963392, "eval_loss": 0.4165042042732239, "eval_runtime": 12.2076, "eval_samples_per_second": 1556.74, "eval_steps_per_second": 32.439, "step": 57400 }, { "epoch": 22.13872832369942, "grad_norm": 0.3308110237121582, "learning_rate": 0.0001114466281310212, "loss": 0.34692848205566407, "step": 57450 }, { "epoch": 22.13872832369942, "eval_loss": 0.4207684099674225, "eval_runtime": 12.227, "eval_samples_per_second": 1554.268, "eval_steps_per_second": 32.387, "step": 57450 }, { "epoch": 22.15799614643545, "grad_norm": 0.3472847044467926, "learning_rate": 0.00011136955684007708, "loss": 0.3573005676269531, "step": 57500 }, { "epoch": 22.15799614643545, "eval_loss": 0.4147140681743622, "eval_runtime": 12.1403, "eval_samples_per_second": 1565.363, "eval_steps_per_second": 32.619, "step": 57500 }, { "epoch": 22.177263969171484, "grad_norm": 0.3373139798641205, "learning_rate": 0.00011129248554913297, "loss": 0.35129058837890625, "step": 57550 }, { "epoch": 22.177263969171484, "eval_loss": 0.4143894612789154, "eval_runtime": 12.1688, "eval_samples_per_second": 1561.698, "eval_steps_per_second": 32.542, "step": 57550 }, { "epoch": 22.196531791907514, "grad_norm": 0.4636562168598175, "learning_rate": 0.00011121541425818883, "loss": 0.35109283447265627, "step": 57600 }, { "epoch": 22.196531791907514, "eval_loss": 0.41262710094451904, "eval_runtime": 12.2484, "eval_samples_per_second": 1551.548, "eval_steps_per_second": 32.331, "step": 57600 }, { "epoch": 22.215799614643544, "grad_norm": 0.33200421929359436, "learning_rate": 0.0001111383429672447, "loss": 0.35496768951416013, "step": 57650 }, { "epoch": 22.215799614643544, "eval_loss": 0.41630789637565613, "eval_runtime": 12.1278, "eval_samples_per_second": 1566.976, "eval_steps_per_second": 32.652, "step": 57650 }, { "epoch": 22.235067437379577, "grad_norm": 0.352877676486969, "learning_rate": 0.00011106127167630058, "loss": 0.3459299850463867, "step": 57700 }, { "epoch": 22.235067437379577, "eval_loss": 0.4143134653568268, "eval_runtime": 12.033, "eval_samples_per_second": 1579.322, "eval_steps_per_second": 32.909, "step": 57700 }, { "epoch": 22.254335260115607, "grad_norm": 0.358334481716156, "learning_rate": 0.00011098420038535647, "loss": 0.3567245101928711, "step": 57750 }, { "epoch": 22.254335260115607, "eval_loss": 0.4189992845058441, "eval_runtime": 12.0104, "eval_samples_per_second": 1582.292, "eval_steps_per_second": 32.971, "step": 57750 }, { "epoch": 22.273603082851636, "grad_norm": 0.3364226818084717, "learning_rate": 0.00011090712909441234, "loss": 0.35298091888427735, "step": 57800 }, { "epoch": 22.273603082851636, "eval_loss": 0.41492825746536255, "eval_runtime": 12.2581, "eval_samples_per_second": 1550.317, "eval_steps_per_second": 32.305, "step": 57800 }, { "epoch": 22.29287090558767, "grad_norm": 0.33699947595596313, "learning_rate": 0.00011083005780346822, "loss": 0.35319412231445313, "step": 57850 }, { "epoch": 22.29287090558767, "eval_loss": 0.4209417700767517, "eval_runtime": 12.4687, "eval_samples_per_second": 1524.142, "eval_steps_per_second": 31.76, "step": 57850 }, { "epoch": 22.3121387283237, "grad_norm": 0.3527465760707855, "learning_rate": 0.00011075298651252408, "loss": 0.34989601135253906, "step": 57900 }, { "epoch": 22.3121387283237, "eval_loss": 0.42414164543151855, "eval_runtime": 12.5223, "eval_samples_per_second": 1517.614, "eval_steps_per_second": 31.624, "step": 57900 }, { "epoch": 22.33140655105973, "grad_norm": 0.3275480568408966, "learning_rate": 0.00011067591522157997, "loss": 0.35313419342041014, "step": 57950 }, { "epoch": 22.33140655105973, "eval_loss": 0.41342833638191223, "eval_runtime": 12.2977, "eval_samples_per_second": 1545.328, "eval_steps_per_second": 32.201, "step": 57950 }, { "epoch": 22.350674373795762, "grad_norm": 0.32702189683914185, "learning_rate": 0.00011059884393063584, "loss": 0.34983497619628906, "step": 58000 }, { "epoch": 22.350674373795762, "eval_loss": 0.41460469365119934, "eval_runtime": 12.2631, "eval_samples_per_second": 1549.686, "eval_steps_per_second": 32.292, "step": 58000 }, { "epoch": 22.36994219653179, "grad_norm": 0.3292064964771271, "learning_rate": 0.00011052177263969172, "loss": 0.3603087997436523, "step": 58050 }, { "epoch": 22.36994219653179, "eval_loss": 0.4162609577178955, "eval_runtime": 12.3442, "eval_samples_per_second": 1539.502, "eval_steps_per_second": 32.08, "step": 58050 }, { "epoch": 22.38921001926782, "grad_norm": 0.39785143733024597, "learning_rate": 0.00011044470134874761, "loss": 0.35370674133300783, "step": 58100 }, { "epoch": 22.38921001926782, "eval_loss": 0.42017459869384766, "eval_runtime": 11.9885, "eval_samples_per_second": 1585.189, "eval_steps_per_second": 33.032, "step": 58100 }, { "epoch": 22.408477842003855, "grad_norm": 0.38202109932899475, "learning_rate": 0.00011036763005780348, "loss": 0.3530514144897461, "step": 58150 }, { "epoch": 22.408477842003855, "eval_loss": 0.4211256206035614, "eval_runtime": 12.1447, "eval_samples_per_second": 1564.8, "eval_steps_per_second": 32.607, "step": 58150 }, { "epoch": 22.427745664739884, "grad_norm": 0.35856661200523376, "learning_rate": 0.00011029055876685934, "loss": 0.35643054962158205, "step": 58200 }, { "epoch": 22.427745664739884, "eval_loss": 0.4115031659603119, "eval_runtime": 12.023, "eval_samples_per_second": 1580.635, "eval_steps_per_second": 32.937, "step": 58200 }, { "epoch": 22.447013487475914, "grad_norm": 0.339470773935318, "learning_rate": 0.00011021348747591522, "loss": 0.3489860534667969, "step": 58250 }, { "epoch": 22.447013487475914, "eval_loss": 0.4180634617805481, "eval_runtime": 12.1727, "eval_samples_per_second": 1561.2, "eval_steps_per_second": 32.532, "step": 58250 }, { "epoch": 22.466281310211947, "grad_norm": 0.34379079937934875, "learning_rate": 0.0001101364161849711, "loss": 0.3544771957397461, "step": 58300 }, { "epoch": 22.466281310211947, "eval_loss": 0.4173836410045624, "eval_runtime": 12.2749, "eval_samples_per_second": 1548.2, "eval_steps_per_second": 32.261, "step": 58300 }, { "epoch": 22.485549132947977, "grad_norm": 0.3687109649181366, "learning_rate": 0.00011005934489402698, "loss": 0.3450492477416992, "step": 58350 }, { "epoch": 22.485549132947977, "eval_loss": 0.4108184278011322, "eval_runtime": 12.3183, "eval_samples_per_second": 1542.739, "eval_steps_per_second": 32.147, "step": 58350 }, { "epoch": 22.504816955684007, "grad_norm": 0.36188799142837524, "learning_rate": 0.00010998227360308286, "loss": 0.35666522979736326, "step": 58400 }, { "epoch": 22.504816955684007, "eval_loss": 0.4142003655433655, "eval_runtime": 12.1158, "eval_samples_per_second": 1568.534, "eval_steps_per_second": 32.685, "step": 58400 }, { "epoch": 22.52408477842004, "grad_norm": 0.3630635738372803, "learning_rate": 0.00010990520231213875, "loss": 0.3538875961303711, "step": 58450 }, { "epoch": 22.52408477842004, "eval_loss": 0.4024921655654907, "eval_runtime": 12.1499, "eval_samples_per_second": 1564.132, "eval_steps_per_second": 32.593, "step": 58450 }, { "epoch": 22.54335260115607, "grad_norm": 0.35822710394859314, "learning_rate": 0.0001098281310211946, "loss": 0.3465740966796875, "step": 58500 }, { "epoch": 22.54335260115607, "eval_loss": 0.4117197096347809, "eval_runtime": 12.1747, "eval_samples_per_second": 1560.939, "eval_steps_per_second": 32.526, "step": 58500 }, { "epoch": 22.5626204238921, "grad_norm": 0.4173564910888672, "learning_rate": 0.00010975105973025048, "loss": 0.35478973388671875, "step": 58550 }, { "epoch": 22.5626204238921, "eval_loss": 0.40693268179893494, "eval_runtime": 12.3773, "eval_samples_per_second": 1535.395, "eval_steps_per_second": 31.994, "step": 58550 }, { "epoch": 22.581888246628132, "grad_norm": 0.33736884593963623, "learning_rate": 0.00010967398843930636, "loss": 0.34911277770996096, "step": 58600 }, { "epoch": 22.581888246628132, "eval_loss": 0.40313780307769775, "eval_runtime": 12.4788, "eval_samples_per_second": 1522.9, "eval_steps_per_second": 31.734, "step": 58600 }, { "epoch": 22.601156069364162, "grad_norm": 0.33598583936691284, "learning_rate": 0.00010959691714836225, "loss": 0.3538917922973633, "step": 58650 }, { "epoch": 22.601156069364162, "eval_loss": 0.40756577253341675, "eval_runtime": 12.5367, "eval_samples_per_second": 1515.866, "eval_steps_per_second": 31.587, "step": 58650 }, { "epoch": 22.62042389210019, "grad_norm": 0.3624790608882904, "learning_rate": 0.00010951984585741812, "loss": 0.3584502410888672, "step": 58700 }, { "epoch": 22.62042389210019, "eval_loss": 0.4110408425331116, "eval_runtime": 12.3607, "eval_samples_per_second": 1537.45, "eval_steps_per_second": 32.037, "step": 58700 }, { "epoch": 22.639691714836225, "grad_norm": 0.34844496846199036, "learning_rate": 0.000109442774566474, "loss": 0.35319419860839846, "step": 58750 }, { "epoch": 22.639691714836225, "eval_loss": 0.4123121500015259, "eval_runtime": 12.1813, "eval_samples_per_second": 1560.09, "eval_steps_per_second": 32.509, "step": 58750 }, { "epoch": 22.658959537572255, "grad_norm": 0.34510278701782227, "learning_rate": 0.00010936570327552986, "loss": 0.3501720809936523, "step": 58800 }, { "epoch": 22.658959537572255, "eval_loss": 0.4196062982082367, "eval_runtime": 12.1518, "eval_samples_per_second": 1563.889, "eval_steps_per_second": 32.588, "step": 58800 }, { "epoch": 22.678227360308284, "grad_norm": 0.3680237829685211, "learning_rate": 0.00010928863198458575, "loss": 0.3588803100585938, "step": 58850 }, { "epoch": 22.678227360308284, "eval_loss": 0.40663087368011475, "eval_runtime": 12.1366, "eval_samples_per_second": 1565.839, "eval_steps_per_second": 32.629, "step": 58850 }, { "epoch": 22.697495183044317, "grad_norm": 0.32476091384887695, "learning_rate": 0.00010921156069364163, "loss": 0.35131206512451174, "step": 58900 }, { "epoch": 22.697495183044317, "eval_loss": 0.41374683380126953, "eval_runtime": 12.1735, "eval_samples_per_second": 1561.091, "eval_steps_per_second": 32.53, "step": 58900 }, { "epoch": 22.716763005780347, "grad_norm": 0.3621932566165924, "learning_rate": 0.0001091344894026975, "loss": 0.3503282928466797, "step": 58950 }, { "epoch": 22.716763005780347, "eval_loss": 0.4118322432041168, "eval_runtime": 12.2172, "eval_samples_per_second": 1555.508, "eval_steps_per_second": 32.413, "step": 58950 }, { "epoch": 22.736030828516377, "grad_norm": 0.3776865303516388, "learning_rate": 0.00010905741811175339, "loss": 0.3616694259643555, "step": 59000 }, { "epoch": 22.736030828516377, "eval_loss": 0.40332329273223877, "eval_runtime": 12.0357, "eval_samples_per_second": 1578.963, "eval_steps_per_second": 32.902, "step": 59000 }, { "epoch": 22.75529865125241, "grad_norm": 0.34310415387153625, "learning_rate": 0.00010898034682080927, "loss": 0.35982139587402345, "step": 59050 }, { "epoch": 22.75529865125241, "eval_loss": 0.4018932282924652, "eval_runtime": 12.0559, "eval_samples_per_second": 1576.329, "eval_steps_per_second": 32.847, "step": 59050 }, { "epoch": 22.77456647398844, "grad_norm": 0.3627042770385742, "learning_rate": 0.00010890327552986513, "loss": 0.35106544494628905, "step": 59100 }, { "epoch": 22.77456647398844, "eval_loss": 0.40938273072242737, "eval_runtime": 12.0014, "eval_samples_per_second": 1583.482, "eval_steps_per_second": 32.996, "step": 59100 }, { "epoch": 22.79383429672447, "grad_norm": 0.3856176733970642, "learning_rate": 0.000108826204238921, "loss": 0.3491558074951172, "step": 59150 }, { "epoch": 22.79383429672447, "eval_loss": 0.4129432737827301, "eval_runtime": 12.0352, "eval_samples_per_second": 1579.032, "eval_steps_per_second": 32.903, "step": 59150 }, { "epoch": 22.813102119460503, "grad_norm": 0.3632095158100128, "learning_rate": 0.00010874913294797689, "loss": 0.35542789459228513, "step": 59200 }, { "epoch": 22.813102119460503, "eval_loss": 0.41676846146583557, "eval_runtime": 12.0336, "eval_samples_per_second": 1579.246, "eval_steps_per_second": 32.908, "step": 59200 }, { "epoch": 22.832369942196532, "grad_norm": 0.4166209101676941, "learning_rate": 0.00010867206165703277, "loss": 0.34742504119873047, "step": 59250 }, { "epoch": 22.832369942196532, "eval_loss": 0.41570910811424255, "eval_runtime": 12.0328, "eval_samples_per_second": 1579.353, "eval_steps_per_second": 32.91, "step": 59250 }, { "epoch": 22.851637764932562, "grad_norm": 0.33289313316345215, "learning_rate": 0.00010859499036608864, "loss": 0.35378345489501956, "step": 59300 }, { "epoch": 22.851637764932562, "eval_loss": 0.3998374938964844, "eval_runtime": 12.0391, "eval_samples_per_second": 1578.526, "eval_steps_per_second": 32.893, "step": 59300 }, { "epoch": 22.870905587668595, "grad_norm": 0.414850115776062, "learning_rate": 0.0001085179190751445, "loss": 0.35326858520507814, "step": 59350 }, { "epoch": 22.870905587668595, "eval_loss": 0.4039616584777832, "eval_runtime": 12.0487, "eval_samples_per_second": 1577.272, "eval_steps_per_second": 32.867, "step": 59350 }, { "epoch": 22.890173410404625, "grad_norm": 0.33717823028564453, "learning_rate": 0.00010844084778420038, "loss": 0.3497649383544922, "step": 59400 }, { "epoch": 22.890173410404625, "eval_loss": 0.4120280146598816, "eval_runtime": 12.0601, "eval_samples_per_second": 1575.78, "eval_steps_per_second": 32.836, "step": 59400 }, { "epoch": 22.909441233140655, "grad_norm": 0.34781190752983093, "learning_rate": 0.00010836377649325627, "loss": 0.3546435546875, "step": 59450 }, { "epoch": 22.909441233140655, "eval_loss": 0.40326055884361267, "eval_runtime": 12.0868, "eval_samples_per_second": 1572.298, "eval_steps_per_second": 32.763, "step": 59450 }, { "epoch": 22.928709055876688, "grad_norm": 0.3078193664550781, "learning_rate": 0.00010828670520231214, "loss": 0.35411739349365234, "step": 59500 }, { "epoch": 22.928709055876688, "eval_loss": 0.40810883045196533, "eval_runtime": 12.0431, "eval_samples_per_second": 1577.993, "eval_steps_per_second": 32.882, "step": 59500 }, { "epoch": 22.947976878612717, "grad_norm": 0.3363012671470642, "learning_rate": 0.00010820963391136803, "loss": 0.3534751129150391, "step": 59550 }, { "epoch": 22.947976878612717, "eval_loss": 0.4077805280685425, "eval_runtime": 12.1694, "eval_samples_per_second": 1561.625, "eval_steps_per_second": 32.541, "step": 59550 }, { "epoch": 22.967244701348747, "grad_norm": 0.3743981719017029, "learning_rate": 0.00010813256262042391, "loss": 0.3427344512939453, "step": 59600 }, { "epoch": 22.967244701348747, "eval_loss": 0.40910038352012634, "eval_runtime": 12.1949, "eval_samples_per_second": 1558.358, "eval_steps_per_second": 32.473, "step": 59600 }, { "epoch": 22.986512524084777, "grad_norm": 0.3133869469165802, "learning_rate": 0.00010805549132947977, "loss": 0.3539696502685547, "step": 59650 }, { "epoch": 22.986512524084777, "eval_loss": 0.4112917482852936, "eval_runtime": 12.1823, "eval_samples_per_second": 1559.967, "eval_steps_per_second": 32.506, "step": 59650 }, { "epoch": 23.00578034682081, "grad_norm": 0.3596579134464264, "learning_rate": 0.00010797842003853564, "loss": 0.35101234436035156, "step": 59700 }, { "epoch": 23.00578034682081, "eval_loss": 0.40755409002304077, "eval_runtime": 12.1822, "eval_samples_per_second": 1559.981, "eval_steps_per_second": 32.506, "step": 59700 }, { "epoch": 23.02504816955684, "grad_norm": 0.3130947947502136, "learning_rate": 0.00010790134874759152, "loss": 0.3509973907470703, "step": 59750 }, { "epoch": 23.02504816955684, "eval_loss": 0.40819400548934937, "eval_runtime": 12.1566, "eval_samples_per_second": 1563.265, "eval_steps_per_second": 32.575, "step": 59750 }, { "epoch": 23.04431599229287, "grad_norm": 0.3759409189224243, "learning_rate": 0.00010782427745664741, "loss": 0.3529574203491211, "step": 59800 }, { "epoch": 23.04431599229287, "eval_loss": 0.41147273778915405, "eval_runtime": 12.1214, "eval_samples_per_second": 1567.809, "eval_steps_per_second": 32.67, "step": 59800 }, { "epoch": 23.063583815028903, "grad_norm": 0.3874574899673462, "learning_rate": 0.00010774720616570328, "loss": 0.35173385620117187, "step": 59850 }, { "epoch": 23.063583815028903, "eval_loss": 0.41608160734176636, "eval_runtime": 12.1774, "eval_samples_per_second": 1560.593, "eval_steps_per_second": 32.519, "step": 59850 }, { "epoch": 23.082851637764932, "grad_norm": 0.3146451711654663, "learning_rate": 0.00010767013487475917, "loss": 0.3486193084716797, "step": 59900 }, { "epoch": 23.082851637764932, "eval_loss": 0.40576478838920593, "eval_runtime": 12.0667, "eval_samples_per_second": 1574.912, "eval_steps_per_second": 32.818, "step": 59900 }, { "epoch": 23.102119460500962, "grad_norm": 0.3195267617702484, "learning_rate": 0.00010759306358381502, "loss": 0.3548238754272461, "step": 59950 }, { "epoch": 23.102119460500962, "eval_loss": 0.40514472126960754, "eval_runtime": 12.1911, "eval_samples_per_second": 1558.846, "eval_steps_per_second": 32.483, "step": 59950 }, { "epoch": 23.121387283236995, "grad_norm": 0.34261518716812134, "learning_rate": 0.00010751599229287091, "loss": 0.3526170349121094, "step": 60000 }, { "epoch": 23.121387283236995, "eval_loss": 0.4033450186252594, "eval_runtime": 12.0724, "eval_samples_per_second": 1574.172, "eval_steps_per_second": 32.802, "step": 60000 }, { "epoch": 23.140655105973025, "grad_norm": 0.3900609016418457, "learning_rate": 0.00010743892100192678, "loss": 0.3474480819702148, "step": 60050 }, { "epoch": 23.140655105973025, "eval_loss": 0.4075223505496979, "eval_runtime": 12.0698, "eval_samples_per_second": 1574.503, "eval_steps_per_second": 32.809, "step": 60050 }, { "epoch": 23.159922928709054, "grad_norm": 0.3460886776447296, "learning_rate": 0.00010736184971098266, "loss": 0.34912399291992186, "step": 60100 }, { "epoch": 23.159922928709054, "eval_loss": 0.4071314036846161, "eval_runtime": 12.0475, "eval_samples_per_second": 1577.422, "eval_steps_per_second": 32.87, "step": 60100 }, { "epoch": 23.179190751445088, "grad_norm": 0.5005828738212585, "learning_rate": 0.00010728477842003855, "loss": 0.35415550231933596, "step": 60150 }, { "epoch": 23.179190751445088, "eval_loss": 0.4171423017978668, "eval_runtime": 12.0471, "eval_samples_per_second": 1577.476, "eval_steps_per_second": 32.871, "step": 60150 }, { "epoch": 23.198458574181117, "grad_norm": 0.3352817893028259, "learning_rate": 0.00010720770712909442, "loss": 0.35041046142578125, "step": 60200 }, { "epoch": 23.198458574181117, "eval_loss": 0.41777968406677246, "eval_runtime": 12.056, "eval_samples_per_second": 1576.311, "eval_steps_per_second": 32.847, "step": 60200 }, { "epoch": 23.217726396917147, "grad_norm": 0.40860673785209656, "learning_rate": 0.00010713063583815028, "loss": 0.3515160369873047, "step": 60250 }, { "epoch": 23.217726396917147, "eval_loss": 0.41961580514907837, "eval_runtime": 12.0436, "eval_samples_per_second": 1577.932, "eval_steps_per_second": 32.881, "step": 60250 }, { "epoch": 23.23699421965318, "grad_norm": 0.32789912819862366, "learning_rate": 0.00010705356454720616, "loss": 0.3513436126708984, "step": 60300 }, { "epoch": 23.23699421965318, "eval_loss": 0.41032296419143677, "eval_runtime": 12.2058, "eval_samples_per_second": 1556.959, "eval_steps_per_second": 32.443, "step": 60300 }, { "epoch": 23.25626204238921, "grad_norm": 0.3454116880893707, "learning_rate": 0.00010697649325626205, "loss": 0.35038665771484373, "step": 60350 }, { "epoch": 23.25626204238921, "eval_loss": 0.40637508034706116, "eval_runtime": 12.1799, "eval_samples_per_second": 1560.275, "eval_steps_per_second": 32.513, "step": 60350 }, { "epoch": 23.27552986512524, "grad_norm": 0.38221532106399536, "learning_rate": 0.00010689942196531792, "loss": 0.34488372802734374, "step": 60400 }, { "epoch": 23.27552986512524, "eval_loss": 0.41049468517303467, "eval_runtime": 12.2223, "eval_samples_per_second": 1554.862, "eval_steps_per_second": 32.4, "step": 60400 }, { "epoch": 23.294797687861273, "grad_norm": 0.37112849950790405, "learning_rate": 0.0001068223506743738, "loss": 0.3512368011474609, "step": 60450 }, { "epoch": 23.294797687861273, "eval_loss": 0.4108507037162781, "eval_runtime": 12.1853, "eval_samples_per_second": 1559.578, "eval_steps_per_second": 32.498, "step": 60450 }, { "epoch": 23.314065510597302, "grad_norm": 0.35059770941734314, "learning_rate": 0.00010674527938342969, "loss": 0.3473938369750977, "step": 60500 }, { "epoch": 23.314065510597302, "eval_loss": 0.41285067796707153, "eval_runtime": 12.092, "eval_samples_per_second": 1571.621, "eval_steps_per_second": 32.749, "step": 60500 }, { "epoch": 23.333333333333332, "grad_norm": 0.37356486916542053, "learning_rate": 0.00010666820809248555, "loss": 0.35284095764160156, "step": 60550 }, { "epoch": 23.333333333333332, "eval_loss": 0.4136626422405243, "eval_runtime": 12.1205, "eval_samples_per_second": 1567.918, "eval_steps_per_second": 32.672, "step": 60550 }, { "epoch": 23.352601156069365, "grad_norm": 0.35061773657798767, "learning_rate": 0.00010659113680154142, "loss": 0.34540477752685544, "step": 60600 }, { "epoch": 23.352601156069365, "eval_loss": 0.4205920398235321, "eval_runtime": 12.1133, "eval_samples_per_second": 1568.855, "eval_steps_per_second": 32.691, "step": 60600 }, { "epoch": 23.371868978805395, "grad_norm": 0.38342800736427307, "learning_rate": 0.0001065140655105973, "loss": 0.3514550018310547, "step": 60650 }, { "epoch": 23.371868978805395, "eval_loss": 0.4093393385410309, "eval_runtime": 12.0469, "eval_samples_per_second": 1577.499, "eval_steps_per_second": 32.871, "step": 60650 }, { "epoch": 23.391136801541425, "grad_norm": 0.44266846776008606, "learning_rate": 0.00010643699421965319, "loss": 0.3541123962402344, "step": 60700 }, { "epoch": 23.391136801541425, "eval_loss": 0.41214337944984436, "eval_runtime": 12.0445, "eval_samples_per_second": 1577.813, "eval_steps_per_second": 32.878, "step": 60700 }, { "epoch": 23.410404624277458, "grad_norm": 0.3174605369567871, "learning_rate": 0.00010635992292870906, "loss": 0.3469362258911133, "step": 60750 }, { "epoch": 23.410404624277458, "eval_loss": 0.41267675161361694, "eval_runtime": 12.0408, "eval_samples_per_second": 1578.303, "eval_steps_per_second": 32.888, "step": 60750 }, { "epoch": 23.429672447013488, "grad_norm": 0.3664960265159607, "learning_rate": 0.00010628285163776494, "loss": 0.35413753509521484, "step": 60800 }, { "epoch": 23.429672447013488, "eval_loss": 0.4006960093975067, "eval_runtime": 12.0914, "eval_samples_per_second": 1571.692, "eval_steps_per_second": 32.75, "step": 60800 }, { "epoch": 23.448940269749517, "grad_norm": 0.3789653480052948, "learning_rate": 0.0001062057803468208, "loss": 0.34792404174804686, "step": 60850 }, { "epoch": 23.448940269749517, "eval_loss": 0.40449658036231995, "eval_runtime": 12.0356, "eval_samples_per_second": 1578.984, "eval_steps_per_second": 32.902, "step": 60850 }, { "epoch": 23.46820809248555, "grad_norm": 0.36309120059013367, "learning_rate": 0.00010612870905587669, "loss": 0.3514362335205078, "step": 60900 }, { "epoch": 23.46820809248555, "eval_loss": 0.4065251648426056, "eval_runtime": 12.0759, "eval_samples_per_second": 1573.716, "eval_steps_per_second": 32.793, "step": 60900 }, { "epoch": 23.48747591522158, "grad_norm": 0.3683059811592102, "learning_rate": 0.00010605163776493257, "loss": 0.34922191619873044, "step": 60950 }, { "epoch": 23.48747591522158, "eval_loss": 0.403415322303772, "eval_runtime": 12.0398, "eval_samples_per_second": 1578.434, "eval_steps_per_second": 32.891, "step": 60950 }, { "epoch": 23.50674373795761, "grad_norm": 0.3912409842014313, "learning_rate": 0.00010597456647398844, "loss": 0.35316810607910154, "step": 61000 }, { "epoch": 23.50674373795761, "eval_loss": 0.4099844694137573, "eval_runtime": 12.1867, "eval_samples_per_second": 1559.407, "eval_steps_per_second": 32.494, "step": 61000 }, { "epoch": 23.526011560693643, "grad_norm": 0.382951945066452, "learning_rate": 0.00010589749518304433, "loss": 0.34941436767578127, "step": 61050 }, { "epoch": 23.526011560693643, "eval_loss": 0.4059906601905823, "eval_runtime": 12.0638, "eval_samples_per_second": 1575.292, "eval_steps_per_second": 32.825, "step": 61050 }, { "epoch": 23.545279383429673, "grad_norm": 0.37281471490859985, "learning_rate": 0.00010582042389210021, "loss": 0.3490950775146484, "step": 61100 }, { "epoch": 23.545279383429673, "eval_loss": 0.4094561040401459, "eval_runtime": 12.051, "eval_samples_per_second": 1576.963, "eval_steps_per_second": 32.86, "step": 61100 }, { "epoch": 23.564547206165702, "grad_norm": 0.3439439833164215, "learning_rate": 0.00010574335260115607, "loss": 0.3506605529785156, "step": 61150 }, { "epoch": 23.564547206165702, "eval_loss": 0.41544365882873535, "eval_runtime": 12.0561, "eval_samples_per_second": 1576.304, "eval_steps_per_second": 32.847, "step": 61150 }, { "epoch": 23.583815028901736, "grad_norm": 0.3615610897541046, "learning_rate": 0.00010566628131021194, "loss": 0.35205577850341796, "step": 61200 }, { "epoch": 23.583815028901736, "eval_loss": 0.4033142030239105, "eval_runtime": 12.1871, "eval_samples_per_second": 1559.36, "eval_steps_per_second": 32.494, "step": 61200 }, { "epoch": 23.603082851637765, "grad_norm": 0.33835288882255554, "learning_rate": 0.00010558921001926783, "loss": 0.35257545471191404, "step": 61250 }, { "epoch": 23.603082851637765, "eval_loss": 0.40885159373283386, "eval_runtime": 12.2139, "eval_samples_per_second": 1555.926, "eval_steps_per_second": 32.422, "step": 61250 }, { "epoch": 23.622350674373795, "grad_norm": 0.35860639810562134, "learning_rate": 0.00010551213872832371, "loss": 0.3554397201538086, "step": 61300 }, { "epoch": 23.622350674373795, "eval_loss": 0.4095515012741089, "eval_runtime": 12.2283, "eval_samples_per_second": 1554.102, "eval_steps_per_second": 32.384, "step": 61300 }, { "epoch": 23.641618497109828, "grad_norm": 0.3424835503101349, "learning_rate": 0.00010543506743737958, "loss": 0.3503252410888672, "step": 61350 }, { "epoch": 23.641618497109828, "eval_loss": 0.41033169627189636, "eval_runtime": 12.2579, "eval_samples_per_second": 1550.348, "eval_steps_per_second": 32.306, "step": 61350 }, { "epoch": 23.660886319845858, "grad_norm": 0.32614532113075256, "learning_rate": 0.00010535799614643547, "loss": 0.3484902191162109, "step": 61400 }, { "epoch": 23.660886319845858, "eval_loss": 0.4085164964199066, "eval_runtime": 12.2199, "eval_samples_per_second": 1555.166, "eval_steps_per_second": 32.406, "step": 61400 }, { "epoch": 23.680154142581888, "grad_norm": 0.418131023645401, "learning_rate": 0.00010528092485549133, "loss": 0.34852134704589843, "step": 61450 }, { "epoch": 23.680154142581888, "eval_loss": 0.4104093611240387, "eval_runtime": 12.062, "eval_samples_per_second": 1575.521, "eval_steps_per_second": 32.83, "step": 61450 }, { "epoch": 23.69942196531792, "grad_norm": 0.3792790472507477, "learning_rate": 0.00010520385356454721, "loss": 0.3518054962158203, "step": 61500 }, { "epoch": 23.69942196531792, "eval_loss": 0.40683138370513916, "eval_runtime": 12.1662, "eval_samples_per_second": 1562.03, "eval_steps_per_second": 32.549, "step": 61500 }, { "epoch": 23.71868978805395, "grad_norm": 0.3249325454235077, "learning_rate": 0.00010512678227360308, "loss": 0.3512077713012695, "step": 61550 }, { "epoch": 23.71868978805395, "eval_loss": 0.40962016582489014, "eval_runtime": 12.171, "eval_samples_per_second": 1561.419, "eval_steps_per_second": 32.536, "step": 61550 }, { "epoch": 23.73795761078998, "grad_norm": 0.33552709221839905, "learning_rate": 0.00010504971098265897, "loss": 0.35368526458740235, "step": 61600 }, { "epoch": 23.73795761078998, "eval_loss": 0.4089512228965759, "eval_runtime": 12.235, "eval_samples_per_second": 1553.25, "eval_steps_per_second": 32.366, "step": 61600 }, { "epoch": 23.75722543352601, "grad_norm": 0.35097551345825195, "learning_rate": 0.00010497263969171485, "loss": 0.3535353851318359, "step": 61650 }, { "epoch": 23.75722543352601, "eval_loss": 0.4159090220928192, "eval_runtime": 12.1011, "eval_samples_per_second": 1570.439, "eval_steps_per_second": 32.724, "step": 61650 }, { "epoch": 23.776493256262043, "grad_norm": 0.33854639530181885, "learning_rate": 0.00010489556840077072, "loss": 0.354573974609375, "step": 61700 }, { "epoch": 23.776493256262043, "eval_loss": 0.40517112612724304, "eval_runtime": 12.2164, "eval_samples_per_second": 1555.612, "eval_steps_per_second": 32.415, "step": 61700 }, { "epoch": 23.795761078998073, "grad_norm": 0.38939011096954346, "learning_rate": 0.00010481849710982658, "loss": 0.3550876998901367, "step": 61750 }, { "epoch": 23.795761078998073, "eval_loss": 0.4203275442123413, "eval_runtime": 12.0811, "eval_samples_per_second": 1573.03, "eval_steps_per_second": 32.778, "step": 61750 }, { "epoch": 23.815028901734102, "grad_norm": 0.3803982436656952, "learning_rate": 0.00010474142581888247, "loss": 0.34934303283691404, "step": 61800 }, { "epoch": 23.815028901734102, "eval_loss": 0.41927140951156616, "eval_runtime": 12.171, "eval_samples_per_second": 1561.414, "eval_steps_per_second": 32.536, "step": 61800 }, { "epoch": 23.834296724470136, "grad_norm": 0.3489030599594116, "learning_rate": 0.00010466435452793835, "loss": 0.35281585693359374, "step": 61850 }, { "epoch": 23.834296724470136, "eval_loss": 0.42335131764411926, "eval_runtime": 12.169, "eval_samples_per_second": 1561.668, "eval_steps_per_second": 32.542, "step": 61850 }, { "epoch": 23.853564547206165, "grad_norm": 0.33045902848243713, "learning_rate": 0.00010458728323699422, "loss": 0.3480122756958008, "step": 61900 }, { "epoch": 23.853564547206165, "eval_loss": 0.4106775224208832, "eval_runtime": 12.1159, "eval_samples_per_second": 1568.516, "eval_steps_per_second": 32.684, "step": 61900 }, { "epoch": 23.872832369942195, "grad_norm": 0.34234416484832764, "learning_rate": 0.00010451021194605011, "loss": 0.3510921859741211, "step": 61950 }, { "epoch": 23.872832369942195, "eval_loss": 0.41417914628982544, "eval_runtime": 12.2283, "eval_samples_per_second": 1554.096, "eval_steps_per_second": 32.384, "step": 61950 }, { "epoch": 23.892100192678228, "grad_norm": 0.3353492319583893, "learning_rate": 0.00010443314065510599, "loss": 0.3491094207763672, "step": 62000 }, { "epoch": 23.892100192678228, "eval_loss": 0.4144243001937866, "eval_runtime": 12.1621, "eval_samples_per_second": 1562.559, "eval_steps_per_second": 32.56, "step": 62000 }, { "epoch": 23.911368015414258, "grad_norm": 0.31985944509506226, "learning_rate": 0.00010435606936416185, "loss": 0.35328155517578125, "step": 62050 }, { "epoch": 23.911368015414258, "eval_loss": 0.41155320405960083, "eval_runtime": 12.0656, "eval_samples_per_second": 1575.055, "eval_steps_per_second": 32.821, "step": 62050 }, { "epoch": 23.930635838150287, "grad_norm": 0.3870062530040741, "learning_rate": 0.00010427899807321772, "loss": 0.35329078674316405, "step": 62100 }, { "epoch": 23.930635838150287, "eval_loss": 0.4072156250476837, "eval_runtime": 12.1652, "eval_samples_per_second": 1562.165, "eval_steps_per_second": 32.552, "step": 62100 }, { "epoch": 23.94990366088632, "grad_norm": 0.3739334046840668, "learning_rate": 0.0001042019267822736, "loss": 0.3487839126586914, "step": 62150 }, { "epoch": 23.94990366088632, "eval_loss": 0.41181522607803345, "eval_runtime": 12.1631, "eval_samples_per_second": 1562.428, "eval_steps_per_second": 32.557, "step": 62150 }, { "epoch": 23.96917148362235, "grad_norm": 0.37903892993927, "learning_rate": 0.00010412485549132949, "loss": 0.3489336395263672, "step": 62200 }, { "epoch": 23.96917148362235, "eval_loss": 0.41383618116378784, "eval_runtime": 12.0883, "eval_samples_per_second": 1572.105, "eval_steps_per_second": 32.759, "step": 62200 }, { "epoch": 23.98843930635838, "grad_norm": 0.5038950443267822, "learning_rate": 0.00010404778420038536, "loss": 0.35565624237060545, "step": 62250 }, { "epoch": 23.98843930635838, "eval_loss": 0.41118401288986206, "eval_runtime": 12.0664, "eval_samples_per_second": 1574.953, "eval_steps_per_second": 32.818, "step": 62250 }, { "epoch": 24.007707129094413, "grad_norm": 0.3460756838321686, "learning_rate": 0.00010397071290944125, "loss": 0.3487716293334961, "step": 62300 }, { "epoch": 24.007707129094413, "eval_loss": 0.4045054018497467, "eval_runtime": 12.1802, "eval_samples_per_second": 1560.236, "eval_steps_per_second": 32.512, "step": 62300 }, { "epoch": 24.026974951830443, "grad_norm": 0.36613109707832336, "learning_rate": 0.0001038936416184971, "loss": 0.34858959197998046, "step": 62350 }, { "epoch": 24.026974951830443, "eval_loss": 0.41622206568717957, "eval_runtime": 12.1457, "eval_samples_per_second": 1564.672, "eval_steps_per_second": 32.604, "step": 62350 }, { "epoch": 24.046242774566473, "grad_norm": 0.3434009552001953, "learning_rate": 0.00010381657032755299, "loss": 0.35155296325683594, "step": 62400 }, { "epoch": 24.046242774566473, "eval_loss": 0.4168601632118225, "eval_runtime": 12.1865, "eval_samples_per_second": 1559.426, "eval_steps_per_second": 32.495, "step": 62400 }, { "epoch": 24.065510597302506, "grad_norm": 0.39586398005485535, "learning_rate": 0.00010373949903660886, "loss": 0.34975151062011717, "step": 62450 }, { "epoch": 24.065510597302506, "eval_loss": 0.41099074482917786, "eval_runtime": 12.1995, "eval_samples_per_second": 1557.763, "eval_steps_per_second": 32.46, "step": 62450 }, { "epoch": 24.084778420038536, "grad_norm": 0.377571165561676, "learning_rate": 0.00010366242774566475, "loss": 0.35281570434570314, "step": 62500 }, { "epoch": 24.084778420038536, "eval_loss": 0.41579896211624146, "eval_runtime": 12.1785, "eval_samples_per_second": 1560.45, "eval_steps_per_second": 32.516, "step": 62500 }, { "epoch": 24.104046242774565, "grad_norm": 0.3773937225341797, "learning_rate": 0.00010358535645472063, "loss": 0.3548154449462891, "step": 62550 }, { "epoch": 24.104046242774565, "eval_loss": 0.4086999297142029, "eval_runtime": 12.1706, "eval_samples_per_second": 1561.468, "eval_steps_per_second": 32.537, "step": 62550 }, { "epoch": 24.1233140655106, "grad_norm": 0.34447112679481506, "learning_rate": 0.0001035082851637765, "loss": 0.3524297332763672, "step": 62600 }, { "epoch": 24.1233140655106, "eval_loss": 0.4054974317550659, "eval_runtime": 12.17, "eval_samples_per_second": 1561.541, "eval_steps_per_second": 32.539, "step": 62600 }, { "epoch": 24.142581888246628, "grad_norm": 0.3436383008956909, "learning_rate": 0.00010343121387283236, "loss": 0.34782440185546876, "step": 62650 }, { "epoch": 24.142581888246628, "eval_loss": 0.41087469458580017, "eval_runtime": 12.1741, "eval_samples_per_second": 1561.015, "eval_steps_per_second": 32.528, "step": 62650 }, { "epoch": 24.161849710982658, "grad_norm": 0.29823189973831177, "learning_rate": 0.00010335414258188825, "loss": 0.3434149932861328, "step": 62700 }, { "epoch": 24.161849710982658, "eval_loss": 0.40747517347335815, "eval_runtime": 12.2317, "eval_samples_per_second": 1553.673, "eval_steps_per_second": 32.375, "step": 62700 }, { "epoch": 24.18111753371869, "grad_norm": 0.3250676095485687, "learning_rate": 0.00010327707129094413, "loss": 0.35359066009521484, "step": 62750 }, { "epoch": 24.18111753371869, "eval_loss": 0.41347071528434753, "eval_runtime": 12.0729, "eval_samples_per_second": 1574.11, "eval_steps_per_second": 32.801, "step": 62750 }, { "epoch": 24.20038535645472, "grad_norm": 0.340562641620636, "learning_rate": 0.0001032, "loss": 0.3473960876464844, "step": 62800 }, { "epoch": 24.20038535645472, "eval_loss": 0.4059213697910309, "eval_runtime": 12.1495, "eval_samples_per_second": 1564.185, "eval_steps_per_second": 32.594, "step": 62800 }, { "epoch": 24.21965317919075, "grad_norm": 0.430858314037323, "learning_rate": 0.00010312292870905589, "loss": 0.3534341049194336, "step": 62850 }, { "epoch": 24.21965317919075, "eval_loss": 0.40964475274086, "eval_runtime": 12.0564, "eval_samples_per_second": 1576.256, "eval_steps_per_second": 32.846, "step": 62850 }, { "epoch": 24.238921001926784, "grad_norm": 0.3848319947719574, "learning_rate": 0.00010304585741811177, "loss": 0.3489474105834961, "step": 62900 }, { "epoch": 24.238921001926784, "eval_loss": 0.40893056988716125, "eval_runtime": 12.1382, "eval_samples_per_second": 1565.632, "eval_steps_per_second": 32.624, "step": 62900 }, { "epoch": 24.258188824662813, "grad_norm": 0.4098030626773834, "learning_rate": 0.00010296878612716763, "loss": 0.3482639312744141, "step": 62950 }, { "epoch": 24.258188824662813, "eval_loss": 0.4087488353252411, "eval_runtime": 12.0562, "eval_samples_per_second": 1576.289, "eval_steps_per_second": 32.846, "step": 62950 }, { "epoch": 24.277456647398843, "grad_norm": 0.32946595549583435, "learning_rate": 0.00010289171483622351, "loss": 0.3449751663208008, "step": 63000 }, { "epoch": 24.277456647398843, "eval_loss": 0.41364744305610657, "eval_runtime": 12.0926, "eval_samples_per_second": 1571.537, "eval_steps_per_second": 32.747, "step": 63000 }, { "epoch": 24.296724470134876, "grad_norm": 0.34898415207862854, "learning_rate": 0.00010281464354527939, "loss": 0.3479450225830078, "step": 63050 }, { "epoch": 24.296724470134876, "eval_loss": 0.410478800535202, "eval_runtime": 12.0902, "eval_samples_per_second": 1571.854, "eval_steps_per_second": 32.754, "step": 63050 }, { "epoch": 24.315992292870906, "grad_norm": 0.3424381911754608, "learning_rate": 0.00010273757225433527, "loss": 0.34894302368164065, "step": 63100 }, { "epoch": 24.315992292870906, "eval_loss": 0.41415801644325256, "eval_runtime": 12.0686, "eval_samples_per_second": 1574.662, "eval_steps_per_second": 32.812, "step": 63100 }, { "epoch": 24.335260115606935, "grad_norm": 0.4101783335208893, "learning_rate": 0.00010266050096339115, "loss": 0.351604118347168, "step": 63150 }, { "epoch": 24.335260115606935, "eval_loss": 0.41115692257881165, "eval_runtime": 12.0613, "eval_samples_per_second": 1575.617, "eval_steps_per_second": 32.832, "step": 63150 }, { "epoch": 24.35452793834297, "grad_norm": 0.30466365814208984, "learning_rate": 0.00010258342967244703, "loss": 0.3464344024658203, "step": 63200 }, { "epoch": 24.35452793834297, "eval_loss": 0.40555182099342346, "eval_runtime": 12.0623, "eval_samples_per_second": 1575.487, "eval_steps_per_second": 32.83, "step": 63200 }, { "epoch": 24.373795761079, "grad_norm": 0.34591084718704224, "learning_rate": 0.00010250635838150288, "loss": 0.351650390625, "step": 63250 }, { "epoch": 24.373795761079, "eval_loss": 0.41807904839515686, "eval_runtime": 12.0885, "eval_samples_per_second": 1572.072, "eval_steps_per_second": 32.758, "step": 63250 }, { "epoch": 24.393063583815028, "grad_norm": 0.3728994131088257, "learning_rate": 0.00010242928709055877, "loss": 0.3518779754638672, "step": 63300 }, { "epoch": 24.393063583815028, "eval_loss": 0.40829789638519287, "eval_runtime": 12.0594, "eval_samples_per_second": 1575.871, "eval_steps_per_second": 32.838, "step": 63300 }, { "epoch": 24.41233140655106, "grad_norm": 0.39991042017936707, "learning_rate": 0.00010235221579961465, "loss": 0.34751205444335936, "step": 63350 }, { "epoch": 24.41233140655106, "eval_loss": 0.41172951459884644, "eval_runtime": 12.0653, "eval_samples_per_second": 1575.098, "eval_steps_per_second": 32.821, "step": 63350 }, { "epoch": 24.43159922928709, "grad_norm": 0.3659707009792328, "learning_rate": 0.00010227514450867053, "loss": 0.3498591995239258, "step": 63400 }, { "epoch": 24.43159922928709, "eval_loss": 0.4152224063873291, "eval_runtime": 12.0699, "eval_samples_per_second": 1574.49, "eval_steps_per_second": 32.809, "step": 63400 }, { "epoch": 24.45086705202312, "grad_norm": 0.3274148106575012, "learning_rate": 0.00010219807321772641, "loss": 0.3470821762084961, "step": 63450 }, { "epoch": 24.45086705202312, "eval_loss": 0.4102877080440521, "eval_runtime": 12.1097, "eval_samples_per_second": 1569.315, "eval_steps_per_second": 32.701, "step": 63450 }, { "epoch": 24.470134874759154, "grad_norm": 0.29690924286842346, "learning_rate": 0.0001021210019267823, "loss": 0.3516875457763672, "step": 63500 }, { "epoch": 24.470134874759154, "eval_loss": 0.4088464379310608, "eval_runtime": 12.1141, "eval_samples_per_second": 1568.745, "eval_steps_per_second": 32.689, "step": 63500 }, { "epoch": 24.489402697495184, "grad_norm": 0.3652198612689972, "learning_rate": 0.00010204393063583815, "loss": 0.34995235443115236, "step": 63550 }, { "epoch": 24.489402697495184, "eval_loss": 0.4099659323692322, "eval_runtime": 12.1396, "eval_samples_per_second": 1565.452, "eval_steps_per_second": 32.62, "step": 63550 }, { "epoch": 24.508670520231213, "grad_norm": 0.33390653133392334, "learning_rate": 0.00010196685934489402, "loss": 0.35025604248046877, "step": 63600 }, { "epoch": 24.508670520231213, "eval_loss": 0.42324569821357727, "eval_runtime": 12.08, "eval_samples_per_second": 1573.184, "eval_steps_per_second": 32.782, "step": 63600 }, { "epoch": 24.527938342967246, "grad_norm": 0.3562656342983246, "learning_rate": 0.00010188978805394991, "loss": 0.34678890228271486, "step": 63650 }, { "epoch": 24.527938342967246, "eval_loss": 0.4082246422767639, "eval_runtime": 12.2482, "eval_samples_per_second": 1551.578, "eval_steps_per_second": 32.331, "step": 63650 }, { "epoch": 24.547206165703276, "grad_norm": 0.39507919549942017, "learning_rate": 0.0001018127167630058, "loss": 0.3533937072753906, "step": 63700 }, { "epoch": 24.547206165703276, "eval_loss": 0.42192187905311584, "eval_runtime": 12.159, "eval_samples_per_second": 1562.953, "eval_steps_per_second": 32.568, "step": 63700 }, { "epoch": 24.566473988439306, "grad_norm": 0.3346495032310486, "learning_rate": 0.00010173564547206167, "loss": 0.3499335479736328, "step": 63750 }, { "epoch": 24.566473988439306, "eval_loss": 0.4140526354312897, "eval_runtime": 12.2241, "eval_samples_per_second": 1554.636, "eval_steps_per_second": 32.395, "step": 63750 }, { "epoch": 24.58574181117534, "grad_norm": 0.36414921283721924, "learning_rate": 0.00010165857418111755, "loss": 0.34377273559570315, "step": 63800 }, { "epoch": 24.58574181117534, "eval_loss": 0.41068190336227417, "eval_runtime": 12.0935, "eval_samples_per_second": 1571.427, "eval_steps_per_second": 32.745, "step": 63800 }, { "epoch": 24.60500963391137, "grad_norm": 0.42442458868026733, "learning_rate": 0.00010158150289017341, "loss": 0.3490145492553711, "step": 63850 }, { "epoch": 24.60500963391137, "eval_loss": 0.4074975550174713, "eval_runtime": 12.0594, "eval_samples_per_second": 1575.865, "eval_steps_per_second": 32.837, "step": 63850 }, { "epoch": 24.6242774566474, "grad_norm": 0.31182676553726196, "learning_rate": 0.00010150443159922929, "loss": 0.34147933959960936, "step": 63900 }, { "epoch": 24.6242774566474, "eval_loss": 0.41059648990631104, "eval_runtime": 12.0815, "eval_samples_per_second": 1572.981, "eval_steps_per_second": 32.777, "step": 63900 }, { "epoch": 24.643545279383428, "grad_norm": 0.3590845763683319, "learning_rate": 0.00010142736030828516, "loss": 0.34855644226074217, "step": 63950 }, { "epoch": 24.643545279383428, "eval_loss": 0.41199395060539246, "eval_runtime": 12.1505, "eval_samples_per_second": 1564.056, "eval_steps_per_second": 32.591, "step": 63950 }, { "epoch": 24.66281310211946, "grad_norm": 0.3353289067745209, "learning_rate": 0.00010135028901734105, "loss": 0.35136421203613283, "step": 64000 }, { "epoch": 24.66281310211946, "eval_loss": 0.4149375855922699, "eval_runtime": 12.1466, "eval_samples_per_second": 1564.559, "eval_steps_per_second": 32.602, "step": 64000 }, { "epoch": 24.68208092485549, "grad_norm": 0.3711238205432892, "learning_rate": 0.00010127321772639693, "loss": 0.3459600830078125, "step": 64050 }, { "epoch": 24.68208092485549, "eval_loss": 0.41317418217658997, "eval_runtime": 12.07, "eval_samples_per_second": 1574.488, "eval_steps_per_second": 32.809, "step": 64050 }, { "epoch": 24.70134874759152, "grad_norm": 0.39010554552078247, "learning_rate": 0.00010119614643545279, "loss": 0.3473027801513672, "step": 64100 }, { "epoch": 24.70134874759152, "eval_loss": 0.39897286891937256, "eval_runtime": 12.075, "eval_samples_per_second": 1573.832, "eval_steps_per_second": 32.795, "step": 64100 }, { "epoch": 24.720616570327554, "grad_norm": 0.38296985626220703, "learning_rate": 0.00010111907514450866, "loss": 0.3462000274658203, "step": 64150 }, { "epoch": 24.720616570327554, "eval_loss": 0.4110531508922577, "eval_runtime": 12.0502, "eval_samples_per_second": 1577.065, "eval_steps_per_second": 32.862, "step": 64150 }, { "epoch": 24.739884393063583, "grad_norm": 0.36965078115463257, "learning_rate": 0.00010104200385356455, "loss": 0.34781890869140625, "step": 64200 }, { "epoch": 24.739884393063583, "eval_loss": 0.40421584248542786, "eval_runtime": 12.1103, "eval_samples_per_second": 1569.242, "eval_steps_per_second": 32.699, "step": 64200 }, { "epoch": 24.759152215799613, "grad_norm": 0.32775846123695374, "learning_rate": 0.00010096493256262043, "loss": 0.3455362319946289, "step": 64250 }, { "epoch": 24.759152215799613, "eval_loss": 0.4005540907382965, "eval_runtime": 12.0621, "eval_samples_per_second": 1575.516, "eval_steps_per_second": 32.83, "step": 64250 }, { "epoch": 24.778420038535646, "grad_norm": 0.32294222712516785, "learning_rate": 0.0001008878612716763, "loss": 0.3503965759277344, "step": 64300 }, { "epoch": 24.778420038535646, "eval_loss": 0.4121895432472229, "eval_runtime": 12.1412, "eval_samples_per_second": 1565.249, "eval_steps_per_second": 32.616, "step": 64300 }, { "epoch": 24.797687861271676, "grad_norm": 0.3684018850326538, "learning_rate": 0.00010081078998073219, "loss": 0.35065010070800784, "step": 64350 }, { "epoch": 24.797687861271676, "eval_loss": 0.3981652855873108, "eval_runtime": 12.0677, "eval_samples_per_second": 1574.783, "eval_steps_per_second": 32.815, "step": 64350 }, { "epoch": 24.816955684007706, "grad_norm": 0.35019463300704956, "learning_rate": 0.00010073371868978805, "loss": 0.34544246673583984, "step": 64400 }, { "epoch": 24.816955684007706, "eval_loss": 0.40511438250541687, "eval_runtime": 12.0551, "eval_samples_per_second": 1576.432, "eval_steps_per_second": 32.849, "step": 64400 }, { "epoch": 24.83622350674374, "grad_norm": 0.3620331287384033, "learning_rate": 0.00010065664739884393, "loss": 0.34531822204589846, "step": 64450 }, { "epoch": 24.83622350674374, "eval_loss": 0.40703538060188293, "eval_runtime": 12.0916, "eval_samples_per_second": 1571.663, "eval_steps_per_second": 32.75, "step": 64450 }, { "epoch": 24.85549132947977, "grad_norm": 0.3639388084411621, "learning_rate": 0.0001005795761078998, "loss": 0.3493729400634766, "step": 64500 }, { "epoch": 24.85549132947977, "eval_loss": 0.40925300121307373, "eval_runtime": 12.2229, "eval_samples_per_second": 1554.785, "eval_steps_per_second": 32.398, "step": 64500 }, { "epoch": 24.8747591522158, "grad_norm": 0.43527737259864807, "learning_rate": 0.00010050250481695569, "loss": 0.3504779052734375, "step": 64550 }, { "epoch": 24.8747591522158, "eval_loss": 0.39673754572868347, "eval_runtime": 12.1523, "eval_samples_per_second": 1563.822, "eval_steps_per_second": 32.586, "step": 64550 }, { "epoch": 24.89402697495183, "grad_norm": 0.2810731828212738, "learning_rate": 0.00010042543352601157, "loss": 0.34924423217773437, "step": 64600 }, { "epoch": 24.89402697495183, "eval_loss": 0.3999762237071991, "eval_runtime": 12.2435, "eval_samples_per_second": 1552.171, "eval_steps_per_second": 32.344, "step": 64600 }, { "epoch": 24.91329479768786, "grad_norm": 0.3468506932258606, "learning_rate": 0.00010034836223506744, "loss": 0.3512066650390625, "step": 64650 }, { "epoch": 24.91329479768786, "eval_loss": 0.40566393733024597, "eval_runtime": 12.2122, "eval_samples_per_second": 1556.146, "eval_steps_per_second": 32.427, "step": 64650 }, { "epoch": 24.93256262042389, "grad_norm": 0.3324216902256012, "learning_rate": 0.0001002712909441233, "loss": 0.34848953247070313, "step": 64700 }, { "epoch": 24.93256262042389, "eval_loss": 0.40167108178138733, "eval_runtime": 12.2186, "eval_samples_per_second": 1555.33, "eval_steps_per_second": 32.41, "step": 64700 }, { "epoch": 24.951830443159924, "grad_norm": 0.35427695512771606, "learning_rate": 0.00010019421965317919, "loss": 0.3491028594970703, "step": 64750 }, { "epoch": 24.951830443159924, "eval_loss": 0.409657746553421, "eval_runtime": 12.2453, "eval_samples_per_second": 1551.942, "eval_steps_per_second": 32.339, "step": 64750 }, { "epoch": 24.971098265895954, "grad_norm": 0.3715837895870209, "learning_rate": 0.00010011714836223507, "loss": 0.34314651489257814, "step": 64800 }, { "epoch": 24.971098265895954, "eval_loss": 0.39905592799186707, "eval_runtime": 12.2374, "eval_samples_per_second": 1552.941, "eval_steps_per_second": 32.36, "step": 64800 }, { "epoch": 24.990366088631983, "grad_norm": 0.3067304790019989, "learning_rate": 0.00010004007707129094, "loss": 0.34837371826171876, "step": 64850 }, { "epoch": 24.990366088631983, "eval_loss": 0.41085150837898254, "eval_runtime": 12.2333, "eval_samples_per_second": 1553.468, "eval_steps_per_second": 32.371, "step": 64850 }, { "epoch": 25.009633911368017, "grad_norm": 0.37883153557777405, "learning_rate": 9.996300578034683e-05, "loss": 0.34394447326660155, "step": 64900 }, { "epoch": 25.009633911368017, "eval_loss": 0.40679866075515747, "eval_runtime": 12.2315, "eval_samples_per_second": 1553.698, "eval_steps_per_second": 32.376, "step": 64900 }, { "epoch": 25.028901734104046, "grad_norm": 0.4482385516166687, "learning_rate": 9.98859344894027e-05, "loss": 0.35288314819335936, "step": 64950 }, { "epoch": 25.028901734104046, "eval_loss": 0.41076239943504333, "eval_runtime": 12.2188, "eval_samples_per_second": 1555.306, "eval_steps_per_second": 32.409, "step": 64950 }, { "epoch": 25.048169556840076, "grad_norm": 0.3832108974456787, "learning_rate": 9.980886319845858e-05, "loss": 0.3470149993896484, "step": 65000 }, { "epoch": 25.048169556840076, "eval_loss": 0.4051113426685333, "eval_runtime": 12.2552, "eval_samples_per_second": 1550.694, "eval_steps_per_second": 32.313, "step": 65000 }, { "epoch": 25.06743737957611, "grad_norm": 0.35983550548553467, "learning_rate": 9.973179190751446e-05, "loss": 0.3451959228515625, "step": 65050 }, { "epoch": 25.06743737957611, "eval_loss": 0.40096601843833923, "eval_runtime": 12.2492, "eval_samples_per_second": 1551.453, "eval_steps_per_second": 32.329, "step": 65050 }, { "epoch": 25.08670520231214, "grad_norm": 0.33026668429374695, "learning_rate": 9.965472061657033e-05, "loss": 0.3459891128540039, "step": 65100 }, { "epoch": 25.08670520231214, "eval_loss": 0.4032179117202759, "eval_runtime": 12.1281, "eval_samples_per_second": 1566.943, "eval_steps_per_second": 32.652, "step": 65100 }, { "epoch": 25.10597302504817, "grad_norm": 0.39518117904663086, "learning_rate": 9.957764932562621e-05, "loss": 0.34884990692138673, "step": 65150 }, { "epoch": 25.10597302504817, "eval_loss": 0.4026787281036377, "eval_runtime": 12.1533, "eval_samples_per_second": 1563.696, "eval_steps_per_second": 32.584, "step": 65150 }, { "epoch": 25.1252408477842, "grad_norm": 0.34151625633239746, "learning_rate": 9.95005780346821e-05, "loss": 0.3462157440185547, "step": 65200 }, { "epoch": 25.1252408477842, "eval_loss": 0.39342257380485535, "eval_runtime": 12.256, "eval_samples_per_second": 1550.586, "eval_steps_per_second": 32.311, "step": 65200 }, { "epoch": 25.14450867052023, "grad_norm": 0.3752110004425049, "learning_rate": 9.942350674373795e-05, "loss": 0.34774940490722656, "step": 65250 }, { "epoch": 25.14450867052023, "eval_loss": 0.40046730637550354, "eval_runtime": 12.0941, "eval_samples_per_second": 1571.351, "eval_steps_per_second": 32.743, "step": 65250 }, { "epoch": 25.16377649325626, "grad_norm": 0.3122682273387909, "learning_rate": 9.934643545279384e-05, "loss": 0.34490638732910156, "step": 65300 }, { "epoch": 25.16377649325626, "eval_loss": 0.40186527371406555, "eval_runtime": 12.1792, "eval_samples_per_second": 1560.366, "eval_steps_per_second": 32.514, "step": 65300 }, { "epoch": 25.183044315992294, "grad_norm": 0.3223252594470978, "learning_rate": 9.926936416184972e-05, "loss": 0.3460881805419922, "step": 65350 }, { "epoch": 25.183044315992294, "eval_loss": 0.40696483850479126, "eval_runtime": 12.2216, "eval_samples_per_second": 1554.949, "eval_steps_per_second": 32.402, "step": 65350 }, { "epoch": 25.202312138728324, "grad_norm": 0.3098030388355255, "learning_rate": 9.91922928709056e-05, "loss": 0.34744590759277344, "step": 65400 }, { "epoch": 25.202312138728324, "eval_loss": 0.3988608717918396, "eval_runtime": 12.2011, "eval_samples_per_second": 1557.568, "eval_steps_per_second": 32.456, "step": 65400 }, { "epoch": 25.221579961464354, "grad_norm": 0.3769843876361847, "learning_rate": 9.911522157996147e-05, "loss": 0.34434085845947265, "step": 65450 }, { "epoch": 25.221579961464354, "eval_loss": 0.4118606746196747, "eval_runtime": 12.2191, "eval_samples_per_second": 1555.264, "eval_steps_per_second": 32.408, "step": 65450 }, { "epoch": 25.240847784200387, "grad_norm": 0.3276320993900299, "learning_rate": 9.903815028901735e-05, "loss": 0.34209400177001953, "step": 65500 }, { "epoch": 25.240847784200387, "eval_loss": 0.406976580619812, "eval_runtime": 12.2508, "eval_samples_per_second": 1551.252, "eval_steps_per_second": 32.325, "step": 65500 }, { "epoch": 25.260115606936417, "grad_norm": 0.3770756423473358, "learning_rate": 9.896107899807322e-05, "loss": 0.34366424560546877, "step": 65550 }, { "epoch": 25.260115606936417, "eval_loss": 0.39966723322868347, "eval_runtime": 12.2842, "eval_samples_per_second": 1547.024, "eval_steps_per_second": 32.236, "step": 65550 }, { "epoch": 25.279383429672446, "grad_norm": 0.3339482843875885, "learning_rate": 9.88840077071291e-05, "loss": 0.34838504791259767, "step": 65600 }, { "epoch": 25.279383429672446, "eval_loss": 0.4051201343536377, "eval_runtime": 12.2335, "eval_samples_per_second": 1553.437, "eval_steps_per_second": 32.37, "step": 65600 }, { "epoch": 25.29865125240848, "grad_norm": 0.34588176012039185, "learning_rate": 9.880693641618498e-05, "loss": 0.34776092529296876, "step": 65650 }, { "epoch": 25.29865125240848, "eval_loss": 0.4075586795806885, "eval_runtime": 12.2372, "eval_samples_per_second": 1552.964, "eval_steps_per_second": 32.36, "step": 65650 }, { "epoch": 25.31791907514451, "grad_norm": 0.39745786786079407, "learning_rate": 9.872986512524085e-05, "loss": 0.35218276977539065, "step": 65700 }, { "epoch": 25.31791907514451, "eval_loss": 0.41129904985427856, "eval_runtime": 12.3209, "eval_samples_per_second": 1542.417, "eval_steps_per_second": 32.14, "step": 65700 }, { "epoch": 25.33718689788054, "grad_norm": 0.3867224454879761, "learning_rate": 9.865279383429674e-05, "loss": 0.3489298629760742, "step": 65750 }, { "epoch": 25.33718689788054, "eval_loss": 0.4082324206829071, "eval_runtime": 12.3883, "eval_samples_per_second": 1534.029, "eval_steps_per_second": 31.966, "step": 65750 }, { "epoch": 25.356454720616572, "grad_norm": 0.31176891922950745, "learning_rate": 9.857572254335261e-05, "loss": 0.34313812255859377, "step": 65800 }, { "epoch": 25.356454720616572, "eval_loss": 0.4092969000339508, "eval_runtime": 12.3171, "eval_samples_per_second": 1542.9, "eval_steps_per_second": 32.151, "step": 65800 }, { "epoch": 25.3757225433526, "grad_norm": 0.31245580315589905, "learning_rate": 9.849865125240848e-05, "loss": 0.33924552917480466, "step": 65850 }, { "epoch": 25.3757225433526, "eval_loss": 0.4011678695678711, "eval_runtime": 12.086, "eval_samples_per_second": 1572.396, "eval_steps_per_second": 32.765, "step": 65850 }, { "epoch": 25.39499036608863, "grad_norm": 0.37746283411979675, "learning_rate": 9.842157996146436e-05, "loss": 0.3456719207763672, "step": 65900 }, { "epoch": 25.39499036608863, "eval_loss": 0.40384966135025024, "eval_runtime": 12.0952, "eval_samples_per_second": 1571.199, "eval_steps_per_second": 32.74, "step": 65900 }, { "epoch": 25.41425818882466, "grad_norm": 0.37617674469947815, "learning_rate": 9.834450867052023e-05, "loss": 0.34294532775878905, "step": 65950 }, { "epoch": 25.41425818882466, "eval_loss": 0.41134217381477356, "eval_runtime": 12.196, "eval_samples_per_second": 1558.216, "eval_steps_per_second": 32.47, "step": 65950 }, { "epoch": 25.433526011560694, "grad_norm": 0.3287246823310852, "learning_rate": 9.82674373795761e-05, "loss": 0.34438758850097656, "step": 66000 }, { "epoch": 25.433526011560694, "eval_loss": 0.405550479888916, "eval_runtime": 12.1272, "eval_samples_per_second": 1567.053, "eval_steps_per_second": 32.654, "step": 66000 }, { "epoch": 25.452793834296724, "grad_norm": 0.32533401250839233, "learning_rate": 9.819036608863199e-05, "loss": 0.34892280578613283, "step": 66050 }, { "epoch": 25.452793834296724, "eval_loss": 0.4018882215023041, "eval_runtime": 12.1881, "eval_samples_per_second": 1559.225, "eval_steps_per_second": 32.491, "step": 66050 }, { "epoch": 25.472061657032754, "grad_norm": 0.37050366401672363, "learning_rate": 9.811329479768788e-05, "loss": 0.34835601806640626, "step": 66100 }, { "epoch": 25.472061657032754, "eval_loss": 0.4043917953968048, "eval_runtime": 12.0898, "eval_samples_per_second": 1571.903, "eval_steps_per_second": 32.755, "step": 66100 }, { "epoch": 25.491329479768787, "grad_norm": 0.3511432111263275, "learning_rate": 9.803622350674375e-05, "loss": 0.34345413208007813, "step": 66150 }, { "epoch": 25.491329479768787, "eval_loss": 0.40407460927963257, "eval_runtime": 12.1242, "eval_samples_per_second": 1567.44, "eval_steps_per_second": 32.662, "step": 66150 }, { "epoch": 25.510597302504816, "grad_norm": 0.3551037013530731, "learning_rate": 9.795915221579962e-05, "loss": 0.35014678955078127, "step": 66200 }, { "epoch": 25.510597302504816, "eval_loss": 0.40050557255744934, "eval_runtime": 12.0752, "eval_samples_per_second": 1573.798, "eval_steps_per_second": 32.794, "step": 66200 }, { "epoch": 25.529865125240846, "grad_norm": 0.3218974471092224, "learning_rate": 9.78820809248555e-05, "loss": 0.34970626831054685, "step": 66250 }, { "epoch": 25.529865125240846, "eval_loss": 0.40552693605422974, "eval_runtime": 12.1569, "eval_samples_per_second": 1563.23, "eval_steps_per_second": 32.574, "step": 66250 }, { "epoch": 25.54913294797688, "grad_norm": 0.3595046401023865, "learning_rate": 9.780500963391137e-05, "loss": 0.3438873291015625, "step": 66300 }, { "epoch": 25.54913294797688, "eval_loss": 0.40775054693222046, "eval_runtime": 12.1791, "eval_samples_per_second": 1560.383, "eval_steps_per_second": 32.515, "step": 66300 }, { "epoch": 25.56840077071291, "grad_norm": 0.3286336660385132, "learning_rate": 9.772793834296725e-05, "loss": 0.347752685546875, "step": 66350 }, { "epoch": 25.56840077071291, "eval_loss": 0.4050438404083252, "eval_runtime": 12.2182, "eval_samples_per_second": 1555.38, "eval_steps_per_second": 32.411, "step": 66350 }, { "epoch": 25.58766859344894, "grad_norm": 0.3117230236530304, "learning_rate": 9.765086705202312e-05, "loss": 0.3465156555175781, "step": 66400 }, { "epoch": 25.58766859344894, "eval_loss": 0.3949090838432312, "eval_runtime": 12.2689, "eval_samples_per_second": 1548.957, "eval_steps_per_second": 32.277, "step": 66400 }, { "epoch": 25.606936416184972, "grad_norm": 0.34121015667915344, "learning_rate": 9.7573795761079e-05, "loss": 0.34539901733398437, "step": 66450 }, { "epoch": 25.606936416184972, "eval_loss": 0.4027334451675415, "eval_runtime": 12.2365, "eval_samples_per_second": 1553.059, "eval_steps_per_second": 32.362, "step": 66450 }, { "epoch": 25.626204238921, "grad_norm": 0.39444777369499207, "learning_rate": 9.749672447013489e-05, "loss": 0.3482596969604492, "step": 66500 }, { "epoch": 25.626204238921, "eval_loss": 0.40894556045532227, "eval_runtime": 12.2733, "eval_samples_per_second": 1548.403, "eval_steps_per_second": 32.265, "step": 66500 }, { "epoch": 25.64547206165703, "grad_norm": 0.3334032893180847, "learning_rate": 9.741965317919074e-05, "loss": 0.34367431640625, "step": 66550 }, { "epoch": 25.64547206165703, "eval_loss": 0.401370108127594, "eval_runtime": 12.2481, "eval_samples_per_second": 1551.583, "eval_steps_per_second": 32.331, "step": 66550 }, { "epoch": 25.664739884393065, "grad_norm": 0.37000197172164917, "learning_rate": 9.734258188824663e-05, "loss": 0.33841175079345703, "step": 66600 }, { "epoch": 25.664739884393065, "eval_loss": 0.40681830048561096, "eval_runtime": 12.0996, "eval_samples_per_second": 1570.636, "eval_steps_per_second": 32.728, "step": 66600 }, { "epoch": 25.684007707129094, "grad_norm": 0.37456098198890686, "learning_rate": 9.726551059730251e-05, "loss": 0.3488788223266602, "step": 66650 }, { "epoch": 25.684007707129094, "eval_loss": 0.3951868712902069, "eval_runtime": 12.0736, "eval_samples_per_second": 1574.01, "eval_steps_per_second": 32.799, "step": 66650 }, { "epoch": 25.703275529865124, "grad_norm": 0.3367025554180145, "learning_rate": 9.718843930635839e-05, "loss": 0.34403549194335936, "step": 66700 }, { "epoch": 25.703275529865124, "eval_loss": 0.40895238518714905, "eval_runtime": 12.1912, "eval_samples_per_second": 1558.834, "eval_steps_per_second": 32.483, "step": 66700 }, { "epoch": 25.722543352601157, "grad_norm": 0.37587013840675354, "learning_rate": 9.711136801541426e-05, "loss": 0.3417684173583984, "step": 66750 }, { "epoch": 25.722543352601157, "eval_loss": 0.40247365832328796, "eval_runtime": 12.1608, "eval_samples_per_second": 1562.729, "eval_steps_per_second": 32.564, "step": 66750 }, { "epoch": 25.741811175337187, "grad_norm": 0.4033512473106384, "learning_rate": 9.703429672447014e-05, "loss": 0.34644485473632813, "step": 66800 }, { "epoch": 25.741811175337187, "eval_loss": 0.40917232632637024, "eval_runtime": 12.2056, "eval_samples_per_second": 1556.991, "eval_steps_per_second": 32.444, "step": 66800 }, { "epoch": 25.761078998073216, "grad_norm": 0.35006043314933777, "learning_rate": 9.695722543352601e-05, "loss": 0.3466936492919922, "step": 66850 }, { "epoch": 25.761078998073216, "eval_loss": 0.4087439775466919, "eval_runtime": 12.2051, "eval_samples_per_second": 1557.057, "eval_steps_per_second": 32.446, "step": 66850 }, { "epoch": 25.78034682080925, "grad_norm": 0.35374945402145386, "learning_rate": 9.688015414258188e-05, "loss": 0.3434608840942383, "step": 66900 }, { "epoch": 25.78034682080925, "eval_loss": 0.40686798095703125, "eval_runtime": 12.1843, "eval_samples_per_second": 1559.714, "eval_steps_per_second": 32.501, "step": 66900 }, { "epoch": 25.79961464354528, "grad_norm": 0.3190465569496155, "learning_rate": 9.680308285163777e-05, "loss": 0.3501566696166992, "step": 66950 }, { "epoch": 25.79961464354528, "eval_loss": 0.4002121388912201, "eval_runtime": 12.1967, "eval_samples_per_second": 1558.125, "eval_steps_per_second": 32.468, "step": 66950 }, { "epoch": 25.81888246628131, "grad_norm": 0.37279319763183594, "learning_rate": 9.672601156069364e-05, "loss": 0.3482442092895508, "step": 67000 }, { "epoch": 25.81888246628131, "eval_loss": 0.41752660274505615, "eval_runtime": 12.2693, "eval_samples_per_second": 1548.912, "eval_steps_per_second": 32.276, "step": 67000 }, { "epoch": 25.838150289017342, "grad_norm": 0.38556748628616333, "learning_rate": 9.664894026974953e-05, "loss": 0.3436566162109375, "step": 67050 }, { "epoch": 25.838150289017342, "eval_loss": 0.40154600143432617, "eval_runtime": 12.4004, "eval_samples_per_second": 1532.533, "eval_steps_per_second": 31.934, "step": 67050 }, { "epoch": 25.857418111753372, "grad_norm": 0.45078378915786743, "learning_rate": 9.65718689788054e-05, "loss": 0.3497908020019531, "step": 67100 }, { "epoch": 25.857418111753372, "eval_loss": 0.4075978398323059, "eval_runtime": 12.2393, "eval_samples_per_second": 1552.709, "eval_steps_per_second": 32.355, "step": 67100 }, { "epoch": 25.8766859344894, "grad_norm": 0.37019991874694824, "learning_rate": 9.649479768786127e-05, "loss": 0.3480282211303711, "step": 67150 }, { "epoch": 25.8766859344894, "eval_loss": 0.40467461943626404, "eval_runtime": 12.2329, "eval_samples_per_second": 1553.522, "eval_steps_per_second": 32.372, "step": 67150 }, { "epoch": 25.895953757225435, "grad_norm": 0.3412874639034271, "learning_rate": 9.641772639691715e-05, "loss": 0.34721138000488283, "step": 67200 }, { "epoch": 25.895953757225435, "eval_loss": 0.3982767164707184, "eval_runtime": 12.2502, "eval_samples_per_second": 1551.324, "eval_steps_per_second": 32.326, "step": 67200 }, { "epoch": 25.915221579961464, "grad_norm": 0.31660884618759155, "learning_rate": 9.634065510597304e-05, "loss": 0.34823028564453123, "step": 67250 }, { "epoch": 25.915221579961464, "eval_loss": 0.40196919441223145, "eval_runtime": 12.24, "eval_samples_per_second": 1552.61, "eval_steps_per_second": 32.353, "step": 67250 }, { "epoch": 25.934489402697494, "grad_norm": 0.41867905855178833, "learning_rate": 9.62635838150289e-05, "loss": 0.3443562316894531, "step": 67300 }, { "epoch": 25.934489402697494, "eval_loss": 0.4046531021595001, "eval_runtime": 12.2337, "eval_samples_per_second": 1553.415, "eval_steps_per_second": 32.37, "step": 67300 }, { "epoch": 25.953757225433527, "grad_norm": 0.3803860545158386, "learning_rate": 9.618651252408478e-05, "loss": 0.34026412963867186, "step": 67350 }, { "epoch": 25.953757225433527, "eval_loss": 0.40111345052719116, "eval_runtime": 12.1564, "eval_samples_per_second": 1563.294, "eval_steps_per_second": 32.575, "step": 67350 }, { "epoch": 25.973025048169557, "grad_norm": 0.3240675926208496, "learning_rate": 9.610944123314067e-05, "loss": 0.34514904022216797, "step": 67400 }, { "epoch": 25.973025048169557, "eval_loss": 0.41306251287460327, "eval_runtime": 12.1377, "eval_samples_per_second": 1565.697, "eval_steps_per_second": 32.626, "step": 67400 }, { "epoch": 25.992292870905587, "grad_norm": 0.3654930591583252, "learning_rate": 9.603236994219654e-05, "loss": 0.34577480316162107, "step": 67450 }, { "epoch": 25.992292870905587, "eval_loss": 0.41177788376808167, "eval_runtime": 12.0832, "eval_samples_per_second": 1572.761, "eval_steps_per_second": 32.773, "step": 67450 }, { "epoch": 26.01156069364162, "grad_norm": 0.3365257680416107, "learning_rate": 9.595529865125241e-05, "loss": 0.34888484954833987, "step": 67500 }, { "epoch": 26.01156069364162, "eval_loss": 0.41132134199142456, "eval_runtime": 12.185, "eval_samples_per_second": 1559.617, "eval_steps_per_second": 32.499, "step": 67500 }, { "epoch": 26.03082851637765, "grad_norm": 0.3200022876262665, "learning_rate": 9.58782273603083e-05, "loss": 0.34600536346435545, "step": 67550 }, { "epoch": 26.03082851637765, "eval_loss": 0.40956953167915344, "eval_runtime": 12.2529, "eval_samples_per_second": 1550.974, "eval_steps_per_second": 32.319, "step": 67550 }, { "epoch": 26.05009633911368, "grad_norm": 0.31594377756118774, "learning_rate": 9.580115606936416e-05, "loss": 0.3390455627441406, "step": 67600 }, { "epoch": 26.05009633911368, "eval_loss": 0.40676069259643555, "eval_runtime": 12.2485, "eval_samples_per_second": 1551.537, "eval_steps_per_second": 32.331, "step": 67600 }, { "epoch": 26.069364161849713, "grad_norm": 0.3830168843269348, "learning_rate": 9.572408477842004e-05, "loss": 0.34558757781982424, "step": 67650 }, { "epoch": 26.069364161849713, "eval_loss": 0.4033353626728058, "eval_runtime": 12.2888, "eval_samples_per_second": 1546.451, "eval_steps_per_second": 32.225, "step": 67650 }, { "epoch": 26.088631984585742, "grad_norm": 0.46510234475135803, "learning_rate": 9.564701348747592e-05, "loss": 0.3491487884521484, "step": 67700 }, { "epoch": 26.088631984585742, "eval_loss": 0.41251733899116516, "eval_runtime": 12.1917, "eval_samples_per_second": 1558.768, "eval_steps_per_second": 32.481, "step": 67700 }, { "epoch": 26.107899807321772, "grad_norm": 0.32603737711906433, "learning_rate": 9.556994219653179e-05, "loss": 0.3490312957763672, "step": 67750 }, { "epoch": 26.107899807321772, "eval_loss": 0.4019075632095337, "eval_runtime": 12.1279, "eval_samples_per_second": 1566.964, "eval_steps_per_second": 32.652, "step": 67750 }, { "epoch": 26.127167630057805, "grad_norm": 0.3401066064834595, "learning_rate": 9.549287090558768e-05, "loss": 0.34681896209716795, "step": 67800 }, { "epoch": 26.127167630057805, "eval_loss": 0.40786033868789673, "eval_runtime": 12.1075, "eval_samples_per_second": 1569.61, "eval_steps_per_second": 32.707, "step": 67800 }, { "epoch": 26.146435452793835, "grad_norm": 0.3513272702693939, "learning_rate": 9.541579961464355e-05, "loss": 0.340859375, "step": 67850 }, { "epoch": 26.146435452793835, "eval_loss": 0.40593987703323364, "eval_runtime": 12.1055, "eval_samples_per_second": 1569.865, "eval_steps_per_second": 32.712, "step": 67850 }, { "epoch": 26.165703275529864, "grad_norm": 0.3863741457462311, "learning_rate": 9.533872832369942e-05, "loss": 0.33884952545166014, "step": 67900 }, { "epoch": 26.165703275529864, "eval_loss": 0.4085873067378998, "eval_runtime": 12.124, "eval_samples_per_second": 1567.468, "eval_steps_per_second": 32.662, "step": 67900 }, { "epoch": 26.184971098265898, "grad_norm": 0.3499763607978821, "learning_rate": 9.52616570327553e-05, "loss": 0.34146469116210937, "step": 67950 }, { "epoch": 26.184971098265898, "eval_loss": 0.3959965109825134, "eval_runtime": 12.1023, "eval_samples_per_second": 1570.279, "eval_steps_per_second": 32.721, "step": 67950 }, { "epoch": 26.204238921001927, "grad_norm": 0.39003264904022217, "learning_rate": 9.518458574181119e-05, "loss": 0.3441759490966797, "step": 68000 }, { "epoch": 26.204238921001927, "eval_loss": 0.40597137808799744, "eval_runtime": 12.1861, "eval_samples_per_second": 1559.477, "eval_steps_per_second": 32.496, "step": 68000 }, { "epoch": 26.223506743737957, "grad_norm": 0.3629879653453827, "learning_rate": 9.510751445086705e-05, "loss": 0.34622596740722655, "step": 68050 }, { "epoch": 26.223506743737957, "eval_loss": 0.397959440946579, "eval_runtime": 12.0761, "eval_samples_per_second": 1573.683, "eval_steps_per_second": 32.792, "step": 68050 }, { "epoch": 26.24277456647399, "grad_norm": 0.3427478075027466, "learning_rate": 9.503044315992293e-05, "loss": 0.34863990783691406, "step": 68100 }, { "epoch": 26.24277456647399, "eval_loss": 0.4004729688167572, "eval_runtime": 12.0655, "eval_samples_per_second": 1575.066, "eval_steps_per_second": 32.821, "step": 68100 }, { "epoch": 26.26204238921002, "grad_norm": 0.3021613359451294, "learning_rate": 9.495337186897882e-05, "loss": 0.34830074310302733, "step": 68150 }, { "epoch": 26.26204238921002, "eval_loss": 0.4076211154460907, "eval_runtime": 12.0554, "eval_samples_per_second": 1576.384, "eval_steps_per_second": 32.848, "step": 68150 }, { "epoch": 26.28131021194605, "grad_norm": 0.365852415561676, "learning_rate": 9.487630057803469e-05, "loss": 0.3380039978027344, "step": 68200 }, { "epoch": 26.28131021194605, "eval_loss": 0.40652161836624146, "eval_runtime": 12.0378, "eval_samples_per_second": 1578.698, "eval_steps_per_second": 32.896, "step": 68200 }, { "epoch": 26.30057803468208, "grad_norm": 0.3053823411464691, "learning_rate": 9.479922928709056e-05, "loss": 0.3419927215576172, "step": 68250 }, { "epoch": 26.30057803468208, "eval_loss": 0.41093236207962036, "eval_runtime": 12.0946, "eval_samples_per_second": 1571.274, "eval_steps_per_second": 32.742, "step": 68250 }, { "epoch": 26.319845857418112, "grad_norm": 0.36691001057624817, "learning_rate": 9.472215799614644e-05, "loss": 0.34058448791503904, "step": 68300 }, { "epoch": 26.319845857418112, "eval_loss": 0.40737009048461914, "eval_runtime": 12.0888, "eval_samples_per_second": 1572.035, "eval_steps_per_second": 32.758, "step": 68300 }, { "epoch": 26.339113680154142, "grad_norm": 0.34305375814437866, "learning_rate": 9.464508670520232e-05, "loss": 0.33964698791503906, "step": 68350 }, { "epoch": 26.339113680154142, "eval_loss": 0.4001956582069397, "eval_runtime": 12.1186, "eval_samples_per_second": 1568.171, "eval_steps_per_second": 32.677, "step": 68350 }, { "epoch": 26.358381502890172, "grad_norm": 0.30825650691986084, "learning_rate": 9.456801541425819e-05, "loss": 0.3435700607299805, "step": 68400 }, { "epoch": 26.358381502890172, "eval_loss": 0.4077039957046509, "eval_runtime": 12.0796, "eval_samples_per_second": 1573.227, "eval_steps_per_second": 32.782, "step": 68400 }, { "epoch": 26.377649325626205, "grad_norm": 0.31938645243644714, "learning_rate": 9.449094412331407e-05, "loss": 0.3439485168457031, "step": 68450 }, { "epoch": 26.377649325626205, "eval_loss": 0.41379180550575256, "eval_runtime": 12.0916, "eval_samples_per_second": 1571.672, "eval_steps_per_second": 32.75, "step": 68450 }, { "epoch": 26.396917148362235, "grad_norm": 0.3482989966869354, "learning_rate": 9.441387283236994e-05, "loss": 0.3469572067260742, "step": 68500 }, { "epoch": 26.396917148362235, "eval_loss": 0.39444559812545776, "eval_runtime": 12.1019, "eval_samples_per_second": 1570.331, "eval_steps_per_second": 32.722, "step": 68500 }, { "epoch": 26.416184971098264, "grad_norm": 0.3449409604072571, "learning_rate": 9.433680154142583e-05, "loss": 0.3434992218017578, "step": 68550 }, { "epoch": 26.416184971098264, "eval_loss": 0.4079976975917816, "eval_runtime": 12.1006, "eval_samples_per_second": 1570.496, "eval_steps_per_second": 32.726, "step": 68550 }, { "epoch": 26.435452793834298, "grad_norm": 0.339961439371109, "learning_rate": 9.42597302504817e-05, "loss": 0.34455970764160154, "step": 68600 }, { "epoch": 26.435452793834298, "eval_loss": 0.40476903319358826, "eval_runtime": 12.0805, "eval_samples_per_second": 1573.109, "eval_steps_per_second": 32.78, "step": 68600 }, { "epoch": 26.454720616570327, "grad_norm": 0.3506257236003876, "learning_rate": 9.418265895953757e-05, "loss": 0.3417066955566406, "step": 68650 }, { "epoch": 26.454720616570327, "eval_loss": 0.39567285776138306, "eval_runtime": 12.0916, "eval_samples_per_second": 1571.675, "eval_steps_per_second": 32.75, "step": 68650 }, { "epoch": 26.473988439306357, "grad_norm": 0.34993255138397217, "learning_rate": 9.410558766859346e-05, "loss": 0.34405410766601563, "step": 68700 }, { "epoch": 26.473988439306357, "eval_loss": 0.4022035300731659, "eval_runtime": 12.1108, "eval_samples_per_second": 1569.176, "eval_steps_per_second": 32.698, "step": 68700 }, { "epoch": 26.49325626204239, "grad_norm": 0.3030779957771301, "learning_rate": 9.402851637764933e-05, "loss": 0.34207466125488284, "step": 68750 }, { "epoch": 26.49325626204239, "eval_loss": 0.39594972133636475, "eval_runtime": 12.0672, "eval_samples_per_second": 1574.85, "eval_steps_per_second": 32.816, "step": 68750 }, { "epoch": 26.51252408477842, "grad_norm": 0.3765057325363159, "learning_rate": 9.39514450867052e-05, "loss": 0.3455942916870117, "step": 68800 }, { "epoch": 26.51252408477842, "eval_loss": 0.4009142518043518, "eval_runtime": 12.0833, "eval_samples_per_second": 1572.753, "eval_steps_per_second": 32.773, "step": 68800 }, { "epoch": 26.53179190751445, "grad_norm": 0.36234042048454285, "learning_rate": 9.387437379576108e-05, "loss": 0.34054161071777345, "step": 68850 }, { "epoch": 26.53179190751445, "eval_loss": 0.4049872159957886, "eval_runtime": 12.1933, "eval_samples_per_second": 1558.563, "eval_steps_per_second": 32.477, "step": 68850 }, { "epoch": 26.551059730250483, "grad_norm": 0.37690702080726624, "learning_rate": 9.379730250481697e-05, "loss": 0.3495954513549805, "step": 68900 }, { "epoch": 26.551059730250483, "eval_loss": 0.39978334307670593, "eval_runtime": 12.1893, "eval_samples_per_second": 1559.074, "eval_steps_per_second": 32.488, "step": 68900 }, { "epoch": 26.570327552986512, "grad_norm": 0.37452253699302673, "learning_rate": 9.372023121387284e-05, "loss": 0.3411111068725586, "step": 68950 }, { "epoch": 26.570327552986512, "eval_loss": 0.3998531699180603, "eval_runtime": 12.1913, "eval_samples_per_second": 1558.822, "eval_steps_per_second": 32.482, "step": 68950 }, { "epoch": 26.589595375722542, "grad_norm": 0.309113472700119, "learning_rate": 9.364315992292871e-05, "loss": 0.3431053924560547, "step": 69000 }, { "epoch": 26.589595375722542, "eval_loss": 0.39281991124153137, "eval_runtime": 12.1629, "eval_samples_per_second": 1562.45, "eval_steps_per_second": 32.558, "step": 69000 }, { "epoch": 26.608863198458575, "grad_norm": 0.36073818802833557, "learning_rate": 9.35660886319846e-05, "loss": 0.34102455139160154, "step": 69050 }, { "epoch": 26.608863198458575, "eval_loss": 0.4005065858364105, "eval_runtime": 12.1771, "eval_samples_per_second": 1560.633, "eval_steps_per_second": 32.52, "step": 69050 }, { "epoch": 26.628131021194605, "grad_norm": 0.3529011011123657, "learning_rate": 9.348901734104047e-05, "loss": 0.34983848571777343, "step": 69100 }, { "epoch": 26.628131021194605, "eval_loss": 0.4047587513923645, "eval_runtime": 12.1723, "eval_samples_per_second": 1561.248, "eval_steps_per_second": 32.533, "step": 69100 }, { "epoch": 26.647398843930635, "grad_norm": 0.32962512969970703, "learning_rate": 9.341194605009634e-05, "loss": 0.3458911895751953, "step": 69150 }, { "epoch": 26.647398843930635, "eval_loss": 0.39084112644195557, "eval_runtime": 12.1687, "eval_samples_per_second": 1561.706, "eval_steps_per_second": 32.542, "step": 69150 }, { "epoch": 26.666666666666668, "grad_norm": 0.4091123640537262, "learning_rate": 9.333487475915222e-05, "loss": 0.3448305511474609, "step": 69200 }, { "epoch": 26.666666666666668, "eval_loss": 0.40475282073020935, "eval_runtime": 12.0856, "eval_samples_per_second": 1572.444, "eval_steps_per_second": 32.766, "step": 69200 }, { "epoch": 26.685934489402698, "grad_norm": 0.3582211434841156, "learning_rate": 9.32578034682081e-05, "loss": 0.3478606414794922, "step": 69250 }, { "epoch": 26.685934489402698, "eval_loss": 0.40264344215393066, "eval_runtime": 12.0796, "eval_samples_per_second": 1573.237, "eval_steps_per_second": 32.783, "step": 69250 }, { "epoch": 26.705202312138727, "grad_norm": 0.3808528780937195, "learning_rate": 9.318073217726398e-05, "loss": 0.3408049011230469, "step": 69300 }, { "epoch": 26.705202312138727, "eval_loss": 0.397393137216568, "eval_runtime": 12.072, "eval_samples_per_second": 1574.22, "eval_steps_per_second": 32.803, "step": 69300 }, { "epoch": 26.72447013487476, "grad_norm": 0.4342128336429596, "learning_rate": 9.310366088631985e-05, "loss": 0.3466617584228516, "step": 69350 }, { "epoch": 26.72447013487476, "eval_loss": 0.40396037697792053, "eval_runtime": 12.3132, "eval_samples_per_second": 1543.39, "eval_steps_per_second": 32.161, "step": 69350 }, { "epoch": 26.74373795761079, "grad_norm": 0.34411364793777466, "learning_rate": 9.302658959537572e-05, "loss": 0.3410129928588867, "step": 69400 }, { "epoch": 26.74373795761079, "eval_loss": 0.4010723829269409, "eval_runtime": 12.1889, "eval_samples_per_second": 1559.121, "eval_steps_per_second": 32.489, "step": 69400 }, { "epoch": 26.76300578034682, "grad_norm": 0.36226412653923035, "learning_rate": 9.294951830443161e-05, "loss": 0.34042694091796877, "step": 69450 }, { "epoch": 26.76300578034682, "eval_loss": 0.39972326159477234, "eval_runtime": 12.2382, "eval_samples_per_second": 1552.844, "eval_steps_per_second": 32.358, "step": 69450 }, { "epoch": 26.782273603082853, "grad_norm": 0.3168051242828369, "learning_rate": 9.287244701348748e-05, "loss": 0.3471105194091797, "step": 69500 }, { "epoch": 26.782273603082853, "eval_loss": 0.404671847820282, "eval_runtime": 12.1949, "eval_samples_per_second": 1558.355, "eval_steps_per_second": 32.473, "step": 69500 }, { "epoch": 26.801541425818883, "grad_norm": 0.34712958335876465, "learning_rate": 9.279537572254335e-05, "loss": 0.3430381393432617, "step": 69550 }, { "epoch": 26.801541425818883, "eval_loss": 0.3932829797267914, "eval_runtime": 12.1561, "eval_samples_per_second": 1563.333, "eval_steps_per_second": 32.576, "step": 69550 }, { "epoch": 26.820809248554912, "grad_norm": 0.3445735573768616, "learning_rate": 9.271830443159924e-05, "loss": 0.34376968383789064, "step": 69600 }, { "epoch": 26.820809248554912, "eval_loss": 0.4087623059749603, "eval_runtime": 12.1834, "eval_samples_per_second": 1559.829, "eval_steps_per_second": 32.503, "step": 69600 }, { "epoch": 26.840077071290946, "grad_norm": 0.3498722016811371, "learning_rate": 9.264123314065512e-05, "loss": 0.3426410675048828, "step": 69650 }, { "epoch": 26.840077071290946, "eval_loss": 0.4004414975643158, "eval_runtime": 12.1796, "eval_samples_per_second": 1560.318, "eval_steps_per_second": 32.513, "step": 69650 }, { "epoch": 26.859344894026975, "grad_norm": 0.3773338198661804, "learning_rate": 9.256416184971098e-05, "loss": 0.3438229751586914, "step": 69700 }, { "epoch": 26.859344894026975, "eval_loss": 0.4128226935863495, "eval_runtime": 12.1737, "eval_samples_per_second": 1561.065, "eval_steps_per_second": 32.529, "step": 69700 }, { "epoch": 26.878612716763005, "grad_norm": 0.328712522983551, "learning_rate": 9.248709055876686e-05, "loss": 0.34241539001464844, "step": 69750 }, { "epoch": 26.878612716763005, "eval_loss": 0.4050303101539612, "eval_runtime": 12.1913, "eval_samples_per_second": 1558.818, "eval_steps_per_second": 32.482, "step": 69750 }, { "epoch": 26.897880539499038, "grad_norm": 0.37562403082847595, "learning_rate": 9.241001926782275e-05, "loss": 0.3436603546142578, "step": 69800 }, { "epoch": 26.897880539499038, "eval_loss": 0.39924830198287964, "eval_runtime": 12.1928, "eval_samples_per_second": 1558.619, "eval_steps_per_second": 32.478, "step": 69800 }, { "epoch": 26.917148362235068, "grad_norm": 0.37423691153526306, "learning_rate": 9.233294797687862e-05, "loss": 0.34328948974609375, "step": 69850 }, { "epoch": 26.917148362235068, "eval_loss": 0.4031747877597809, "eval_runtime": 12.1373, "eval_samples_per_second": 1565.757, "eval_steps_per_second": 32.627, "step": 69850 }, { "epoch": 26.936416184971097, "grad_norm": 0.34089457988739014, "learning_rate": 9.225587668593449e-05, "loss": 0.34267967224121093, "step": 69900 }, { "epoch": 26.936416184971097, "eval_loss": 0.4087149500846863, "eval_runtime": 12.107, "eval_samples_per_second": 1569.671, "eval_steps_per_second": 32.708, "step": 69900 }, { "epoch": 26.95568400770713, "grad_norm": 0.36034461855888367, "learning_rate": 9.217880539499038e-05, "loss": 0.3385904693603516, "step": 69950 }, { "epoch": 26.95568400770713, "eval_loss": 0.4106902778148651, "eval_runtime": 12.1125, "eval_samples_per_second": 1568.962, "eval_steps_per_second": 32.694, "step": 69950 }, { "epoch": 26.97495183044316, "grad_norm": 0.4286401867866516, "learning_rate": 9.210173410404625e-05, "loss": 0.34384689331054685, "step": 70000 }, { "epoch": 26.97495183044316, "eval_loss": 0.4065011739730835, "eval_runtime": 12.1342, "eval_samples_per_second": 1566.151, "eval_steps_per_second": 32.635, "step": 70000 }, { "epoch": 26.99421965317919, "grad_norm": 0.3543224334716797, "learning_rate": 9.202466281310213e-05, "loss": 0.33668155670166017, "step": 70050 }, { "epoch": 26.99421965317919, "eval_loss": 0.4009856581687927, "eval_runtime": 12.1084, "eval_samples_per_second": 1569.493, "eval_steps_per_second": 32.705, "step": 70050 }, { "epoch": 27.013487475915223, "grad_norm": 0.36464059352874756, "learning_rate": 9.1947591522158e-05, "loss": 0.34188491821289063, "step": 70100 }, { "epoch": 27.013487475915223, "eval_loss": 0.41445639729499817, "eval_runtime": 12.1832, "eval_samples_per_second": 1559.847, "eval_steps_per_second": 32.504, "step": 70100 }, { "epoch": 27.032755298651253, "grad_norm": 0.35329607129096985, "learning_rate": 9.187052023121387e-05, "loss": 0.33964492797851564, "step": 70150 }, { "epoch": 27.032755298651253, "eval_loss": 0.4069187045097351, "eval_runtime": 12.1575, "eval_samples_per_second": 1563.156, "eval_steps_per_second": 32.573, "step": 70150 }, { "epoch": 27.052023121387283, "grad_norm": 0.34304171800613403, "learning_rate": 9.179344894026976e-05, "loss": 0.3469298553466797, "step": 70200 }, { "epoch": 27.052023121387283, "eval_loss": 0.40434688329696655, "eval_runtime": 12.1541, "eval_samples_per_second": 1563.586, "eval_steps_per_second": 32.582, "step": 70200 }, { "epoch": 27.071290944123316, "grad_norm": 0.37067097425460815, "learning_rate": 9.171637764932563e-05, "loss": 0.34144710540771483, "step": 70250 }, { "epoch": 27.071290944123316, "eval_loss": 0.3998097777366638, "eval_runtime": 12.1163, "eval_samples_per_second": 1568.463, "eval_steps_per_second": 32.683, "step": 70250 }, { "epoch": 27.090558766859345, "grad_norm": 0.35580769181251526, "learning_rate": 9.16393063583815e-05, "loss": 0.33546737670898436, "step": 70300 }, { "epoch": 27.090558766859345, "eval_loss": 0.39971816539764404, "eval_runtime": 12.2223, "eval_samples_per_second": 1554.859, "eval_steps_per_second": 32.4, "step": 70300 }, { "epoch": 27.109826589595375, "grad_norm": 0.3048737049102783, "learning_rate": 9.156223506743739e-05, "loss": 0.3457350158691406, "step": 70350 }, { "epoch": 27.109826589595375, "eval_loss": 0.4093460142612457, "eval_runtime": 12.0953, "eval_samples_per_second": 1571.191, "eval_steps_per_second": 32.74, "step": 70350 }, { "epoch": 27.129094412331405, "grad_norm": 0.4064912796020508, "learning_rate": 9.148516377649327e-05, "loss": 0.3433489608764648, "step": 70400 }, { "epoch": 27.129094412331405, "eval_loss": 0.39858436584472656, "eval_runtime": 12.0885, "eval_samples_per_second": 1572.073, "eval_steps_per_second": 32.758, "step": 70400 }, { "epoch": 27.148362235067438, "grad_norm": 0.30401647090911865, "learning_rate": 9.140809248554913e-05, "loss": 0.34147052764892577, "step": 70450 }, { "epoch": 27.148362235067438, "eval_loss": 0.3996553122997284, "eval_runtime": 12.135, "eval_samples_per_second": 1566.05, "eval_steps_per_second": 32.633, "step": 70450 }, { "epoch": 27.167630057803468, "grad_norm": 0.3724418580532074, "learning_rate": 9.133102119460501e-05, "loss": 0.34838668823242186, "step": 70500 }, { "epoch": 27.167630057803468, "eval_loss": 0.4072006940841675, "eval_runtime": 12.1704, "eval_samples_per_second": 1561.493, "eval_steps_per_second": 32.538, "step": 70500 }, { "epoch": 27.186897880539497, "grad_norm": 0.3469790518283844, "learning_rate": 9.12539499036609e-05, "loss": 0.34331737518310546, "step": 70550 }, { "epoch": 27.186897880539497, "eval_loss": 0.40911591053009033, "eval_runtime": 12.0841, "eval_samples_per_second": 1572.641, "eval_steps_per_second": 32.77, "step": 70550 }, { "epoch": 27.20616570327553, "grad_norm": 0.34040504693984985, "learning_rate": 9.117687861271677e-05, "loss": 0.3429974365234375, "step": 70600 }, { "epoch": 27.20616570327553, "eval_loss": 0.3930108845233917, "eval_runtime": 12.2873, "eval_samples_per_second": 1546.633, "eval_steps_per_second": 32.228, "step": 70600 }, { "epoch": 27.22543352601156, "grad_norm": 0.3801578879356384, "learning_rate": 9.109980732177264e-05, "loss": 0.3414713287353516, "step": 70650 }, { "epoch": 27.22543352601156, "eval_loss": 0.4028588533401489, "eval_runtime": 12.169, "eval_samples_per_second": 1561.667, "eval_steps_per_second": 32.542, "step": 70650 }, { "epoch": 27.24470134874759, "grad_norm": 0.3553521931171417, "learning_rate": 9.102273603082853e-05, "loss": 0.33831642150878904, "step": 70700 }, { "epoch": 27.24470134874759, "eval_loss": 0.4042399525642395, "eval_runtime": 12.1912, "eval_samples_per_second": 1558.833, "eval_steps_per_second": 32.483, "step": 70700 }, { "epoch": 27.263969171483623, "grad_norm": 0.33868980407714844, "learning_rate": 9.09456647398844e-05, "loss": 0.34295379638671875, "step": 70750 }, { "epoch": 27.263969171483623, "eval_loss": 0.4079524278640747, "eval_runtime": 12.2112, "eval_samples_per_second": 1556.273, "eval_steps_per_second": 32.429, "step": 70750 }, { "epoch": 27.283236994219653, "grad_norm": 0.3783123195171356, "learning_rate": 9.086859344894027e-05, "loss": 0.3371542358398438, "step": 70800 }, { "epoch": 27.283236994219653, "eval_loss": 0.3995727598667145, "eval_runtime": 12.3785, "eval_samples_per_second": 1535.239, "eval_steps_per_second": 31.991, "step": 70800 }, { "epoch": 27.302504816955683, "grad_norm": 0.32976222038269043, "learning_rate": 9.079152215799615e-05, "loss": 0.3448733139038086, "step": 70850 }, { "epoch": 27.302504816955683, "eval_loss": 0.39763426780700684, "eval_runtime": 12.2893, "eval_samples_per_second": 1546.386, "eval_steps_per_second": 32.223, "step": 70850 }, { "epoch": 27.321772639691716, "grad_norm": 0.41020768880844116, "learning_rate": 9.071445086705203e-05, "loss": 0.3445896530151367, "step": 70900 }, { "epoch": 27.321772639691716, "eval_loss": 0.3990596532821655, "eval_runtime": 12.3434, "eval_samples_per_second": 1539.603, "eval_steps_per_second": 32.082, "step": 70900 }, { "epoch": 27.341040462427745, "grad_norm": 0.30504441261291504, "learning_rate": 9.063737957610791e-05, "loss": 0.3421406555175781, "step": 70950 }, { "epoch": 27.341040462427745, "eval_loss": 0.40020135045051575, "eval_runtime": 12.4553, "eval_samples_per_second": 1525.772, "eval_steps_per_second": 31.794, "step": 70950 }, { "epoch": 27.360308285163775, "grad_norm": 0.3219850957393646, "learning_rate": 9.056030828516378e-05, "loss": 0.3436236190795898, "step": 71000 }, { "epoch": 27.360308285163775, "eval_loss": 0.40496236085891724, "eval_runtime": 12.1151, "eval_samples_per_second": 1568.625, "eval_steps_per_second": 32.687, "step": 71000 }, { "epoch": 27.37957610789981, "grad_norm": 0.3589180111885071, "learning_rate": 9.048323699421965e-05, "loss": 0.3432718658447266, "step": 71050 }, { "epoch": 27.37957610789981, "eval_loss": 0.40268275141716003, "eval_runtime": 12.0959, "eval_samples_per_second": 1571.115, "eval_steps_per_second": 32.738, "step": 71050 }, { "epoch": 27.398843930635838, "grad_norm": 0.3488368093967438, "learning_rate": 9.040616570327554e-05, "loss": 0.3447647476196289, "step": 71100 }, { "epoch": 27.398843930635838, "eval_loss": 0.3977199196815491, "eval_runtime": 12.1591, "eval_samples_per_second": 1562.951, "eval_steps_per_second": 32.568, "step": 71100 }, { "epoch": 27.418111753371868, "grad_norm": 0.38076522946357727, "learning_rate": 9.032909441233141e-05, "loss": 0.34290672302246095, "step": 71150 }, { "epoch": 27.418111753371868, "eval_loss": 0.4041559100151062, "eval_runtime": 12.2432, "eval_samples_per_second": 1552.207, "eval_steps_per_second": 32.344, "step": 71150 }, { "epoch": 27.4373795761079, "grad_norm": 0.3307676315307617, "learning_rate": 9.025202312138728e-05, "loss": 0.3373046112060547, "step": 71200 }, { "epoch": 27.4373795761079, "eval_loss": 0.407199501991272, "eval_runtime": 12.1593, "eval_samples_per_second": 1562.914, "eval_steps_per_second": 32.568, "step": 71200 }, { "epoch": 27.45664739884393, "grad_norm": 0.3324744403362274, "learning_rate": 9.017495183044317e-05, "loss": 0.3426343536376953, "step": 71250 }, { "epoch": 27.45664739884393, "eval_loss": 0.4089909791946411, "eval_runtime": 12.1037, "eval_samples_per_second": 1570.092, "eval_steps_per_second": 32.717, "step": 71250 }, { "epoch": 27.47591522157996, "grad_norm": 0.32975542545318604, "learning_rate": 9.009788053949904e-05, "loss": 0.3441029739379883, "step": 71300 }, { "epoch": 27.47591522157996, "eval_loss": 0.40519461035728455, "eval_runtime": 12.1221, "eval_samples_per_second": 1567.712, "eval_steps_per_second": 32.668, "step": 71300 }, { "epoch": 27.495183044315993, "grad_norm": 0.36460864543914795, "learning_rate": 9.002080924855492e-05, "loss": 0.3405307769775391, "step": 71350 }, { "epoch": 27.495183044315993, "eval_loss": 0.4047679007053375, "eval_runtime": 12.0936, "eval_samples_per_second": 1571.407, "eval_steps_per_second": 32.745, "step": 71350 }, { "epoch": 27.514450867052023, "grad_norm": 0.34993845224380493, "learning_rate": 8.99437379576108e-05, "loss": 0.34264163970947265, "step": 71400 }, { "epoch": 27.514450867052023, "eval_loss": 0.39810818433761597, "eval_runtime": 12.2275, "eval_samples_per_second": 1554.199, "eval_steps_per_second": 32.386, "step": 71400 }, { "epoch": 27.533718689788053, "grad_norm": 0.35023918747901917, "learning_rate": 8.986666666666666e-05, "loss": 0.34074546813964846, "step": 71450 }, { "epoch": 27.533718689788053, "eval_loss": 0.4008537232875824, "eval_runtime": 12.2515, "eval_samples_per_second": 1551.156, "eval_steps_per_second": 32.323, "step": 71450 }, { "epoch": 27.552986512524086, "grad_norm": 0.36380478739738464, "learning_rate": 8.978959537572255e-05, "loss": 0.3443232345581055, "step": 71500 }, { "epoch": 27.552986512524086, "eval_loss": 0.402727872133255, "eval_runtime": 12.1694, "eval_samples_per_second": 1561.621, "eval_steps_per_second": 32.541, "step": 71500 }, { "epoch": 27.572254335260116, "grad_norm": 0.3293098509311676, "learning_rate": 8.971252408477842e-05, "loss": 0.343697509765625, "step": 71550 }, { "epoch": 27.572254335260116, "eval_loss": 0.407426655292511, "eval_runtime": 12.0953, "eval_samples_per_second": 1571.191, "eval_steps_per_second": 32.74, "step": 71550 }, { "epoch": 27.591522157996145, "grad_norm": 0.3189796209335327, "learning_rate": 8.963545279383429e-05, "loss": 0.345107421875, "step": 71600 }, { "epoch": 27.591522157996145, "eval_loss": 0.41045936942100525, "eval_runtime": 12.1003, "eval_samples_per_second": 1570.542, "eval_steps_per_second": 32.727, "step": 71600 }, { "epoch": 27.61078998073218, "grad_norm": 0.30838245153427124, "learning_rate": 8.955838150289018e-05, "loss": 0.3394207000732422, "step": 71650 }, { "epoch": 27.61078998073218, "eval_loss": 0.4038008749485016, "eval_runtime": 12.1033, "eval_samples_per_second": 1570.155, "eval_steps_per_second": 32.718, "step": 71650 }, { "epoch": 27.63005780346821, "grad_norm": 0.3677329123020172, "learning_rate": 8.948131021194606e-05, "loss": 0.3424112319946289, "step": 71700 }, { "epoch": 27.63005780346821, "eval_loss": 0.3968685567378998, "eval_runtime": 12.1325, "eval_samples_per_second": 1566.367, "eval_steps_per_second": 32.64, "step": 71700 }, { "epoch": 27.649325626204238, "grad_norm": 0.32951968908309937, "learning_rate": 8.940423892100192e-05, "loss": 0.3398356628417969, "step": 71750 }, { "epoch": 27.649325626204238, "eval_loss": 0.3973637521266937, "eval_runtime": 12.104, "eval_samples_per_second": 1570.055, "eval_steps_per_second": 32.716, "step": 71750 }, { "epoch": 27.66859344894027, "grad_norm": 0.33189913630485535, "learning_rate": 8.93271676300578e-05, "loss": 0.34570510864257814, "step": 71800 }, { "epoch": 27.66859344894027, "eval_loss": 0.3977719247341156, "eval_runtime": 12.0815, "eval_samples_per_second": 1572.984, "eval_steps_per_second": 32.777, "step": 71800 }, { "epoch": 27.6878612716763, "grad_norm": 0.39112940430641174, "learning_rate": 8.925009633911369e-05, "loss": 0.3403484344482422, "step": 71850 }, { "epoch": 27.6878612716763, "eval_loss": 0.40048137307167053, "eval_runtime": 12.1094, "eval_samples_per_second": 1569.355, "eval_steps_per_second": 32.702, "step": 71850 }, { "epoch": 27.70712909441233, "grad_norm": 0.37574002146720886, "learning_rate": 8.917302504816956e-05, "loss": 0.33866683959960936, "step": 71900 }, { "epoch": 27.70712909441233, "eval_loss": 0.3942176103591919, "eval_runtime": 12.115, "eval_samples_per_second": 1568.633, "eval_steps_per_second": 32.687, "step": 71900 }, { "epoch": 27.726396917148364, "grad_norm": 0.2983663082122803, "learning_rate": 8.909595375722543e-05, "loss": 0.34194198608398435, "step": 71950 }, { "epoch": 27.726396917148364, "eval_loss": 0.40677589178085327, "eval_runtime": 12.1406, "eval_samples_per_second": 1565.321, "eval_steps_per_second": 32.618, "step": 71950 }, { "epoch": 27.745664739884393, "grad_norm": 0.3224749267101288, "learning_rate": 8.901888246628132e-05, "loss": 0.3430283355712891, "step": 72000 }, { "epoch": 27.745664739884393, "eval_loss": 0.40603792667388916, "eval_runtime": 12.0997, "eval_samples_per_second": 1570.624, "eval_steps_per_second": 32.728, "step": 72000 }, { "epoch": 27.764932562620423, "grad_norm": 0.3764272630214691, "learning_rate": 8.894181117533719e-05, "loss": 0.33977081298828127, "step": 72050 }, { "epoch": 27.764932562620423, "eval_loss": 0.3931167423725128, "eval_runtime": 12.1306, "eval_samples_per_second": 1566.613, "eval_steps_per_second": 32.645, "step": 72050 }, { "epoch": 27.784200385356456, "grad_norm": 0.33363887667655945, "learning_rate": 8.886473988439307e-05, "loss": 0.3449821853637695, "step": 72100 }, { "epoch": 27.784200385356456, "eval_loss": 0.40895944833755493, "eval_runtime": 12.1326, "eval_samples_per_second": 1566.363, "eval_steps_per_second": 32.639, "step": 72100 }, { "epoch": 27.803468208092486, "grad_norm": 0.39367878437042236, "learning_rate": 8.878766859344894e-05, "loss": 0.34095088958740233, "step": 72150 }, { "epoch": 27.803468208092486, "eval_loss": 0.40529584884643555, "eval_runtime": 12.1722, "eval_samples_per_second": 1561.265, "eval_steps_per_second": 32.533, "step": 72150 }, { "epoch": 27.822736030828516, "grad_norm": 0.32966548204421997, "learning_rate": 8.871059730250482e-05, "loss": 0.34028297424316406, "step": 72200 }, { "epoch": 27.822736030828516, "eval_loss": 0.4057076573371887, "eval_runtime": 12.1313, "eval_samples_per_second": 1566.52, "eval_steps_per_second": 32.643, "step": 72200 }, { "epoch": 27.84200385356455, "grad_norm": 0.4148493707180023, "learning_rate": 8.86335260115607e-05, "loss": 0.3405000305175781, "step": 72250 }, { "epoch": 27.84200385356455, "eval_loss": 0.402450829744339, "eval_runtime": 12.3106, "eval_samples_per_second": 1543.712, "eval_steps_per_second": 32.167, "step": 72250 }, { "epoch": 27.86127167630058, "grad_norm": 0.40058425068855286, "learning_rate": 8.855645472061657e-05, "loss": 0.3361852264404297, "step": 72300 }, { "epoch": 27.86127167630058, "eval_loss": 0.38880547881126404, "eval_runtime": 12.2282, "eval_samples_per_second": 1554.112, "eval_steps_per_second": 32.384, "step": 72300 }, { "epoch": 27.880539499036608, "grad_norm": 0.37408918142318726, "learning_rate": 8.847938342967244e-05, "loss": 0.34303882598876956, "step": 72350 }, { "epoch": 27.880539499036608, "eval_loss": 0.4043724834918976, "eval_runtime": 12.1315, "eval_samples_per_second": 1566.504, "eval_steps_per_second": 32.642, "step": 72350 }, { "epoch": 27.89980732177264, "grad_norm": 0.36157700419425964, "learning_rate": 8.840231213872833e-05, "loss": 0.34021018981933593, "step": 72400 }, { "epoch": 27.89980732177264, "eval_loss": 0.40479332208633423, "eval_runtime": 12.1167, "eval_samples_per_second": 1568.416, "eval_steps_per_second": 32.682, "step": 72400 }, { "epoch": 27.91907514450867, "grad_norm": 0.36208704113960266, "learning_rate": 8.832524084778421e-05, "loss": 0.3451300811767578, "step": 72450 }, { "epoch": 27.91907514450867, "eval_loss": 0.4035382568836212, "eval_runtime": 12.1911, "eval_samples_per_second": 1558.84, "eval_steps_per_second": 32.483, "step": 72450 }, { "epoch": 27.9383429672447, "grad_norm": 0.3315705955028534, "learning_rate": 8.824816955684007e-05, "loss": 0.34081550598144533, "step": 72500 }, { "epoch": 27.9383429672447, "eval_loss": 0.39865633845329285, "eval_runtime": 12.1193, "eval_samples_per_second": 1568.083, "eval_steps_per_second": 32.675, "step": 72500 }, { "epoch": 27.95761078998073, "grad_norm": 0.35635730624198914, "learning_rate": 8.817109826589596e-05, "loss": 0.34472591400146485, "step": 72550 }, { "epoch": 27.95761078998073, "eval_loss": 0.40570735931396484, "eval_runtime": 12.1143, "eval_samples_per_second": 1568.726, "eval_steps_per_second": 32.689, "step": 72550 }, { "epoch": 27.976878612716764, "grad_norm": 0.43175315856933594, "learning_rate": 8.809402697495184e-05, "loss": 0.33976757049560546, "step": 72600 }, { "epoch": 27.976878612716764, "eval_loss": 0.3982110321521759, "eval_runtime": 12.1345, "eval_samples_per_second": 1566.111, "eval_steps_per_second": 32.634, "step": 72600 }, { "epoch": 27.996146435452793, "grad_norm": 0.2947046756744385, "learning_rate": 8.801695568400771e-05, "loss": 0.34231842041015625, "step": 72650 }, { "epoch": 27.996146435452793, "eval_loss": 0.3987494111061096, "eval_runtime": 12.1386, "eval_samples_per_second": 1565.58, "eval_steps_per_second": 32.623, "step": 72650 }, { "epoch": 28.015414258188823, "grad_norm": 0.3378951847553253, "learning_rate": 8.793988439306358e-05, "loss": 0.3391097640991211, "step": 72700 }, { "epoch": 28.015414258188823, "eval_loss": 0.4031994044780731, "eval_runtime": 12.2113, "eval_samples_per_second": 1556.258, "eval_steps_per_second": 32.429, "step": 72700 }, { "epoch": 28.034682080924856, "grad_norm": 0.34101206064224243, "learning_rate": 8.786281310211947e-05, "loss": 0.3427278137207031, "step": 72750 }, { "epoch": 28.034682080924856, "eval_loss": 0.4084327220916748, "eval_runtime": 12.1909, "eval_samples_per_second": 1558.867, "eval_steps_per_second": 32.483, "step": 72750 }, { "epoch": 28.053949903660886, "grad_norm": 0.3347659707069397, "learning_rate": 8.778574181117534e-05, "loss": 0.33843963623046874, "step": 72800 }, { "epoch": 28.053949903660886, "eval_loss": 0.3955867886543274, "eval_runtime": 12.196, "eval_samples_per_second": 1558.218, "eval_steps_per_second": 32.47, "step": 72800 }, { "epoch": 28.073217726396916, "grad_norm": 0.38758981227874756, "learning_rate": 8.770867052023121e-05, "loss": 0.34188793182373045, "step": 72850 }, { "epoch": 28.073217726396916, "eval_loss": 0.4027468264102936, "eval_runtime": 12.2037, "eval_samples_per_second": 1557.228, "eval_steps_per_second": 32.449, "step": 72850 }, { "epoch": 28.09248554913295, "grad_norm": 0.33388659358024597, "learning_rate": 8.76315992292871e-05, "loss": 0.33945991516113283, "step": 72900 }, { "epoch": 28.09248554913295, "eval_loss": 0.41589048504829407, "eval_runtime": 12.2128, "eval_samples_per_second": 1556.073, "eval_steps_per_second": 32.425, "step": 72900 }, { "epoch": 28.11175337186898, "grad_norm": 0.32528674602508545, "learning_rate": 8.755452793834297e-05, "loss": 0.33810325622558596, "step": 72950 }, { "epoch": 28.11175337186898, "eval_loss": 0.414380818605423, "eval_runtime": 12.2178, "eval_samples_per_second": 1555.43, "eval_steps_per_second": 32.412, "step": 72950 }, { "epoch": 28.131021194605008, "grad_norm": 0.3338592052459717, "learning_rate": 8.747745664739885e-05, "loss": 0.34067100524902344, "step": 73000 }, { "epoch": 28.131021194605008, "eval_loss": 0.40973034501075745, "eval_runtime": 12.0893, "eval_samples_per_second": 1571.967, "eval_steps_per_second": 32.756, "step": 73000 }, { "epoch": 28.15028901734104, "grad_norm": 0.3514813184738159, "learning_rate": 8.740038535645472e-05, "loss": 0.33975814819335937, "step": 73050 }, { "epoch": 28.15028901734104, "eval_loss": 0.40760722756385803, "eval_runtime": 12.1149, "eval_samples_per_second": 1568.647, "eval_steps_per_second": 32.687, "step": 73050 }, { "epoch": 28.16955684007707, "grad_norm": 0.3665297329425812, "learning_rate": 8.73233140655106e-05, "loss": 0.34500713348388673, "step": 73100 }, { "epoch": 28.16955684007707, "eval_loss": 0.40724608302116394, "eval_runtime": 12.1193, "eval_samples_per_second": 1568.079, "eval_steps_per_second": 32.675, "step": 73100 }, { "epoch": 28.1888246628131, "grad_norm": 0.3459983766078949, "learning_rate": 8.724624277456648e-05, "loss": 0.3347904586791992, "step": 73150 }, { "epoch": 28.1888246628131, "eval_loss": 0.4201650321483612, "eval_runtime": 12.177, "eval_samples_per_second": 1560.651, "eval_steps_per_second": 32.52, "step": 73150 }, { "epoch": 28.208092485549134, "grad_norm": 0.37623170018196106, "learning_rate": 8.716917148362236e-05, "loss": 0.33902797698974607, "step": 73200 }, { "epoch": 28.208092485549134, "eval_loss": 0.41943246126174927, "eval_runtime": 12.0975, "eval_samples_per_second": 1570.907, "eval_steps_per_second": 32.734, "step": 73200 }, { "epoch": 28.227360308285164, "grad_norm": 0.39677706360816956, "learning_rate": 8.709210019267822e-05, "loss": 0.344732666015625, "step": 73250 }, { "epoch": 28.227360308285164, "eval_loss": 0.41330793499946594, "eval_runtime": 12.1813, "eval_samples_per_second": 1560.102, "eval_steps_per_second": 32.509, "step": 73250 }, { "epoch": 28.246628131021193, "grad_norm": 0.35347869992256165, "learning_rate": 8.701502890173411e-05, "loss": 0.3414713668823242, "step": 73300 }, { "epoch": 28.246628131021193, "eval_loss": 0.40561792254447937, "eval_runtime": 12.0984, "eval_samples_per_second": 1570.783, "eval_steps_per_second": 32.732, "step": 73300 }, { "epoch": 28.265895953757227, "grad_norm": 0.3541305363178253, "learning_rate": 8.693795761078999e-05, "loss": 0.3429085159301758, "step": 73350 }, { "epoch": 28.265895953757227, "eval_loss": 0.40455254912376404, "eval_runtime": 12.1752, "eval_samples_per_second": 1560.881, "eval_steps_per_second": 32.525, "step": 73350 }, { "epoch": 28.285163776493256, "grad_norm": 0.3626526892185211, "learning_rate": 8.686088631984586e-05, "loss": 0.339837532043457, "step": 73400 }, { "epoch": 28.285163776493256, "eval_loss": 0.40132611989974976, "eval_runtime": 12.1102, "eval_samples_per_second": 1569.255, "eval_steps_per_second": 32.7, "step": 73400 }, { "epoch": 28.304431599229286, "grad_norm": 0.3971756398677826, "learning_rate": 8.678381502890173e-05, "loss": 0.34123638153076175, "step": 73450 }, { "epoch": 28.304431599229286, "eval_loss": 0.4071372151374817, "eval_runtime": 12.1296, "eval_samples_per_second": 1566.743, "eval_steps_per_second": 32.647, "step": 73450 }, { "epoch": 28.32369942196532, "grad_norm": 0.3365832567214966, "learning_rate": 8.670674373795762e-05, "loss": 0.339866943359375, "step": 73500 }, { "epoch": 28.32369942196532, "eval_loss": 0.4053739905357361, "eval_runtime": 12.2427, "eval_samples_per_second": 1552.27, "eval_steps_per_second": 32.346, "step": 73500 }, { "epoch": 28.34296724470135, "grad_norm": 0.29607510566711426, "learning_rate": 8.662967244701349e-05, "loss": 0.3413586807250977, "step": 73550 }, { "epoch": 28.34296724470135, "eval_loss": 0.3981591463088989, "eval_runtime": 12.2699, "eval_samples_per_second": 1548.836, "eval_steps_per_second": 32.274, "step": 73550 }, { "epoch": 28.36223506743738, "grad_norm": 0.3336181342601776, "learning_rate": 8.655260115606936e-05, "loss": 0.3427694702148438, "step": 73600 }, { "epoch": 28.36223506743738, "eval_loss": 0.40682151913642883, "eval_runtime": 12.2601, "eval_samples_per_second": 1550.073, "eval_steps_per_second": 32.3, "step": 73600 }, { "epoch": 28.38150289017341, "grad_norm": 0.34709376096725464, "learning_rate": 8.647552986512525e-05, "loss": 0.3375228500366211, "step": 73650 }, { "epoch": 28.38150289017341, "eval_loss": 0.39792823791503906, "eval_runtime": 12.186, "eval_samples_per_second": 1559.491, "eval_steps_per_second": 32.496, "step": 73650 }, { "epoch": 28.40077071290944, "grad_norm": 0.3873053193092346, "learning_rate": 8.639845857418112e-05, "loss": 0.33988964080810546, "step": 73700 }, { "epoch": 28.40077071290944, "eval_loss": 0.41531920433044434, "eval_runtime": 12.2728, "eval_samples_per_second": 1548.463, "eval_steps_per_second": 32.266, "step": 73700 }, { "epoch": 28.42003853564547, "grad_norm": 0.3640563488006592, "learning_rate": 8.6321387283237e-05, "loss": 0.34104312896728517, "step": 73750 }, { "epoch": 28.42003853564547, "eval_loss": 0.4088253080844879, "eval_runtime": 12.2812, "eval_samples_per_second": 1547.41, "eval_steps_per_second": 32.245, "step": 73750 }, { "epoch": 28.439306358381504, "grad_norm": 0.30980661511421204, "learning_rate": 8.624431599229288e-05, "loss": 0.3326974105834961, "step": 73800 }, { "epoch": 28.439306358381504, "eval_loss": 0.4063643217086792, "eval_runtime": 12.2934, "eval_samples_per_second": 1545.871, "eval_steps_per_second": 32.212, "step": 73800 }, { "epoch": 28.458574181117534, "grad_norm": 0.3265276551246643, "learning_rate": 8.616724470134875e-05, "loss": 0.3412283325195313, "step": 73850 }, { "epoch": 28.458574181117534, "eval_loss": 0.4039968252182007, "eval_runtime": 12.1, "eval_samples_per_second": 1570.585, "eval_steps_per_second": 32.727, "step": 73850 }, { "epoch": 28.477842003853564, "grad_norm": 0.31497660279273987, "learning_rate": 8.609017341040463e-05, "loss": 0.33476226806640624, "step": 73900 }, { "epoch": 28.477842003853564, "eval_loss": 0.407135009765625, "eval_runtime": 12.1353, "eval_samples_per_second": 1566.008, "eval_steps_per_second": 32.632, "step": 73900 }, { "epoch": 28.497109826589597, "grad_norm": 0.34809163212776184, "learning_rate": 8.60131021194605e-05, "loss": 0.3363214111328125, "step": 73950 }, { "epoch": 28.497109826589597, "eval_loss": 0.40078043937683105, "eval_runtime": 12.1712, "eval_samples_per_second": 1561.385, "eval_steps_per_second": 32.536, "step": 73950 }, { "epoch": 28.516377649325626, "grad_norm": 0.3526594638824463, "learning_rate": 8.593603082851637e-05, "loss": 0.33877574920654296, "step": 74000 }, { "epoch": 28.516377649325626, "eval_loss": 0.3962794840335846, "eval_runtime": 12.1007, "eval_samples_per_second": 1570.493, "eval_steps_per_second": 32.725, "step": 74000 }, { "epoch": 28.535645472061656, "grad_norm": 0.3257293701171875, "learning_rate": 8.585895953757226e-05, "loss": 0.34083568572998046, "step": 74050 }, { "epoch": 28.535645472061656, "eval_loss": 0.40493643283843994, "eval_runtime": 12.194, "eval_samples_per_second": 1558.475, "eval_steps_per_second": 32.475, "step": 74050 }, { "epoch": 28.55491329479769, "grad_norm": 0.35710665583610535, "learning_rate": 8.578188824662814e-05, "loss": 0.3411405944824219, "step": 74100 }, { "epoch": 28.55491329479769, "eval_loss": 0.4035206437110901, "eval_runtime": 12.2323, "eval_samples_per_second": 1553.596, "eval_steps_per_second": 32.373, "step": 74100 }, { "epoch": 28.57418111753372, "grad_norm": 0.3962242007255554, "learning_rate": 8.570481695568402e-05, "loss": 0.3374639129638672, "step": 74150 }, { "epoch": 28.57418111753372, "eval_loss": 0.40450605750083923, "eval_runtime": 12.0998, "eval_samples_per_second": 1570.6, "eval_steps_per_second": 32.728, "step": 74150 }, { "epoch": 28.59344894026975, "grad_norm": 0.36148691177368164, "learning_rate": 8.562774566473989e-05, "loss": 0.3372167205810547, "step": 74200 }, { "epoch": 28.59344894026975, "eval_loss": 0.3990635573863983, "eval_runtime": 12.1603, "eval_samples_per_second": 1562.792, "eval_steps_per_second": 32.565, "step": 74200 }, { "epoch": 28.612716763005782, "grad_norm": 0.36583447456359863, "learning_rate": 8.555067437379577e-05, "loss": 0.3383594512939453, "step": 74250 }, { "epoch": 28.612716763005782, "eval_loss": 0.4084060490131378, "eval_runtime": 12.2339, "eval_samples_per_second": 1553.388, "eval_steps_per_second": 32.369, "step": 74250 }, { "epoch": 28.63198458574181, "grad_norm": 0.3991263508796692, "learning_rate": 8.547360308285164e-05, "loss": 0.3431509017944336, "step": 74300 }, { "epoch": 28.63198458574181, "eval_loss": 0.40451762080192566, "eval_runtime": 12.101, "eval_samples_per_second": 1570.446, "eval_steps_per_second": 32.725, "step": 74300 }, { "epoch": 28.65125240847784, "grad_norm": 0.3188799023628235, "learning_rate": 8.539653179190751e-05, "loss": 0.3357038879394531, "step": 74350 }, { "epoch": 28.65125240847784, "eval_loss": 0.4108193516731262, "eval_runtime": 12.1047, "eval_samples_per_second": 1569.965, "eval_steps_per_second": 32.714, "step": 74350 }, { "epoch": 28.670520231213874, "grad_norm": 0.31139424443244934, "learning_rate": 8.53194605009634e-05, "loss": 0.3399533462524414, "step": 74400 }, { "epoch": 28.670520231213874, "eval_loss": 0.40622013807296753, "eval_runtime": 12.1189, "eval_samples_per_second": 1568.135, "eval_steps_per_second": 32.676, "step": 74400 }, { "epoch": 28.689788053949904, "grad_norm": 0.34679368138313293, "learning_rate": 8.524238921001927e-05, "loss": 0.34214717864990235, "step": 74450 }, { "epoch": 28.689788053949904, "eval_loss": 0.40086260437965393, "eval_runtime": 12.267, "eval_samples_per_second": 1549.191, "eval_steps_per_second": 32.282, "step": 74450 }, { "epoch": 28.709055876685934, "grad_norm": 0.3473329246044159, "learning_rate": 8.516531791907516e-05, "loss": 0.34495399475097654, "step": 74500 }, { "epoch": 28.709055876685934, "eval_loss": 0.4092850387096405, "eval_runtime": 12.2814, "eval_samples_per_second": 1547.379, "eval_steps_per_second": 32.244, "step": 74500 }, { "epoch": 28.728323699421964, "grad_norm": 0.3568156659603119, "learning_rate": 8.508824662813103e-05, "loss": 0.34430732727050783, "step": 74550 }, { "epoch": 28.728323699421964, "eval_loss": 0.4015388488769531, "eval_runtime": 12.1204, "eval_samples_per_second": 1567.938, "eval_steps_per_second": 32.672, "step": 74550 }, { "epoch": 28.747591522157997, "grad_norm": 0.35718047618865967, "learning_rate": 8.50111753371869e-05, "loss": 0.33965484619140623, "step": 74600 }, { "epoch": 28.747591522157997, "eval_loss": 0.40743741393089294, "eval_runtime": 12.1769, "eval_samples_per_second": 1560.654, "eval_steps_per_second": 32.52, "step": 74600 }, { "epoch": 28.766859344894026, "grad_norm": 0.34835973381996155, "learning_rate": 8.493410404624278e-05, "loss": 0.3399014663696289, "step": 74650 }, { "epoch": 28.766859344894026, "eval_loss": 0.4040972888469696, "eval_runtime": 12.2319, "eval_samples_per_second": 1553.641, "eval_steps_per_second": 32.374, "step": 74650 }, { "epoch": 28.786127167630056, "grad_norm": 0.35528451204299927, "learning_rate": 8.485703275529865e-05, "loss": 0.3398328399658203, "step": 74700 }, { "epoch": 28.786127167630056, "eval_loss": 0.40803733468055725, "eval_runtime": 12.2507, "eval_samples_per_second": 1551.252, "eval_steps_per_second": 32.325, "step": 74700 }, { "epoch": 28.80539499036609, "grad_norm": 0.36740756034851074, "learning_rate": 8.477996146435453e-05, "loss": 0.3432450866699219, "step": 74750 }, { "epoch": 28.80539499036609, "eval_loss": 0.40036237239837646, "eval_runtime": 12.2118, "eval_samples_per_second": 1556.205, "eval_steps_per_second": 32.428, "step": 74750 }, { "epoch": 28.82466281310212, "grad_norm": 0.3492307662963867, "learning_rate": 8.470289017341041e-05, "loss": 0.34101158142089844, "step": 74800 }, { "epoch": 28.82466281310212, "eval_loss": 0.40532955527305603, "eval_runtime": 12.2178, "eval_samples_per_second": 1555.436, "eval_steps_per_second": 32.412, "step": 74800 }, { "epoch": 28.84393063583815, "grad_norm": 0.3296574056148529, "learning_rate": 8.46258188824663e-05, "loss": 0.3352030181884766, "step": 74850 }, { "epoch": 28.84393063583815, "eval_loss": 0.4040343761444092, "eval_runtime": 12.2184, "eval_samples_per_second": 1555.365, "eval_steps_per_second": 32.41, "step": 74850 }, { "epoch": 28.863198458574182, "grad_norm": 0.3253469169139862, "learning_rate": 8.454874759152215e-05, "loss": 0.341695556640625, "step": 74900 }, { "epoch": 28.863198458574182, "eval_loss": 0.3968691825866699, "eval_runtime": 12.2046, "eval_samples_per_second": 1557.12, "eval_steps_per_second": 32.447, "step": 74900 }, { "epoch": 28.88246628131021, "grad_norm": 0.33987531065940857, "learning_rate": 8.447167630057804e-05, "loss": 0.33235916137695315, "step": 74950 }, { "epoch": 28.88246628131021, "eval_loss": 0.40201306343078613, "eval_runtime": 12.2584, "eval_samples_per_second": 1550.29, "eval_steps_per_second": 32.304, "step": 74950 }, { "epoch": 28.90173410404624, "grad_norm": 0.2947417199611664, "learning_rate": 8.439460500963392e-05, "loss": 0.3370682907104492, "step": 75000 }, { "epoch": 28.90173410404624, "eval_loss": 0.4052457809448242, "eval_runtime": 12.268, "eval_samples_per_second": 1549.069, "eval_steps_per_second": 32.279, "step": 75000 }, { "epoch": 28.921001926782274, "grad_norm": 0.3588947355747223, "learning_rate": 8.43175337186898e-05, "loss": 0.33859275817871093, "step": 75050 }, { "epoch": 28.921001926782274, "eval_loss": 0.402864933013916, "eval_runtime": 12.2387, "eval_samples_per_second": 1552.778, "eval_steps_per_second": 32.356, "step": 75050 }, { "epoch": 28.940269749518304, "grad_norm": 0.3480064570903778, "learning_rate": 8.424046242774567e-05, "loss": 0.33539901733398436, "step": 75100 }, { "epoch": 28.940269749518304, "eval_loss": 0.40438100695610046, "eval_runtime": 12.2679, "eval_samples_per_second": 1549.087, "eval_steps_per_second": 32.279, "step": 75100 }, { "epoch": 28.959537572254334, "grad_norm": 0.3147982656955719, "learning_rate": 8.416339113680155e-05, "loss": 0.3388772201538086, "step": 75150 }, { "epoch": 28.959537572254334, "eval_loss": 0.40164658427238464, "eval_runtime": 12.3928, "eval_samples_per_second": 1533.473, "eval_steps_per_second": 31.954, "step": 75150 }, { "epoch": 28.978805394990367, "grad_norm": 0.33464354276657104, "learning_rate": 8.408631984585742e-05, "loss": 0.3379730987548828, "step": 75200 }, { "epoch": 28.978805394990367, "eval_loss": 0.4051615297794342, "eval_runtime": 12.2557, "eval_samples_per_second": 1550.63, "eval_steps_per_second": 32.312, "step": 75200 }, { "epoch": 28.998073217726397, "grad_norm": 0.36251458525657654, "learning_rate": 8.40092485549133e-05, "loss": 0.34375804901123047, "step": 75250 }, { "epoch": 28.998073217726397, "eval_loss": 0.3973506987094879, "eval_runtime": 12.0912, "eval_samples_per_second": 1571.718, "eval_steps_per_second": 32.751, "step": 75250 }, { "epoch": 29.017341040462426, "grad_norm": 0.41536593437194824, "learning_rate": 8.393217726396918e-05, "loss": 0.3345018768310547, "step": 75300 }, { "epoch": 29.017341040462426, "eval_loss": 0.4031812846660614, "eval_runtime": 12.0928, "eval_samples_per_second": 1571.519, "eval_steps_per_second": 32.747, "step": 75300 }, { "epoch": 29.03660886319846, "grad_norm": 0.3709567189216614, "learning_rate": 8.385510597302505e-05, "loss": 0.34657310485839843, "step": 75350 }, { "epoch": 29.03660886319846, "eval_loss": 0.40909937024116516, "eval_runtime": 12.1101, "eval_samples_per_second": 1569.272, "eval_steps_per_second": 32.7, "step": 75350 }, { "epoch": 29.05587668593449, "grad_norm": 0.3898402154445648, "learning_rate": 8.377803468208093e-05, "loss": 0.3375400924682617, "step": 75400 }, { "epoch": 29.05587668593449, "eval_loss": 0.4083695709705353, "eval_runtime": 12.1237, "eval_samples_per_second": 1567.511, "eval_steps_per_second": 32.663, "step": 75400 }, { "epoch": 29.07514450867052, "grad_norm": 0.3070099353790283, "learning_rate": 8.37009633911368e-05, "loss": 0.3369541549682617, "step": 75450 }, { "epoch": 29.07514450867052, "eval_loss": 0.40919581055641174, "eval_runtime": 12.1027, "eval_samples_per_second": 1570.222, "eval_steps_per_second": 32.72, "step": 75450 }, { "epoch": 29.094412331406552, "grad_norm": 0.32743772864341736, "learning_rate": 8.362389210019268e-05, "loss": 0.3396988677978516, "step": 75500 }, { "epoch": 29.094412331406552, "eval_loss": 0.40457454323768616, "eval_runtime": 12.1016, "eval_samples_per_second": 1570.368, "eval_steps_per_second": 32.723, "step": 75500 }, { "epoch": 29.113680154142582, "grad_norm": 0.3568827509880066, "learning_rate": 8.354682080924856e-05, "loss": 0.337619743347168, "step": 75550 }, { "epoch": 29.113680154142582, "eval_loss": 0.39772024750709534, "eval_runtime": 12.1322, "eval_samples_per_second": 1566.411, "eval_steps_per_second": 32.64, "step": 75550 }, { "epoch": 29.13294797687861, "grad_norm": 0.3511202931404114, "learning_rate": 8.346974951830445e-05, "loss": 0.3357983016967773, "step": 75600 }, { "epoch": 29.13294797687861, "eval_loss": 0.4034055769443512, "eval_runtime": 12.112, "eval_samples_per_second": 1569.024, "eval_steps_per_second": 32.695, "step": 75600 }, { "epoch": 29.152215799614645, "grad_norm": 0.3642117381095886, "learning_rate": 8.33926782273603e-05, "loss": 0.342579345703125, "step": 75650 }, { "epoch": 29.152215799614645, "eval_loss": 0.40739530324935913, "eval_runtime": 12.2622, "eval_samples_per_second": 1549.806, "eval_steps_per_second": 32.294, "step": 75650 }, { "epoch": 29.171483622350674, "grad_norm": 0.33608829975128174, "learning_rate": 8.331560693641619e-05, "loss": 0.34283668518066407, "step": 75700 }, { "epoch": 29.171483622350674, "eval_loss": 0.4016248881816864, "eval_runtime": 12.4485, "eval_samples_per_second": 1526.612, "eval_steps_per_second": 31.811, "step": 75700 }, { "epoch": 29.190751445086704, "grad_norm": 0.31334346532821655, "learning_rate": 8.323853564547207e-05, "loss": 0.3373906707763672, "step": 75750 }, { "epoch": 29.190751445086704, "eval_loss": 0.39489105343818665, "eval_runtime": 12.1214, "eval_samples_per_second": 1567.8, "eval_steps_per_second": 32.669, "step": 75750 }, { "epoch": 29.210019267822737, "grad_norm": 0.3143196403980255, "learning_rate": 8.316146435452795e-05, "loss": 0.3415828704833984, "step": 75800 }, { "epoch": 29.210019267822737, "eval_loss": 0.3993748724460602, "eval_runtime": 12.2794, "eval_samples_per_second": 1547.634, "eval_steps_per_second": 32.249, "step": 75800 }, { "epoch": 29.229287090558767, "grad_norm": 0.3821955919265747, "learning_rate": 8.308439306358382e-05, "loss": 0.3374503707885742, "step": 75850 }, { "epoch": 29.229287090558767, "eval_loss": 0.4018915295600891, "eval_runtime": 12.2258, "eval_samples_per_second": 1554.42, "eval_steps_per_second": 32.391, "step": 75850 }, { "epoch": 29.248554913294797, "grad_norm": 0.37379252910614014, "learning_rate": 8.30073217726397e-05, "loss": 0.338868408203125, "step": 75900 }, { "epoch": 29.248554913294797, "eval_loss": 0.39820611476898193, "eval_runtime": 12.2838, "eval_samples_per_second": 1547.078, "eval_steps_per_second": 32.238, "step": 75900 }, { "epoch": 29.26782273603083, "grad_norm": 0.38983428478240967, "learning_rate": 8.293025048169557e-05, "loss": 0.3412132263183594, "step": 75950 }, { "epoch": 29.26782273603083, "eval_loss": 0.40245193243026733, "eval_runtime": 12.1585, "eval_samples_per_second": 1563.022, "eval_steps_per_second": 32.57, "step": 75950 }, { "epoch": 29.28709055876686, "grad_norm": 0.36237165331840515, "learning_rate": 8.285317919075144e-05, "loss": 0.33799495697021487, "step": 76000 }, { "epoch": 29.28709055876686, "eval_loss": 0.4036724269390106, "eval_runtime": 12.2155, "eval_samples_per_second": 1555.727, "eval_steps_per_second": 32.418, "step": 76000 }, { "epoch": 29.30635838150289, "grad_norm": 0.3307145833969116, "learning_rate": 8.277610789980732e-05, "loss": 0.339770622253418, "step": 76050 }, { "epoch": 29.30635838150289, "eval_loss": 0.39507389068603516, "eval_runtime": 12.1919, "eval_samples_per_second": 1558.746, "eval_steps_per_second": 32.481, "step": 76050 }, { "epoch": 29.325626204238922, "grad_norm": 0.30640560388565063, "learning_rate": 8.26990366088632e-05, "loss": 0.34053466796875, "step": 76100 }, { "epoch": 29.325626204238922, "eval_loss": 0.3994506299495697, "eval_runtime": 12.2295, "eval_samples_per_second": 1553.947, "eval_steps_per_second": 32.381, "step": 76100 }, { "epoch": 29.344894026974952, "grad_norm": 0.32951030135154724, "learning_rate": 8.262196531791909e-05, "loss": 0.336102294921875, "step": 76150 }, { "epoch": 29.344894026974952, "eval_loss": 0.3984861373901367, "eval_runtime": 12.2286, "eval_samples_per_second": 1554.061, "eval_steps_per_second": 32.383, "step": 76150 }, { "epoch": 29.36416184971098, "grad_norm": 0.3466298580169678, "learning_rate": 8.254489402697496e-05, "loss": 0.3357891845703125, "step": 76200 }, { "epoch": 29.36416184971098, "eval_loss": 0.39193016290664673, "eval_runtime": 12.2185, "eval_samples_per_second": 1555.341, "eval_steps_per_second": 32.41, "step": 76200 }, { "epoch": 29.383429672447015, "grad_norm": 0.36966007947921753, "learning_rate": 8.246782273603083e-05, "loss": 0.34476242065429685, "step": 76250 }, { "epoch": 29.383429672447015, "eval_loss": 0.4013426899909973, "eval_runtime": 12.2119, "eval_samples_per_second": 1556.184, "eval_steps_per_second": 32.427, "step": 76250 }, { "epoch": 29.402697495183045, "grad_norm": 0.3589341640472412, "learning_rate": 8.239075144508671e-05, "loss": 0.34361915588378905, "step": 76300 }, { "epoch": 29.402697495183045, "eval_loss": 0.398499459028244, "eval_runtime": 12.2141, "eval_samples_per_second": 1555.908, "eval_steps_per_second": 32.422, "step": 76300 }, { "epoch": 29.421965317919074, "grad_norm": 0.34493348002433777, "learning_rate": 8.231368015414258e-05, "loss": 0.33458736419677737, "step": 76350 }, { "epoch": 29.421965317919074, "eval_loss": 0.39613667130470276, "eval_runtime": 12.1864, "eval_samples_per_second": 1559.446, "eval_steps_per_second": 32.495, "step": 76350 }, { "epoch": 29.441233140655108, "grad_norm": 0.3703666031360626, "learning_rate": 8.223660886319846e-05, "loss": 0.33679203033447264, "step": 76400 }, { "epoch": 29.441233140655108, "eval_loss": 0.3904256224632263, "eval_runtime": 12.2068, "eval_samples_per_second": 1556.832, "eval_steps_per_second": 32.441, "step": 76400 }, { "epoch": 29.460500963391137, "grad_norm": 0.3583824336528778, "learning_rate": 8.215953757225434e-05, "loss": 0.34144466400146484, "step": 76450 }, { "epoch": 29.460500963391137, "eval_loss": 0.4040822684764862, "eval_runtime": 12.1419, "eval_samples_per_second": 1565.157, "eval_steps_per_second": 32.614, "step": 76450 }, { "epoch": 29.479768786127167, "grad_norm": 0.32129788398742676, "learning_rate": 8.208246628131021e-05, "loss": 0.3366130828857422, "step": 76500 }, { "epoch": 29.479768786127167, "eval_loss": 0.397727906703949, "eval_runtime": 12.1228, "eval_samples_per_second": 1567.63, "eval_steps_per_second": 32.666, "step": 76500 }, { "epoch": 29.4990366088632, "grad_norm": 0.31789708137512207, "learning_rate": 8.20053949903661e-05, "loss": 0.3358024597167969, "step": 76550 }, { "epoch": 29.4990366088632, "eval_loss": 0.40461546182632446, "eval_runtime": 12.2692, "eval_samples_per_second": 1548.922, "eval_steps_per_second": 32.276, "step": 76550 }, { "epoch": 29.51830443159923, "grad_norm": 0.3582072854042053, "learning_rate": 8.192832369942197e-05, "loss": 0.34053627014160154, "step": 76600 }, { "epoch": 29.51830443159923, "eval_loss": 0.3956148624420166, "eval_runtime": 12.2737, "eval_samples_per_second": 1548.354, "eval_steps_per_second": 32.264, "step": 76600 }, { "epoch": 29.53757225433526, "grad_norm": 0.3688930571079254, "learning_rate": 8.185125240847784e-05, "loss": 0.3372396087646484, "step": 76650 }, { "epoch": 29.53757225433526, "eval_loss": 0.403920978307724, "eval_runtime": 12.2547, "eval_samples_per_second": 1550.747, "eval_steps_per_second": 32.314, "step": 76650 }, { "epoch": 29.556840077071293, "grad_norm": 0.32262521982192993, "learning_rate": 8.177418111753372e-05, "loss": 0.33971923828125, "step": 76700 }, { "epoch": 29.556840077071293, "eval_loss": 0.4012478291988373, "eval_runtime": 12.2901, "eval_samples_per_second": 1546.285, "eval_steps_per_second": 32.221, "step": 76700 }, { "epoch": 29.576107899807322, "grad_norm": 0.3442867696285248, "learning_rate": 8.16971098265896e-05, "loss": 0.33692138671875, "step": 76750 }, { "epoch": 29.576107899807322, "eval_loss": 0.40071719884872437, "eval_runtime": 12.1249, "eval_samples_per_second": 1567.348, "eval_steps_per_second": 32.66, "step": 76750 }, { "epoch": 29.595375722543352, "grad_norm": 0.3229323625564575, "learning_rate": 8.162003853564547e-05, "loss": 0.3420280456542969, "step": 76800 }, { "epoch": 29.595375722543352, "eval_loss": 0.3957239091396332, "eval_runtime": 12.1315, "eval_samples_per_second": 1566.499, "eval_steps_per_second": 32.642, "step": 76800 }, { "epoch": 29.614643545279385, "grad_norm": 0.3567691445350647, "learning_rate": 8.154296724470135e-05, "loss": 0.3430733108520508, "step": 76850 }, { "epoch": 29.614643545279385, "eval_loss": 0.4028620421886444, "eval_runtime": 12.1155, "eval_samples_per_second": 1568.565, "eval_steps_per_second": 32.685, "step": 76850 }, { "epoch": 29.633911368015415, "grad_norm": 0.3480564057826996, "learning_rate": 8.146589595375724e-05, "loss": 0.3327045440673828, "step": 76900 }, { "epoch": 29.633911368015415, "eval_loss": 0.39998650550842285, "eval_runtime": 12.1387, "eval_samples_per_second": 1565.574, "eval_steps_per_second": 32.623, "step": 76900 }, { "epoch": 29.653179190751445, "grad_norm": 0.37636005878448486, "learning_rate": 8.13888246628131e-05, "loss": 0.3388992691040039, "step": 76950 }, { "epoch": 29.653179190751445, "eval_loss": 0.40690168738365173, "eval_runtime": 12.1357, "eval_samples_per_second": 1565.962, "eval_steps_per_second": 32.631, "step": 76950 }, { "epoch": 29.672447013487474, "grad_norm": 0.3563169538974762, "learning_rate": 8.131175337186898e-05, "loss": 0.3361124801635742, "step": 77000 }, { "epoch": 29.672447013487474, "eval_loss": 0.39062345027923584, "eval_runtime": 12.1354, "eval_samples_per_second": 1565.996, "eval_steps_per_second": 32.632, "step": 77000 }, { "epoch": 29.691714836223507, "grad_norm": 0.3892887234687805, "learning_rate": 8.123468208092486e-05, "loss": 0.3335068511962891, "step": 77050 }, { "epoch": 29.691714836223507, "eval_loss": 0.3967650532722473, "eval_runtime": 12.1129, "eval_samples_per_second": 1568.91, "eval_steps_per_second": 32.693, "step": 77050 }, { "epoch": 29.710982658959537, "grad_norm": 0.3354916572570801, "learning_rate": 8.115761078998074e-05, "loss": 0.34214469909667966, "step": 77100 }, { "epoch": 29.710982658959537, "eval_loss": 0.3983604907989502, "eval_runtime": 12.1929, "eval_samples_per_second": 1558.606, "eval_steps_per_second": 32.478, "step": 77100 }, { "epoch": 29.730250481695567, "grad_norm": 0.36883726716041565, "learning_rate": 8.108053949903661e-05, "loss": 0.33639251708984375, "step": 77150 }, { "epoch": 29.730250481695567, "eval_loss": 0.3981660306453705, "eval_runtime": 12.3006, "eval_samples_per_second": 1544.96, "eval_steps_per_second": 32.193, "step": 77150 }, { "epoch": 29.7495183044316, "grad_norm": 0.3186633586883545, "learning_rate": 8.100346820809249e-05, "loss": 0.3405615997314453, "step": 77200 }, { "epoch": 29.7495183044316, "eval_loss": 0.4013603627681732, "eval_runtime": 12.2752, "eval_samples_per_second": 1548.163, "eval_steps_per_second": 32.26, "step": 77200 }, { "epoch": 29.76878612716763, "grad_norm": 0.33433860540390015, "learning_rate": 8.092639691714836e-05, "loss": 0.34316627502441405, "step": 77250 }, { "epoch": 29.76878612716763, "eval_loss": 0.3944407105445862, "eval_runtime": 12.2537, "eval_samples_per_second": 1550.883, "eval_steps_per_second": 32.317, "step": 77250 }, { "epoch": 29.78805394990366, "grad_norm": 0.38801100850105286, "learning_rate": 8.084932562620425e-05, "loss": 0.33468536376953123, "step": 77300 }, { "epoch": 29.78805394990366, "eval_loss": 0.39183300733566284, "eval_runtime": 12.2587, "eval_samples_per_second": 1550.243, "eval_steps_per_second": 32.304, "step": 77300 }, { "epoch": 29.807321772639693, "grad_norm": 0.3587905764579773, "learning_rate": 8.077225433526012e-05, "loss": 0.33674217224121095, "step": 77350 }, { "epoch": 29.807321772639693, "eval_loss": 0.40156689286231995, "eval_runtime": 12.3814, "eval_samples_per_second": 1534.878, "eval_steps_per_second": 31.983, "step": 77350 }, { "epoch": 29.826589595375722, "grad_norm": 0.32864800095558167, "learning_rate": 8.069518304431599e-05, "loss": 0.3385050964355469, "step": 77400 }, { "epoch": 29.826589595375722, "eval_loss": 0.39360493421554565, "eval_runtime": 12.263, "eval_samples_per_second": 1549.706, "eval_steps_per_second": 32.292, "step": 77400 }, { "epoch": 29.845857418111752, "grad_norm": 0.3370286822319031, "learning_rate": 8.061811175337188e-05, "loss": 0.33882041931152346, "step": 77450 }, { "epoch": 29.845857418111752, "eval_loss": 0.3934619724750519, "eval_runtime": 12.2574, "eval_samples_per_second": 1550.405, "eval_steps_per_second": 32.307, "step": 77450 }, { "epoch": 29.865125240847785, "grad_norm": 0.3959228992462158, "learning_rate": 8.054104046242775e-05, "loss": 0.33527217864990233, "step": 77500 }, { "epoch": 29.865125240847785, "eval_loss": 0.40505313873291016, "eval_runtime": 12.2752, "eval_samples_per_second": 1548.16, "eval_steps_per_second": 32.26, "step": 77500 }, { "epoch": 29.884393063583815, "grad_norm": 0.3105311691761017, "learning_rate": 8.046396917148362e-05, "loss": 0.3402052307128906, "step": 77550 }, { "epoch": 29.884393063583815, "eval_loss": 0.3973555564880371, "eval_runtime": 12.2683, "eval_samples_per_second": 1549.036, "eval_steps_per_second": 32.278, "step": 77550 }, { "epoch": 29.903660886319845, "grad_norm": 0.33539679646492004, "learning_rate": 8.03868978805395e-05, "loss": 0.3371730422973633, "step": 77600 }, { "epoch": 29.903660886319845, "eval_loss": 0.39878925681114197, "eval_runtime": 12.2559, "eval_samples_per_second": 1550.597, "eval_steps_per_second": 32.311, "step": 77600 }, { "epoch": 29.922928709055878, "grad_norm": 0.34858375787734985, "learning_rate": 8.030982658959539e-05, "loss": 0.3363747406005859, "step": 77650 }, { "epoch": 29.922928709055878, "eval_loss": 0.3987490236759186, "eval_runtime": 12.2619, "eval_samples_per_second": 1549.845, "eval_steps_per_second": 32.295, "step": 77650 }, { "epoch": 29.942196531791907, "grad_norm": 0.3784034848213196, "learning_rate": 8.023275529865125e-05, "loss": 0.33827449798583986, "step": 77700 }, { "epoch": 29.942196531791907, "eval_loss": 0.3972572386264801, "eval_runtime": 12.244, "eval_samples_per_second": 1552.112, "eval_steps_per_second": 32.342, "step": 77700 }, { "epoch": 29.961464354527937, "grad_norm": 0.3775968551635742, "learning_rate": 8.015568400770713e-05, "loss": 0.33659881591796875, "step": 77750 }, { "epoch": 29.961464354527937, "eval_loss": 0.3942483067512512, "eval_runtime": 12.2309, "eval_samples_per_second": 1553.771, "eval_steps_per_second": 32.377, "step": 77750 }, { "epoch": 29.98073217726397, "grad_norm": 0.32215413451194763, "learning_rate": 8.007861271676302e-05, "loss": 0.33448402404785155, "step": 77800 }, { "epoch": 29.98073217726397, "eval_loss": 0.39629849791526794, "eval_runtime": 12.1301, "eval_samples_per_second": 1566.682, "eval_steps_per_second": 32.646, "step": 77800 }, { "epoch": 30.0, "grad_norm": 0.32509124279022217, "learning_rate": 8.000154142581889e-05, "loss": 0.33391098022460936, "step": 77850 }, { "epoch": 30.0, "eval_loss": 0.40078866481781006, "eval_runtime": 12.1762, "eval_samples_per_second": 1560.745, "eval_steps_per_second": 32.522, "step": 77850 }, { "epoch": 30.01926782273603, "grad_norm": 0.3141268789768219, "learning_rate": 7.992447013487476e-05, "loss": 0.3358293151855469, "step": 77900 }, { "epoch": 30.01926782273603, "eval_loss": 0.3999428451061249, "eval_runtime": 12.2134, "eval_samples_per_second": 1556.001, "eval_steps_per_second": 32.424, "step": 77900 }, { "epoch": 30.038535645472063, "grad_norm": 0.31997787952423096, "learning_rate": 7.984739884393064e-05, "loss": 0.3365086364746094, "step": 77950 }, { "epoch": 30.038535645472063, "eval_loss": 0.41002345085144043, "eval_runtime": 12.1528, "eval_samples_per_second": 1563.75, "eval_steps_per_second": 32.585, "step": 77950 }, { "epoch": 30.057803468208093, "grad_norm": 0.36958038806915283, "learning_rate": 7.977032755298651e-05, "loss": 0.33802482604980466, "step": 78000 }, { "epoch": 30.057803468208093, "eval_loss": 0.39623013138771057, "eval_runtime": 12.1351, "eval_samples_per_second": 1566.032, "eval_steps_per_second": 32.633, "step": 78000 }, { "epoch": 30.077071290944122, "grad_norm": 0.33436909317970276, "learning_rate": 7.969325626204239e-05, "loss": 0.3400865173339844, "step": 78050 }, { "epoch": 30.077071290944122, "eval_loss": 0.3966422379016876, "eval_runtime": 12.1095, "eval_samples_per_second": 1569.349, "eval_steps_per_second": 32.702, "step": 78050 }, { "epoch": 30.096339113680155, "grad_norm": 0.3612748086452484, "learning_rate": 7.961618497109827e-05, "loss": 0.33430622100830076, "step": 78100 }, { "epoch": 30.096339113680155, "eval_loss": 0.39065003395080566, "eval_runtime": 12.1136, "eval_samples_per_second": 1568.809, "eval_steps_per_second": 32.69, "step": 78100 }, { "epoch": 30.115606936416185, "grad_norm": 0.35375291109085083, "learning_rate": 7.953911368015414e-05, "loss": 0.3334471130371094, "step": 78150 }, { "epoch": 30.115606936416185, "eval_loss": 0.39156579971313477, "eval_runtime": 12.1171, "eval_samples_per_second": 1568.362, "eval_steps_per_second": 32.681, "step": 78150 }, { "epoch": 30.134874759152215, "grad_norm": 0.3348279297351837, "learning_rate": 7.946204238921003e-05, "loss": 0.3424004364013672, "step": 78200 }, { "epoch": 30.134874759152215, "eval_loss": 0.3937307894229889, "eval_runtime": 12.1147, "eval_samples_per_second": 1568.676, "eval_steps_per_second": 32.688, "step": 78200 }, { "epoch": 30.154142581888248, "grad_norm": 0.34247830510139465, "learning_rate": 7.93849710982659e-05, "loss": 0.33740226745605467, "step": 78250 }, { "epoch": 30.154142581888248, "eval_loss": 0.3976731300354004, "eval_runtime": 12.2119, "eval_samples_per_second": 1556.181, "eval_steps_per_second": 32.427, "step": 78250 }, { "epoch": 30.173410404624278, "grad_norm": 0.37805888056755066, "learning_rate": 7.930789980732177e-05, "loss": 0.33345382690429687, "step": 78300 }, { "epoch": 30.173410404624278, "eval_loss": 0.3878285586833954, "eval_runtime": 12.1405, "eval_samples_per_second": 1565.344, "eval_steps_per_second": 32.618, "step": 78300 }, { "epoch": 30.192678227360307, "grad_norm": 0.4086238443851471, "learning_rate": 7.923082851637765e-05, "loss": 0.33750850677490235, "step": 78350 }, { "epoch": 30.192678227360307, "eval_loss": 0.4093362092971802, "eval_runtime": 12.1374, "eval_samples_per_second": 1565.739, "eval_steps_per_second": 32.626, "step": 78350 }, { "epoch": 30.21194605009634, "grad_norm": 0.36569342017173767, "learning_rate": 7.915375722543354e-05, "loss": 0.3310520172119141, "step": 78400 }, { "epoch": 30.21194605009634, "eval_loss": 0.39482930302619934, "eval_runtime": 12.1328, "eval_samples_per_second": 1566.332, "eval_steps_per_second": 32.639, "step": 78400 }, { "epoch": 30.23121387283237, "grad_norm": 0.3322986662387848, "learning_rate": 7.90766859344894e-05, "loss": 0.33495746612548827, "step": 78450 }, { "epoch": 30.23121387283237, "eval_loss": 0.3956666886806488, "eval_runtime": 12.1251, "eval_samples_per_second": 1567.321, "eval_steps_per_second": 32.659, "step": 78450 }, { "epoch": 30.2504816955684, "grad_norm": 0.3476741313934326, "learning_rate": 7.899961464354528e-05, "loss": 0.334678955078125, "step": 78500 }, { "epoch": 30.2504816955684, "eval_loss": 0.40333569049835205, "eval_runtime": 12.1793, "eval_samples_per_second": 1560.353, "eval_steps_per_second": 32.514, "step": 78500 }, { "epoch": 30.269749518304433, "grad_norm": 0.3494560122489929, "learning_rate": 7.892254335260117e-05, "loss": 0.3376890563964844, "step": 78550 }, { "epoch": 30.269749518304433, "eval_loss": 0.4037102460861206, "eval_runtime": 12.1972, "eval_samples_per_second": 1558.066, "eval_steps_per_second": 32.467, "step": 78550 }, { "epoch": 30.289017341040463, "grad_norm": 0.43575745820999146, "learning_rate": 7.884547206165704e-05, "loss": 0.33312847137451174, "step": 78600 }, { "epoch": 30.289017341040463, "eval_loss": 0.3945438265800476, "eval_runtime": 12.1342, "eval_samples_per_second": 1566.153, "eval_steps_per_second": 32.635, "step": 78600 }, { "epoch": 30.308285163776493, "grad_norm": 0.29940304160118103, "learning_rate": 7.876840077071291e-05, "loss": 0.3339963912963867, "step": 78650 }, { "epoch": 30.308285163776493, "eval_loss": 0.39802664518356323, "eval_runtime": 12.2767, "eval_samples_per_second": 1547.976, "eval_steps_per_second": 32.256, "step": 78650 }, { "epoch": 30.327552986512526, "grad_norm": 0.3045285642147064, "learning_rate": 7.86913294797688e-05, "loss": 0.33359962463378906, "step": 78700 }, { "epoch": 30.327552986512526, "eval_loss": 0.39610549807548523, "eval_runtime": 12.1592, "eval_samples_per_second": 1562.937, "eval_steps_per_second": 32.568, "step": 78700 }, { "epoch": 30.346820809248555, "grad_norm": 0.35362133383750916, "learning_rate": 7.861425818882467e-05, "loss": 0.34049957275390624, "step": 78750 }, { "epoch": 30.346820809248555, "eval_loss": 0.3979499042034149, "eval_runtime": 12.0264, "eval_samples_per_second": 1580.188, "eval_steps_per_second": 32.928, "step": 78750 }, { "epoch": 30.366088631984585, "grad_norm": 0.31190043687820435, "learning_rate": 7.853718689788054e-05, "loss": 0.33647064208984373, "step": 78800 }, { "epoch": 30.366088631984585, "eval_loss": 0.4047558009624481, "eval_runtime": 11.9956, "eval_samples_per_second": 1584.247, "eval_steps_per_second": 33.012, "step": 78800 }, { "epoch": 30.38535645472062, "grad_norm": 0.3496329188346863, "learning_rate": 7.846011560693642e-05, "loss": 0.33419708251953123, "step": 78850 }, { "epoch": 30.38535645472062, "eval_loss": 0.39307811856269836, "eval_runtime": 11.9946, "eval_samples_per_second": 1584.382, "eval_steps_per_second": 33.015, "step": 78850 }, { "epoch": 30.404624277456648, "grad_norm": 0.3291316032409668, "learning_rate": 7.83830443159923e-05, "loss": 0.3332817840576172, "step": 78900 }, { "epoch": 30.404624277456648, "eval_loss": 0.39601850509643555, "eval_runtime": 11.9829, "eval_samples_per_second": 1585.925, "eval_steps_per_second": 33.047, "step": 78900 }, { "epoch": 30.423892100192678, "grad_norm": 0.31855300068855286, "learning_rate": 7.830597302504818e-05, "loss": 0.3363607406616211, "step": 78950 }, { "epoch": 30.423892100192678, "eval_loss": 0.39654678106307983, "eval_runtime": 12.0064, "eval_samples_per_second": 1582.816, "eval_steps_per_second": 32.982, "step": 78950 }, { "epoch": 30.443159922928707, "grad_norm": 0.31431058049201965, "learning_rate": 7.822890173410405e-05, "loss": 0.3351900863647461, "step": 79000 }, { "epoch": 30.443159922928707, "eval_loss": 0.39453360438346863, "eval_runtime": 12.0049, "eval_samples_per_second": 1583.017, "eval_steps_per_second": 32.986, "step": 79000 }, { "epoch": 30.46242774566474, "grad_norm": 0.37528693675994873, "learning_rate": 7.815183044315992e-05, "loss": 0.3345530700683594, "step": 79050 }, { "epoch": 30.46242774566474, "eval_loss": 0.3950311243534088, "eval_runtime": 12.0116, "eval_samples_per_second": 1582.135, "eval_steps_per_second": 32.968, "step": 79050 }, { "epoch": 30.48169556840077, "grad_norm": 0.298917293548584, "learning_rate": 7.80747591522158e-05, "loss": 0.33680248260498047, "step": 79100 }, { "epoch": 30.48169556840077, "eval_loss": 0.3966728746891022, "eval_runtime": 12.0088, "eval_samples_per_second": 1582.508, "eval_steps_per_second": 32.976, "step": 79100 }, { "epoch": 30.5009633911368, "grad_norm": 0.3466651439666748, "learning_rate": 7.799768786127169e-05, "loss": 0.3407196807861328, "step": 79150 }, { "epoch": 30.5009633911368, "eval_loss": 0.39678579568862915, "eval_runtime": 11.993, "eval_samples_per_second": 1584.594, "eval_steps_per_second": 33.019, "step": 79150 }, { "epoch": 30.520231213872833, "grad_norm": 0.3450862467288971, "learning_rate": 7.792061657032755e-05, "loss": 0.3338542175292969, "step": 79200 }, { "epoch": 30.520231213872833, "eval_loss": 0.39303213357925415, "eval_runtime": 12.1509, "eval_samples_per_second": 1563.994, "eval_steps_per_second": 32.59, "step": 79200 }, { "epoch": 30.539499036608863, "grad_norm": 0.356312096118927, "learning_rate": 7.784354527938343e-05, "loss": 0.337673454284668, "step": 79250 }, { "epoch": 30.539499036608863, "eval_loss": 0.39701399207115173, "eval_runtime": 12.3678, "eval_samples_per_second": 1536.573, "eval_steps_per_second": 32.019, "step": 79250 }, { "epoch": 30.558766859344892, "grad_norm": 0.3645452857017517, "learning_rate": 7.776647398843932e-05, "loss": 0.3345222473144531, "step": 79300 }, { "epoch": 30.558766859344892, "eval_loss": 0.39528223872184753, "eval_runtime": 12.2057, "eval_samples_per_second": 1556.982, "eval_steps_per_second": 32.444, "step": 79300 }, { "epoch": 30.578034682080926, "grad_norm": 0.31102579832077026, "learning_rate": 7.768940269749519e-05, "loss": 0.33252300262451173, "step": 79350 }, { "epoch": 30.578034682080926, "eval_loss": 0.39212682843208313, "eval_runtime": 12.3573, "eval_samples_per_second": 1537.88, "eval_steps_per_second": 32.046, "step": 79350 }, { "epoch": 30.597302504816955, "grad_norm": 0.3202425241470337, "learning_rate": 7.761233140655106e-05, "loss": 0.3356368637084961, "step": 79400 }, { "epoch": 30.597302504816955, "eval_loss": 0.39177006483078003, "eval_runtime": 12.3342, "eval_samples_per_second": 1540.753, "eval_steps_per_second": 32.106, "step": 79400 }, { "epoch": 30.616570327552985, "grad_norm": 0.29129624366760254, "learning_rate": 7.753526011560695e-05, "loss": 0.3417876434326172, "step": 79450 }, { "epoch": 30.616570327552985, "eval_loss": 0.393098384141922, "eval_runtime": 12.2315, "eval_samples_per_second": 1553.696, "eval_steps_per_second": 32.375, "step": 79450 }, { "epoch": 30.63583815028902, "grad_norm": 0.35155218839645386, "learning_rate": 7.745818882466282e-05, "loss": 0.32990322113037107, "step": 79500 }, { "epoch": 30.63583815028902, "eval_loss": 0.38657593727111816, "eval_runtime": 12.251, "eval_samples_per_second": 1551.218, "eval_steps_per_second": 32.324, "step": 79500 }, { "epoch": 30.655105973025048, "grad_norm": 0.4029949903488159, "learning_rate": 7.738111753371869e-05, "loss": 0.33605422973632815, "step": 79550 }, { "epoch": 30.655105973025048, "eval_loss": 0.39461612701416016, "eval_runtime": 12.2536, "eval_samples_per_second": 1550.894, "eval_steps_per_second": 32.317, "step": 79550 }, { "epoch": 30.674373795761078, "grad_norm": 0.35093769431114197, "learning_rate": 7.730404624277457e-05, "loss": 0.3401282501220703, "step": 79600 }, { "epoch": 30.674373795761078, "eval_loss": 0.3965562880039215, "eval_runtime": 12.2563, "eval_samples_per_second": 1550.556, "eval_steps_per_second": 32.31, "step": 79600 }, { "epoch": 30.69364161849711, "grad_norm": 0.3528108298778534, "learning_rate": 7.722697495183045e-05, "loss": 0.33603816986083984, "step": 79650 }, { "epoch": 30.69364161849711, "eval_loss": 0.40417003631591797, "eval_runtime": 12.4231, "eval_samples_per_second": 1529.728, "eval_steps_per_second": 31.876, "step": 79650 }, { "epoch": 30.71290944123314, "grad_norm": 0.33754968643188477, "learning_rate": 7.714990366088633e-05, "loss": 0.33375083923339843, "step": 79700 }, { "epoch": 30.71290944123314, "eval_loss": 0.38841933012008667, "eval_runtime": 12.0602, "eval_samples_per_second": 1575.757, "eval_steps_per_second": 32.835, "step": 79700 }, { "epoch": 30.73217726396917, "grad_norm": 0.37452858686447144, "learning_rate": 7.70728323699422e-05, "loss": 0.33378837585449217, "step": 79750 }, { "epoch": 30.73217726396917, "eval_loss": 0.3938928544521332, "eval_runtime": 12.036, "eval_samples_per_second": 1578.925, "eval_steps_per_second": 32.901, "step": 79750 }, { "epoch": 30.751445086705203, "grad_norm": 0.3228905200958252, "learning_rate": 7.699576107899807e-05, "loss": 0.3368485641479492, "step": 79800 }, { "epoch": 30.751445086705203, "eval_loss": 0.3963688611984253, "eval_runtime": 12.6966, "eval_samples_per_second": 1496.782, "eval_steps_per_second": 31.19, "step": 79800 }, { "epoch": 30.770712909441233, "grad_norm": 0.3379043936729431, "learning_rate": 7.691868978805396e-05, "loss": 0.3377945709228516, "step": 79850 }, { "epoch": 30.770712909441233, "eval_loss": 0.3923089802265167, "eval_runtime": 12.1659, "eval_samples_per_second": 1562.066, "eval_steps_per_second": 32.55, "step": 79850 }, { "epoch": 30.789980732177263, "grad_norm": 0.31663915514945984, "learning_rate": 7.684161849710983e-05, "loss": 0.33276130676269533, "step": 79900 }, { "epoch": 30.789980732177263, "eval_loss": 0.40273723006248474, "eval_runtime": 12.212, "eval_samples_per_second": 1556.172, "eval_steps_per_second": 32.427, "step": 79900 }, { "epoch": 30.809248554913296, "grad_norm": 0.34666216373443604, "learning_rate": 7.67645472061657e-05, "loss": 0.3313359832763672, "step": 79950 }, { "epoch": 30.809248554913296, "eval_loss": 0.39420267939567566, "eval_runtime": 12.3025, "eval_samples_per_second": 1544.722, "eval_steps_per_second": 32.188, "step": 79950 }, { "epoch": 30.828516377649326, "grad_norm": 0.32805973291397095, "learning_rate": 7.668747591522159e-05, "loss": 0.3355695724487305, "step": 80000 }, { "epoch": 30.828516377649326, "eval_loss": 0.3985747694969177, "eval_runtime": 12.0698, "eval_samples_per_second": 1574.507, "eval_steps_per_second": 32.809, "step": 80000 }, { "epoch": 30.847784200385355, "grad_norm": 0.33398300409317017, "learning_rate": 7.661040462427747e-05, "loss": 0.3364617538452148, "step": 80050 }, { "epoch": 30.847784200385355, "eval_loss": 0.39853447675704956, "eval_runtime": 12.2491, "eval_samples_per_second": 1551.462, "eval_steps_per_second": 32.329, "step": 80050 }, { "epoch": 30.86705202312139, "grad_norm": 0.3591051399707794, "learning_rate": 7.653333333333333e-05, "loss": 0.33254112243652345, "step": 80100 }, { "epoch": 30.86705202312139, "eval_loss": 0.3914082944393158, "eval_runtime": 12.3611, "eval_samples_per_second": 1537.401, "eval_steps_per_second": 32.036, "step": 80100 }, { "epoch": 30.886319845857418, "grad_norm": 0.34924015402793884, "learning_rate": 7.645626204238921e-05, "loss": 0.33654228210449216, "step": 80150 }, { "epoch": 30.886319845857418, "eval_loss": 0.3880145251750946, "eval_runtime": 12.8786, "eval_samples_per_second": 1475.622, "eval_steps_per_second": 30.749, "step": 80150 }, { "epoch": 30.905587668593448, "grad_norm": 0.347432404756546, "learning_rate": 7.63791907514451e-05, "loss": 0.3363762664794922, "step": 80200 }, { "epoch": 30.905587668593448, "eval_loss": 0.3967089354991913, "eval_runtime": 12.0625, "eval_samples_per_second": 1575.455, "eval_steps_per_second": 32.829, "step": 80200 }, { "epoch": 30.92485549132948, "grad_norm": 0.359576016664505, "learning_rate": 7.630211946050097e-05, "loss": 0.3385428237915039, "step": 80250 }, { "epoch": 30.92485549132948, "eval_loss": 0.39151743054389954, "eval_runtime": 12.3004, "eval_samples_per_second": 1544.995, "eval_steps_per_second": 32.194, "step": 80250 }, { "epoch": 30.94412331406551, "grad_norm": 0.36698511242866516, "learning_rate": 7.622504816955684e-05, "loss": 0.33210254669189454, "step": 80300 }, { "epoch": 30.94412331406551, "eval_loss": 0.394904226064682, "eval_runtime": 12.4079, "eval_samples_per_second": 1531.61, "eval_steps_per_second": 31.915, "step": 80300 }, { "epoch": 30.96339113680154, "grad_norm": 0.2749645709991455, "learning_rate": 7.614797687861273e-05, "loss": 0.33860298156738283, "step": 80350 }, { "epoch": 30.96339113680154, "eval_loss": 0.39184460043907166, "eval_runtime": 12.2525, "eval_samples_per_second": 1551.034, "eval_steps_per_second": 32.32, "step": 80350 }, { "epoch": 30.982658959537574, "grad_norm": 0.43100059032440186, "learning_rate": 7.60709055876686e-05, "loss": 0.3343136978149414, "step": 80400 }, { "epoch": 30.982658959537574, "eval_loss": 0.3927250802516937, "eval_runtime": 12.1614, "eval_samples_per_second": 1562.652, "eval_steps_per_second": 32.562, "step": 80400 }, { "epoch": 31.001926782273603, "grad_norm": 0.35989582538604736, "learning_rate": 7.599383429672448e-05, "loss": 0.33394905090332033, "step": 80450 }, { "epoch": 31.001926782273603, "eval_loss": 0.3948439657688141, "eval_runtime": 12.4409, "eval_samples_per_second": 1527.542, "eval_steps_per_second": 31.83, "step": 80450 }, { "epoch": 31.021194605009633, "grad_norm": 0.3679141402244568, "learning_rate": 7.591676300578035e-05, "loss": 0.3343206024169922, "step": 80500 }, { "epoch": 31.021194605009633, "eval_loss": 0.3927876651287079, "eval_runtime": 12.227, "eval_samples_per_second": 1554.266, "eval_steps_per_second": 32.387, "step": 80500 }, { "epoch": 31.040462427745666, "grad_norm": 0.34307408332824707, "learning_rate": 7.583969171483622e-05, "loss": 0.3322888946533203, "step": 80550 }, { "epoch": 31.040462427745666, "eval_loss": 0.39470410346984863, "eval_runtime": 12.4606, "eval_samples_per_second": 1525.128, "eval_steps_per_second": 31.78, "step": 80550 }, { "epoch": 31.059730250481696, "grad_norm": 0.2723255157470703, "learning_rate": 7.576262042389211e-05, "loss": 0.3314684295654297, "step": 80600 }, { "epoch": 31.059730250481696, "eval_loss": 0.39921268820762634, "eval_runtime": 12.2612, "eval_samples_per_second": 1549.925, "eval_steps_per_second": 32.297, "step": 80600 }, { "epoch": 31.078998073217726, "grad_norm": 0.3640703558921814, "learning_rate": 7.568554913294798e-05, "loss": 0.33681087493896483, "step": 80650 }, { "epoch": 31.078998073217726, "eval_loss": 0.3941943943500519, "eval_runtime": 12.051, "eval_samples_per_second": 1576.96, "eval_steps_per_second": 32.86, "step": 80650 }, { "epoch": 31.09826589595376, "grad_norm": 0.30020853877067566, "learning_rate": 7.560847784200385e-05, "loss": 0.33399856567382813, "step": 80700 }, { "epoch": 31.09826589595376, "eval_loss": 0.3958126902580261, "eval_runtime": 12.5425, "eval_samples_per_second": 1515.167, "eval_steps_per_second": 31.573, "step": 80700 }, { "epoch": 31.11753371868979, "grad_norm": 0.3685876727104187, "learning_rate": 7.553140655105974e-05, "loss": 0.3349112319946289, "step": 80750 }, { "epoch": 31.11753371868979, "eval_loss": 0.3932817578315735, "eval_runtime": 12.3668, "eval_samples_per_second": 1536.689, "eval_steps_per_second": 32.021, "step": 80750 }, { "epoch": 31.136801541425818, "grad_norm": 0.3284311890602112, "learning_rate": 7.545433526011561e-05, "loss": 0.3380855941772461, "step": 80800 }, { "epoch": 31.136801541425818, "eval_loss": 0.39650848507881165, "eval_runtime": 12.2423, "eval_samples_per_second": 1552.32, "eval_steps_per_second": 32.347, "step": 80800 }, { "epoch": 31.15606936416185, "grad_norm": 0.33151042461395264, "learning_rate": 7.537726396917148e-05, "loss": 0.33093292236328126, "step": 80850 }, { "epoch": 31.15606936416185, "eval_loss": 0.3906618654727936, "eval_runtime": 12.1048, "eval_samples_per_second": 1569.952, "eval_steps_per_second": 32.714, "step": 80850 }, { "epoch": 31.17533718689788, "grad_norm": 0.32439085841178894, "learning_rate": 7.530019267822736e-05, "loss": 0.33047252655029297, "step": 80900 }, { "epoch": 31.17533718689788, "eval_loss": 0.389200896024704, "eval_runtime": 12.4233, "eval_samples_per_second": 1529.71, "eval_steps_per_second": 31.876, "step": 80900 }, { "epoch": 31.19460500963391, "grad_norm": 0.3703368604183197, "learning_rate": 7.522312138728324e-05, "loss": 0.33789913177490233, "step": 80950 }, { "epoch": 31.19460500963391, "eval_loss": 0.38897258043289185, "eval_runtime": 12.3379, "eval_samples_per_second": 1540.289, "eval_steps_per_second": 32.096, "step": 80950 }, { "epoch": 31.213872832369944, "grad_norm": 0.37909552454948425, "learning_rate": 7.514605009633912e-05, "loss": 0.33433372497558594, "step": 81000 }, { "epoch": 31.213872832369944, "eval_loss": 0.3873302936553955, "eval_runtime": 12.3642, "eval_samples_per_second": 1537.016, "eval_steps_per_second": 32.028, "step": 81000 }, { "epoch": 31.233140655105974, "grad_norm": 0.33231836557388306, "learning_rate": 7.506897880539499e-05, "loss": 0.33316516876220703, "step": 81050 }, { "epoch": 31.233140655105974, "eval_loss": 0.3933020532131195, "eval_runtime": 12.4368, "eval_samples_per_second": 1528.05, "eval_steps_per_second": 31.841, "step": 81050 }, { "epoch": 31.252408477842003, "grad_norm": 0.32074564695358276, "learning_rate": 7.499190751445086e-05, "loss": 0.3318569946289063, "step": 81100 }, { "epoch": 31.252408477842003, "eval_loss": 0.3947520852088928, "eval_runtime": 12.4089, "eval_samples_per_second": 1531.483, "eval_steps_per_second": 31.913, "step": 81100 }, { "epoch": 31.271676300578033, "grad_norm": 0.3266116976737976, "learning_rate": 7.491483622350675e-05, "loss": 0.3360607147216797, "step": 81150 }, { "epoch": 31.271676300578033, "eval_loss": 0.38842105865478516, "eval_runtime": 12.6075, "eval_samples_per_second": 1507.355, "eval_steps_per_second": 31.41, "step": 81150 }, { "epoch": 31.290944123314066, "grad_norm": 0.36575672030448914, "learning_rate": 7.483776493256263e-05, "loss": 0.3317249298095703, "step": 81200 }, { "epoch": 31.290944123314066, "eval_loss": 0.389698326587677, "eval_runtime": 12.4172, "eval_samples_per_second": 1530.456, "eval_steps_per_second": 31.891, "step": 81200 }, { "epoch": 31.310211946050096, "grad_norm": 0.35568854212760925, "learning_rate": 7.476069364161849e-05, "loss": 0.33496803283691406, "step": 81250 }, { "epoch": 31.310211946050096, "eval_loss": 0.386882483959198, "eval_runtime": 12.3444, "eval_samples_per_second": 1539.478, "eval_steps_per_second": 32.079, "step": 81250 }, { "epoch": 31.329479768786126, "grad_norm": 0.37098228931427, "learning_rate": 7.468362235067438e-05, "loss": 0.3372187042236328, "step": 81300 }, { "epoch": 31.329479768786126, "eval_loss": 0.38705065846443176, "eval_runtime": 12.1726, "eval_samples_per_second": 1561.216, "eval_steps_per_second": 32.532, "step": 81300 }, { "epoch": 31.34874759152216, "grad_norm": 0.34937584400177, "learning_rate": 7.460655105973026e-05, "loss": 0.33677894592285157, "step": 81350 }, { "epoch": 31.34874759152216, "eval_loss": 0.3930945098400116, "eval_runtime": 12.4197, "eval_samples_per_second": 1530.154, "eval_steps_per_second": 31.885, "step": 81350 }, { "epoch": 31.36801541425819, "grad_norm": 0.35550403594970703, "learning_rate": 7.452947976878613e-05, "loss": 0.33909748077392576, "step": 81400 }, { "epoch": 31.36801541425819, "eval_loss": 0.3928523063659668, "eval_runtime": 12.2008, "eval_samples_per_second": 1557.602, "eval_steps_per_second": 32.457, "step": 81400 }, { "epoch": 31.387283236994218, "grad_norm": 0.31252729892730713, "learning_rate": 7.4452408477842e-05, "loss": 0.3349456787109375, "step": 81450 }, { "epoch": 31.387283236994218, "eval_loss": 0.3922406733036041, "eval_runtime": 12.4408, "eval_samples_per_second": 1527.555, "eval_steps_per_second": 31.831, "step": 81450 }, { "epoch": 31.40655105973025, "grad_norm": 0.35789743065834045, "learning_rate": 7.437533718689789e-05, "loss": 0.328759651184082, "step": 81500 }, { "epoch": 31.40655105973025, "eval_loss": 0.3815910816192627, "eval_runtime": 12.2732, "eval_samples_per_second": 1548.411, "eval_steps_per_second": 32.265, "step": 81500 }, { "epoch": 31.42581888246628, "grad_norm": 0.3162961006164551, "learning_rate": 7.429826589595376e-05, "loss": 0.3326981353759766, "step": 81550 }, { "epoch": 31.42581888246628, "eval_loss": 0.39257243275642395, "eval_runtime": 12.3941, "eval_samples_per_second": 1533.313, "eval_steps_per_second": 31.951, "step": 81550 }, { "epoch": 31.44508670520231, "grad_norm": 0.3173085153102875, "learning_rate": 7.422119460500963e-05, "loss": 0.3304977035522461, "step": 81600 }, { "epoch": 31.44508670520231, "eval_loss": 0.3942854106426239, "eval_runtime": 12.4231, "eval_samples_per_second": 1529.73, "eval_steps_per_second": 31.876, "step": 81600 }, { "epoch": 31.464354527938344, "grad_norm": 0.3353341519832611, "learning_rate": 7.414412331406552e-05, "loss": 0.3350178146362305, "step": 81650 }, { "epoch": 31.464354527938344, "eval_loss": 0.39619171619415283, "eval_runtime": 12.3844, "eval_samples_per_second": 1534.512, "eval_steps_per_second": 31.976, "step": 81650 }, { "epoch": 31.483622350674374, "grad_norm": 0.3512631058692932, "learning_rate": 7.406705202312139e-05, "loss": 0.3329004669189453, "step": 81700 }, { "epoch": 31.483622350674374, "eval_loss": 0.39310675859451294, "eval_runtime": 12.318, "eval_samples_per_second": 1542.779, "eval_steps_per_second": 32.148, "step": 81700 }, { "epoch": 31.502890173410403, "grad_norm": 0.2977689504623413, "learning_rate": 7.398998073217727e-05, "loss": 0.3353124237060547, "step": 81750 }, { "epoch": 31.502890173410403, "eval_loss": 0.3933800756931305, "eval_runtime": 12.023, "eval_samples_per_second": 1580.631, "eval_steps_per_second": 32.937, "step": 81750 }, { "epoch": 31.522157996146436, "grad_norm": 0.35949206352233887, "learning_rate": 7.391290944123314e-05, "loss": 0.3383650207519531, "step": 81800 }, { "epoch": 31.522157996146436, "eval_loss": 0.3884298503398895, "eval_runtime": 12.2739, "eval_samples_per_second": 1548.328, "eval_steps_per_second": 32.264, "step": 81800 }, { "epoch": 31.541425818882466, "grad_norm": 0.3502868711948395, "learning_rate": 7.383583815028901e-05, "loss": 0.3327152633666992, "step": 81850 }, { "epoch": 31.541425818882466, "eval_loss": 0.38017353415489197, "eval_runtime": 12.1712, "eval_samples_per_second": 1561.396, "eval_steps_per_second": 32.536, "step": 81850 }, { "epoch": 31.560693641618496, "grad_norm": 0.2831386625766754, "learning_rate": 7.37587668593449e-05, "loss": 0.3360588836669922, "step": 81900 }, { "epoch": 31.560693641618496, "eval_loss": 0.3893136978149414, "eval_runtime": 12.2006, "eval_samples_per_second": 1557.625, "eval_steps_per_second": 32.457, "step": 81900 }, { "epoch": 31.57996146435453, "grad_norm": 0.32322245836257935, "learning_rate": 7.368169556840077e-05, "loss": 0.32995784759521485, "step": 81950 }, { "epoch": 31.57996146435453, "eval_loss": 0.3925672173500061, "eval_runtime": 12.1448, "eval_samples_per_second": 1564.789, "eval_steps_per_second": 32.607, "step": 81950 }, { "epoch": 31.59922928709056, "grad_norm": 0.38338539004325867, "learning_rate": 7.360462427745664e-05, "loss": 0.3337534713745117, "step": 82000 }, { "epoch": 31.59922928709056, "eval_loss": 0.3898797929286957, "eval_runtime": 12.6379, "eval_samples_per_second": 1503.728, "eval_steps_per_second": 31.334, "step": 82000 }, { "epoch": 31.61849710982659, "grad_norm": 0.3318467438220978, "learning_rate": 7.352755298651253e-05, "loss": 0.33032482147216796, "step": 82050 }, { "epoch": 31.61849710982659, "eval_loss": 0.3932211697101593, "eval_runtime": 12.2622, "eval_samples_per_second": 1549.799, "eval_steps_per_second": 32.294, "step": 82050 }, { "epoch": 31.63776493256262, "grad_norm": 0.33630242943763733, "learning_rate": 7.345048169556841e-05, "loss": 0.3302790832519531, "step": 82100 }, { "epoch": 31.63776493256262, "eval_loss": 0.3855394721031189, "eval_runtime": 12.2482, "eval_samples_per_second": 1551.573, "eval_steps_per_second": 32.331, "step": 82100 }, { "epoch": 31.65703275529865, "grad_norm": 0.3319029211997986, "learning_rate": 7.337341040462428e-05, "loss": 0.33577049255371094, "step": 82150 }, { "epoch": 31.65703275529865, "eval_loss": 0.39433595538139343, "eval_runtime": 12.3654, "eval_samples_per_second": 1536.866, "eval_steps_per_second": 32.025, "step": 82150 }, { "epoch": 31.67630057803468, "grad_norm": 0.41104209423065186, "learning_rate": 7.329633911368015e-05, "loss": 0.3302264022827148, "step": 82200 }, { "epoch": 31.67630057803468, "eval_loss": 0.38176703453063965, "eval_runtime": 11.9948, "eval_samples_per_second": 1584.355, "eval_steps_per_second": 33.014, "step": 82200 }, { "epoch": 31.695568400770714, "grad_norm": 0.336476594209671, "learning_rate": 7.321926782273604e-05, "loss": 0.3358791351318359, "step": 82250 }, { "epoch": 31.695568400770714, "eval_loss": 0.3902879059314728, "eval_runtime": 12.1926, "eval_samples_per_second": 1558.652, "eval_steps_per_second": 32.479, "step": 82250 }, { "epoch": 31.714836223506744, "grad_norm": 0.32354748249053955, "learning_rate": 7.314219653179191e-05, "loss": 0.32998649597167967, "step": 82300 }, { "epoch": 31.714836223506744, "eval_loss": 0.38361677527427673, "eval_runtime": 12.4109, "eval_samples_per_second": 1531.23, "eval_steps_per_second": 31.907, "step": 82300 }, { "epoch": 31.734104046242773, "grad_norm": 0.3881266117095947, "learning_rate": 7.306512524084778e-05, "loss": 0.3343693923950195, "step": 82350 }, { "epoch": 31.734104046242773, "eval_loss": 0.39176177978515625, "eval_runtime": 12.3846, "eval_samples_per_second": 1534.489, "eval_steps_per_second": 31.975, "step": 82350 }, { "epoch": 31.753371868978807, "grad_norm": 0.3368484079837799, "learning_rate": 7.298805394990367e-05, "loss": 0.33365814208984373, "step": 82400 }, { "epoch": 31.753371868978807, "eval_loss": 0.3917023241519928, "eval_runtime": 12.0172, "eval_samples_per_second": 1581.4, "eval_steps_per_second": 32.953, "step": 82400 }, { "epoch": 31.772639691714836, "grad_norm": 0.36743301153182983, "learning_rate": 7.291098265895954e-05, "loss": 0.32970603942871096, "step": 82450 }, { "epoch": 31.772639691714836, "eval_loss": 0.3887389898300171, "eval_runtime": 12.2032, "eval_samples_per_second": 1557.297, "eval_steps_per_second": 32.451, "step": 82450 }, { "epoch": 31.791907514450866, "grad_norm": 0.3367919623851776, "learning_rate": 7.283391136801542e-05, "loss": 0.3334079360961914, "step": 82500 }, { "epoch": 31.791907514450866, "eval_loss": 0.39604809880256653, "eval_runtime": 12.2559, "eval_samples_per_second": 1550.606, "eval_steps_per_second": 32.311, "step": 82500 }, { "epoch": 31.8111753371869, "grad_norm": 0.32461050152778625, "learning_rate": 7.27568400770713e-05, "loss": 0.33287490844726564, "step": 82550 }, { "epoch": 31.8111753371869, "eval_loss": 0.3971518576145172, "eval_runtime": 12.3772, "eval_samples_per_second": 1535.407, "eval_steps_per_second": 31.994, "step": 82550 }, { "epoch": 31.83044315992293, "grad_norm": 0.33348342776298523, "learning_rate": 7.267976878612717e-05, "loss": 0.3339728927612305, "step": 82600 }, { "epoch": 31.83044315992293, "eval_loss": 0.3909747898578644, "eval_runtime": 12.2083, "eval_samples_per_second": 1556.643, "eval_steps_per_second": 32.437, "step": 82600 }, { "epoch": 31.84971098265896, "grad_norm": 0.3236358165740967, "learning_rate": 7.260269749518305e-05, "loss": 0.3322230529785156, "step": 82650 }, { "epoch": 31.84971098265896, "eval_loss": 0.38940104842185974, "eval_runtime": 12.4331, "eval_samples_per_second": 1528.503, "eval_steps_per_second": 31.851, "step": 82650 }, { "epoch": 31.868978805394992, "grad_norm": 0.3306892216205597, "learning_rate": 7.252562620423892e-05, "loss": 0.33445030212402344, "step": 82700 }, { "epoch": 31.868978805394992, "eval_loss": 0.3877246677875519, "eval_runtime": 12.3172, "eval_samples_per_second": 1542.888, "eval_steps_per_second": 32.15, "step": 82700 }, { "epoch": 31.88824662813102, "grad_norm": 0.30529966950416565, "learning_rate": 7.24485549132948e-05, "loss": 0.3366144943237305, "step": 82750 }, { "epoch": 31.88824662813102, "eval_loss": 0.38707804679870605, "eval_runtime": 12.308, "eval_samples_per_second": 1544.032, "eval_steps_per_second": 32.174, "step": 82750 }, { "epoch": 31.90751445086705, "grad_norm": 0.3396291732788086, "learning_rate": 7.237148362235068e-05, "loss": 0.33801265716552736, "step": 82800 }, { "epoch": 31.90751445086705, "eval_loss": 0.38472774624824524, "eval_runtime": 12.3082, "eval_samples_per_second": 1544.006, "eval_steps_per_second": 32.174, "step": 82800 }, { "epoch": 31.926782273603084, "grad_norm": 0.29563799500465393, "learning_rate": 7.229441233140656e-05, "loss": 0.33369720458984375, "step": 82850 }, { "epoch": 31.926782273603084, "eval_loss": 0.39592254161834717, "eval_runtime": 12.2059, "eval_samples_per_second": 1556.951, "eval_steps_per_second": 32.443, "step": 82850 }, { "epoch": 31.946050096339114, "grad_norm": 0.3153001666069031, "learning_rate": 7.221734104046242e-05, "loss": 0.32941547393798826, "step": 82900 }, { "epoch": 31.946050096339114, "eval_loss": 0.3894803524017334, "eval_runtime": 12.3143, "eval_samples_per_second": 1543.243, "eval_steps_per_second": 32.158, "step": 82900 }, { "epoch": 31.965317919075144, "grad_norm": 0.39017364382743835, "learning_rate": 7.21402697495183e-05, "loss": 0.338901252746582, "step": 82950 }, { "epoch": 31.965317919075144, "eval_loss": 0.396602988243103, "eval_runtime": 12.3924, "eval_samples_per_second": 1533.515, "eval_steps_per_second": 31.955, "step": 82950 }, { "epoch": 31.984585741811177, "grad_norm": 0.3777558207511902, "learning_rate": 7.206319845857419e-05, "loss": 0.3341526412963867, "step": 83000 }, { "epoch": 31.984585741811177, "eval_loss": 0.3932454288005829, "eval_runtime": 12.2981, "eval_samples_per_second": 1545.282, "eval_steps_per_second": 32.2, "step": 83000 }, { "epoch": 32.00385356454721, "grad_norm": 0.34222671389579773, "learning_rate": 7.198612716763006e-05, "loss": 0.3342710876464844, "step": 83050 }, { "epoch": 32.00385356454721, "eval_loss": 0.39037859439849854, "eval_runtime": 12.1229, "eval_samples_per_second": 1567.612, "eval_steps_per_second": 32.665, "step": 83050 }, { "epoch": 32.02312138728324, "grad_norm": 0.3451511561870575, "learning_rate": 7.190905587668593e-05, "loss": 0.32967056274414064, "step": 83100 }, { "epoch": 32.02312138728324, "eval_loss": 0.38652628660202026, "eval_runtime": 12.216, "eval_samples_per_second": 1555.667, "eval_steps_per_second": 32.417, "step": 83100 }, { "epoch": 32.042389210019266, "grad_norm": 0.3590167164802551, "learning_rate": 7.183198458574182e-05, "loss": 0.3287253570556641, "step": 83150 }, { "epoch": 32.042389210019266, "eval_loss": 0.3912278115749359, "eval_runtime": 12.4365, "eval_samples_per_second": 1528.08, "eval_steps_per_second": 31.842, "step": 83150 }, { "epoch": 32.0616570327553, "grad_norm": 0.3654906153678894, "learning_rate": 7.175491329479769e-05, "loss": 0.3359911346435547, "step": 83200 }, { "epoch": 32.0616570327553, "eval_loss": 0.3827441930770874, "eval_runtime": 12.2477, "eval_samples_per_second": 1551.636, "eval_steps_per_second": 32.333, "step": 83200 }, { "epoch": 32.08092485549133, "grad_norm": 0.3655374348163605, "learning_rate": 7.167784200385357e-05, "loss": 0.33842662811279295, "step": 83250 }, { "epoch": 32.08092485549133, "eval_loss": 0.38642650842666626, "eval_runtime": 11.993, "eval_samples_per_second": 1584.594, "eval_steps_per_second": 33.019, "step": 83250 }, { "epoch": 32.10019267822736, "grad_norm": 0.3382033407688141, "learning_rate": 7.160077071290945e-05, "loss": 0.33293800354003905, "step": 83300 }, { "epoch": 32.10019267822736, "eval_loss": 0.38166847825050354, "eval_runtime": 12.3923, "eval_samples_per_second": 1533.538, "eval_steps_per_second": 31.955, "step": 83300 }, { "epoch": 32.11946050096339, "grad_norm": 0.40729060769081116, "learning_rate": 7.152369942196532e-05, "loss": 0.33123050689697264, "step": 83350 }, { "epoch": 32.11946050096339, "eval_loss": 0.3931393325328827, "eval_runtime": 12.3117, "eval_samples_per_second": 1543.575, "eval_steps_per_second": 32.165, "step": 83350 }, { "epoch": 32.138728323699425, "grad_norm": 0.28325581550598145, "learning_rate": 7.14466281310212e-05, "loss": 0.3317831420898438, "step": 83400 }, { "epoch": 32.138728323699425, "eval_loss": 0.38973355293273926, "eval_runtime": 12.28, "eval_samples_per_second": 1547.552, "eval_steps_per_second": 32.247, "step": 83400 }, { "epoch": 32.15799614643545, "grad_norm": 0.314932644367218, "learning_rate": 7.136955684007707e-05, "loss": 0.33860435485839846, "step": 83450 }, { "epoch": 32.15799614643545, "eval_loss": 0.38891252875328064, "eval_runtime": 12.0299, "eval_samples_per_second": 1579.732, "eval_steps_per_second": 32.918, "step": 83450 }, { "epoch": 32.177263969171484, "grad_norm": 0.33232662081718445, "learning_rate": 7.129248554913294e-05, "loss": 0.3358937835693359, "step": 83500 }, { "epoch": 32.177263969171484, "eval_loss": 0.3957799971103668, "eval_runtime": 12.2982, "eval_samples_per_second": 1545.263, "eval_steps_per_second": 32.2, "step": 83500 }, { "epoch": 32.19653179190752, "grad_norm": 0.33444711565971375, "learning_rate": 7.121541425818883e-05, "loss": 0.3356376266479492, "step": 83550 }, { "epoch": 32.19653179190752, "eval_loss": 0.38456934690475464, "eval_runtime": 12.2816, "eval_samples_per_second": 1547.361, "eval_steps_per_second": 32.243, "step": 83550 }, { "epoch": 32.215799614643544, "grad_norm": 0.3414185047149658, "learning_rate": 7.113834296724471e-05, "loss": 0.3312584686279297, "step": 83600 }, { "epoch": 32.215799614643544, "eval_loss": 0.3826345205307007, "eval_runtime": 12.3573, "eval_samples_per_second": 1537.873, "eval_steps_per_second": 32.046, "step": 83600 }, { "epoch": 32.23506743737958, "grad_norm": 0.31103435158729553, "learning_rate": 7.106127167630057e-05, "loss": 0.3382235336303711, "step": 83650 }, { "epoch": 32.23506743737958, "eval_loss": 0.38529568910598755, "eval_runtime": 12.2437, "eval_samples_per_second": 1552.142, "eval_steps_per_second": 32.343, "step": 83650 }, { "epoch": 32.25433526011561, "grad_norm": 0.32266634702682495, "learning_rate": 7.098420038535646e-05, "loss": 0.3278768920898438, "step": 83700 }, { "epoch": 32.25433526011561, "eval_loss": 0.38972991704940796, "eval_runtime": 12.3016, "eval_samples_per_second": 1544.837, "eval_steps_per_second": 32.191, "step": 83700 }, { "epoch": 32.273603082851636, "grad_norm": 0.38494542241096497, "learning_rate": 7.090712909441234e-05, "loss": 0.3333472442626953, "step": 83750 }, { "epoch": 32.273603082851636, "eval_loss": 0.390533983707428, "eval_runtime": 12.225, "eval_samples_per_second": 1554.523, "eval_steps_per_second": 32.393, "step": 83750 }, { "epoch": 32.29287090558767, "grad_norm": 0.35600781440734863, "learning_rate": 7.083005780346821e-05, "loss": 0.33366439819335936, "step": 83800 }, { "epoch": 32.29287090558767, "eval_loss": 0.3926495313644409, "eval_runtime": 12.1732, "eval_samples_per_second": 1561.133, "eval_steps_per_second": 32.53, "step": 83800 }, { "epoch": 32.3121387283237, "grad_norm": 0.3323899507522583, "learning_rate": 7.075298651252408e-05, "loss": 0.3325334167480469, "step": 83850 }, { "epoch": 32.3121387283237, "eval_loss": 0.38367557525634766, "eval_runtime": 12.3644, "eval_samples_per_second": 1536.99, "eval_steps_per_second": 32.027, "step": 83850 }, { "epoch": 32.33140655105973, "grad_norm": 0.41862767934799194, "learning_rate": 7.067591522157997e-05, "loss": 0.3350790786743164, "step": 83900 }, { "epoch": 32.33140655105973, "eval_loss": 0.3934996426105499, "eval_runtime": 12.1574, "eval_samples_per_second": 1563.159, "eval_steps_per_second": 32.573, "step": 83900 }, { "epoch": 32.35067437379576, "grad_norm": 0.3459570109844208, "learning_rate": 7.059884393063584e-05, "loss": 0.3316618347167969, "step": 83950 }, { "epoch": 32.35067437379576, "eval_loss": 0.3952227532863617, "eval_runtime": 12.2223, "eval_samples_per_second": 1554.858, "eval_steps_per_second": 32.4, "step": 83950 }, { "epoch": 32.369942196531795, "grad_norm": 0.36485931277275085, "learning_rate": 7.052177263969171e-05, "loss": 0.3253084945678711, "step": 84000 }, { "epoch": 32.369942196531795, "eval_loss": 0.3924708664417267, "eval_runtime": 12.1801, "eval_samples_per_second": 1560.256, "eval_steps_per_second": 32.512, "step": 84000 }, { "epoch": 32.38921001926782, "grad_norm": 0.3909143805503845, "learning_rate": 7.04447013487476e-05, "loss": 0.3260066986083984, "step": 84050 }, { "epoch": 32.38921001926782, "eval_loss": 0.391934335231781, "eval_runtime": 12.1699, "eval_samples_per_second": 1561.56, "eval_steps_per_second": 32.539, "step": 84050 }, { "epoch": 32.408477842003855, "grad_norm": 0.36118561029434204, "learning_rate": 7.036763005780347e-05, "loss": 0.3329648208618164, "step": 84100 }, { "epoch": 32.408477842003855, "eval_loss": 0.38808566331863403, "eval_runtime": 12.1478, "eval_samples_per_second": 1564.398, "eval_steps_per_second": 32.598, "step": 84100 }, { "epoch": 32.42774566473989, "grad_norm": 0.31038767099380493, "learning_rate": 7.029055876685935e-05, "loss": 0.3310514450073242, "step": 84150 }, { "epoch": 32.42774566473989, "eval_loss": 0.39050042629241943, "eval_runtime": 12.355, "eval_samples_per_second": 1538.16, "eval_steps_per_second": 32.052, "step": 84150 }, { "epoch": 32.447013487475914, "grad_norm": 0.34646889567375183, "learning_rate": 7.021348747591522e-05, "loss": 0.3330271911621094, "step": 84200 }, { "epoch": 32.447013487475914, "eval_loss": 0.39797794818878174, "eval_runtime": 12.2423, "eval_samples_per_second": 1552.321, "eval_steps_per_second": 32.347, "step": 84200 }, { "epoch": 32.46628131021195, "grad_norm": 0.37190306186676025, "learning_rate": 7.01364161849711e-05, "loss": 0.33200157165527344, "step": 84250 }, { "epoch": 32.46628131021195, "eval_loss": 0.38757002353668213, "eval_runtime": 12.4253, "eval_samples_per_second": 1529.463, "eval_steps_per_second": 31.871, "step": 84250 }, { "epoch": 32.48554913294798, "grad_norm": 0.304220050573349, "learning_rate": 7.005934489402698e-05, "loss": 0.3366059112548828, "step": 84300 }, { "epoch": 32.48554913294798, "eval_loss": 0.3881857097148895, "eval_runtime": 12.3544, "eval_samples_per_second": 1538.242, "eval_steps_per_second": 32.053, "step": 84300 }, { "epoch": 32.50481695568401, "grad_norm": 0.3284686207771301, "learning_rate": 6.998227360308287e-05, "loss": 0.3324573516845703, "step": 84350 }, { "epoch": 32.50481695568401, "eval_loss": 0.388376921415329, "eval_runtime": 12.218, "eval_samples_per_second": 1555.408, "eval_steps_per_second": 32.411, "step": 84350 }, { "epoch": 32.52408477842004, "grad_norm": 0.37155449390411377, "learning_rate": 6.990520231213872e-05, "loss": 0.3281977462768555, "step": 84400 }, { "epoch": 32.52408477842004, "eval_loss": 0.39317426085472107, "eval_runtime": 12.1532, "eval_samples_per_second": 1563.701, "eval_steps_per_second": 32.584, "step": 84400 }, { "epoch": 32.543352601156066, "grad_norm": 0.3963061273097992, "learning_rate": 6.982813102119461e-05, "loss": 0.3349224853515625, "step": 84450 }, { "epoch": 32.543352601156066, "eval_loss": 0.3931733965873718, "eval_runtime": 13.1523, "eval_samples_per_second": 1444.915, "eval_steps_per_second": 30.109, "step": 84450 }, { "epoch": 32.5626204238921, "grad_norm": 0.37769371271133423, "learning_rate": 6.97510597302505e-05, "loss": 0.3320587158203125, "step": 84500 }, { "epoch": 32.5626204238921, "eval_loss": 0.3889002501964569, "eval_runtime": 12.3161, "eval_samples_per_second": 1543.027, "eval_steps_per_second": 32.153, "step": 84500 }, { "epoch": 32.58188824662813, "grad_norm": 0.3914284408092499, "learning_rate": 6.967398843930637e-05, "loss": 0.33131515502929687, "step": 84550 }, { "epoch": 32.58188824662813, "eval_loss": 0.39082425832748413, "eval_runtime": 12.3059, "eval_samples_per_second": 1544.304, "eval_steps_per_second": 32.18, "step": 84550 }, { "epoch": 32.60115606936416, "grad_norm": 0.3642851412296295, "learning_rate": 6.959691714836224e-05, "loss": 0.33434425354003905, "step": 84600 }, { "epoch": 32.60115606936416, "eval_loss": 0.38643303513526917, "eval_runtime": 13.49, "eval_samples_per_second": 1408.747, "eval_steps_per_second": 29.355, "step": 84600 }, { "epoch": 32.62042389210019, "grad_norm": 0.3068494498729706, "learning_rate": 6.951984585741812e-05, "loss": 0.33064876556396483, "step": 84650 }, { "epoch": 32.62042389210019, "eval_loss": 0.39715924859046936, "eval_runtime": 12.1843, "eval_samples_per_second": 1559.712, "eval_steps_per_second": 32.501, "step": 84650 }, { "epoch": 32.639691714836225, "grad_norm": 0.381879061460495, "learning_rate": 6.944277456647399e-05, "loss": 0.33619491577148436, "step": 84700 }, { "epoch": 32.639691714836225, "eval_loss": 0.39403975009918213, "eval_runtime": 12.3026, "eval_samples_per_second": 1544.714, "eval_steps_per_second": 32.188, "step": 84700 }, { "epoch": 32.65895953757225, "grad_norm": 0.34039437770843506, "learning_rate": 6.936570327552986e-05, "loss": 0.33369949340820315, "step": 84750 }, { "epoch": 32.65895953757225, "eval_loss": 0.39233583211898804, "eval_runtime": 12.3555, "eval_samples_per_second": 1538.096, "eval_steps_per_second": 32.05, "step": 84750 }, { "epoch": 32.678227360308284, "grad_norm": 0.3397429883480072, "learning_rate": 6.928863198458575e-05, "loss": 0.3291718292236328, "step": 84800 }, { "epoch": 32.678227360308284, "eval_loss": 0.3910946846008301, "eval_runtime": 12.0832, "eval_samples_per_second": 1572.766, "eval_steps_per_second": 32.773, "step": 84800 }, { "epoch": 32.69749518304432, "grad_norm": 0.3459029495716095, "learning_rate": 6.921156069364162e-05, "loss": 0.3333112716674805, "step": 84850 }, { "epoch": 32.69749518304432, "eval_loss": 0.393562912940979, "eval_runtime": 12.2873, "eval_samples_per_second": 1546.634, "eval_steps_per_second": 32.228, "step": 84850 }, { "epoch": 32.716763005780344, "grad_norm": 0.35008564591407776, "learning_rate": 6.91344894026975e-05, "loss": 0.33231822967529295, "step": 84900 }, { "epoch": 32.716763005780344, "eval_loss": 0.38651174306869507, "eval_runtime": 12.2022, "eval_samples_per_second": 1557.418, "eval_steps_per_second": 32.453, "step": 84900 }, { "epoch": 32.73603082851638, "grad_norm": 0.31790414452552795, "learning_rate": 6.905741811175338e-05, "loss": 0.3316878128051758, "step": 84950 }, { "epoch": 32.73603082851638, "eval_loss": 0.39632076025009155, "eval_runtime": 14.1859, "eval_samples_per_second": 1339.643, "eval_steps_per_second": 27.915, "step": 84950 }, { "epoch": 32.75529865125241, "grad_norm": 0.33214712142944336, "learning_rate": 6.898034682080925e-05, "loss": 0.3273626327514648, "step": 85000 }, { "epoch": 32.75529865125241, "eval_loss": 0.3909478783607483, "eval_runtime": 12.0145, "eval_samples_per_second": 1581.751, "eval_steps_per_second": 32.96, "step": 85000 }, { "epoch": 32.774566473988436, "grad_norm": 0.36767810583114624, "learning_rate": 6.890327552986513e-05, "loss": 0.334046630859375, "step": 85050 }, { "epoch": 32.774566473988436, "eval_loss": 0.3866599500179291, "eval_runtime": 12.1593, "eval_samples_per_second": 1562.925, "eval_steps_per_second": 32.568, "step": 85050 }, { "epoch": 32.79383429672447, "grad_norm": 0.34708231687545776, "learning_rate": 6.8826204238921e-05, "loss": 0.3299650573730469, "step": 85100 }, { "epoch": 32.79383429672447, "eval_loss": 0.39845120906829834, "eval_runtime": 12.2993, "eval_samples_per_second": 1545.131, "eval_steps_per_second": 32.197, "step": 85100 }, { "epoch": 32.8131021194605, "grad_norm": 0.3029509484767914, "learning_rate": 6.874913294797688e-05, "loss": 0.32749366760253906, "step": 85150 }, { "epoch": 32.8131021194605, "eval_loss": 0.39530616998672485, "eval_runtime": 12.3184, "eval_samples_per_second": 1542.727, "eval_steps_per_second": 32.147, "step": 85150 }, { "epoch": 32.83236994219653, "grad_norm": 0.3296744227409363, "learning_rate": 6.867206165703276e-05, "loss": 0.33171417236328127, "step": 85200 }, { "epoch": 32.83236994219653, "eval_loss": 0.3916977643966675, "eval_runtime": 12.4043, "eval_samples_per_second": 1532.053, "eval_steps_per_second": 31.924, "step": 85200 }, { "epoch": 32.85163776493256, "grad_norm": 0.3107566237449646, "learning_rate": 6.859499036608865e-05, "loss": 0.33191410064697263, "step": 85250 }, { "epoch": 32.85163776493256, "eval_loss": 0.3909749984741211, "eval_runtime": 13.7405, "eval_samples_per_second": 1383.066, "eval_steps_per_second": 28.82, "step": 85250 }, { "epoch": 32.870905587668595, "grad_norm": 0.28598639369010925, "learning_rate": 6.851791907514452e-05, "loss": 0.32973854064941405, "step": 85300 }, { "epoch": 32.870905587668595, "eval_loss": 0.3997822105884552, "eval_runtime": 13.9847, "eval_samples_per_second": 1358.913, "eval_steps_per_second": 28.317, "step": 85300 }, { "epoch": 32.89017341040462, "grad_norm": 0.337296724319458, "learning_rate": 6.844084778420039e-05, "loss": 0.3333906173706055, "step": 85350 }, { "epoch": 32.89017341040462, "eval_loss": 0.3951239585876465, "eval_runtime": 12.1874, "eval_samples_per_second": 1559.314, "eval_steps_per_second": 32.493, "step": 85350 }, { "epoch": 32.909441233140655, "grad_norm": 0.2877908945083618, "learning_rate": 6.836377649325627e-05, "loss": 0.3267830276489258, "step": 85400 }, { "epoch": 32.909441233140655, "eval_loss": 0.39604440331459045, "eval_runtime": 12.2294, "eval_samples_per_second": 1553.957, "eval_steps_per_second": 32.381, "step": 85400 }, { "epoch": 32.92870905587669, "grad_norm": 0.3552136719226837, "learning_rate": 6.828670520231214e-05, "loss": 0.33112693786621095, "step": 85450 }, { "epoch": 32.92870905587669, "eval_loss": 0.39403417706489563, "eval_runtime": 12.2786, "eval_samples_per_second": 1547.736, "eval_steps_per_second": 32.251, "step": 85450 }, { "epoch": 32.947976878612714, "grad_norm": 0.36246249079704285, "learning_rate": 6.820963391136802e-05, "loss": 0.3339415740966797, "step": 85500 }, { "epoch": 32.947976878612714, "eval_loss": 0.39723631739616394, "eval_runtime": 12.3489, "eval_samples_per_second": 1538.927, "eval_steps_per_second": 32.068, "step": 85500 }, { "epoch": 32.96724470134875, "grad_norm": 0.3567337989807129, "learning_rate": 6.813256262042389e-05, "loss": 0.33196029663085935, "step": 85550 }, { "epoch": 32.96724470134875, "eval_loss": 0.4093892574310303, "eval_runtime": 12.38, "eval_samples_per_second": 1535.051, "eval_steps_per_second": 31.987, "step": 85550 }, { "epoch": 32.98651252408478, "grad_norm": 0.3128979802131653, "learning_rate": 6.805549132947977e-05, "loss": 0.33449176788330076, "step": 85600 }, { "epoch": 32.98651252408478, "eval_loss": 0.39319631457328796, "eval_runtime": 12.4794, "eval_samples_per_second": 1522.826, "eval_steps_per_second": 31.732, "step": 85600 }, { "epoch": 33.005780346820806, "grad_norm": 0.3540187180042267, "learning_rate": 6.797842003853566e-05, "loss": 0.33008087158203125, "step": 85650 }, { "epoch": 33.005780346820806, "eval_loss": 0.3856356739997864, "eval_runtime": 12.6444, "eval_samples_per_second": 1502.955, "eval_steps_per_second": 31.318, "step": 85650 }, { "epoch": 33.02504816955684, "grad_norm": 0.303708553314209, "learning_rate": 6.790134874759151e-05, "loss": 0.32581363677978514, "step": 85700 }, { "epoch": 33.02504816955684, "eval_loss": 0.40099695324897766, "eval_runtime": 12.1678, "eval_samples_per_second": 1561.827, "eval_steps_per_second": 32.545, "step": 85700 }, { "epoch": 33.04431599229287, "grad_norm": 0.3260761499404907, "learning_rate": 6.78242774566474e-05, "loss": 0.33573657989501954, "step": 85750 }, { "epoch": 33.04431599229287, "eval_loss": 0.38736164569854736, "eval_runtime": 12.4119, "eval_samples_per_second": 1531.109, "eval_steps_per_second": 31.905, "step": 85750 }, { "epoch": 33.0635838150289, "grad_norm": 0.32847580313682556, "learning_rate": 6.774720616570328e-05, "loss": 0.3356610870361328, "step": 85800 }, { "epoch": 33.0635838150289, "eval_loss": 0.3917013108730316, "eval_runtime": 12.2945, "eval_samples_per_second": 1545.735, "eval_steps_per_second": 32.21, "step": 85800 }, { "epoch": 33.08285163776493, "grad_norm": 0.31720566749572754, "learning_rate": 6.767013487475916e-05, "loss": 0.3271720504760742, "step": 85850 }, { "epoch": 33.08285163776493, "eval_loss": 0.392518013715744, "eval_runtime": 12.0016, "eval_samples_per_second": 1583.462, "eval_steps_per_second": 32.996, "step": 85850 }, { "epoch": 33.102119460500965, "grad_norm": 0.32704609632492065, "learning_rate": 6.759306358381503e-05, "loss": 0.32698806762695315, "step": 85900 }, { "epoch": 33.102119460500965, "eval_loss": 0.38945522904396057, "eval_runtime": 13.4658, "eval_samples_per_second": 1411.274, "eval_steps_per_second": 29.408, "step": 85900 }, { "epoch": 33.12138728323699, "grad_norm": 0.3069796562194824, "learning_rate": 6.751599229287091e-05, "loss": 0.3326752853393555, "step": 85950 }, { "epoch": 33.12138728323699, "eval_loss": 0.3895275294780731, "eval_runtime": 12.3882, "eval_samples_per_second": 1534.045, "eval_steps_per_second": 31.966, "step": 85950 }, { "epoch": 33.140655105973025, "grad_norm": 0.3290899991989136, "learning_rate": 6.743892100192678e-05, "loss": 0.33159957885742186, "step": 86000 }, { "epoch": 33.140655105973025, "eval_loss": 0.38596269488334656, "eval_runtime": 12.3711, "eval_samples_per_second": 1536.165, "eval_steps_per_second": 32.01, "step": 86000 }, { "epoch": 33.15992292870906, "grad_norm": 0.3325170874595642, "learning_rate": 6.736184971098265e-05, "loss": 0.3332086181640625, "step": 86050 }, { "epoch": 33.15992292870906, "eval_loss": 0.39872369170188904, "eval_runtime": 12.0999, "eval_samples_per_second": 1570.59, "eval_steps_per_second": 32.728, "step": 86050 }, { "epoch": 33.179190751445084, "grad_norm": 0.34142637252807617, "learning_rate": 6.728477842003854e-05, "loss": 0.3308460235595703, "step": 86100 }, { "epoch": 33.179190751445084, "eval_loss": 0.3966997265815735, "eval_runtime": 12.1763, "eval_samples_per_second": 1560.739, "eval_steps_per_second": 32.522, "step": 86100 }, { "epoch": 33.19845857418112, "grad_norm": 0.35238638520240784, "learning_rate": 6.720770712909441e-05, "loss": 0.33647621154785157, "step": 86150 }, { "epoch": 33.19845857418112, "eval_loss": 0.400581419467926, "eval_runtime": 12.1813, "eval_samples_per_second": 1560.094, "eval_steps_per_second": 32.509, "step": 86150 }, { "epoch": 33.21772639691715, "grad_norm": 0.32032808661460876, "learning_rate": 6.71306358381503e-05, "loss": 0.3298699188232422, "step": 86200 }, { "epoch": 33.21772639691715, "eval_loss": 0.3889765441417694, "eval_runtime": 13.5721, "eval_samples_per_second": 1400.227, "eval_steps_per_second": 29.178, "step": 86200 }, { "epoch": 33.23699421965318, "grad_norm": 0.3108430802822113, "learning_rate": 6.705356454720617e-05, "loss": 0.3299360656738281, "step": 86250 }, { "epoch": 33.23699421965318, "eval_loss": 0.393259197473526, "eval_runtime": 12.376, "eval_samples_per_second": 1535.554, "eval_steps_per_second": 31.997, "step": 86250 }, { "epoch": 33.25626204238921, "grad_norm": 0.3221990466117859, "learning_rate": 6.697649325626204e-05, "loss": 0.3326185989379883, "step": 86300 }, { "epoch": 33.25626204238921, "eval_loss": 0.39155706763267517, "eval_runtime": 12.3797, "eval_samples_per_second": 1535.098, "eval_steps_per_second": 31.988, "step": 86300 }, { "epoch": 33.27552986512524, "grad_norm": 0.3412969708442688, "learning_rate": 6.689942196531792e-05, "loss": 0.3331646728515625, "step": 86350 }, { "epoch": 33.27552986512524, "eval_loss": 0.3843606412410736, "eval_runtime": 12.2207, "eval_samples_per_second": 1555.064, "eval_steps_per_second": 32.404, "step": 86350 }, { "epoch": 33.29479768786127, "grad_norm": 0.34119340777397156, "learning_rate": 6.682235067437381e-05, "loss": 0.32963714599609373, "step": 86400 }, { "epoch": 33.29479768786127, "eval_loss": 0.3866514265537262, "eval_runtime": 12.2926, "eval_samples_per_second": 1545.964, "eval_steps_per_second": 32.214, "step": 86400 }, { "epoch": 33.3140655105973, "grad_norm": 0.3142387568950653, "learning_rate": 6.674527938342967e-05, "loss": 0.33276077270507814, "step": 86450 }, { "epoch": 33.3140655105973, "eval_loss": 0.3836846649646759, "eval_runtime": 12.3452, "eval_samples_per_second": 1539.378, "eval_steps_per_second": 32.077, "step": 86450 }, { "epoch": 33.333333333333336, "grad_norm": 0.3491765558719635, "learning_rate": 6.666820809248555e-05, "loss": 0.32747535705566405, "step": 86500 }, { "epoch": 33.333333333333336, "eval_loss": 0.3920976519584656, "eval_runtime": 12.0234, "eval_samples_per_second": 1580.583, "eval_steps_per_second": 32.936, "step": 86500 }, { "epoch": 33.35260115606936, "grad_norm": 0.31445926427841187, "learning_rate": 6.659113680154144e-05, "loss": 0.3327904510498047, "step": 86550 }, { "epoch": 33.35260115606936, "eval_loss": 0.38855254650115967, "eval_runtime": 12.4246, "eval_samples_per_second": 1529.546, "eval_steps_per_second": 31.872, "step": 86550 }, { "epoch": 33.371868978805395, "grad_norm": 0.36880192160606384, "learning_rate": 6.65140655105973e-05, "loss": 0.32740192413330077, "step": 86600 }, { "epoch": 33.371868978805395, "eval_loss": 0.39098548889160156, "eval_runtime": 12.4549, "eval_samples_per_second": 1525.822, "eval_steps_per_second": 31.795, "step": 86600 }, { "epoch": 33.39113680154143, "grad_norm": 0.3408445715904236, "learning_rate": 6.643699421965318e-05, "loss": 0.3313533020019531, "step": 86650 }, { "epoch": 33.39113680154143, "eval_loss": 0.39367181062698364, "eval_runtime": 12.328, "eval_samples_per_second": 1541.526, "eval_steps_per_second": 32.122, "step": 86650 }, { "epoch": 33.410404624277454, "grad_norm": 0.30251121520996094, "learning_rate": 6.635992292870906e-05, "loss": 0.33045440673828125, "step": 86700 }, { "epoch": 33.410404624277454, "eval_loss": 0.38703659176826477, "eval_runtime": 12.2856, "eval_samples_per_second": 1546.851, "eval_steps_per_second": 32.233, "step": 86700 }, { "epoch": 33.42967244701349, "grad_norm": 0.3319636881351471, "learning_rate": 6.628285163776493e-05, "loss": 0.3304859924316406, "step": 86750 }, { "epoch": 33.42967244701349, "eval_loss": 0.39377906918525696, "eval_runtime": 12.2043, "eval_samples_per_second": 1557.153, "eval_steps_per_second": 32.448, "step": 86750 }, { "epoch": 33.44894026974952, "grad_norm": 0.3178090751171112, "learning_rate": 6.62057803468208e-05, "loss": 0.3369068908691406, "step": 86800 }, { "epoch": 33.44894026974952, "eval_loss": 0.3841116428375244, "eval_runtime": 13.8649, "eval_samples_per_second": 1370.652, "eval_steps_per_second": 28.561, "step": 86800 }, { "epoch": 33.46820809248555, "grad_norm": 0.32858818769454956, "learning_rate": 6.612870905587669e-05, "loss": 0.3344866180419922, "step": 86850 }, { "epoch": 33.46820809248555, "eval_loss": 0.39127326011657715, "eval_runtime": 12.295, "eval_samples_per_second": 1545.666, "eval_steps_per_second": 32.208, "step": 86850 }, { "epoch": 33.48747591522158, "grad_norm": 0.34014686942100525, "learning_rate": 6.605163776493256e-05, "loss": 0.3364862823486328, "step": 86900 }, { "epoch": 33.48747591522158, "eval_loss": 0.39069321751594543, "eval_runtime": 12.2776, "eval_samples_per_second": 1547.864, "eval_steps_per_second": 32.254, "step": 86900 }, { "epoch": 33.50674373795761, "grad_norm": 0.3168788254261017, "learning_rate": 6.597456647398845e-05, "loss": 0.32663711547851565, "step": 86950 }, { "epoch": 33.50674373795761, "eval_loss": 0.3917730450630188, "eval_runtime": 12.1075, "eval_samples_per_second": 1569.611, "eval_steps_per_second": 32.707, "step": 86950 }, { "epoch": 33.52601156069364, "grad_norm": 0.3389648199081421, "learning_rate": 6.589749518304432e-05, "loss": 0.3308210754394531, "step": 87000 }, { "epoch": 33.52601156069364, "eval_loss": 0.39015376567840576, "eval_runtime": 12.334, "eval_samples_per_second": 1540.781, "eval_steps_per_second": 32.106, "step": 87000 }, { "epoch": 33.54527938342967, "grad_norm": 0.3641950190067291, "learning_rate": 6.582042389210019e-05, "loss": 0.3271903991699219, "step": 87050 }, { "epoch": 33.54527938342967, "eval_loss": 0.3925703763961792, "eval_runtime": 12.4308, "eval_samples_per_second": 1528.779, "eval_steps_per_second": 31.856, "step": 87050 }, { "epoch": 33.564547206165706, "grad_norm": 0.3484618067741394, "learning_rate": 6.574335260115607e-05, "loss": 0.32776817321777346, "step": 87100 }, { "epoch": 33.564547206165706, "eval_loss": 0.39782944321632385, "eval_runtime": 12.2514, "eval_samples_per_second": 1551.168, "eval_steps_per_second": 32.323, "step": 87100 }, { "epoch": 33.58381502890173, "grad_norm": 0.2944464385509491, "learning_rate": 6.566628131021195e-05, "loss": 0.32631984710693357, "step": 87150 }, { "epoch": 33.58381502890173, "eval_loss": 0.3974054753780365, "eval_runtime": 12.3855, "eval_samples_per_second": 1534.379, "eval_steps_per_second": 31.973, "step": 87150 }, { "epoch": 33.603082851637765, "grad_norm": 0.36574989557266235, "learning_rate": 6.558921001926782e-05, "loss": 0.3277693176269531, "step": 87200 }, { "epoch": 33.603082851637765, "eval_loss": 0.3945235013961792, "eval_runtime": 12.4071, "eval_samples_per_second": 1531.707, "eval_steps_per_second": 31.917, "step": 87200 }, { "epoch": 33.6223506743738, "grad_norm": 0.35329440236091614, "learning_rate": 6.55121387283237e-05, "loss": 0.33089752197265626, "step": 87250 }, { "epoch": 33.6223506743738, "eval_loss": 0.3936978876590729, "eval_runtime": 12.1865, "eval_samples_per_second": 1559.436, "eval_steps_per_second": 32.495, "step": 87250 }, { "epoch": 33.641618497109825, "grad_norm": 0.32796546816825867, "learning_rate": 6.543506743737959e-05, "loss": 0.33262588500976564, "step": 87300 }, { "epoch": 33.641618497109825, "eval_loss": 0.3919011354446411, "eval_runtime": 12.2642, "eval_samples_per_second": 1549.549, "eval_steps_per_second": 32.289, "step": 87300 }, { "epoch": 33.66088631984586, "grad_norm": 0.31516075134277344, "learning_rate": 6.535799614643546e-05, "loss": 0.33268089294433595, "step": 87350 }, { "epoch": 33.66088631984586, "eval_loss": 0.3898976743221283, "eval_runtime": 12.3364, "eval_samples_per_second": 1540.478, "eval_steps_per_second": 32.1, "step": 87350 }, { "epoch": 33.68015414258189, "grad_norm": 0.30553916096687317, "learning_rate": 6.528092485549133e-05, "loss": 0.33274749755859373, "step": 87400 }, { "epoch": 33.68015414258189, "eval_loss": 0.3832867443561554, "eval_runtime": 12.1089, "eval_samples_per_second": 1569.43, "eval_steps_per_second": 32.703, "step": 87400 }, { "epoch": 33.69942196531792, "grad_norm": 0.32491442561149597, "learning_rate": 6.520385356454721e-05, "loss": 0.33439323425292966, "step": 87450 }, { "epoch": 33.69942196531792, "eval_loss": 0.38546866178512573, "eval_runtime": 12.3405, "eval_samples_per_second": 1539.976, "eval_steps_per_second": 32.09, "step": 87450 }, { "epoch": 33.71868978805395, "grad_norm": 0.3326893150806427, "learning_rate": 6.512678227360309e-05, "loss": 0.33395168304443357, "step": 87500 }, { "epoch": 33.71868978805395, "eval_loss": 0.3959989845752716, "eval_runtime": 12.3679, "eval_samples_per_second": 1536.555, "eval_steps_per_second": 32.018, "step": 87500 }, { "epoch": 33.737957610789984, "grad_norm": 0.35818278789520264, "learning_rate": 6.504971098265896e-05, "loss": 0.32382308959960937, "step": 87550 }, { "epoch": 33.737957610789984, "eval_loss": 0.39477333426475525, "eval_runtime": 12.7507, "eval_samples_per_second": 1490.427, "eval_steps_per_second": 31.057, "step": 87550 }, { "epoch": 33.75722543352601, "grad_norm": 0.35273653268814087, "learning_rate": 6.497263969171484e-05, "loss": 0.3314169692993164, "step": 87600 }, { "epoch": 33.75722543352601, "eval_loss": 0.3872348368167877, "eval_runtime": 12.3973, "eval_samples_per_second": 1532.917, "eval_steps_per_second": 31.942, "step": 87600 }, { "epoch": 33.77649325626204, "grad_norm": 0.33836743235588074, "learning_rate": 6.489556840077071e-05, "loss": 0.3278672790527344, "step": 87650 }, { "epoch": 33.77649325626204, "eval_loss": 0.38380682468414307, "eval_runtime": 12.0121, "eval_samples_per_second": 1582.07, "eval_steps_per_second": 32.967, "step": 87650 }, { "epoch": 33.795761078998076, "grad_norm": 0.33700770139694214, "learning_rate": 6.48184971098266e-05, "loss": 0.32899429321289064, "step": 87700 }, { "epoch": 33.795761078998076, "eval_loss": 0.3861653208732605, "eval_runtime": 12.3945, "eval_samples_per_second": 1533.259, "eval_steps_per_second": 31.95, "step": 87700 }, { "epoch": 33.8150289017341, "grad_norm": 0.34987226128578186, "learning_rate": 6.474142581888247e-05, "loss": 0.32789825439453124, "step": 87750 }, { "epoch": 33.8150289017341, "eval_loss": 0.38709399104118347, "eval_runtime": 12.3728, "eval_samples_per_second": 1535.954, "eval_steps_per_second": 32.006, "step": 87750 }, { "epoch": 33.834296724470136, "grad_norm": 0.3626163899898529, "learning_rate": 6.466435452793834e-05, "loss": 0.33068260192871096, "step": 87800 }, { "epoch": 33.834296724470136, "eval_loss": 0.39165014028549194, "eval_runtime": 12.47, "eval_samples_per_second": 1523.983, "eval_steps_per_second": 31.756, "step": 87800 }, { "epoch": 33.85356454720617, "grad_norm": 0.3504170775413513, "learning_rate": 6.458728323699423e-05, "loss": 0.3277112579345703, "step": 87850 }, { "epoch": 33.85356454720617, "eval_loss": 0.38315898180007935, "eval_runtime": 12.406, "eval_samples_per_second": 1531.844, "eval_steps_per_second": 31.92, "step": 87850 }, { "epoch": 33.872832369942195, "grad_norm": 0.3369210958480835, "learning_rate": 6.45102119460501e-05, "loss": 0.3302669143676758, "step": 87900 }, { "epoch": 33.872832369942195, "eval_loss": 0.3886324167251587, "eval_runtime": 12.2542, "eval_samples_per_second": 1550.811, "eval_steps_per_second": 32.315, "step": 87900 }, { "epoch": 33.89210019267823, "grad_norm": 0.3538813889026642, "learning_rate": 6.443314065510597e-05, "loss": 0.33378288269042966, "step": 87950 }, { "epoch": 33.89210019267823, "eval_loss": 0.39754247665405273, "eval_runtime": 12.645, "eval_samples_per_second": 1502.888, "eval_steps_per_second": 31.317, "step": 87950 }, { "epoch": 33.91136801541426, "grad_norm": 0.3137151598930359, "learning_rate": 6.435606936416185e-05, "loss": 0.329036979675293, "step": 88000 }, { "epoch": 33.91136801541426, "eval_loss": 0.3940879702568054, "eval_runtime": 12.3925, "eval_samples_per_second": 1533.503, "eval_steps_per_second": 31.955, "step": 88000 }, { "epoch": 33.93063583815029, "grad_norm": 0.34337642788887024, "learning_rate": 6.427899807321774e-05, "loss": 0.32974063873291015, "step": 88050 }, { "epoch": 33.93063583815029, "eval_loss": 0.3878549635410309, "eval_runtime": 12.3289, "eval_samples_per_second": 1541.416, "eval_steps_per_second": 32.12, "step": 88050 }, { "epoch": 33.94990366088632, "grad_norm": 0.3428712785243988, "learning_rate": 6.42019267822736e-05, "loss": 0.33199310302734375, "step": 88100 }, { "epoch": 33.94990366088632, "eval_loss": 0.39220479130744934, "eval_runtime": 12.049, "eval_samples_per_second": 1577.227, "eval_steps_per_second": 32.866, "step": 88100 }, { "epoch": 33.969171483622354, "grad_norm": 0.4098481237888336, "learning_rate": 6.412485549132948e-05, "loss": 0.3368207931518555, "step": 88150 }, { "epoch": 33.969171483622354, "eval_loss": 0.3902398943901062, "eval_runtime": 12.3365, "eval_samples_per_second": 1540.467, "eval_steps_per_second": 32.1, "step": 88150 }, { "epoch": 33.98843930635838, "grad_norm": 0.3917214572429657, "learning_rate": 6.404778420038537e-05, "loss": 0.3336841583251953, "step": 88200 }, { "epoch": 33.98843930635838, "eval_loss": 0.39737021923065186, "eval_runtime": 12.4212, "eval_samples_per_second": 1529.962, "eval_steps_per_second": 31.881, "step": 88200 }, { "epoch": 34.00770712909441, "grad_norm": 0.3119466304779053, "learning_rate": 6.397071290944124e-05, "loss": 0.3296173095703125, "step": 88250 }, { "epoch": 34.00770712909441, "eval_loss": 0.390932559967041, "eval_runtime": 12.2009, "eval_samples_per_second": 1557.596, "eval_steps_per_second": 32.457, "step": 88250 }, { "epoch": 34.02697495183045, "grad_norm": 0.3032195568084717, "learning_rate": 6.389364161849711e-05, "loss": 0.328331298828125, "step": 88300 }, { "epoch": 34.02697495183045, "eval_loss": 0.39449542760849, "eval_runtime": 12.1776, "eval_samples_per_second": 1560.565, "eval_steps_per_second": 32.519, "step": 88300 }, { "epoch": 34.04624277456647, "grad_norm": 0.31678199768066406, "learning_rate": 6.3816570327553e-05, "loss": 0.32859676361083984, "step": 88350 }, { "epoch": 34.04624277456647, "eval_loss": 0.3918789029121399, "eval_runtime": 12.4261, "eval_samples_per_second": 1529.361, "eval_steps_per_second": 31.868, "step": 88350 }, { "epoch": 34.065510597302506, "grad_norm": 0.34428349137306213, "learning_rate": 6.373949903660886e-05, "loss": 0.3309896850585938, "step": 88400 }, { "epoch": 34.065510597302506, "eval_loss": 0.3892180323600769, "eval_runtime": 12.3861, "eval_samples_per_second": 1534.296, "eval_steps_per_second": 31.971, "step": 88400 }, { "epoch": 34.08477842003854, "grad_norm": 0.4305480718612671, "learning_rate": 6.366242774566475e-05, "loss": 0.32992916107177733, "step": 88450 }, { "epoch": 34.08477842003854, "eval_loss": 0.3859463334083557, "eval_runtime": 12.3155, "eval_samples_per_second": 1543.102, "eval_steps_per_second": 32.155, "step": 88450 }, { "epoch": 34.104046242774565, "grad_norm": 0.36362192034721375, "learning_rate": 6.358535645472062e-05, "loss": 0.32947486877441406, "step": 88500 }, { "epoch": 34.104046242774565, "eval_loss": 0.38720834255218506, "eval_runtime": 12.3364, "eval_samples_per_second": 1540.487, "eval_steps_per_second": 32.1, "step": 88500 }, { "epoch": 34.1233140655106, "grad_norm": 0.33099594712257385, "learning_rate": 6.350828516377649e-05, "loss": 0.3295342254638672, "step": 88550 }, { "epoch": 34.1233140655106, "eval_loss": 0.380791038274765, "eval_runtime": 12.3388, "eval_samples_per_second": 1540.179, "eval_steps_per_second": 32.094, "step": 88550 }, { "epoch": 34.14258188824663, "grad_norm": 0.34775716066360474, "learning_rate": 6.343121387283238e-05, "loss": 0.3280264663696289, "step": 88600 }, { "epoch": 34.14258188824663, "eval_loss": 0.3914911150932312, "eval_runtime": 14.566, "eval_samples_per_second": 1304.679, "eval_steps_per_second": 27.187, "step": 88600 }, { "epoch": 34.16184971098266, "grad_norm": 0.36225128173828125, "learning_rate": 6.335414258188825e-05, "loss": 0.3306592559814453, "step": 88650 }, { "epoch": 34.16184971098266, "eval_loss": 0.38729536533355713, "eval_runtime": 12.3404, "eval_samples_per_second": 1539.984, "eval_steps_per_second": 32.09, "step": 88650 }, { "epoch": 34.18111753371869, "grad_norm": 0.3609655797481537, "learning_rate": 6.327707129094412e-05, "loss": 0.33160335540771485, "step": 88700 }, { "epoch": 34.18111753371869, "eval_loss": 0.38952240347862244, "eval_runtime": 12.2378, "eval_samples_per_second": 1552.892, "eval_steps_per_second": 32.359, "step": 88700 }, { "epoch": 34.20038535645472, "grad_norm": 0.3530179560184479, "learning_rate": 6.32e-05, "loss": 0.33011531829833984, "step": 88750 }, { "epoch": 34.20038535645472, "eval_loss": 0.39346814155578613, "eval_runtime": 12.0338, "eval_samples_per_second": 1579.217, "eval_steps_per_second": 32.907, "step": 88750 }, { "epoch": 34.21965317919075, "grad_norm": 0.36664533615112305, "learning_rate": 6.312292870905589e-05, "loss": 0.3256890487670898, "step": 88800 }, { "epoch": 34.21965317919075, "eval_loss": 0.3852498531341553, "eval_runtime": 12.2036, "eval_samples_per_second": 1557.24, "eval_steps_per_second": 32.449, "step": 88800 }, { "epoch": 34.238921001926784, "grad_norm": 0.36303970217704773, "learning_rate": 6.304585741811175e-05, "loss": 0.3290599060058594, "step": 88850 }, { "epoch": 34.238921001926784, "eval_loss": 0.38644957542419434, "eval_runtime": 12.479, "eval_samples_per_second": 1522.879, "eval_steps_per_second": 31.733, "step": 88850 }, { "epoch": 34.25818882466281, "grad_norm": 0.3842925429344177, "learning_rate": 6.296878612716763e-05, "loss": 0.3264118957519531, "step": 88900 }, { "epoch": 34.25818882466281, "eval_loss": 0.3924679160118103, "eval_runtime": 12.5943, "eval_samples_per_second": 1508.937, "eval_steps_per_second": 31.443, "step": 88900 }, { "epoch": 34.27745664739884, "grad_norm": 0.28656959533691406, "learning_rate": 6.289171483622352e-05, "loss": 0.32335342407226564, "step": 88950 }, { "epoch": 34.27745664739884, "eval_loss": 0.3824901580810547, "eval_runtime": 12.1968, "eval_samples_per_second": 1558.114, "eval_steps_per_second": 32.468, "step": 88950 }, { "epoch": 34.296724470134876, "grad_norm": 0.314212828874588, "learning_rate": 6.281464354527939e-05, "loss": 0.33042320251464846, "step": 89000 }, { "epoch": 34.296724470134876, "eval_loss": 0.3876990079879761, "eval_runtime": 12.2594, "eval_samples_per_second": 1550.154, "eval_steps_per_second": 32.302, "step": 89000 }, { "epoch": 34.3159922928709, "grad_norm": 0.3581949770450592, "learning_rate": 6.273757225433526e-05, "loss": 0.33068840026855467, "step": 89050 }, { "epoch": 34.3159922928709, "eval_loss": 0.38904911279678345, "eval_runtime": 12.2139, "eval_samples_per_second": 1555.937, "eval_steps_per_second": 32.422, "step": 89050 }, { "epoch": 34.335260115606935, "grad_norm": 0.34296730160713196, "learning_rate": 6.266050096339114e-05, "loss": 0.32809356689453123, "step": 89100 }, { "epoch": 34.335260115606935, "eval_loss": 0.39113134145736694, "eval_runtime": 12.2897, "eval_samples_per_second": 1546.338, "eval_steps_per_second": 32.222, "step": 89100 }, { "epoch": 34.35452793834297, "grad_norm": 0.3821568191051483, "learning_rate": 6.258342967244702e-05, "loss": 0.3301662826538086, "step": 89150 }, { "epoch": 34.35452793834297, "eval_loss": 0.38328826427459717, "eval_runtime": 12.3372, "eval_samples_per_second": 1540.383, "eval_steps_per_second": 32.098, "step": 89150 }, { "epoch": 34.373795761078995, "grad_norm": 0.3363609313964844, "learning_rate": 6.250635838150289e-05, "loss": 0.33046215057373046, "step": 89200 }, { "epoch": 34.373795761078995, "eval_loss": 0.3920291066169739, "eval_runtime": 12.3336, "eval_samples_per_second": 1540.827, "eval_steps_per_second": 32.107, "step": 89200 }, { "epoch": 34.39306358381503, "grad_norm": 0.2989078164100647, "learning_rate": 6.242928709055877e-05, "loss": 0.3277073287963867, "step": 89250 }, { "epoch": 34.39306358381503, "eval_loss": 0.39348042011260986, "eval_runtime": 12.1101, "eval_samples_per_second": 1569.269, "eval_steps_per_second": 32.7, "step": 89250 }, { "epoch": 34.41233140655106, "grad_norm": 0.3429989516735077, "learning_rate": 6.235221579961464e-05, "loss": 0.32709880828857424, "step": 89300 }, { "epoch": 34.41233140655106, "eval_loss": 0.39232122898101807, "eval_runtime": 12.5426, "eval_samples_per_second": 1515.162, "eval_steps_per_second": 31.573, "step": 89300 }, { "epoch": 34.43159922928709, "grad_norm": 0.3355536460876465, "learning_rate": 6.227514450867053e-05, "loss": 0.3259059143066406, "step": 89350 }, { "epoch": 34.43159922928709, "eval_loss": 0.38773906230926514, "eval_runtime": 12.2009, "eval_samples_per_second": 1557.589, "eval_steps_per_second": 32.457, "step": 89350 }, { "epoch": 34.45086705202312, "grad_norm": 0.35940131545066833, "learning_rate": 6.21980732177264e-05, "loss": 0.32811698913574217, "step": 89400 }, { "epoch": 34.45086705202312, "eval_loss": 0.3857577443122864, "eval_runtime": 12.3418, "eval_samples_per_second": 1539.81, "eval_steps_per_second": 32.086, "step": 89400 }, { "epoch": 34.470134874759154, "grad_norm": 0.36457574367523193, "learning_rate": 6.212100192678227e-05, "loss": 0.3315275192260742, "step": 89450 }, { "epoch": 34.470134874759154, "eval_loss": 0.3796592056751251, "eval_runtime": 12.3082, "eval_samples_per_second": 1544.013, "eval_steps_per_second": 32.174, "step": 89450 }, { "epoch": 34.48940269749518, "grad_norm": 0.36117640137672424, "learning_rate": 6.204393063583816e-05, "loss": 0.33032405853271485, "step": 89500 }, { "epoch": 34.48940269749518, "eval_loss": 0.3864496946334839, "eval_runtime": 12.194, "eval_samples_per_second": 1558.465, "eval_steps_per_second": 32.475, "step": 89500 }, { "epoch": 34.50867052023121, "grad_norm": 0.3881984353065491, "learning_rate": 6.196685934489404e-05, "loss": 0.32124412536621094, "step": 89550 }, { "epoch": 34.50867052023121, "eval_loss": 0.3878605365753174, "eval_runtime": 12.3965, "eval_samples_per_second": 1533.012, "eval_steps_per_second": 31.944, "step": 89550 }, { "epoch": 34.527938342967246, "grad_norm": 0.3459565043449402, "learning_rate": 6.18897880539499e-05, "loss": 0.33038829803466796, "step": 89600 }, { "epoch": 34.527938342967246, "eval_loss": 0.3955438733100891, "eval_runtime": 12.3185, "eval_samples_per_second": 1542.721, "eval_steps_per_second": 32.147, "step": 89600 }, { "epoch": 34.54720616570327, "grad_norm": 0.3655318319797516, "learning_rate": 6.181271676300578e-05, "loss": 0.33093067169189455, "step": 89650 }, { "epoch": 34.54720616570327, "eval_loss": 0.39360445737838745, "eval_runtime": 12.341, "eval_samples_per_second": 1539.903, "eval_steps_per_second": 32.088, "step": 89650 }, { "epoch": 34.566473988439306, "grad_norm": 0.2933152914047241, "learning_rate": 6.173564547206167e-05, "loss": 0.3302872467041016, "step": 89700 }, { "epoch": 34.566473988439306, "eval_loss": 0.39164769649505615, "eval_runtime": 12.5135, "eval_samples_per_second": 1518.682, "eval_steps_per_second": 31.646, "step": 89700 }, { "epoch": 34.58574181117534, "grad_norm": 0.3143720030784607, "learning_rate": 6.165857418111754e-05, "loss": 0.3234867095947266, "step": 89750 }, { "epoch": 34.58574181117534, "eval_loss": 0.39063483476638794, "eval_runtime": 12.311, "eval_samples_per_second": 1543.658, "eval_steps_per_second": 32.166, "step": 89750 }, { "epoch": 34.605009633911365, "grad_norm": 0.3133572041988373, "learning_rate": 6.158150289017341e-05, "loss": 0.3273264312744141, "step": 89800 }, { "epoch": 34.605009633911365, "eval_loss": 0.3919639587402344, "eval_runtime": 12.1459, "eval_samples_per_second": 1564.643, "eval_steps_per_second": 32.604, "step": 89800 }, { "epoch": 34.6242774566474, "grad_norm": 0.35064980387687683, "learning_rate": 6.15044315992293e-05, "loss": 0.32805973052978515, "step": 89850 }, { "epoch": 34.6242774566474, "eval_loss": 0.3880475163459778, "eval_runtime": 12.1916, "eval_samples_per_second": 1558.782, "eval_steps_per_second": 32.481, "step": 89850 }, { "epoch": 34.64354527938343, "grad_norm": 0.3478328585624695, "learning_rate": 6.142736030828517e-05, "loss": 0.33214874267578126, "step": 89900 }, { "epoch": 34.64354527938343, "eval_loss": 0.38948604464530945, "eval_runtime": 12.7378, "eval_samples_per_second": 1491.937, "eval_steps_per_second": 31.089, "step": 89900 }, { "epoch": 34.66281310211946, "grad_norm": 0.338631808757782, "learning_rate": 6.135028901734104e-05, "loss": 0.32986270904541015, "step": 89950 }, { "epoch": 34.66281310211946, "eval_loss": 0.3854893445968628, "eval_runtime": 14.0709, "eval_samples_per_second": 1350.586, "eval_steps_per_second": 28.143, "step": 89950 }, { "epoch": 34.68208092485549, "grad_norm": 0.31339335441589355, "learning_rate": 6.127321772639692e-05, "loss": 0.32953269958496095, "step": 90000 }, { "epoch": 34.68208092485549, "eval_loss": 0.39659199118614197, "eval_runtime": 12.3831, "eval_samples_per_second": 1534.67, "eval_steps_per_second": 31.979, "step": 90000 }, { "epoch": 34.701348747591524, "grad_norm": 0.3332389295101166, "learning_rate": 6.11961464354528e-05, "loss": 0.32313148498535155, "step": 90050 }, { "epoch": 34.701348747591524, "eval_loss": 0.3969087302684784, "eval_runtime": 12.1892, "eval_samples_per_second": 1559.081, "eval_steps_per_second": 32.488, "step": 90050 }, { "epoch": 34.72061657032755, "grad_norm": 0.3547174334526062, "learning_rate": 6.111907514450868e-05, "loss": 0.326370849609375, "step": 90100 }, { "epoch": 34.72061657032755, "eval_loss": 0.3893473744392395, "eval_runtime": 12.0368, "eval_samples_per_second": 1578.819, "eval_steps_per_second": 32.899, "step": 90100 }, { "epoch": 34.73988439306358, "grad_norm": 0.3665701448917389, "learning_rate": 6.104200385356455e-05, "loss": 0.32903564453125, "step": 90150 }, { "epoch": 34.73988439306358, "eval_loss": 0.3856891691684723, "eval_runtime": 12.3661, "eval_samples_per_second": 1536.78, "eval_steps_per_second": 32.023, "step": 90150 }, { "epoch": 34.75915221579962, "grad_norm": 0.31533849239349365, "learning_rate": 6.096493256262042e-05, "loss": 0.32621543884277343, "step": 90200 }, { "epoch": 34.75915221579962, "eval_loss": 0.3823149800300598, "eval_runtime": 12.3929, "eval_samples_per_second": 1533.461, "eval_steps_per_second": 31.954, "step": 90200 }, { "epoch": 34.77842003853564, "grad_norm": 0.3025828003883362, "learning_rate": 6.088786127167631e-05, "loss": 0.32941116333007814, "step": 90250 }, { "epoch": 34.77842003853564, "eval_loss": 0.3869825303554535, "eval_runtime": 12.4228, "eval_samples_per_second": 1529.766, "eval_steps_per_second": 31.877, "step": 90250 }, { "epoch": 34.797687861271676, "grad_norm": 0.31869152188301086, "learning_rate": 6.0810789980732186e-05, "loss": 0.33194770812988283, "step": 90300 }, { "epoch": 34.797687861271676, "eval_loss": 0.3920426070690155, "eval_runtime": 12.3898, "eval_samples_per_second": 1533.84, "eval_steps_per_second": 31.962, "step": 90300 }, { "epoch": 34.81695568400771, "grad_norm": 0.3232773542404175, "learning_rate": 6.073371868978806e-05, "loss": 0.32447792053222657, "step": 90350 }, { "epoch": 34.81695568400771, "eval_loss": 0.3923782706260681, "eval_runtime": 12.4679, "eval_samples_per_second": 1524.23, "eval_steps_per_second": 31.761, "step": 90350 }, { "epoch": 34.836223506743735, "grad_norm": 0.3161788284778595, "learning_rate": 6.0656647398843935e-05, "loss": 0.3290896224975586, "step": 90400 }, { "epoch": 34.836223506743735, "eval_loss": 0.3905608057975769, "eval_runtime": 12.2522, "eval_samples_per_second": 1551.067, "eval_steps_per_second": 32.321, "step": 90400 }, { "epoch": 34.85549132947977, "grad_norm": 0.33408594131469727, "learning_rate": 6.0579576107899806e-05, "loss": 0.3300537872314453, "step": 90450 }, { "epoch": 34.85549132947977, "eval_loss": 0.38030725717544556, "eval_runtime": 12.384, "eval_samples_per_second": 1534.56, "eval_steps_per_second": 31.977, "step": 90450 }, { "epoch": 34.8747591522158, "grad_norm": 0.33147674798965454, "learning_rate": 6.0502504816955685e-05, "loss": 0.32585433959960936, "step": 90500 }, { "epoch": 34.8747591522158, "eval_loss": 0.39042437076568604, "eval_runtime": 12.3772, "eval_samples_per_second": 1535.406, "eval_steps_per_second": 31.994, "step": 90500 }, { "epoch": 34.89402697495183, "grad_norm": 0.31088000535964966, "learning_rate": 6.042543352601156e-05, "loss": 0.3288770294189453, "step": 90550 }, { "epoch": 34.89402697495183, "eval_loss": 0.39041152596473694, "eval_runtime": 12.3935, "eval_samples_per_second": 1533.391, "eval_steps_per_second": 31.952, "step": 90550 }, { "epoch": 34.91329479768786, "grad_norm": 0.35791653394699097, "learning_rate": 6.0348362235067434e-05, "loss": 0.32824466705322264, "step": 90600 }, { "epoch": 34.91329479768786, "eval_loss": 0.38882195949554443, "eval_runtime": 12.7039, "eval_samples_per_second": 1495.92, "eval_steps_per_second": 31.172, "step": 90600 }, { "epoch": 34.932562620423894, "grad_norm": 0.3813866376876831, "learning_rate": 6.027129094412331e-05, "loss": 0.33194889068603517, "step": 90650 }, { "epoch": 34.932562620423894, "eval_loss": 0.3908965289592743, "eval_runtime": 12.2893, "eval_samples_per_second": 1546.383, "eval_steps_per_second": 32.223, "step": 90650 }, { "epoch": 34.95183044315992, "grad_norm": 0.39697086811065674, "learning_rate": 6.01942196531792e-05, "loss": 0.32998329162597656, "step": 90700 }, { "epoch": 34.95183044315992, "eval_loss": 0.3902999758720398, "eval_runtime": 12.1996, "eval_samples_per_second": 1557.758, "eval_steps_per_second": 32.46, "step": 90700 }, { "epoch": 34.971098265895954, "grad_norm": 0.33773136138916016, "learning_rate": 6.011714836223507e-05, "loss": 0.3237565231323242, "step": 90750 }, { "epoch": 34.971098265895954, "eval_loss": 0.38959982991218567, "eval_runtime": 12.3033, "eval_samples_per_second": 1544.621, "eval_steps_per_second": 32.186, "step": 90750 }, { "epoch": 34.99036608863199, "grad_norm": 0.32897379994392395, "learning_rate": 6.0040077071290947e-05, "loss": 0.32759632110595704, "step": 90800 }, { "epoch": 34.99036608863199, "eval_loss": 0.3908528983592987, "eval_runtime": 12.307, "eval_samples_per_second": 1544.162, "eval_steps_per_second": 32.177, "step": 90800 }, { "epoch": 35.00963391136801, "grad_norm": 0.30499255657196045, "learning_rate": 5.9963005780346825e-05, "loss": 0.3259908676147461, "step": 90850 }, { "epoch": 35.00963391136801, "eval_loss": 0.3861997127532959, "eval_runtime": 12.3386, "eval_samples_per_second": 1540.205, "eval_steps_per_second": 32.094, "step": 90850 }, { "epoch": 35.028901734104046, "grad_norm": 0.3199129104614258, "learning_rate": 5.9885934489402696e-05, "loss": 0.32529090881347655, "step": 90900 }, { "epoch": 35.028901734104046, "eval_loss": 0.38671115040779114, "eval_runtime": 12.3103, "eval_samples_per_second": 1543.742, "eval_steps_per_second": 32.168, "step": 90900 }, { "epoch": 35.04816955684008, "grad_norm": 0.32943421602249146, "learning_rate": 5.9808863198458574e-05, "loss": 0.33128475189208983, "step": 90950 }, { "epoch": 35.04816955684008, "eval_loss": 0.389596164226532, "eval_runtime": 12.048, "eval_samples_per_second": 1577.354, "eval_steps_per_second": 32.868, "step": 90950 }, { "epoch": 35.067437379576106, "grad_norm": 0.35097113251686096, "learning_rate": 5.973179190751446e-05, "loss": 0.3298822021484375, "step": 91000 }, { "epoch": 35.067437379576106, "eval_loss": 0.39984816312789917, "eval_runtime": 12.3414, "eval_samples_per_second": 1539.862, "eval_steps_per_second": 32.087, "step": 91000 }, { "epoch": 35.08670520231214, "grad_norm": 0.31923091411590576, "learning_rate": 5.9654720616570324e-05, "loss": 0.32701454162597654, "step": 91050 }, { "epoch": 35.08670520231214, "eval_loss": 0.39509665966033936, "eval_runtime": 12.3174, "eval_samples_per_second": 1542.856, "eval_steps_per_second": 32.15, "step": 91050 }, { "epoch": 35.10597302504817, "grad_norm": 0.41551530361175537, "learning_rate": 5.957764932562621e-05, "loss": 0.32488456726074216, "step": 91100 }, { "epoch": 35.10597302504817, "eval_loss": 0.38820841908454895, "eval_runtime": 12.3594, "eval_samples_per_second": 1537.616, "eval_steps_per_second": 32.04, "step": 91100 }, { "epoch": 35.1252408477842, "grad_norm": 0.29955896735191345, "learning_rate": 5.9500578034682087e-05, "loss": 0.32864936828613284, "step": 91150 }, { "epoch": 35.1252408477842, "eval_loss": 0.39003986120224, "eval_runtime": 12.7879, "eval_samples_per_second": 1486.096, "eval_steps_per_second": 30.967, "step": 91150 }, { "epoch": 35.14450867052023, "grad_norm": 0.3296370208263397, "learning_rate": 5.942350674373796e-05, "loss": 0.3297565460205078, "step": 91200 }, { "epoch": 35.14450867052023, "eval_loss": 0.3889846205711365, "eval_runtime": 13.5767, "eval_samples_per_second": 1399.752, "eval_steps_per_second": 29.168, "step": 91200 }, { "epoch": 35.163776493256265, "grad_norm": 0.3236866891384125, "learning_rate": 5.9346435452793836e-05, "loss": 0.3270408248901367, "step": 91250 }, { "epoch": 35.163776493256265, "eval_loss": 0.39141765236854553, "eval_runtime": 13.1976, "eval_samples_per_second": 1439.959, "eval_steps_per_second": 30.005, "step": 91250 }, { "epoch": 35.18304431599229, "grad_norm": 0.3340907394886017, "learning_rate": 5.9269364161849714e-05, "loss": 0.3284273147583008, "step": 91300 }, { "epoch": 35.18304431599229, "eval_loss": 0.38769540190696716, "eval_runtime": 15.2685, "eval_samples_per_second": 1244.658, "eval_steps_per_second": 25.936, "step": 91300 }, { "epoch": 35.202312138728324, "grad_norm": 0.31380167603492737, "learning_rate": 5.9192292870905585e-05, "loss": 0.3278533935546875, "step": 91350 }, { "epoch": 35.202312138728324, "eval_loss": 0.3916812241077423, "eval_runtime": 15.4841, "eval_samples_per_second": 1227.321, "eval_steps_per_second": 25.575, "step": 91350 }, { "epoch": 35.22157996146436, "grad_norm": 0.36638548970222473, "learning_rate": 5.9115221579961464e-05, "loss": 0.32475303649902343, "step": 91400 }, { "epoch": 35.22157996146436, "eval_loss": 0.3889930248260498, "eval_runtime": 12.2452, "eval_samples_per_second": 1551.961, "eval_steps_per_second": 32.339, "step": 91400 }, { "epoch": 35.24084778420038, "grad_norm": 0.3512418270111084, "learning_rate": 5.903815028901735e-05, "loss": 0.327381591796875, "step": 91450 }, { "epoch": 35.24084778420038, "eval_loss": 0.3847394287586212, "eval_runtime": 12.3906, "eval_samples_per_second": 1533.748, "eval_steps_per_second": 31.96, "step": 91450 }, { "epoch": 35.26011560693642, "grad_norm": 0.355015367269516, "learning_rate": 5.896107899807321e-05, "loss": 0.33271141052246095, "step": 91500 }, { "epoch": 35.26011560693642, "eval_loss": 0.39291587471961975, "eval_runtime": 12.0747, "eval_samples_per_second": 1573.868, "eval_steps_per_second": 32.796, "step": 91500 }, { "epoch": 35.27938342967245, "grad_norm": 0.33805742859840393, "learning_rate": 5.88840077071291e-05, "loss": 0.3288734817504883, "step": 91550 }, { "epoch": 35.27938342967245, "eval_loss": 0.39325278997421265, "eval_runtime": 12.4135, "eval_samples_per_second": 1530.919, "eval_steps_per_second": 31.901, "step": 91550 }, { "epoch": 35.298651252408476, "grad_norm": 0.36293283104896545, "learning_rate": 5.8806936416184976e-05, "loss": 0.32982353210449217, "step": 91600 }, { "epoch": 35.298651252408476, "eval_loss": 0.3850092589855194, "eval_runtime": 12.408, "eval_samples_per_second": 1531.595, "eval_steps_per_second": 31.915, "step": 91600 }, { "epoch": 35.31791907514451, "grad_norm": 0.31515851616859436, "learning_rate": 5.872986512524085e-05, "loss": 0.3259614181518555, "step": 91650 }, { "epoch": 35.31791907514451, "eval_loss": 0.3886248767375946, "eval_runtime": 13.1705, "eval_samples_per_second": 1442.925, "eval_steps_per_second": 30.067, "step": 91650 }, { "epoch": 35.33718689788054, "grad_norm": 0.3731270730495453, "learning_rate": 5.8652793834296726e-05, "loss": 0.32571853637695314, "step": 91700 }, { "epoch": 35.33718689788054, "eval_loss": 0.3833027780056, "eval_runtime": 13.2403, "eval_samples_per_second": 1435.311, "eval_steps_per_second": 29.909, "step": 91700 }, { "epoch": 35.35645472061657, "grad_norm": 0.36144429445266724, "learning_rate": 5.8575722543352604e-05, "loss": 0.3298924255371094, "step": 91750 }, { "epoch": 35.35645472061657, "eval_loss": 0.38512998819351196, "eval_runtime": 12.3558, "eval_samples_per_second": 1538.066, "eval_steps_per_second": 32.05, "step": 91750 }, { "epoch": 35.3757225433526, "grad_norm": 0.32412126660346985, "learning_rate": 5.8498651252408475e-05, "loss": 0.32445358276367187, "step": 91800 }, { "epoch": 35.3757225433526, "eval_loss": 0.3858166038990021, "eval_runtime": 12.1239, "eval_samples_per_second": 1567.479, "eval_steps_per_second": 32.663, "step": 91800 }, { "epoch": 35.394990366088635, "grad_norm": 0.3005750775337219, "learning_rate": 5.842157996146436e-05, "loss": 0.32464439392089844, "step": 91850 }, { "epoch": 35.394990366088635, "eval_loss": 0.39128929376602173, "eval_runtime": 14.4714, "eval_samples_per_second": 1313.214, "eval_steps_per_second": 27.364, "step": 91850 }, { "epoch": 35.41425818882466, "grad_norm": 0.3461354374885559, "learning_rate": 5.834450867052024e-05, "loss": 0.3330879211425781, "step": 91900 }, { "epoch": 35.41425818882466, "eval_loss": 0.381767600774765, "eval_runtime": 12.2326, "eval_samples_per_second": 1553.558, "eval_steps_per_second": 32.373, "step": 91900 }, { "epoch": 35.433526011560694, "grad_norm": 0.29059550166130066, "learning_rate": 5.826743737957611e-05, "loss": 0.32488624572753905, "step": 91950 }, { "epoch": 35.433526011560694, "eval_loss": 0.37853455543518066, "eval_runtime": 12.2915, "eval_samples_per_second": 1546.108, "eval_steps_per_second": 32.217, "step": 91950 }, { "epoch": 35.45279383429673, "grad_norm": 0.3337874114513397, "learning_rate": 5.819036608863199e-05, "loss": 0.32832000732421873, "step": 92000 }, { "epoch": 35.45279383429673, "eval_loss": 0.3941405415534973, "eval_runtime": 12.2764, "eval_samples_per_second": 1548.014, "eval_steps_per_second": 32.257, "step": 92000 }, { "epoch": 35.472061657032754, "grad_norm": 0.3118699789047241, "learning_rate": 5.8113294797687866e-05, "loss": 0.3244602203369141, "step": 92050 }, { "epoch": 35.472061657032754, "eval_loss": 0.38113558292388916, "eval_runtime": 12.2262, "eval_samples_per_second": 1554.363, "eval_steps_per_second": 32.389, "step": 92050 }, { "epoch": 35.49132947976879, "grad_norm": 0.33406227827072144, "learning_rate": 5.803622350674374e-05, "loss": 0.32967243194580076, "step": 92100 }, { "epoch": 35.49132947976879, "eval_loss": 0.3898988962173462, "eval_runtime": 12.7065, "eval_samples_per_second": 1495.61, "eval_steps_per_second": 31.165, "step": 92100 }, { "epoch": 35.51059730250482, "grad_norm": 0.3175235390663147, "learning_rate": 5.7959152215799615e-05, "loss": 0.32863048553466795, "step": 92150 }, { "epoch": 35.51059730250482, "eval_loss": 0.3850865364074707, "eval_runtime": 12.3715, "eval_samples_per_second": 1536.116, "eval_steps_per_second": 32.009, "step": 92150 }, { "epoch": 35.529865125240846, "grad_norm": 0.2963683605194092, "learning_rate": 5.78820809248555e-05, "loss": 0.32394416809082033, "step": 92200 }, { "epoch": 35.529865125240846, "eval_loss": 0.3833273649215698, "eval_runtime": 12.2949, "eval_samples_per_second": 1545.678, "eval_steps_per_second": 32.208, "step": 92200 }, { "epoch": 35.54913294797688, "grad_norm": 0.31194597482681274, "learning_rate": 5.7805009633911364e-05, "loss": 0.3289098358154297, "step": 92250 }, { "epoch": 35.54913294797688, "eval_loss": 0.3901132345199585, "eval_runtime": 12.2975, "eval_samples_per_second": 1545.356, "eval_steps_per_second": 32.202, "step": 92250 }, { "epoch": 35.56840077071291, "grad_norm": 0.3243687152862549, "learning_rate": 5.772793834296725e-05, "loss": 0.32677871704101563, "step": 92300 }, { "epoch": 35.56840077071291, "eval_loss": 0.38990917801856995, "eval_runtime": 12.1749, "eval_samples_per_second": 1560.918, "eval_steps_per_second": 32.526, "step": 92300 }, { "epoch": 35.58766859344894, "grad_norm": 0.3599238097667694, "learning_rate": 5.765086705202313e-05, "loss": 0.32910957336425783, "step": 92350 }, { "epoch": 35.58766859344894, "eval_loss": 0.3900713622570038, "eval_runtime": 12.1114, "eval_samples_per_second": 1569.106, "eval_steps_per_second": 32.697, "step": 92350 }, { "epoch": 35.60693641618497, "grad_norm": 0.3442700207233429, "learning_rate": 5.7573795761079e-05, "loss": 0.3289793014526367, "step": 92400 }, { "epoch": 35.60693641618497, "eval_loss": 0.38326600193977356, "eval_runtime": 12.3602, "eval_samples_per_second": 1537.51, "eval_steps_per_second": 32.038, "step": 92400 }, { "epoch": 35.626204238921005, "grad_norm": 0.33132287859916687, "learning_rate": 5.749672447013488e-05, "loss": 0.3262133026123047, "step": 92450 }, { "epoch": 35.626204238921005, "eval_loss": 0.38108164072036743, "eval_runtime": 12.2261, "eval_samples_per_second": 1554.382, "eval_steps_per_second": 32.39, "step": 92450 }, { "epoch": 35.64547206165703, "grad_norm": 0.33836987614631653, "learning_rate": 5.7419653179190755e-05, "loss": 0.32867305755615234, "step": 92500 }, { "epoch": 35.64547206165703, "eval_loss": 0.3930107355117798, "eval_runtime": 14.1268, "eval_samples_per_second": 1345.243, "eval_steps_per_second": 28.032, "step": 92500 }, { "epoch": 35.664739884393065, "grad_norm": 0.33436036109924316, "learning_rate": 5.7342581888246626e-05, "loss": 0.3263105010986328, "step": 92550 }, { "epoch": 35.664739884393065, "eval_loss": 0.3842018246650696, "eval_runtime": 12.9671, "eval_samples_per_second": 1465.555, "eval_steps_per_second": 30.539, "step": 92550 }, { "epoch": 35.6840077071291, "grad_norm": 0.3616037368774414, "learning_rate": 5.7265510597302504e-05, "loss": 0.32735801696777345, "step": 92600 }, { "epoch": 35.6840077071291, "eval_loss": 0.38384687900543213, "eval_runtime": 15.0327, "eval_samples_per_second": 1264.175, "eval_steps_per_second": 26.343, "step": 92600 }, { "epoch": 35.703275529865124, "grad_norm": 0.3618532121181488, "learning_rate": 5.718843930635839e-05, "loss": 0.32403541564941407, "step": 92650 }, { "epoch": 35.703275529865124, "eval_loss": 0.38063809275627136, "eval_runtime": 12.5768, "eval_samples_per_second": 1511.039, "eval_steps_per_second": 31.487, "step": 92650 }, { "epoch": 35.72254335260116, "grad_norm": 0.3044779598712921, "learning_rate": 5.7111368015414254e-05, "loss": 0.3256887435913086, "step": 92700 }, { "epoch": 35.72254335260116, "eval_loss": 0.3875041902065277, "eval_runtime": 13.5825, "eval_samples_per_second": 1399.149, "eval_steps_per_second": 29.155, "step": 92700 }, { "epoch": 35.74181117533719, "grad_norm": 0.3083705008029938, "learning_rate": 5.703429672447014e-05, "loss": 0.3272967147827148, "step": 92750 }, { "epoch": 35.74181117533719, "eval_loss": 0.38704925775527954, "eval_runtime": 12.357, "eval_samples_per_second": 1537.919, "eval_steps_per_second": 32.047, "step": 92750 }, { "epoch": 35.761078998073216, "grad_norm": 0.29405614733695984, "learning_rate": 5.695722543352602e-05, "loss": 0.32405284881591795, "step": 92800 }, { "epoch": 35.761078998073216, "eval_loss": 0.382814884185791, "eval_runtime": 12.3481, "eval_samples_per_second": 1539.028, "eval_steps_per_second": 32.07, "step": 92800 }, { "epoch": 35.78034682080925, "grad_norm": 0.32914501428604126, "learning_rate": 5.688015414258189e-05, "loss": 0.32709091186523437, "step": 92850 }, { "epoch": 35.78034682080925, "eval_loss": 0.3882371485233307, "eval_runtime": 12.2451, "eval_samples_per_second": 1551.966, "eval_steps_per_second": 32.339, "step": 92850 }, { "epoch": 35.79961464354528, "grad_norm": 0.325853168964386, "learning_rate": 5.6803082851637766e-05, "loss": 0.327569580078125, "step": 92900 }, { "epoch": 35.79961464354528, "eval_loss": 0.38445788621902466, "eval_runtime": 12.2704, "eval_samples_per_second": 1548.769, "eval_steps_per_second": 32.273, "step": 92900 }, { "epoch": 35.81888246628131, "grad_norm": 0.33827540278434753, "learning_rate": 5.672601156069365e-05, "loss": 0.3308378601074219, "step": 92950 }, { "epoch": 35.81888246628131, "eval_loss": 0.3859719932079315, "eval_runtime": 12.2862, "eval_samples_per_second": 1546.773, "eval_steps_per_second": 32.231, "step": 92950 }, { "epoch": 35.83815028901734, "grad_norm": 0.3714035451412201, "learning_rate": 5.6648940269749516e-05, "loss": 0.32649620056152345, "step": 93000 }, { "epoch": 35.83815028901734, "eval_loss": 0.39290741086006165, "eval_runtime": 12.2103, "eval_samples_per_second": 1556.385, "eval_steps_per_second": 32.432, "step": 93000 }, { "epoch": 35.857418111753375, "grad_norm": 0.33440831303596497, "learning_rate": 5.65718689788054e-05, "loss": 0.3265763473510742, "step": 93050 }, { "epoch": 35.857418111753375, "eval_loss": 0.3897450864315033, "eval_runtime": 12.2236, "eval_samples_per_second": 1554.703, "eval_steps_per_second": 32.396, "step": 93050 }, { "epoch": 35.8766859344894, "grad_norm": 0.3327382802963257, "learning_rate": 5.649479768786128e-05, "loss": 0.32586151123046875, "step": 93100 }, { "epoch": 35.8766859344894, "eval_loss": 0.3865887522697449, "eval_runtime": 12.2404, "eval_samples_per_second": 1552.558, "eval_steps_per_second": 32.352, "step": 93100 }, { "epoch": 35.895953757225435, "grad_norm": 0.3006040155887604, "learning_rate": 5.641772639691715e-05, "loss": 0.33076980590820315, "step": 93150 }, { "epoch": 35.895953757225435, "eval_loss": 0.3848879635334015, "eval_runtime": 12.2674, "eval_samples_per_second": 1549.143, "eval_steps_per_second": 32.281, "step": 93150 }, { "epoch": 35.91522157996146, "grad_norm": 0.3177235722541809, "learning_rate": 5.634065510597303e-05, "loss": 0.33281124114990235, "step": 93200 }, { "epoch": 35.91522157996146, "eval_loss": 0.3872844874858856, "eval_runtime": 12.0788, "eval_samples_per_second": 1573.34, "eval_steps_per_second": 32.785, "step": 93200 }, { "epoch": 35.934489402697494, "grad_norm": 0.39580488204956055, "learning_rate": 5.6263583815028906e-05, "loss": 0.3290985870361328, "step": 93250 }, { "epoch": 35.934489402697494, "eval_loss": 0.37895816564559937, "eval_runtime": 12.4102, "eval_samples_per_second": 1531.321, "eval_steps_per_second": 31.909, "step": 93250 }, { "epoch": 35.95375722543353, "grad_norm": 0.3073520362377167, "learning_rate": 5.618651252408478e-05, "loss": 0.3269797897338867, "step": 93300 }, { "epoch": 35.95375722543353, "eval_loss": 0.39324769377708435, "eval_runtime": 12.4935, "eval_samples_per_second": 1521.113, "eval_steps_per_second": 31.697, "step": 93300 }, { "epoch": 35.97302504816955, "grad_norm": 0.29328101873397827, "learning_rate": 5.6109441233140656e-05, "loss": 0.3280996322631836, "step": 93350 }, { "epoch": 35.97302504816955, "eval_loss": 0.3873334527015686, "eval_runtime": 12.4753, "eval_samples_per_second": 1523.326, "eval_steps_per_second": 31.743, "step": 93350 }, { "epoch": 35.99229287090559, "grad_norm": 0.3160848617553711, "learning_rate": 5.603236994219654e-05, "loss": 0.3308690643310547, "step": 93400 }, { "epoch": 35.99229287090559, "eval_loss": 0.393159419298172, "eval_runtime": 12.2344, "eval_samples_per_second": 1553.323, "eval_steps_per_second": 32.368, "step": 93400 }, { "epoch": 36.01156069364162, "grad_norm": 0.3511022627353668, "learning_rate": 5.5955298651252405e-05, "loss": 0.3265379333496094, "step": 93450 }, { "epoch": 36.01156069364162, "eval_loss": 0.38520848751068115, "eval_runtime": 12.2812, "eval_samples_per_second": 1547.411, "eval_steps_per_second": 32.245, "step": 93450 }, { "epoch": 36.030828516377646, "grad_norm": 0.3043035864830017, "learning_rate": 5.587822736030829e-05, "loss": 0.33146865844726564, "step": 93500 }, { "epoch": 36.030828516377646, "eval_loss": 0.38520219922065735, "eval_runtime": 12.223, "eval_samples_per_second": 1554.778, "eval_steps_per_second": 32.398, "step": 93500 }, { "epoch": 36.05009633911368, "grad_norm": 0.3311373293399811, "learning_rate": 5.580115606936417e-05, "loss": 0.3260406494140625, "step": 93550 }, { "epoch": 36.05009633911368, "eval_loss": 0.38183778524398804, "eval_runtime": 12.486, "eval_samples_per_second": 1522.023, "eval_steps_per_second": 31.715, "step": 93550 }, { "epoch": 36.06936416184971, "grad_norm": 0.3456331491470337, "learning_rate": 5.572408477842004e-05, "loss": 0.3265385055541992, "step": 93600 }, { "epoch": 36.06936416184971, "eval_loss": 0.3883320391178131, "eval_runtime": 12.3792, "eval_samples_per_second": 1535.162, "eval_steps_per_second": 31.989, "step": 93600 }, { "epoch": 36.08863198458574, "grad_norm": 0.34360605478286743, "learning_rate": 5.564701348747592e-05, "loss": 0.3263379669189453, "step": 93650 }, { "epoch": 36.08863198458574, "eval_loss": 0.3762940466403961, "eval_runtime": 12.3737, "eval_samples_per_second": 1535.838, "eval_steps_per_second": 32.003, "step": 93650 }, { "epoch": 36.10789980732177, "grad_norm": 0.33395975828170776, "learning_rate": 5.5569942196531796e-05, "loss": 0.32583274841308596, "step": 93700 }, { "epoch": 36.10789980732177, "eval_loss": 0.3868345320224762, "eval_runtime": 12.536, "eval_samples_per_second": 1515.954, "eval_steps_per_second": 31.589, "step": 93700 }, { "epoch": 36.127167630057805, "grad_norm": 0.34991687536239624, "learning_rate": 5.549287090558767e-05, "loss": 0.327344970703125, "step": 93750 }, { "epoch": 36.127167630057805, "eval_loss": 0.38920578360557556, "eval_runtime": 12.1391, "eval_samples_per_second": 1565.518, "eval_steps_per_second": 32.622, "step": 93750 }, { "epoch": 36.14643545279383, "grad_norm": 0.3359109163284302, "learning_rate": 5.5415799614643545e-05, "loss": 0.32641555786132814, "step": 93800 }, { "epoch": 36.14643545279383, "eval_loss": 0.3866998255252838, "eval_runtime": 12.4461, "eval_samples_per_second": 1526.899, "eval_steps_per_second": 31.817, "step": 93800 }, { "epoch": 36.165703275529864, "grad_norm": 0.3192787766456604, "learning_rate": 5.533872832369943e-05, "loss": 0.32495429992675784, "step": 93850 }, { "epoch": 36.165703275529864, "eval_loss": 0.3829503059387207, "eval_runtime": 12.3669, "eval_samples_per_second": 1536.678, "eval_steps_per_second": 32.021, "step": 93850 }, { "epoch": 36.1849710982659, "grad_norm": 0.3170911371707916, "learning_rate": 5.52616570327553e-05, "loss": 0.32794769287109377, "step": 93900 }, { "epoch": 36.1849710982659, "eval_loss": 0.3765464723110199, "eval_runtime": 12.7249, "eval_samples_per_second": 1493.447, "eval_steps_per_second": 31.12, "step": 93900 }, { "epoch": 36.204238921001924, "grad_norm": 0.31286895275115967, "learning_rate": 5.518458574181118e-05, "loss": 0.32770332336425784, "step": 93950 }, { "epoch": 36.204238921001924, "eval_loss": 0.38451334834098816, "eval_runtime": 12.4078, "eval_samples_per_second": 1531.618, "eval_steps_per_second": 31.915, "step": 93950 }, { "epoch": 36.22350674373796, "grad_norm": 0.3234644830226898, "learning_rate": 5.510751445086706e-05, "loss": 0.326754150390625, "step": 94000 }, { "epoch": 36.22350674373796, "eval_loss": 0.3835383355617523, "eval_runtime": 12.5931, "eval_samples_per_second": 1509.084, "eval_steps_per_second": 31.446, "step": 94000 }, { "epoch": 36.24277456647399, "grad_norm": 0.3357420861721039, "learning_rate": 5.503044315992293e-05, "loss": 0.32602882385253906, "step": 94050 }, { "epoch": 36.24277456647399, "eval_loss": 0.39288952946662903, "eval_runtime": 12.0659, "eval_samples_per_second": 1575.02, "eval_steps_per_second": 32.82, "step": 94050 }, { "epoch": 36.262042389210016, "grad_norm": 0.3226640522480011, "learning_rate": 5.495337186897881e-05, "loss": 0.3301510238647461, "step": 94100 }, { "epoch": 36.262042389210016, "eval_loss": 0.3775346279144287, "eval_runtime": 12.2981, "eval_samples_per_second": 1545.284, "eval_steps_per_second": 32.2, "step": 94100 }, { "epoch": 36.28131021194605, "grad_norm": 0.3303546905517578, "learning_rate": 5.487630057803469e-05, "loss": 0.32282970428466795, "step": 94150 }, { "epoch": 36.28131021194605, "eval_loss": 0.3815577030181885, "eval_runtime": 12.4126, "eval_samples_per_second": 1531.027, "eval_steps_per_second": 31.903, "step": 94150 }, { "epoch": 36.30057803468208, "grad_norm": 0.29724082350730896, "learning_rate": 5.479922928709056e-05, "loss": 0.3254724884033203, "step": 94200 }, { "epoch": 36.30057803468208, "eval_loss": 0.38169682025909424, "eval_runtime": 12.3522, "eval_samples_per_second": 1538.505, "eval_steps_per_second": 32.059, "step": 94200 }, { "epoch": 36.31984585741811, "grad_norm": 0.33230093121528625, "learning_rate": 5.472215799614644e-05, "loss": 0.32769027709960935, "step": 94250 }, { "epoch": 36.31984585741811, "eval_loss": 0.388942688703537, "eval_runtime": 12.3638, "eval_samples_per_second": 1537.07, "eval_steps_per_second": 32.029, "step": 94250 }, { "epoch": 36.33911368015414, "grad_norm": 0.30012890696525574, "learning_rate": 5.464508670520232e-05, "loss": 0.32327945709228517, "step": 94300 }, { "epoch": 36.33911368015414, "eval_loss": 0.3808019757270813, "eval_runtime": 12.251, "eval_samples_per_second": 1551.218, "eval_steps_per_second": 32.324, "step": 94300 }, { "epoch": 36.358381502890175, "grad_norm": 0.3199288845062256, "learning_rate": 5.456801541425819e-05, "loss": 0.32635009765625, "step": 94350 }, { "epoch": 36.358381502890175, "eval_loss": 0.376923531293869, "eval_runtime": 12.3273, "eval_samples_per_second": 1541.62, "eval_steps_per_second": 32.124, "step": 94350 }, { "epoch": 36.3776493256262, "grad_norm": 0.33416637778282166, "learning_rate": 5.449094412331407e-05, "loss": 0.3258469772338867, "step": 94400 }, { "epoch": 36.3776493256262, "eval_loss": 0.3840233385562897, "eval_runtime": 12.3615, "eval_samples_per_second": 1537.351, "eval_steps_per_second": 32.035, "step": 94400 }, { "epoch": 36.396917148362235, "grad_norm": 0.3255612850189209, "learning_rate": 5.441387283236995e-05, "loss": 0.3294960403442383, "step": 94450 }, { "epoch": 36.396917148362235, "eval_loss": 0.3855111598968506, "eval_runtime": 12.2807, "eval_samples_per_second": 1547.471, "eval_steps_per_second": 32.246, "step": 94450 }, { "epoch": 36.41618497109827, "grad_norm": 0.325446218252182, "learning_rate": 5.433680154142582e-05, "loss": 0.3185654830932617, "step": 94500 }, { "epoch": 36.41618497109827, "eval_loss": 0.39206087589263916, "eval_runtime": 12.3528, "eval_samples_per_second": 1538.435, "eval_steps_per_second": 32.057, "step": 94500 }, { "epoch": 36.435452793834294, "grad_norm": 0.32647112011909485, "learning_rate": 5.42597302504817e-05, "loss": 0.32712139129638673, "step": 94550 }, { "epoch": 36.435452793834294, "eval_loss": 0.37852707505226135, "eval_runtime": 15.7338, "eval_samples_per_second": 1207.847, "eval_steps_per_second": 25.169, "step": 94550 }, { "epoch": 36.45472061657033, "grad_norm": 0.29917794466018677, "learning_rate": 5.418265895953758e-05, "loss": 0.32862945556640627, "step": 94600 }, { "epoch": 36.45472061657033, "eval_loss": 0.3839617669582367, "eval_runtime": 12.1882, "eval_samples_per_second": 1559.219, "eval_steps_per_second": 32.491, "step": 94600 }, { "epoch": 36.47398843930636, "grad_norm": 0.3462665379047394, "learning_rate": 5.4105587668593446e-05, "loss": 0.3298581314086914, "step": 94650 }, { "epoch": 36.47398843930636, "eval_loss": 0.390325665473938, "eval_runtime": 12.2589, "eval_samples_per_second": 1550.223, "eval_steps_per_second": 32.303, "step": 94650 }, { "epoch": 36.49325626204239, "grad_norm": 0.34780147671699524, "learning_rate": 5.402851637764933e-05, "loss": 0.32415672302246096, "step": 94700 }, { "epoch": 36.49325626204239, "eval_loss": 0.38648584485054016, "eval_runtime": 12.3687, "eval_samples_per_second": 1536.456, "eval_steps_per_second": 32.016, "step": 94700 }, { "epoch": 36.51252408477842, "grad_norm": 0.45523908734321594, "learning_rate": 5.395144508670521e-05, "loss": 0.3282521820068359, "step": 94750 }, { "epoch": 36.51252408477842, "eval_loss": 0.37696152925491333, "eval_runtime": 12.5088, "eval_samples_per_second": 1519.253, "eval_steps_per_second": 31.658, "step": 94750 }, { "epoch": 36.53179190751445, "grad_norm": 0.3533797562122345, "learning_rate": 5.387437379576108e-05, "loss": 0.3251274871826172, "step": 94800 }, { "epoch": 36.53179190751445, "eval_loss": 0.38812553882598877, "eval_runtime": 12.2857, "eval_samples_per_second": 1546.844, "eval_steps_per_second": 32.233, "step": 94800 }, { "epoch": 36.55105973025048, "grad_norm": 0.35232996940612793, "learning_rate": 5.379730250481696e-05, "loss": 0.3264775848388672, "step": 94850 }, { "epoch": 36.55105973025048, "eval_loss": 0.3858183026313782, "eval_runtime": 12.2714, "eval_samples_per_second": 1548.643, "eval_steps_per_second": 32.27, "step": 94850 }, { "epoch": 36.57032755298651, "grad_norm": 0.2999281883239746, "learning_rate": 5.3720231213872844e-05, "loss": 0.32776565551757814, "step": 94900 }, { "epoch": 36.57032755298651, "eval_loss": 0.382784366607666, "eval_runtime": 12.0659, "eval_samples_per_second": 1575.019, "eval_steps_per_second": 32.82, "step": 94900 }, { "epoch": 36.589595375722546, "grad_norm": 0.29279327392578125, "learning_rate": 5.364315992292871e-05, "loss": 0.32732784271240234, "step": 94950 }, { "epoch": 36.589595375722546, "eval_loss": 0.38369306921958923, "eval_runtime": 12.3017, "eval_samples_per_second": 1544.824, "eval_steps_per_second": 32.191, "step": 94950 }, { "epoch": 36.60886319845857, "grad_norm": 0.342729389667511, "learning_rate": 5.356608863198459e-05, "loss": 0.32045135498046873, "step": 95000 }, { "epoch": 36.60886319845857, "eval_loss": 0.3898096978664398, "eval_runtime": 12.3343, "eval_samples_per_second": 1540.75, "eval_steps_per_second": 32.106, "step": 95000 }, { "epoch": 36.628131021194605, "grad_norm": 0.29523158073425293, "learning_rate": 5.348901734104047e-05, "loss": 0.31970123291015623, "step": 95050 }, { "epoch": 36.628131021194605, "eval_loss": 0.3814075291156769, "eval_runtime": 12.3294, "eval_samples_per_second": 1541.352, "eval_steps_per_second": 32.118, "step": 95050 }, { "epoch": 36.64739884393064, "grad_norm": 0.34183481335639954, "learning_rate": 5.341194605009634e-05, "loss": 0.32104393005371096, "step": 95100 }, { "epoch": 36.64739884393064, "eval_loss": 0.38014283776283264, "eval_runtime": 12.2094, "eval_samples_per_second": 1556.503, "eval_steps_per_second": 32.434, "step": 95100 }, { "epoch": 36.666666666666664, "grad_norm": 0.3634852468967438, "learning_rate": 5.333487475915222e-05, "loss": 0.3273652648925781, "step": 95150 }, { "epoch": 36.666666666666664, "eval_loss": 0.3862748146057129, "eval_runtime": 12.5884, "eval_samples_per_second": 1509.638, "eval_steps_per_second": 31.457, "step": 95150 }, { "epoch": 36.6859344894027, "grad_norm": 0.3237438201904297, "learning_rate": 5.325780346820809e-05, "loss": 0.32427139282226564, "step": 95200 }, { "epoch": 36.6859344894027, "eval_loss": 0.3862474858760834, "eval_runtime": 12.0351, "eval_samples_per_second": 1579.053, "eval_steps_per_second": 32.904, "step": 95200 }, { "epoch": 36.70520231213873, "grad_norm": 0.31385213136672974, "learning_rate": 5.318073217726397e-05, "loss": 0.32432716369628906, "step": 95250 }, { "epoch": 36.70520231213873, "eval_loss": 0.387704998254776, "eval_runtime": 12.3737, "eval_samples_per_second": 1535.84, "eval_steps_per_second": 32.003, "step": 95250 }, { "epoch": 36.72447013487476, "grad_norm": 0.35812127590179443, "learning_rate": 5.310366088631985e-05, "loss": 0.32345924377441404, "step": 95300 }, { "epoch": 36.72447013487476, "eval_loss": 0.389710396528244, "eval_runtime": 12.3519, "eval_samples_per_second": 1538.554, "eval_steps_per_second": 32.06, "step": 95300 }, { "epoch": 36.74373795761079, "grad_norm": 0.3412971496582031, "learning_rate": 5.302658959537572e-05, "loss": 0.32830253601074216, "step": 95350 }, { "epoch": 36.74373795761079, "eval_loss": 0.3921198546886444, "eval_runtime": 12.4349, "eval_samples_per_second": 1528.275, "eval_steps_per_second": 31.846, "step": 95350 }, { "epoch": 36.76300578034682, "grad_norm": 0.36888304352760315, "learning_rate": 5.29495183044316e-05, "loss": 0.32585922241210935, "step": 95400 }, { "epoch": 36.76300578034682, "eval_loss": 0.3946174681186676, "eval_runtime": 12.219, "eval_samples_per_second": 1555.283, "eval_steps_per_second": 32.409, "step": 95400 }, { "epoch": 36.78227360308285, "grad_norm": 0.33024862408638, "learning_rate": 5.287244701348748e-05, "loss": 0.3294694900512695, "step": 95450 }, { "epoch": 36.78227360308285, "eval_loss": 0.38429775834083557, "eval_runtime": 12.3568, "eval_samples_per_second": 1537.938, "eval_steps_per_second": 32.047, "step": 95450 }, { "epoch": 36.80154142581888, "grad_norm": 0.38877061009407043, "learning_rate": 5.279537572254335e-05, "loss": 0.3219462966918945, "step": 95500 }, { "epoch": 36.80154142581888, "eval_loss": 0.39081257581710815, "eval_runtime": 12.5277, "eval_samples_per_second": 1516.96, "eval_steps_per_second": 31.61, "step": 95500 }, { "epoch": 36.820809248554916, "grad_norm": 0.31259748339653015, "learning_rate": 5.271830443159923e-05, "loss": 0.3246589660644531, "step": 95550 }, { "epoch": 36.820809248554916, "eval_loss": 0.39233821630477905, "eval_runtime": 12.4552, "eval_samples_per_second": 1525.784, "eval_steps_per_second": 31.794, "step": 95550 }, { "epoch": 36.84007707129094, "grad_norm": 0.3273388743400574, "learning_rate": 5.264123314065511e-05, "loss": 0.3221525573730469, "step": 95600 }, { "epoch": 36.84007707129094, "eval_loss": 0.3852018117904663, "eval_runtime": 12.3809, "eval_samples_per_second": 1534.94, "eval_steps_per_second": 31.985, "step": 95600 }, { "epoch": 36.859344894026975, "grad_norm": 0.32565194368362427, "learning_rate": 5.256416184971098e-05, "loss": 0.323927001953125, "step": 95650 }, { "epoch": 36.859344894026975, "eval_loss": 0.39105942845344543, "eval_runtime": 12.3875, "eval_samples_per_second": 1534.128, "eval_steps_per_second": 31.968, "step": 95650 }, { "epoch": 36.87861271676301, "grad_norm": 0.35268473625183105, "learning_rate": 5.248709055876686e-05, "loss": 0.3210253524780273, "step": 95700 }, { "epoch": 36.87861271676301, "eval_loss": 0.3887671232223511, "eval_runtime": 12.4487, "eval_samples_per_second": 1526.581, "eval_steps_per_second": 31.81, "step": 95700 }, { "epoch": 36.897880539499035, "grad_norm": 0.2997651994228363, "learning_rate": 5.241001926782274e-05, "loss": 0.32775646209716797, "step": 95750 }, { "epoch": 36.897880539499035, "eval_loss": 0.3886624574661255, "eval_runtime": 12.0794, "eval_samples_per_second": 1573.263, "eval_steps_per_second": 32.783, "step": 95750 }, { "epoch": 36.91714836223507, "grad_norm": 0.3446139991283417, "learning_rate": 5.233294797687861e-05, "loss": 0.3292439651489258, "step": 95800 }, { "epoch": 36.91714836223507, "eval_loss": 0.389981210231781, "eval_runtime": 12.2286, "eval_samples_per_second": 1554.064, "eval_steps_per_second": 32.383, "step": 95800 }, { "epoch": 36.9364161849711, "grad_norm": 0.4170444905757904, "learning_rate": 5.2255876685934494e-05, "loss": 0.3252682113647461, "step": 95850 }, { "epoch": 36.9364161849711, "eval_loss": 0.38564082980155945, "eval_runtime": 12.2854, "eval_samples_per_second": 1546.875, "eval_steps_per_second": 32.233, "step": 95850 }, { "epoch": 36.95568400770713, "grad_norm": 0.31309282779693604, "learning_rate": 5.217880539499037e-05, "loss": 0.32634246826171875, "step": 95900 }, { "epoch": 36.95568400770713, "eval_loss": 0.38342446088790894, "eval_runtime": 14.694, "eval_samples_per_second": 1293.316, "eval_steps_per_second": 26.95, "step": 95900 }, { "epoch": 36.97495183044316, "grad_norm": 0.34742844104766846, "learning_rate": 5.210173410404624e-05, "loss": 0.3275069427490234, "step": 95950 }, { "epoch": 36.97495183044316, "eval_loss": 0.37761858105659485, "eval_runtime": 12.3406, "eval_samples_per_second": 1539.959, "eval_steps_per_second": 32.089, "step": 95950 }, { "epoch": 36.994219653179194, "grad_norm": 0.31900736689567566, "learning_rate": 5.202466281310212e-05, "loss": 0.3241277313232422, "step": 96000 }, { "epoch": 36.994219653179194, "eval_loss": 0.38885417580604553, "eval_runtime": 12.4014, "eval_samples_per_second": 1532.405, "eval_steps_per_second": 31.932, "step": 96000 }, { "epoch": 37.01348747591522, "grad_norm": 0.34900158643722534, "learning_rate": 5.1947591522158e-05, "loss": 0.3257408142089844, "step": 96050 }, { "epoch": 37.01348747591522, "eval_loss": 0.3759954273700714, "eval_runtime": 12.5794, "eval_samples_per_second": 1510.719, "eval_steps_per_second": 31.48, "step": 96050 }, { "epoch": 37.03275529865125, "grad_norm": 0.2962462306022644, "learning_rate": 5.187052023121387e-05, "loss": 0.32645645141601565, "step": 96100 }, { "epoch": 37.03275529865125, "eval_loss": 0.38972207903862, "eval_runtime": 12.3901, "eval_samples_per_second": 1533.803, "eval_steps_per_second": 31.961, "step": 96100 }, { "epoch": 37.052023121387286, "grad_norm": 0.34481170773506165, "learning_rate": 5.179344894026975e-05, "loss": 0.324489860534668, "step": 96150 }, { "epoch": 37.052023121387286, "eval_loss": 0.3866436779499054, "eval_runtime": 12.3203, "eval_samples_per_second": 1542.497, "eval_steps_per_second": 32.142, "step": 96150 }, { "epoch": 37.07129094412331, "grad_norm": 0.3174974322319031, "learning_rate": 5.1716377649325634e-05, "loss": 0.32341217041015624, "step": 96200 }, { "epoch": 37.07129094412331, "eval_loss": 0.38022610545158386, "eval_runtime": 12.2065, "eval_samples_per_second": 1556.877, "eval_steps_per_second": 32.442, "step": 96200 }, { "epoch": 37.090558766859345, "grad_norm": 0.34391117095947266, "learning_rate": 5.16393063583815e-05, "loss": 0.31973785400390625, "step": 96250 }, { "epoch": 37.090558766859345, "eval_loss": 0.38111862540245056, "eval_runtime": 12.2069, "eval_samples_per_second": 1556.83, "eval_steps_per_second": 32.441, "step": 96250 }, { "epoch": 37.10982658959538, "grad_norm": 0.3417455852031708, "learning_rate": 5.1562235067437383e-05, "loss": 0.3256679916381836, "step": 96300 }, { "epoch": 37.10982658959538, "eval_loss": 0.389210045337677, "eval_runtime": 12.1192, "eval_samples_per_second": 1568.088, "eval_steps_per_second": 32.675, "step": 96300 }, { "epoch": 37.129094412331405, "grad_norm": 0.32720425724983215, "learning_rate": 5.148516377649326e-05, "loss": 0.32634754180908204, "step": 96350 }, { "epoch": 37.129094412331405, "eval_loss": 0.3814832270145416, "eval_runtime": 12.3673, "eval_samples_per_second": 1536.636, "eval_steps_per_second": 32.02, "step": 96350 }, { "epoch": 37.14836223506744, "grad_norm": 0.3985839784145355, "learning_rate": 5.140809248554913e-05, "loss": 0.3246780014038086, "step": 96400 }, { "epoch": 37.14836223506744, "eval_loss": 0.3840966820716858, "eval_runtime": 12.2215, "eval_samples_per_second": 1554.967, "eval_steps_per_second": 32.402, "step": 96400 }, { "epoch": 37.16763005780347, "grad_norm": 0.3049646317958832, "learning_rate": 5.133102119460501e-05, "loss": 0.3254716491699219, "step": 96450 }, { "epoch": 37.16763005780347, "eval_loss": 0.3893960118293762, "eval_runtime": 12.2394, "eval_samples_per_second": 1552.687, "eval_steps_per_second": 32.354, "step": 96450 }, { "epoch": 37.1868978805395, "grad_norm": 0.34614068269729614, "learning_rate": 5.125394990366089e-05, "loss": 0.3244920349121094, "step": 96500 }, { "epoch": 37.1868978805395, "eval_loss": 0.3806404173374176, "eval_runtime": 12.2161, "eval_samples_per_second": 1555.65, "eval_steps_per_second": 32.416, "step": 96500 }, { "epoch": 37.20616570327553, "grad_norm": 0.3153478503227234, "learning_rate": 5.117687861271676e-05, "loss": 0.3276688766479492, "step": 96550 }, { "epoch": 37.20616570327553, "eval_loss": 0.3887615501880646, "eval_runtime": 14.7598, "eval_samples_per_second": 1287.556, "eval_steps_per_second": 26.83, "step": 96550 }, { "epoch": 37.225433526011564, "grad_norm": 0.31658944487571716, "learning_rate": 5.109980732177264e-05, "loss": 0.3285929489135742, "step": 96600 }, { "epoch": 37.225433526011564, "eval_loss": 0.38573041558265686, "eval_runtime": 12.0832, "eval_samples_per_second": 1572.763, "eval_steps_per_second": 32.773, "step": 96600 }, { "epoch": 37.24470134874759, "grad_norm": 0.30299386382102966, "learning_rate": 5.1022736030828523e-05, "loss": 0.32333892822265625, "step": 96650 }, { "epoch": 37.24470134874759, "eval_loss": 0.38760167360305786, "eval_runtime": 12.3466, "eval_samples_per_second": 1539.212, "eval_steps_per_second": 32.074, "step": 96650 }, { "epoch": 37.26396917148362, "grad_norm": 0.33455100655555725, "learning_rate": 5.094566473988439e-05, "loss": 0.3257026672363281, "step": 96700 }, { "epoch": 37.26396917148362, "eval_loss": 0.3787505030632019, "eval_runtime": 12.3074, "eval_samples_per_second": 1544.11, "eval_steps_per_second": 32.176, "step": 96700 }, { "epoch": 37.283236994219656, "grad_norm": 0.31294846534729004, "learning_rate": 5.086859344894027e-05, "loss": 0.32590877532958984, "step": 96750 }, { "epoch": 37.283236994219656, "eval_loss": 0.3792029917240143, "eval_runtime": 12.2103, "eval_samples_per_second": 1556.395, "eval_steps_per_second": 32.432, "step": 96750 }, { "epoch": 37.30250481695568, "grad_norm": 0.35594484210014343, "learning_rate": 5.079152215799615e-05, "loss": 0.3226111602783203, "step": 96800 }, { "epoch": 37.30250481695568, "eval_loss": 0.38425570726394653, "eval_runtime": 12.2199, "eval_samples_per_second": 1555.167, "eval_steps_per_second": 32.406, "step": 96800 }, { "epoch": 37.321772639691716, "grad_norm": 0.33544138073921204, "learning_rate": 5.071445086705202e-05, "loss": 0.3194586944580078, "step": 96850 }, { "epoch": 37.321772639691716, "eval_loss": 0.37934601306915283, "eval_runtime": 12.2795, "eval_samples_per_second": 1547.618, "eval_steps_per_second": 32.249, "step": 96850 }, { "epoch": 37.34104046242775, "grad_norm": 0.3462146520614624, "learning_rate": 5.06373795761079e-05, "loss": 0.3198286819458008, "step": 96900 }, { "epoch": 37.34104046242775, "eval_loss": 0.3800748586654663, "eval_runtime": 12.0756, "eval_samples_per_second": 1573.757, "eval_steps_per_second": 32.794, "step": 96900 }, { "epoch": 37.360308285163775, "grad_norm": 0.3504232168197632, "learning_rate": 5.0560308285163785e-05, "loss": 0.3279592132568359, "step": 96950 }, { "epoch": 37.360308285163775, "eval_loss": 0.38064518570899963, "eval_runtime": 12.3062, "eval_samples_per_second": 1544.267, "eval_steps_per_second": 32.179, "step": 96950 }, { "epoch": 37.37957610789981, "grad_norm": 0.3228347897529602, "learning_rate": 5.048323699421965e-05, "loss": 0.32754165649414063, "step": 97000 }, { "epoch": 37.37957610789981, "eval_loss": 0.38433125615119934, "eval_runtime": 12.2953, "eval_samples_per_second": 1545.631, "eval_steps_per_second": 32.207, "step": 97000 }, { "epoch": 37.39884393063584, "grad_norm": 0.34631940722465515, "learning_rate": 5.0406165703275535e-05, "loss": 0.3239423751831055, "step": 97050 }, { "epoch": 37.39884393063584, "eval_loss": 0.3743809461593628, "eval_runtime": 14.2104, "eval_samples_per_second": 1337.333, "eval_steps_per_second": 27.867, "step": 97050 }, { "epoch": 37.41811175337187, "grad_norm": 0.36051034927368164, "learning_rate": 5.032909441233141e-05, "loss": 0.3266269302368164, "step": 97100 }, { "epoch": 37.41811175337187, "eval_loss": 0.3809479773044586, "eval_runtime": 12.3925, "eval_samples_per_second": 1533.512, "eval_steps_per_second": 31.955, "step": 97100 }, { "epoch": 37.4373795761079, "grad_norm": 0.31952711939811707, "learning_rate": 5.0252023121387284e-05, "loss": 0.325677490234375, "step": 97150 }, { "epoch": 37.4373795761079, "eval_loss": 0.3840530514717102, "eval_runtime": 12.2016, "eval_samples_per_second": 1557.501, "eval_steps_per_second": 32.455, "step": 97150 }, { "epoch": 37.456647398843934, "grad_norm": 0.33152368664741516, "learning_rate": 5.017495183044316e-05, "loss": 0.31972469329833986, "step": 97200 }, { "epoch": 37.456647398843934, "eval_loss": 0.38957932591438293, "eval_runtime": 12.5216, "eval_samples_per_second": 1517.703, "eval_steps_per_second": 31.625, "step": 97200 }, { "epoch": 37.47591522157996, "grad_norm": 0.33769261837005615, "learning_rate": 5.009788053949904e-05, "loss": 0.32045345306396483, "step": 97250 }, { "epoch": 37.47591522157996, "eval_loss": 0.38402286171913147, "eval_runtime": 12.2158, "eval_samples_per_second": 1555.689, "eval_steps_per_second": 32.417, "step": 97250 }, { "epoch": 37.49518304431599, "grad_norm": 0.3398774266242981, "learning_rate": 5.002080924855491e-05, "loss": 0.3204893493652344, "step": 97300 }, { "epoch": 37.49518304431599, "eval_loss": 0.3847043812274933, "eval_runtime": 12.2406, "eval_samples_per_second": 1552.542, "eval_steps_per_second": 32.351, "step": 97300 }, { "epoch": 37.51445086705202, "grad_norm": 0.36315369606018066, "learning_rate": 4.994373795761079e-05, "loss": 0.3243041229248047, "step": 97350 }, { "epoch": 37.51445086705202, "eval_loss": 0.3888672888278961, "eval_runtime": 12.2969, "eval_samples_per_second": 1545.434, "eval_steps_per_second": 32.203, "step": 97350 }, { "epoch": 37.53371868978805, "grad_norm": 0.3414086699485779, "learning_rate": 4.986666666666667e-05, "loss": 0.3193508720397949, "step": 97400 }, { "epoch": 37.53371868978805, "eval_loss": 0.3886553943157196, "eval_runtime": 12.2383, "eval_samples_per_second": 1552.835, "eval_steps_per_second": 32.358, "step": 97400 }, { "epoch": 37.552986512524086, "grad_norm": 0.31254100799560547, "learning_rate": 4.9789595375722546e-05, "loss": 0.3215216827392578, "step": 97450 }, { "epoch": 37.552986512524086, "eval_loss": 0.3860328197479248, "eval_runtime": 12.1657, "eval_samples_per_second": 1562.091, "eval_steps_per_second": 32.55, "step": 97450 }, { "epoch": 37.57225433526011, "grad_norm": 0.3267834782600403, "learning_rate": 4.9712524084778424e-05, "loss": 0.31902631759643557, "step": 97500 }, { "epoch": 37.57225433526011, "eval_loss": 0.3819849193096161, "eval_runtime": 12.2397, "eval_samples_per_second": 1552.651, "eval_steps_per_second": 32.354, "step": 97500 }, { "epoch": 37.591522157996145, "grad_norm": 0.3192937672138214, "learning_rate": 4.9635452793834296e-05, "loss": 0.32604610443115234, "step": 97550 }, { "epoch": 37.591522157996145, "eval_loss": 0.3839585483074188, "eval_runtime": 12.2846, "eval_samples_per_second": 1546.974, "eval_steps_per_second": 32.235, "step": 97550 }, { "epoch": 37.61078998073218, "grad_norm": 0.3499439060688019, "learning_rate": 4.955838150289018e-05, "loss": 0.32703079223632814, "step": 97600 }, { "epoch": 37.61078998073218, "eval_loss": 0.3943444788455963, "eval_runtime": 12.3437, "eval_samples_per_second": 1539.567, "eval_steps_per_second": 32.081, "step": 97600 }, { "epoch": 37.630057803468205, "grad_norm": 0.3226899802684784, "learning_rate": 4.948131021194605e-05, "loss": 0.31930496215820314, "step": 97650 }, { "epoch": 37.630057803468205, "eval_loss": 0.385478675365448, "eval_runtime": 12.3096, "eval_samples_per_second": 1543.835, "eval_steps_per_second": 32.17, "step": 97650 }, { "epoch": 37.64932562620424, "grad_norm": 0.32752248644828796, "learning_rate": 4.940423892100193e-05, "loss": 0.3245128631591797, "step": 97700 }, { "epoch": 37.64932562620424, "eval_loss": 0.388528972864151, "eval_runtime": 12.2743, "eval_samples_per_second": 1548.272, "eval_steps_per_second": 32.262, "step": 97700 }, { "epoch": 37.66859344894027, "grad_norm": 0.29661041498184204, "learning_rate": 4.932716763005781e-05, "loss": 0.32444320678710936, "step": 97750 }, { "epoch": 37.66859344894027, "eval_loss": 0.38179507851600647, "eval_runtime": 12.096, "eval_samples_per_second": 1571.092, "eval_steps_per_second": 32.738, "step": 97750 }, { "epoch": 37.6878612716763, "grad_norm": 0.31166842579841614, "learning_rate": 4.925009633911368e-05, "loss": 0.3311893081665039, "step": 97800 }, { "epoch": 37.6878612716763, "eval_loss": 0.3872106373310089, "eval_runtime": 12.2488, "eval_samples_per_second": 1551.494, "eval_steps_per_second": 32.33, "step": 97800 }, { "epoch": 37.70712909441233, "grad_norm": 0.3044958710670471, "learning_rate": 4.917302504816956e-05, "loss": 0.32142173767089843, "step": 97850 }, { "epoch": 37.70712909441233, "eval_loss": 0.38128209114074707, "eval_runtime": 12.2393, "eval_samples_per_second": 1552.706, "eval_steps_per_second": 32.355, "step": 97850 }, { "epoch": 37.726396917148364, "grad_norm": 0.3460747301578522, "learning_rate": 4.9095953757225436e-05, "loss": 0.32231548309326175, "step": 97900 }, { "epoch": 37.726396917148364, "eval_loss": 0.38306543231010437, "eval_runtime": 12.289, "eval_samples_per_second": 1546.425, "eval_steps_per_second": 32.224, "step": 97900 }, { "epoch": 37.74566473988439, "grad_norm": 0.29722946882247925, "learning_rate": 4.9018882466281314e-05, "loss": 0.32386127471923826, "step": 97950 }, { "epoch": 37.74566473988439, "eval_loss": 0.3828566372394562, "eval_runtime": 12.2211, "eval_samples_per_second": 1555.014, "eval_steps_per_second": 32.403, "step": 97950 }, { "epoch": 37.76493256262042, "grad_norm": 0.3174768388271332, "learning_rate": 4.8941811175337185e-05, "loss": 0.32597251892089846, "step": 98000 }, { "epoch": 37.76493256262042, "eval_loss": 0.3903764486312866, "eval_runtime": 12.2263, "eval_samples_per_second": 1554.356, "eval_steps_per_second": 32.389, "step": 98000 }, { "epoch": 37.784200385356456, "grad_norm": 0.36756718158721924, "learning_rate": 4.886473988439307e-05, "loss": 0.325155029296875, "step": 98050 }, { "epoch": 37.784200385356456, "eval_loss": 0.3849199116230011, "eval_runtime": 12.4217, "eval_samples_per_second": 1529.897, "eval_steps_per_second": 31.88, "step": 98050 }, { "epoch": 37.80346820809248, "grad_norm": 0.3421362340450287, "learning_rate": 4.878766859344894e-05, "loss": 0.32327789306640625, "step": 98100 }, { "epoch": 37.80346820809248, "eval_loss": 0.3815121054649353, "eval_runtime": 12.3018, "eval_samples_per_second": 1544.814, "eval_steps_per_second": 32.19, "step": 98100 }, { "epoch": 37.822736030828516, "grad_norm": 0.34832021594047546, "learning_rate": 4.871059730250482e-05, "loss": 0.3235137939453125, "step": 98150 }, { "epoch": 37.822736030828516, "eval_loss": 0.38436585664749146, "eval_runtime": 12.3585, "eval_samples_per_second": 1537.732, "eval_steps_per_second": 32.043, "step": 98150 }, { "epoch": 37.84200385356455, "grad_norm": 0.3324728310108185, "learning_rate": 4.86335260115607e-05, "loss": 0.32023536682128906, "step": 98200 }, { "epoch": 37.84200385356455, "eval_loss": 0.3770127296447754, "eval_runtime": 12.3647, "eval_samples_per_second": 1536.954, "eval_steps_per_second": 32.027, "step": 98200 }, { "epoch": 37.861271676300575, "grad_norm": 0.3780096471309662, "learning_rate": 4.8556454720616576e-05, "loss": 0.3257052993774414, "step": 98250 }, { "epoch": 37.861271676300575, "eval_loss": 0.38297420740127563, "eval_runtime": 12.5184, "eval_samples_per_second": 1518.089, "eval_steps_per_second": 31.634, "step": 98250 }, { "epoch": 37.88053949903661, "grad_norm": 0.34781530499458313, "learning_rate": 4.847938342967245e-05, "loss": 0.32463050842285157, "step": 98300 }, { "epoch": 37.88053949903661, "eval_loss": 0.38354015350341797, "eval_runtime": 12.1471, "eval_samples_per_second": 1564.487, "eval_steps_per_second": 32.6, "step": 98300 }, { "epoch": 37.89980732177264, "grad_norm": 0.2958250343799591, "learning_rate": 4.8402312138728325e-05, "loss": 0.32621566772460936, "step": 98350 }, { "epoch": 37.89980732177264, "eval_loss": 0.3823920488357544, "eval_runtime": 12.2316, "eval_samples_per_second": 1553.686, "eval_steps_per_second": 32.375, "step": 98350 }, { "epoch": 37.91907514450867, "grad_norm": 0.34966540336608887, "learning_rate": 4.83252408477842e-05, "loss": 0.32661674499511717, "step": 98400 }, { "epoch": 37.91907514450867, "eval_loss": 0.382882684469223, "eval_runtime": 12.2949, "eval_samples_per_second": 1545.682, "eval_steps_per_second": 32.208, "step": 98400 }, { "epoch": 37.9383429672447, "grad_norm": 0.3552516996860504, "learning_rate": 4.824816955684008e-05, "loss": 0.3297817611694336, "step": 98450 }, { "epoch": 37.9383429672447, "eval_loss": 0.3845005929470062, "eval_runtime": 13.2852, "eval_samples_per_second": 1430.463, "eval_steps_per_second": 29.808, "step": 98450 }, { "epoch": 37.957610789980734, "grad_norm": 0.3460848033428192, "learning_rate": 4.817109826589596e-05, "loss": 0.324522590637207, "step": 98500 }, { "epoch": 37.957610789980734, "eval_loss": 0.38350972533226013, "eval_runtime": 12.2518, "eval_samples_per_second": 1551.124, "eval_steps_per_second": 32.322, "step": 98500 }, { "epoch": 37.97687861271676, "grad_norm": 0.3577321171760559, "learning_rate": 4.809402697495183e-05, "loss": 0.3220881271362305, "step": 98550 }, { "epoch": 37.97687861271676, "eval_loss": 0.3847104012966156, "eval_runtime": 12.236, "eval_samples_per_second": 1553.118, "eval_steps_per_second": 32.363, "step": 98550 }, { "epoch": 37.99614643545279, "grad_norm": 0.3706432580947876, "learning_rate": 4.801695568400771e-05, "loss": 0.32336318969726563, "step": 98600 }, { "epoch": 37.99614643545279, "eval_loss": 0.38202714920043945, "eval_runtime": 12.0923, "eval_samples_per_second": 1571.58, "eval_steps_per_second": 32.748, "step": 98600 }, { "epoch": 38.01541425818883, "grad_norm": 0.3555866777896881, "learning_rate": 4.793988439306359e-05, "loss": 0.3258026885986328, "step": 98650 }, { "epoch": 38.01541425818883, "eval_loss": 0.3847752809524536, "eval_runtime": 12.4581, "eval_samples_per_second": 1525.437, "eval_steps_per_second": 31.787, "step": 98650 }, { "epoch": 38.03468208092485, "grad_norm": 0.39026984572410583, "learning_rate": 4.7862813102119465e-05, "loss": 0.32374229431152346, "step": 98700 }, { "epoch": 38.03468208092485, "eval_loss": 0.3835687041282654, "eval_runtime": 12.4773, "eval_samples_per_second": 1523.081, "eval_steps_per_second": 31.738, "step": 98700 }, { "epoch": 38.053949903660886, "grad_norm": 0.3030640184879303, "learning_rate": 4.7785741811175337e-05, "loss": 0.3252749252319336, "step": 98750 }, { "epoch": 38.053949903660886, "eval_loss": 0.3859764337539673, "eval_runtime": 12.3832, "eval_samples_per_second": 1534.665, "eval_steps_per_second": 31.979, "step": 98750 }, { "epoch": 38.07321772639692, "grad_norm": 0.34602755308151245, "learning_rate": 4.7708670520231215e-05, "loss": 0.32335662841796875, "step": 98800 }, { "epoch": 38.07321772639692, "eval_loss": 0.38637015223503113, "eval_runtime": 12.3624, "eval_samples_per_second": 1537.239, "eval_steps_per_second": 32.033, "step": 98800 }, { "epoch": 38.092485549132945, "grad_norm": 0.2917664051055908, "learning_rate": 4.763159922928709e-05, "loss": 0.3256111145019531, "step": 98850 }, { "epoch": 38.092485549132945, "eval_loss": 0.3872085213661194, "eval_runtime": 12.273, "eval_samples_per_second": 1548.443, "eval_steps_per_second": 32.266, "step": 98850 }, { "epoch": 38.11175337186898, "grad_norm": 0.32019370794296265, "learning_rate": 4.755452793834297e-05, "loss": 0.32635734558105467, "step": 98900 }, { "epoch": 38.11175337186898, "eval_loss": 0.3883402347564697, "eval_runtime": 12.2395, "eval_samples_per_second": 1552.676, "eval_steps_per_second": 32.354, "step": 98900 }, { "epoch": 38.13102119460501, "grad_norm": 0.33041924238204956, "learning_rate": 4.747745664739884e-05, "loss": 0.3242264938354492, "step": 98950 }, { "epoch": 38.13102119460501, "eval_loss": 0.3778841197490692, "eval_runtime": 12.4063, "eval_samples_per_second": 1531.807, "eval_steps_per_second": 31.919, "step": 98950 }, { "epoch": 38.15028901734104, "grad_norm": 0.3569977581501007, "learning_rate": 4.740038535645473e-05, "loss": 0.3254582595825195, "step": 99000 }, { "epoch": 38.15028901734104, "eval_loss": 0.38275086879730225, "eval_runtime": 12.2438, "eval_samples_per_second": 1552.13, "eval_steps_per_second": 32.343, "step": 99000 }, { "epoch": 38.16955684007707, "grad_norm": 0.3465358316898346, "learning_rate": 4.73233140655106e-05, "loss": 0.32075416564941406, "step": 99050 }, { "epoch": 38.16955684007707, "eval_loss": 0.38666999340057373, "eval_runtime": 12.3708, "eval_samples_per_second": 1536.199, "eval_steps_per_second": 32.011, "step": 99050 }, { "epoch": 38.188824662813104, "grad_norm": 0.3328206241130829, "learning_rate": 4.7246242774566477e-05, "loss": 0.32292030334472654, "step": 99100 }, { "epoch": 38.188824662813104, "eval_loss": 0.3776254653930664, "eval_runtime": 12.3537, "eval_samples_per_second": 1538.33, "eval_steps_per_second": 32.055, "step": 99100 }, { "epoch": 38.20809248554913, "grad_norm": 0.3295062184333801, "learning_rate": 4.7169171483622355e-05, "loss": 0.3216088104248047, "step": 99150 }, { "epoch": 38.20809248554913, "eval_loss": 0.38292554020881653, "eval_runtime": 12.0686, "eval_samples_per_second": 1574.661, "eval_steps_per_second": 32.812, "step": 99150 }, { "epoch": 38.227360308285164, "grad_norm": 0.3398779630661011, "learning_rate": 4.7092100192678226e-05, "loss": 0.3226991271972656, "step": 99200 }, { "epoch": 38.227360308285164, "eval_loss": 0.3855631947517395, "eval_runtime": 12.3466, "eval_samples_per_second": 1539.209, "eval_steps_per_second": 32.074, "step": 99200 }, { "epoch": 38.2466281310212, "grad_norm": 0.30484309792518616, "learning_rate": 4.7015028901734104e-05, "loss": 0.32372245788574217, "step": 99250 }, { "epoch": 38.2466281310212, "eval_loss": 0.39195898175239563, "eval_runtime": 12.3236, "eval_samples_per_second": 1542.078, "eval_steps_per_second": 32.133, "step": 99250 }, { "epoch": 38.26589595375722, "grad_norm": 0.3171716332435608, "learning_rate": 4.693795761078998e-05, "loss": 0.3209469604492188, "step": 99300 }, { "epoch": 38.26589595375722, "eval_loss": 0.38332825899124146, "eval_runtime": 12.3412, "eval_samples_per_second": 1539.883, "eval_steps_per_second": 32.088, "step": 99300 }, { "epoch": 38.285163776493256, "grad_norm": 0.3266012370586395, "learning_rate": 4.686088631984586e-05, "loss": 0.32518787384033204, "step": 99350 }, { "epoch": 38.285163776493256, "eval_loss": 0.38413751125335693, "eval_runtime": 12.3351, "eval_samples_per_second": 1540.646, "eval_steps_per_second": 32.104, "step": 99350 }, { "epoch": 38.30443159922929, "grad_norm": 0.31703147292137146, "learning_rate": 4.678381502890173e-05, "loss": 0.32363300323486327, "step": 99400 }, { "epoch": 38.30443159922929, "eval_loss": 0.3785512149333954, "eval_runtime": 12.2159, "eval_samples_per_second": 1555.678, "eval_steps_per_second": 32.417, "step": 99400 }, { "epoch": 38.323699421965316, "grad_norm": 0.35744360089302063, "learning_rate": 4.670674373795762e-05, "loss": 0.3239305877685547, "step": 99450 }, { "epoch": 38.323699421965316, "eval_loss": 0.3859465420246124, "eval_runtime": 12.2533, "eval_samples_per_second": 1550.931, "eval_steps_per_second": 32.318, "step": 99450 }, { "epoch": 38.34296724470135, "grad_norm": 0.32955631613731384, "learning_rate": 4.662967244701349e-05, "loss": 0.33016937255859374, "step": 99500 }, { "epoch": 38.34296724470135, "eval_loss": 0.38089486956596375, "eval_runtime": 12.2379, "eval_samples_per_second": 1552.886, "eval_steps_per_second": 32.359, "step": 99500 }, { "epoch": 38.36223506743738, "grad_norm": 0.3499875068664551, "learning_rate": 4.6552601156069366e-05, "loss": 0.3255010223388672, "step": 99550 }, { "epoch": 38.36223506743738, "eval_loss": 0.38351306319236755, "eval_runtime": 12.2208, "eval_samples_per_second": 1555.051, "eval_steps_per_second": 32.404, "step": 99550 }, { "epoch": 38.38150289017341, "grad_norm": 0.2984645366668701, "learning_rate": 4.6475529865125244e-05, "loss": 0.32215789794921873, "step": 99600 }, { "epoch": 38.38150289017341, "eval_loss": 0.3898315727710724, "eval_runtime": 12.2402, "eval_samples_per_second": 1552.586, "eval_steps_per_second": 32.352, "step": 99600 }, { "epoch": 38.40077071290944, "grad_norm": 0.34309080243110657, "learning_rate": 4.639845857418112e-05, "loss": 0.3249150085449219, "step": 99650 }, { "epoch": 38.40077071290944, "eval_loss": 0.38251522183418274, "eval_runtime": 12.2326, "eval_samples_per_second": 1553.547, "eval_steps_per_second": 32.372, "step": 99650 }, { "epoch": 38.420038535645475, "grad_norm": 0.31838759779930115, "learning_rate": 4.6321387283236994e-05, "loss": 0.32149204254150393, "step": 99700 }, { "epoch": 38.420038535645475, "eval_loss": 0.3859882950782776, "eval_runtime": 12.2037, "eval_samples_per_second": 1557.235, "eval_steps_per_second": 32.449, "step": 99700 }, { "epoch": 38.4393063583815, "grad_norm": 0.28872817754745483, "learning_rate": 4.624431599229287e-05, "loss": 0.32126537322998044, "step": 99750 }, { "epoch": 38.4393063583815, "eval_loss": 0.38163408637046814, "eval_runtime": 12.3147, "eval_samples_per_second": 1543.195, "eval_steps_per_second": 32.157, "step": 99750 }, { "epoch": 38.458574181117534, "grad_norm": 0.3210464417934418, "learning_rate": 4.616724470134875e-05, "loss": 0.3188796615600586, "step": 99800 }, { "epoch": 38.458574181117534, "eval_loss": 0.3784565031528473, "eval_runtime": 12.3394, "eval_samples_per_second": 1540.106, "eval_steps_per_second": 32.092, "step": 99800 }, { "epoch": 38.47784200385357, "grad_norm": 0.2837672233581543, "learning_rate": 4.609017341040462e-05, "loss": 0.32909549713134767, "step": 99850 }, { "epoch": 38.47784200385357, "eval_loss": 0.3820670247077942, "eval_runtime": 12.2419, "eval_samples_per_second": 1552.374, "eval_steps_per_second": 32.348, "step": 99850 }, { "epoch": 38.49710982658959, "grad_norm": 0.3098672926425934, "learning_rate": 4.6013102119460506e-05, "loss": 0.32447128295898436, "step": 99900 }, { "epoch": 38.49710982658959, "eval_loss": 0.3817133903503418, "eval_runtime": 12.25, "eval_samples_per_second": 1551.35, "eval_steps_per_second": 32.327, "step": 99900 }, { "epoch": 38.51637764932563, "grad_norm": 0.3364885151386261, "learning_rate": 4.593603082851638e-05, "loss": 0.3253604507446289, "step": 99950 }, { "epoch": 38.51637764932563, "eval_loss": 0.3768647313117981, "eval_runtime": 12.3787, "eval_samples_per_second": 1535.215, "eval_steps_per_second": 31.99, "step": 99950 }, { "epoch": 38.53564547206166, "grad_norm": 0.3608248233795166, "learning_rate": 4.5858959537572256e-05, "loss": 0.3229799652099609, "step": 100000 }, { "epoch": 38.53564547206166, "eval_loss": 0.3777264654636383, "eval_runtime": 12.1021, "eval_samples_per_second": 1570.3, "eval_steps_per_second": 32.721, "step": 100000 }, { "epoch": 38.554913294797686, "grad_norm": 0.35184767842292786, "learning_rate": 4.5781888246628134e-05, "loss": 0.3277977752685547, "step": 100050 }, { "epoch": 38.554913294797686, "eval_loss": 0.38544660806655884, "eval_runtime": 12.437, "eval_samples_per_second": 1528.027, "eval_steps_per_second": 31.841, "step": 100050 }, { "epoch": 38.57418111753372, "grad_norm": 0.35992667078971863, "learning_rate": 4.570481695568401e-05, "loss": 0.32283931732177734, "step": 100100 }, { "epoch": 38.57418111753372, "eval_loss": 0.3858245611190796, "eval_runtime": 12.2353, "eval_samples_per_second": 1553.216, "eval_steps_per_second": 32.365, "step": 100100 }, { "epoch": 38.59344894026975, "grad_norm": 0.3286411762237549, "learning_rate": 4.562774566473988e-05, "loss": 0.3240751647949219, "step": 100150 }, { "epoch": 38.59344894026975, "eval_loss": 0.3803292214870453, "eval_runtime": 12.2497, "eval_samples_per_second": 1551.379, "eval_steps_per_second": 32.327, "step": 100150 }, { "epoch": 38.61271676300578, "grad_norm": 0.31575286388397217, "learning_rate": 4.555067437379577e-05, "loss": 0.32170925140380857, "step": 100200 }, { "epoch": 38.61271676300578, "eval_loss": 0.37952104210853577, "eval_runtime": 12.332, "eval_samples_per_second": 1541.03, "eval_steps_per_second": 32.112, "step": 100200 }, { "epoch": 38.63198458574181, "grad_norm": 0.3284974694252014, "learning_rate": 4.547360308285164e-05, "loss": 0.324201774597168, "step": 100250 }, { "epoch": 38.63198458574181, "eval_loss": 0.3713763356208801, "eval_runtime": 12.2425, "eval_samples_per_second": 1552.3, "eval_steps_per_second": 32.346, "step": 100250 }, { "epoch": 38.651252408477845, "grad_norm": 0.38102230429649353, "learning_rate": 4.539653179190752e-05, "loss": 0.3246105194091797, "step": 100300 }, { "epoch": 38.651252408477845, "eval_loss": 0.37803077697753906, "eval_runtime": 12.358, "eval_samples_per_second": 1537.795, "eval_steps_per_second": 32.044, "step": 100300 }, { "epoch": 38.67052023121387, "grad_norm": 0.3489737808704376, "learning_rate": 4.5319460500963396e-05, "loss": 0.3241348648071289, "step": 100350 }, { "epoch": 38.67052023121387, "eval_loss": 0.38748905062675476, "eval_runtime": 12.2904, "eval_samples_per_second": 1546.245, "eval_steps_per_second": 32.22, "step": 100350 }, { "epoch": 38.689788053949904, "grad_norm": 0.4744672477245331, "learning_rate": 4.5242389210019274e-05, "loss": 0.3209294891357422, "step": 100400 }, { "epoch": 38.689788053949904, "eval_loss": 0.3839319348335266, "eval_runtime": 12.2279, "eval_samples_per_second": 1554.156, "eval_steps_per_second": 32.385, "step": 100400 }, { "epoch": 38.70905587668594, "grad_norm": 0.29229164123535156, "learning_rate": 4.5165317919075145e-05, "loss": 0.32538063049316407, "step": 100450 }, { "epoch": 38.70905587668594, "eval_loss": 0.3908150792121887, "eval_runtime": 12.3165, "eval_samples_per_second": 1542.975, "eval_steps_per_second": 32.152, "step": 100450 }, { "epoch": 38.72832369942196, "grad_norm": 0.3242015242576599, "learning_rate": 4.508824662813102e-05, "loss": 0.31694921493530276, "step": 100500 }, { "epoch": 38.72832369942196, "eval_loss": 0.3879190981388092, "eval_runtime": 12.2625, "eval_samples_per_second": 1549.767, "eval_steps_per_second": 32.294, "step": 100500 }, { "epoch": 38.747591522158, "grad_norm": 0.31549715995788574, "learning_rate": 4.50111753371869e-05, "loss": 0.3191660690307617, "step": 100550 }, { "epoch": 38.747591522158, "eval_loss": 0.3904980719089508, "eval_runtime": 12.0816, "eval_samples_per_second": 1572.965, "eval_steps_per_second": 32.777, "step": 100550 }, { "epoch": 38.76685934489403, "grad_norm": 0.32977861166000366, "learning_rate": 4.493410404624277e-05, "loss": 0.31889110565185547, "step": 100600 }, { "epoch": 38.76685934489403, "eval_loss": 0.390365868806839, "eval_runtime": 12.2575, "eval_samples_per_second": 1550.397, "eval_steps_per_second": 32.307, "step": 100600 }, { "epoch": 38.786127167630056, "grad_norm": 0.3492985963821411, "learning_rate": 4.485703275529866e-05, "loss": 0.32334793090820313, "step": 100650 }, { "epoch": 38.786127167630056, "eval_loss": 0.38496458530426025, "eval_runtime": 12.2318, "eval_samples_per_second": 1553.661, "eval_steps_per_second": 32.375, "step": 100650 }, { "epoch": 38.80539499036609, "grad_norm": 0.298520565032959, "learning_rate": 4.477996146435453e-05, "loss": 0.32259029388427735, "step": 100700 }, { "epoch": 38.80539499036609, "eval_loss": 0.38052940368652344, "eval_runtime": 12.4346, "eval_samples_per_second": 1528.316, "eval_steps_per_second": 31.847, "step": 100700 }, { "epoch": 38.82466281310212, "grad_norm": 0.30582112073898315, "learning_rate": 4.470289017341041e-05, "loss": 0.32743614196777343, "step": 100750 }, { "epoch": 38.82466281310212, "eval_loss": 0.38284391164779663, "eval_runtime": 12.3842, "eval_samples_per_second": 1534.537, "eval_steps_per_second": 31.976, "step": 100750 }, { "epoch": 38.84393063583815, "grad_norm": 0.326119065284729, "learning_rate": 4.4625818882466285e-05, "loss": 0.3261922836303711, "step": 100800 }, { "epoch": 38.84393063583815, "eval_loss": 0.37484055757522583, "eval_runtime": 12.0622, "eval_samples_per_second": 1575.506, "eval_steps_per_second": 32.83, "step": 100800 }, { "epoch": 38.86319845857418, "grad_norm": 0.3174157440662384, "learning_rate": 4.454874759152216e-05, "loss": 0.322800407409668, "step": 100850 }, { "epoch": 38.86319845857418, "eval_loss": 0.3886362612247467, "eval_runtime": 12.3963, "eval_samples_per_second": 1533.034, "eval_steps_per_second": 31.945, "step": 100850 }, { "epoch": 38.882466281310215, "grad_norm": 0.2919534146785736, "learning_rate": 4.4471676300578035e-05, "loss": 0.33000770568847654, "step": 100900 }, { "epoch": 38.882466281310215, "eval_loss": 0.3803521692752838, "eval_runtime": 12.4534, "eval_samples_per_second": 1526.01, "eval_steps_per_second": 31.799, "step": 100900 }, { "epoch": 38.90173410404624, "grad_norm": 0.3095511794090271, "learning_rate": 4.439460500963392e-05, "loss": 0.32281085968017575, "step": 100950 }, { "epoch": 38.90173410404624, "eval_loss": 0.3885936141014099, "eval_runtime": 12.7089, "eval_samples_per_second": 1495.328, "eval_steps_per_second": 31.159, "step": 100950 }, { "epoch": 38.921001926782274, "grad_norm": 0.29343506693840027, "learning_rate": 4.431753371868979e-05, "loss": 0.3264060592651367, "step": 101000 }, { "epoch": 38.921001926782274, "eval_loss": 0.3808668255805969, "eval_runtime": 12.4068, "eval_samples_per_second": 1531.736, "eval_steps_per_second": 31.918, "step": 101000 }, { "epoch": 38.94026974951831, "grad_norm": 0.34628400206565857, "learning_rate": 4.424046242774567e-05, "loss": 0.317656364440918, "step": 101050 }, { "epoch": 38.94026974951831, "eval_loss": 0.3854350447654724, "eval_runtime": 12.229, "eval_samples_per_second": 1554.016, "eval_steps_per_second": 32.382, "step": 101050 }, { "epoch": 38.959537572254334, "grad_norm": 0.3029169738292694, "learning_rate": 4.416339113680155e-05, "loss": 0.3204023361206055, "step": 101100 }, { "epoch": 38.959537572254334, "eval_loss": 0.38336795568466187, "eval_runtime": 12.3216, "eval_samples_per_second": 1542.328, "eval_steps_per_second": 32.139, "step": 101100 }, { "epoch": 38.97880539499037, "grad_norm": 0.3315020203590393, "learning_rate": 4.408631984585742e-05, "loss": 0.3264346694946289, "step": 101150 }, { "epoch": 38.97880539499037, "eval_loss": 0.38037583231925964, "eval_runtime": 12.3759, "eval_samples_per_second": 1535.566, "eval_steps_per_second": 31.998, "step": 101150 }, { "epoch": 38.9980732177264, "grad_norm": 0.3432365357875824, "learning_rate": 4.4009248554913296e-05, "loss": 0.3216224670410156, "step": 101200 }, { "epoch": 38.9980732177264, "eval_loss": 0.37773019075393677, "eval_runtime": 12.4205, "eval_samples_per_second": 1530.056, "eval_steps_per_second": 31.883, "step": 101200 }, { "epoch": 39.017341040462426, "grad_norm": 0.2993318438529968, "learning_rate": 4.393217726396917e-05, "loss": 0.324326286315918, "step": 101250 }, { "epoch": 39.017341040462426, "eval_loss": 0.38116079568862915, "eval_runtime": 12.4447, "eval_samples_per_second": 1527.074, "eval_steps_per_second": 31.821, "step": 101250 }, { "epoch": 39.03660886319846, "grad_norm": 0.3017495274543762, "learning_rate": 4.385510597302505e-05, "loss": 0.3196546936035156, "step": 101300 }, { "epoch": 39.03660886319846, "eval_loss": 0.3807886242866516, "eval_runtime": 12.2813, "eval_samples_per_second": 1547.39, "eval_steps_per_second": 32.244, "step": 101300 }, { "epoch": 39.05587668593449, "grad_norm": 0.3274686336517334, "learning_rate": 4.3778034682080924e-05, "loss": 0.32668361663818357, "step": 101350 }, { "epoch": 39.05587668593449, "eval_loss": 0.3764110803604126, "eval_runtime": 13.68, "eval_samples_per_second": 1389.177, "eval_steps_per_second": 28.947, "step": 101350 }, { "epoch": 39.07514450867052, "grad_norm": 0.31717967987060547, "learning_rate": 4.37009633911368e-05, "loss": 0.32222816467285154, "step": 101400 }, { "epoch": 39.07514450867052, "eval_loss": 0.3873702883720398, "eval_runtime": 12.253, "eval_samples_per_second": 1550.963, "eval_steps_per_second": 32.319, "step": 101400 }, { "epoch": 39.09441233140655, "grad_norm": 0.3464842736721039, "learning_rate": 4.362389210019268e-05, "loss": 0.3179457092285156, "step": 101450 }, { "epoch": 39.09441233140655, "eval_loss": 0.37935277819633484, "eval_runtime": 12.5186, "eval_samples_per_second": 1518.064, "eval_steps_per_second": 31.633, "step": 101450 }, { "epoch": 39.113680154142585, "grad_norm": 0.3288393020629883, "learning_rate": 4.354682080924856e-05, "loss": 0.32986297607421877, "step": 101500 }, { "epoch": 39.113680154142585, "eval_loss": 0.3759736120700836, "eval_runtime": 12.4535, "eval_samples_per_second": 1525.993, "eval_steps_per_second": 31.798, "step": 101500 }, { "epoch": 39.13294797687861, "grad_norm": 0.34074074029922485, "learning_rate": 4.346974951830443e-05, "loss": 0.3266056823730469, "step": 101550 }, { "epoch": 39.13294797687861, "eval_loss": 0.3790600895881653, "eval_runtime": 12.4028, "eval_samples_per_second": 1532.233, "eval_steps_per_second": 31.928, "step": 101550 }, { "epoch": 39.152215799614645, "grad_norm": 0.30920347571372986, "learning_rate": 4.3392678227360315e-05, "loss": 0.32146984100341797, "step": 101600 }, { "epoch": 39.152215799614645, "eval_loss": 0.3713854253292084, "eval_runtime": 12.0632, "eval_samples_per_second": 1575.365, "eval_steps_per_second": 32.827, "step": 101600 }, { "epoch": 39.17148362235068, "grad_norm": 0.32620102167129517, "learning_rate": 4.3315606936416186e-05, "loss": 0.3185664176940918, "step": 101650 }, { "epoch": 39.17148362235068, "eval_loss": 0.38865527510643005, "eval_runtime": 12.2322, "eval_samples_per_second": 1553.606, "eval_steps_per_second": 32.374, "step": 101650 }, { "epoch": 39.190751445086704, "grad_norm": 0.3566093444824219, "learning_rate": 4.3238535645472064e-05, "loss": 0.3187283134460449, "step": 101700 }, { "epoch": 39.190751445086704, "eval_loss": 0.38413652777671814, "eval_runtime": 14.3278, "eval_samples_per_second": 1326.369, "eval_steps_per_second": 27.638, "step": 101700 }, { "epoch": 39.21001926782274, "grad_norm": 0.32487812638282776, "learning_rate": 4.316146435452794e-05, "loss": 0.3247224426269531, "step": 101750 }, { "epoch": 39.21001926782274, "eval_loss": 0.3799559772014618, "eval_runtime": 12.3067, "eval_samples_per_second": 1544.201, "eval_steps_per_second": 32.178, "step": 101750 }, { "epoch": 39.22928709055876, "grad_norm": 0.3344571590423584, "learning_rate": 4.3084393063583814e-05, "loss": 0.3228867340087891, "step": 101800 }, { "epoch": 39.22928709055876, "eval_loss": 0.3833319842815399, "eval_runtime": 12.206, "eval_samples_per_second": 1556.945, "eval_steps_per_second": 32.443, "step": 101800 }, { "epoch": 39.2485549132948, "grad_norm": 0.3208291828632355, "learning_rate": 4.300732177263969e-05, "loss": 0.32248275756835937, "step": 101850 }, { "epoch": 39.2485549132948, "eval_loss": 0.38649553060531616, "eval_runtime": 12.0942, "eval_samples_per_second": 1571.333, "eval_steps_per_second": 32.743, "step": 101850 }, { "epoch": 39.26782273603083, "grad_norm": 0.3009142279624939, "learning_rate": 4.293025048169557e-05, "loss": 0.32097896575927737, "step": 101900 }, { "epoch": 39.26782273603083, "eval_loss": 0.3844289481639862, "eval_runtime": 12.4907, "eval_samples_per_second": 1521.452, "eval_steps_per_second": 31.704, "step": 101900 }, { "epoch": 39.287090558766856, "grad_norm": 0.3313482999801636, "learning_rate": 4.285317919075145e-05, "loss": 0.32517166137695314, "step": 101950 }, { "epoch": 39.287090558766856, "eval_loss": 0.37837716937065125, "eval_runtime": 12.4086, "eval_samples_per_second": 1531.513, "eval_steps_per_second": 31.913, "step": 101950 }, { "epoch": 39.30635838150289, "grad_norm": 0.31990954279899597, "learning_rate": 4.277610789980732e-05, "loss": 0.32112472534179687, "step": 102000 }, { "epoch": 39.30635838150289, "eval_loss": 0.38244470953941345, "eval_runtime": 12.4621, "eval_samples_per_second": 1524.94, "eval_steps_per_second": 31.776, "step": 102000 }, { "epoch": 39.32562620423892, "grad_norm": 0.33063751459121704, "learning_rate": 4.2699036608863204e-05, "loss": 0.31995540618896484, "step": 102050 }, { "epoch": 39.32562620423892, "eval_loss": 0.38090038299560547, "eval_runtime": 12.4936, "eval_samples_per_second": 1521.103, "eval_steps_per_second": 31.696, "step": 102050 }, { "epoch": 39.34489402697495, "grad_norm": 0.38976553082466125, "learning_rate": 4.2621965317919075e-05, "loss": 0.32534934997558596, "step": 102100 }, { "epoch": 39.34489402697495, "eval_loss": 0.38015878200531006, "eval_runtime": 12.4662, "eval_samples_per_second": 1524.44, "eval_steps_per_second": 31.766, "step": 102100 }, { "epoch": 39.36416184971098, "grad_norm": 0.3221515119075775, "learning_rate": 4.2544894026974954e-05, "loss": 0.3267615509033203, "step": 102150 }, { "epoch": 39.36416184971098, "eval_loss": 0.3888033926486969, "eval_runtime": 12.3931, "eval_samples_per_second": 1533.432, "eval_steps_per_second": 31.953, "step": 102150 }, { "epoch": 39.383429672447015, "grad_norm": 0.3183836340904236, "learning_rate": 4.246782273603083e-05, "loss": 0.31861629486083987, "step": 102200 }, { "epoch": 39.383429672447015, "eval_loss": 0.38145601749420166, "eval_runtime": 12.4429, "eval_samples_per_second": 1527.298, "eval_steps_per_second": 31.825, "step": 102200 }, { "epoch": 39.40269749518304, "grad_norm": 0.33277371525764465, "learning_rate": 4.239075144508671e-05, "loss": 0.32680885314941405, "step": 102250 }, { "epoch": 39.40269749518304, "eval_loss": 0.3765226900577545, "eval_runtime": 12.7342, "eval_samples_per_second": 1492.357, "eval_steps_per_second": 31.097, "step": 102250 }, { "epoch": 39.421965317919074, "grad_norm": 0.32571205496788025, "learning_rate": 4.231368015414258e-05, "loss": 0.3258203125, "step": 102300 }, { "epoch": 39.421965317919074, "eval_loss": 0.3879793882369995, "eval_runtime": 12.4078, "eval_samples_per_second": 1531.613, "eval_steps_per_second": 31.915, "step": 102300 }, { "epoch": 39.44123314065511, "grad_norm": 0.3362293541431427, "learning_rate": 4.223660886319846e-05, "loss": 0.3233353042602539, "step": 102350 }, { "epoch": 39.44123314065511, "eval_loss": 0.38480839133262634, "eval_runtime": 12.4035, "eval_samples_per_second": 1532.144, "eval_steps_per_second": 31.926, "step": 102350 }, { "epoch": 39.460500963391134, "grad_norm": 0.3201095461845398, "learning_rate": 4.215953757225434e-05, "loss": 0.3215863037109375, "step": 102400 }, { "epoch": 39.460500963391134, "eval_loss": 0.3826410472393036, "eval_runtime": 12.4609, "eval_samples_per_second": 1525.094, "eval_steps_per_second": 31.779, "step": 102400 }, { "epoch": 39.47976878612717, "grad_norm": 0.2961621880531311, "learning_rate": 4.2082466281310215e-05, "loss": 0.3257990264892578, "step": 102450 }, { "epoch": 39.47976878612717, "eval_loss": 0.39047694206237793, "eval_runtime": 12.3301, "eval_samples_per_second": 1541.263, "eval_steps_per_second": 32.116, "step": 102450 }, { "epoch": 39.4990366088632, "grad_norm": 0.3392292857170105, "learning_rate": 4.2005394990366094e-05, "loss": 0.31948759078979494, "step": 102500 }, { "epoch": 39.4990366088632, "eval_loss": 0.38254043459892273, "eval_runtime": 12.4191, "eval_samples_per_second": 1530.222, "eval_steps_per_second": 31.886, "step": 102500 }, { "epoch": 39.518304431599226, "grad_norm": 0.33778369426727295, "learning_rate": 4.1928323699421965e-05, "loss": 0.3202978515625, "step": 102550 }, { "epoch": 39.518304431599226, "eval_loss": 0.3860911726951599, "eval_runtime": 12.2561, "eval_samples_per_second": 1550.569, "eval_steps_per_second": 32.31, "step": 102550 }, { "epoch": 39.53757225433526, "grad_norm": 0.2921721041202545, "learning_rate": 4.185125240847784e-05, "loss": 0.3239518356323242, "step": 102600 }, { "epoch": 39.53757225433526, "eval_loss": 0.37777528166770935, "eval_runtime": 12.2549, "eval_samples_per_second": 1550.726, "eval_steps_per_second": 32.314, "step": 102600 }, { "epoch": 39.55684007707129, "grad_norm": 0.34872519969940186, "learning_rate": 4.177418111753372e-05, "loss": 0.3196516418457031, "step": 102650 }, { "epoch": 39.55684007707129, "eval_loss": 0.3818143606185913, "eval_runtime": 12.0959, "eval_samples_per_second": 1571.106, "eval_steps_per_second": 32.738, "step": 102650 }, { "epoch": 39.57610789980732, "grad_norm": 0.3192081153392792, "learning_rate": 4.16971098265896e-05, "loss": 0.31996978759765626, "step": 102700 }, { "epoch": 39.57610789980732, "eval_loss": 0.3830792009830475, "eval_runtime": 12.3759, "eval_samples_per_second": 1535.567, "eval_steps_per_second": 31.998, "step": 102700 }, { "epoch": 39.59537572254335, "grad_norm": 0.41257357597351074, "learning_rate": 4.162003853564547e-05, "loss": 0.3226544952392578, "step": 102750 }, { "epoch": 39.59537572254335, "eval_loss": 0.38603001832962036, "eval_runtime": 12.3906, "eval_samples_per_second": 1533.741, "eval_steps_per_second": 31.96, "step": 102750 }, { "epoch": 39.614643545279385, "grad_norm": 0.3230137526988983, "learning_rate": 4.1542967244701356e-05, "loss": 0.3226910018920898, "step": 102800 }, { "epoch": 39.614643545279385, "eval_loss": 0.37526819109916687, "eval_runtime": 12.3787, "eval_samples_per_second": 1535.212, "eval_steps_per_second": 31.99, "step": 102800 }, { "epoch": 39.63391136801541, "grad_norm": 0.3533986210823059, "learning_rate": 4.146589595375723e-05, "loss": 0.3191491508483887, "step": 102850 }, { "epoch": 39.63391136801541, "eval_loss": 0.3799397051334381, "eval_runtime": 12.3548, "eval_samples_per_second": 1538.194, "eval_steps_per_second": 32.052, "step": 102850 }, { "epoch": 39.653179190751445, "grad_norm": 0.311780720949173, "learning_rate": 4.1388824662813105e-05, "loss": 0.3202433776855469, "step": 102900 }, { "epoch": 39.653179190751445, "eval_loss": 0.37823665142059326, "eval_runtime": 12.0869, "eval_samples_per_second": 1572.282, "eval_steps_per_second": 32.763, "step": 102900 }, { "epoch": 39.67244701348748, "grad_norm": 0.313098281621933, "learning_rate": 4.131175337186898e-05, "loss": 0.32604461669921875, "step": 102950 }, { "epoch": 39.67244701348748, "eval_loss": 0.37764298915863037, "eval_runtime": 12.3345, "eval_samples_per_second": 1540.723, "eval_steps_per_second": 32.105, "step": 102950 }, { "epoch": 39.691714836223504, "grad_norm": 0.3388941287994385, "learning_rate": 4.123468208092486e-05, "loss": 0.3255709457397461, "step": 103000 }, { "epoch": 39.691714836223504, "eval_loss": 0.3859165608882904, "eval_runtime": 12.3235, "eval_samples_per_second": 1542.099, "eval_steps_per_second": 32.134, "step": 103000 }, { "epoch": 39.71098265895954, "grad_norm": 0.344268798828125, "learning_rate": 4.115761078998073e-05, "loss": 0.32182464599609373, "step": 103050 }, { "epoch": 39.71098265895954, "eval_loss": 0.38200700283050537, "eval_runtime": 12.3103, "eval_samples_per_second": 1543.745, "eval_steps_per_second": 32.168, "step": 103050 }, { "epoch": 39.73025048169557, "grad_norm": 0.2921173572540283, "learning_rate": 4.108053949903661e-05, "loss": 0.32000213623046875, "step": 103100 }, { "epoch": 39.73025048169557, "eval_loss": 0.385852575302124, "eval_runtime": 12.2532, "eval_samples_per_second": 1550.939, "eval_steps_per_second": 32.318, "step": 103100 }, { "epoch": 39.7495183044316, "grad_norm": 0.32257768511772156, "learning_rate": 4.100346820809249e-05, "loss": 0.32087154388427735, "step": 103150 }, { "epoch": 39.7495183044316, "eval_loss": 0.3894372284412384, "eval_runtime": 12.2338, "eval_samples_per_second": 1553.401, "eval_steps_per_second": 32.369, "step": 103150 }, { "epoch": 39.76878612716763, "grad_norm": 0.36981362104415894, "learning_rate": 4.092639691714836e-05, "loss": 0.32586715698242186, "step": 103200 }, { "epoch": 39.76878612716763, "eval_loss": 0.3797774612903595, "eval_runtime": 12.3102, "eval_samples_per_second": 1543.757, "eval_steps_per_second": 32.168, "step": 103200 }, { "epoch": 39.78805394990366, "grad_norm": 0.346647709608078, "learning_rate": 4.0849325626204245e-05, "loss": 0.31642507553100585, "step": 103250 }, { "epoch": 39.78805394990366, "eval_loss": 0.386206716299057, "eval_runtime": 12.2366, "eval_samples_per_second": 1553.049, "eval_steps_per_second": 32.362, "step": 103250 }, { "epoch": 39.80732177263969, "grad_norm": 0.3334583640098572, "learning_rate": 4.0772254335260116e-05, "loss": 0.3208634948730469, "step": 103300 }, { "epoch": 39.80732177263969, "eval_loss": 0.37603944540023804, "eval_runtime": 12.2369, "eval_samples_per_second": 1553.007, "eval_steps_per_second": 32.361, "step": 103300 }, { "epoch": 39.82658959537572, "grad_norm": 0.3209018409252167, "learning_rate": 4.0695183044315994e-05, "loss": 0.32413253784179685, "step": 103350 }, { "epoch": 39.82658959537572, "eval_loss": 0.38689833879470825, "eval_runtime": 12.3308, "eval_samples_per_second": 1541.179, "eval_steps_per_second": 32.115, "step": 103350 }, { "epoch": 39.845857418111756, "grad_norm": 0.31634676456451416, "learning_rate": 4.061811175337187e-05, "loss": 0.32875640869140627, "step": 103400 }, { "epoch": 39.845857418111756, "eval_loss": 0.38334301114082336, "eval_runtime": 12.2351, "eval_samples_per_second": 1553.232, "eval_steps_per_second": 32.366, "step": 103400 }, { "epoch": 39.86512524084778, "grad_norm": 0.30478212237358093, "learning_rate": 4.054104046242775e-05, "loss": 0.3210894012451172, "step": 103450 }, { "epoch": 39.86512524084778, "eval_loss": 0.38237878680229187, "eval_runtime": 12.3671, "eval_samples_per_second": 1536.657, "eval_steps_per_second": 32.02, "step": 103450 }, { "epoch": 39.884393063583815, "grad_norm": 0.37053579092025757, "learning_rate": 4.046396917148362e-05, "loss": 0.3224208450317383, "step": 103500 }, { "epoch": 39.884393063583815, "eval_loss": 0.37822768092155457, "eval_runtime": 12.2839, "eval_samples_per_second": 1547.061, "eval_steps_per_second": 32.237, "step": 103500 }, { "epoch": 39.90366088631985, "grad_norm": 0.3216537535190582, "learning_rate": 4.038689788053951e-05, "loss": 0.3242170333862305, "step": 103550 }, { "epoch": 39.90366088631985, "eval_loss": 0.39103108644485474, "eval_runtime": 12.2311, "eval_samples_per_second": 1553.739, "eval_steps_per_second": 32.376, "step": 103550 }, { "epoch": 39.922928709055874, "grad_norm": 0.3812324106693268, "learning_rate": 4.030982658959538e-05, "loss": 0.3228831100463867, "step": 103600 }, { "epoch": 39.922928709055874, "eval_loss": 0.3799179494380951, "eval_runtime": 12.3454, "eval_samples_per_second": 1539.359, "eval_steps_per_second": 32.077, "step": 103600 }, { "epoch": 39.94219653179191, "grad_norm": 0.30404233932495117, "learning_rate": 4.0232755298651256e-05, "loss": 0.32548812866210936, "step": 103650 }, { "epoch": 39.94219653179191, "eval_loss": 0.3878864347934723, "eval_runtime": 12.2278, "eval_samples_per_second": 1554.168, "eval_steps_per_second": 32.385, "step": 103650 }, { "epoch": 39.96146435452794, "grad_norm": 0.39582502841949463, "learning_rate": 4.015568400770713e-05, "loss": 0.3232265472412109, "step": 103700 }, { "epoch": 39.96146435452794, "eval_loss": 0.3878537714481354, "eval_runtime": 12.1438, "eval_samples_per_second": 1564.914, "eval_steps_per_second": 32.609, "step": 103700 }, { "epoch": 39.98073217726397, "grad_norm": 0.31447532773017883, "learning_rate": 4.0078612716763006e-05, "loss": 0.31816413879394534, "step": 103750 }, { "epoch": 39.98073217726397, "eval_loss": 0.3744716942310333, "eval_runtime": 12.4156, "eval_samples_per_second": 1530.651, "eval_steps_per_second": 31.895, "step": 103750 }, { "epoch": 40.0, "grad_norm": 0.36110809445381165, "learning_rate": 4.0001541425818884e-05, "loss": 0.3274527359008789, "step": 103800 }, { "epoch": 40.0, "eval_loss": 0.38628730177879333, "eval_runtime": 12.2348, "eval_samples_per_second": 1553.275, "eval_steps_per_second": 32.367, "step": 103800 }, { "epoch": 40.01926782273603, "grad_norm": 0.3266274631023407, "learning_rate": 3.9924470134874755e-05, "loss": 0.31366506576538083, "step": 103850 }, { "epoch": 40.01926782273603, "eval_loss": 0.3741891086101532, "eval_runtime": 12.2494, "eval_samples_per_second": 1551.42, "eval_steps_per_second": 32.328, "step": 103850 }, { "epoch": 40.03853564547206, "grad_norm": 0.349526971578598, "learning_rate": 3.984739884393064e-05, "loss": 0.32355499267578125, "step": 103900 }, { "epoch": 40.03853564547206, "eval_loss": 0.3799409866333008, "eval_runtime": 12.2645, "eval_samples_per_second": 1549.511, "eval_steps_per_second": 32.288, "step": 103900 }, { "epoch": 40.05780346820809, "grad_norm": 0.38123929500579834, "learning_rate": 3.977032755298651e-05, "loss": 0.32005302429199217, "step": 103950 }, { "epoch": 40.05780346820809, "eval_loss": 0.3771795332431793, "eval_runtime": 12.0827, "eval_samples_per_second": 1572.821, "eval_steps_per_second": 32.774, "step": 103950 }, { "epoch": 40.077071290944126, "grad_norm": 0.3160003423690796, "learning_rate": 3.969325626204239e-05, "loss": 0.3180100631713867, "step": 104000 }, { "epoch": 40.077071290944126, "eval_loss": 0.37859228253364563, "eval_runtime": 12.4168, "eval_samples_per_second": 1530.511, "eval_steps_per_second": 31.892, "step": 104000 }, { "epoch": 40.09633911368015, "grad_norm": 0.3367283046245575, "learning_rate": 3.961618497109827e-05, "loss": 0.32244167327880857, "step": 104050 }, { "epoch": 40.09633911368015, "eval_loss": 0.3821028470993042, "eval_runtime": 12.4341, "eval_samples_per_second": 1528.374, "eval_steps_per_second": 31.848, "step": 104050 }, { "epoch": 40.115606936416185, "grad_norm": 0.35102951526641846, "learning_rate": 3.9539113680154146e-05, "loss": 0.3227753448486328, "step": 104100 }, { "epoch": 40.115606936416185, "eval_loss": 0.38219839334487915, "eval_runtime": 12.4014, "eval_samples_per_second": 1532.406, "eval_steps_per_second": 31.932, "step": 104100 }, { "epoch": 40.13487475915222, "grad_norm": 0.3473288416862488, "learning_rate": 3.946204238921002e-05, "loss": 0.318532772064209, "step": 104150 }, { "epoch": 40.13487475915222, "eval_loss": 0.38346588611602783, "eval_runtime": 12.4026, "eval_samples_per_second": 1532.263, "eval_steps_per_second": 31.929, "step": 104150 }, { "epoch": 40.154142581888244, "grad_norm": 0.34819597005844116, "learning_rate": 3.93849710982659e-05, "loss": 0.31824262619018556, "step": 104200 }, { "epoch": 40.154142581888244, "eval_loss": 0.3847159743309021, "eval_runtime": 12.0897, "eval_samples_per_second": 1571.913, "eval_steps_per_second": 32.755, "step": 104200 }, { "epoch": 40.17341040462428, "grad_norm": 0.31111589074134827, "learning_rate": 3.9307899807321773e-05, "loss": 0.32443962097167967, "step": 104250 }, { "epoch": 40.17341040462428, "eval_loss": 0.3816429376602173, "eval_runtime": 12.4342, "eval_samples_per_second": 1528.363, "eval_steps_per_second": 31.848, "step": 104250 }, { "epoch": 40.19267822736031, "grad_norm": 0.35325363278388977, "learning_rate": 3.923082851637765e-05, "loss": 0.32055519104003904, "step": 104300 }, { "epoch": 40.19267822736031, "eval_loss": 0.38938599824905396, "eval_runtime": 12.4583, "eval_samples_per_second": 1525.412, "eval_steps_per_second": 31.786, "step": 104300 }, { "epoch": 40.21194605009634, "grad_norm": 0.3098491430282593, "learning_rate": 3.915375722543353e-05, "loss": 0.322132568359375, "step": 104350 }, { "epoch": 40.21194605009634, "eval_loss": 0.3840576708316803, "eval_runtime": 12.4806, "eval_samples_per_second": 1522.679, "eval_steps_per_second": 31.729, "step": 104350 }, { "epoch": 40.23121387283237, "grad_norm": 0.3278464674949646, "learning_rate": 3.90766859344894e-05, "loss": 0.32437145233154296, "step": 104400 }, { "epoch": 40.23121387283237, "eval_loss": 0.38318943977355957, "eval_runtime": 12.4724, "eval_samples_per_second": 1523.682, "eval_steps_per_second": 31.75, "step": 104400 }, { "epoch": 40.2504816955684, "grad_norm": 0.2977653443813324, "learning_rate": 3.899961464354528e-05, "loss": 0.3202424621582031, "step": 104450 }, { "epoch": 40.2504816955684, "eval_loss": 0.3807383179664612, "eval_runtime": 12.4437, "eval_samples_per_second": 1527.194, "eval_steps_per_second": 31.823, "step": 104450 }, { "epoch": 40.26974951830443, "grad_norm": 0.29310402274131775, "learning_rate": 3.892254335260116e-05, "loss": 0.31799121856689455, "step": 104500 }, { "epoch": 40.26974951830443, "eval_loss": 0.38190072774887085, "eval_runtime": 12.3418, "eval_samples_per_second": 1539.806, "eval_steps_per_second": 32.086, "step": 104500 }, { "epoch": 40.28901734104046, "grad_norm": 0.3153882622718811, "learning_rate": 3.8845472061657035e-05, "loss": 0.32017051696777343, "step": 104550 }, { "epoch": 40.28901734104046, "eval_loss": 0.37600207328796387, "eval_runtime": 12.2532, "eval_samples_per_second": 1550.944, "eval_steps_per_second": 32.318, "step": 104550 }, { "epoch": 40.308285163776496, "grad_norm": 0.3544333875179291, "learning_rate": 3.876840077071291e-05, "loss": 0.31956991195678713, "step": 104600 }, { "epoch": 40.308285163776496, "eval_loss": 0.38600602746009827, "eval_runtime": 12.2377, "eval_samples_per_second": 1552.911, "eval_steps_per_second": 32.359, "step": 104600 }, { "epoch": 40.32755298651252, "grad_norm": 0.32531896233558655, "learning_rate": 3.869132947976879e-05, "loss": 0.32178291320800784, "step": 104650 }, { "epoch": 40.32755298651252, "eval_loss": 0.3814699649810791, "eval_runtime": 12.4215, "eval_samples_per_second": 1529.928, "eval_steps_per_second": 31.88, "step": 104650 }, { "epoch": 40.346820809248555, "grad_norm": 0.33429577946662903, "learning_rate": 3.861425818882466e-05, "loss": 0.3248287582397461, "step": 104700 }, { "epoch": 40.346820809248555, "eval_loss": 0.3858553469181061, "eval_runtime": 12.4469, "eval_samples_per_second": 1526.81, "eval_steps_per_second": 31.815, "step": 104700 }, { "epoch": 40.36608863198459, "grad_norm": 0.32811492681503296, "learning_rate": 3.853718689788054e-05, "loss": 0.32355613708496095, "step": 104750 }, { "epoch": 40.36608863198459, "eval_loss": 0.38816913962364197, "eval_runtime": 12.0892, "eval_samples_per_second": 1571.987, "eval_steps_per_second": 32.757, "step": 104750 }, { "epoch": 40.385356454720615, "grad_norm": 0.4136947691440582, "learning_rate": 3.846011560693642e-05, "loss": 0.3278625869750977, "step": 104800 }, { "epoch": 40.385356454720615, "eval_loss": 0.3837444484233856, "eval_runtime": 12.2478, "eval_samples_per_second": 1551.624, "eval_steps_per_second": 32.332, "step": 104800 }, { "epoch": 40.40462427745665, "grad_norm": 0.36452749371528625, "learning_rate": 3.83830443159923e-05, "loss": 0.32567424774169923, "step": 104850 }, { "epoch": 40.40462427745665, "eval_loss": 0.3863914906978607, "eval_runtime": 12.2472, "eval_samples_per_second": 1551.708, "eval_steps_per_second": 32.334, "step": 104850 }, { "epoch": 40.42389210019268, "grad_norm": 0.3123543858528137, "learning_rate": 3.830597302504817e-05, "loss": 0.3202303695678711, "step": 104900 }, { "epoch": 40.42389210019268, "eval_loss": 0.38506171107292175, "eval_runtime": 12.2421, "eval_samples_per_second": 1552.347, "eval_steps_per_second": 32.347, "step": 104900 }, { "epoch": 40.44315992292871, "grad_norm": 0.3214589059352875, "learning_rate": 3.822890173410405e-05, "loss": 0.3236609649658203, "step": 104950 }, { "epoch": 40.44315992292871, "eval_loss": 0.38689562678337097, "eval_runtime": 12.2469, "eval_samples_per_second": 1551.745, "eval_steps_per_second": 32.335, "step": 104950 }, { "epoch": 40.46242774566474, "grad_norm": 0.3081962466239929, "learning_rate": 3.8151830443159925e-05, "loss": 0.32169960021972654, "step": 105000 }, { "epoch": 40.46242774566474, "eval_loss": 0.3860403895378113, "eval_runtime": 12.1619, "eval_samples_per_second": 1562.59, "eval_steps_per_second": 32.561, "step": 105000 }, { "epoch": 40.481695568400774, "grad_norm": 0.32808494567871094, "learning_rate": 3.80747591522158e-05, "loss": 0.31748716354370116, "step": 105050 }, { "epoch": 40.481695568400774, "eval_loss": 0.3876902163028717, "eval_runtime": 12.3468, "eval_samples_per_second": 1539.185, "eval_steps_per_second": 32.073, "step": 105050 }, { "epoch": 40.5009633911368, "grad_norm": 0.3287920653820038, "learning_rate": 3.799768786127168e-05, "loss": 0.3191215133666992, "step": 105100 }, { "epoch": 40.5009633911368, "eval_loss": 0.3836974501609802, "eval_runtime": 12.2867, "eval_samples_per_second": 1546.71, "eval_steps_per_second": 32.23, "step": 105100 }, { "epoch": 40.52023121387283, "grad_norm": 0.29675307869911194, "learning_rate": 3.792061657032755e-05, "loss": 0.3161991882324219, "step": 105150 }, { "epoch": 40.52023121387283, "eval_loss": 0.386461466550827, "eval_runtime": 12.4333, "eval_samples_per_second": 1528.473, "eval_steps_per_second": 31.85, "step": 105150 }, { "epoch": 40.539499036608866, "grad_norm": 0.32069525122642517, "learning_rate": 3.784354527938343e-05, "loss": 0.31414901733398437, "step": 105200 }, { "epoch": 40.539499036608866, "eval_loss": 0.38272666931152344, "eval_runtime": 12.393, "eval_samples_per_second": 1533.449, "eval_steps_per_second": 31.954, "step": 105200 }, { "epoch": 40.55876685934489, "grad_norm": 0.3214528560638428, "learning_rate": 3.776647398843931e-05, "loss": 0.32236785888671876, "step": 105250 }, { "epoch": 40.55876685934489, "eval_loss": 0.3855484127998352, "eval_runtime": 12.4026, "eval_samples_per_second": 1532.257, "eval_steps_per_second": 31.929, "step": 105250 }, { "epoch": 40.578034682080926, "grad_norm": 0.36510491371154785, "learning_rate": 3.768940269749519e-05, "loss": 0.31991971969604494, "step": 105300 }, { "epoch": 40.578034682080926, "eval_loss": 0.3806275725364685, "eval_runtime": 12.3962, "eval_samples_per_second": 1533.051, "eval_steps_per_second": 31.945, "step": 105300 }, { "epoch": 40.59730250481696, "grad_norm": 0.3487936556339264, "learning_rate": 3.761233140655106e-05, "loss": 0.3220771026611328, "step": 105350 }, { "epoch": 40.59730250481696, "eval_loss": 0.3821215331554413, "eval_runtime": 12.497, "eval_samples_per_second": 1520.683, "eval_steps_per_second": 31.688, "step": 105350 }, { "epoch": 40.616570327552985, "grad_norm": 0.3288983106613159, "learning_rate": 3.753526011560694e-05, "loss": 0.3186056137084961, "step": 105400 }, { "epoch": 40.616570327552985, "eval_loss": 0.37835004925727844, "eval_runtime": 12.466, "eval_samples_per_second": 1524.467, "eval_steps_per_second": 31.766, "step": 105400 }, { "epoch": 40.63583815028902, "grad_norm": 0.34094274044036865, "learning_rate": 3.7458188824662814e-05, "loss": 0.3208282852172852, "step": 105450 }, { "epoch": 40.63583815028902, "eval_loss": 0.38431859016418457, "eval_runtime": 12.3739, "eval_samples_per_second": 1535.819, "eval_steps_per_second": 32.003, "step": 105450 }, { "epoch": 40.65510597302505, "grad_norm": 0.3394392728805542, "learning_rate": 3.738111753371869e-05, "loss": 0.3240556335449219, "step": 105500 }, { "epoch": 40.65510597302505, "eval_loss": 0.38029083609580994, "eval_runtime": 12.4888, "eval_samples_per_second": 1521.682, "eval_steps_per_second": 31.708, "step": 105500 }, { "epoch": 40.67437379576108, "grad_norm": 0.32136157155036926, "learning_rate": 3.730404624277457e-05, "loss": 0.3180526542663574, "step": 105550 }, { "epoch": 40.67437379576108, "eval_loss": 0.3855363130569458, "eval_runtime": 12.0636, "eval_samples_per_second": 1575.321, "eval_steps_per_second": 32.826, "step": 105550 }, { "epoch": 40.69364161849711, "grad_norm": 0.3039543330669403, "learning_rate": 3.722697495183045e-05, "loss": 0.32097305297851564, "step": 105600 }, { "epoch": 40.69364161849711, "eval_loss": 0.38883692026138306, "eval_runtime": 12.2967, "eval_samples_per_second": 1545.459, "eval_steps_per_second": 32.204, "step": 105600 }, { "epoch": 40.712909441233144, "grad_norm": 0.3315683901309967, "learning_rate": 3.714990366088632e-05, "loss": 0.3220414733886719, "step": 105650 }, { "epoch": 40.712909441233144, "eval_loss": 0.3840700387954712, "eval_runtime": 12.2609, "eval_samples_per_second": 1549.974, "eval_steps_per_second": 32.298, "step": 105650 }, { "epoch": 40.73217726396917, "grad_norm": 0.32312530279159546, "learning_rate": 3.70728323699422e-05, "loss": 0.32394115447998045, "step": 105700 }, { "epoch": 40.73217726396917, "eval_loss": 0.37681400775909424, "eval_runtime": 12.3949, "eval_samples_per_second": 1533.214, "eval_steps_per_second": 31.949, "step": 105700 }, { "epoch": 40.7514450867052, "grad_norm": 0.34498634934425354, "learning_rate": 3.6995761078998076e-05, "loss": 0.3159974479675293, "step": 105750 }, { "epoch": 40.7514450867052, "eval_loss": 0.3867703974246979, "eval_runtime": 12.5651, "eval_samples_per_second": 1512.441, "eval_steps_per_second": 31.516, "step": 105750 }, { "epoch": 40.77071290944124, "grad_norm": 0.3088548481464386, "learning_rate": 3.691868978805395e-05, "loss": 0.320888671875, "step": 105800 }, { "epoch": 40.77071290944124, "eval_loss": 0.38889122009277344, "eval_runtime": 12.0942, "eval_samples_per_second": 1571.325, "eval_steps_per_second": 32.743, "step": 105800 }, { "epoch": 40.78998073217726, "grad_norm": 0.35948511958122253, "learning_rate": 3.684161849710983e-05, "loss": 0.3241624450683594, "step": 105850 }, { "epoch": 40.78998073217726, "eval_loss": 0.3786505460739136, "eval_runtime": 12.4051, "eval_samples_per_second": 1531.953, "eval_steps_per_second": 31.922, "step": 105850 }, { "epoch": 40.809248554913296, "grad_norm": 0.32197433710098267, "learning_rate": 3.6764547206165704e-05, "loss": 0.32028190612792967, "step": 105900 }, { "epoch": 40.809248554913296, "eval_loss": 0.37357667088508606, "eval_runtime": 12.2776, "eval_samples_per_second": 1547.863, "eval_steps_per_second": 32.254, "step": 105900 }, { "epoch": 40.82851637764932, "grad_norm": 0.338074654340744, "learning_rate": 3.668747591522158e-05, "loss": 0.3254270172119141, "step": 105950 }, { "epoch": 40.82851637764932, "eval_loss": 0.37792670726776123, "eval_runtime": 12.3095, "eval_samples_per_second": 1543.842, "eval_steps_per_second": 32.17, "step": 105950 }, { "epoch": 40.847784200385355, "grad_norm": 0.3771144151687622, "learning_rate": 3.661040462427745e-05, "loss": 0.3204991912841797, "step": 106000 }, { "epoch": 40.847784200385355, "eval_loss": 0.380256712436676, "eval_runtime": 12.2812, "eval_samples_per_second": 1547.406, "eval_steps_per_second": 32.244, "step": 106000 }, { "epoch": 40.86705202312139, "grad_norm": 0.33397355675697327, "learning_rate": 3.653333333333334e-05, "loss": 0.3212562942504883, "step": 106050 }, { "epoch": 40.86705202312139, "eval_loss": 0.37218016386032104, "eval_runtime": 12.2328, "eval_samples_per_second": 1553.529, "eval_steps_per_second": 32.372, "step": 106050 }, { "epoch": 40.886319845857415, "grad_norm": 0.3158644735813141, "learning_rate": 3.645626204238921e-05, "loss": 0.32004005432128907, "step": 106100 }, { "epoch": 40.886319845857415, "eval_loss": 0.38942062854766846, "eval_runtime": 12.3433, "eval_samples_per_second": 1539.615, "eval_steps_per_second": 32.082, "step": 106100 }, { "epoch": 40.90558766859345, "grad_norm": 0.3357080817222595, "learning_rate": 3.637919075144509e-05, "loss": 0.32798770904541014, "step": 106150 }, { "epoch": 40.90558766859345, "eval_loss": 0.38499438762664795, "eval_runtime": 12.5082, "eval_samples_per_second": 1519.325, "eval_steps_per_second": 31.659, "step": 106150 }, { "epoch": 40.92485549132948, "grad_norm": 0.313538134098053, "learning_rate": 3.6302119460500966e-05, "loss": 0.3167275619506836, "step": 106200 }, { "epoch": 40.92485549132948, "eval_loss": 0.38814523816108704, "eval_runtime": 12.6795, "eval_samples_per_second": 1498.793, "eval_steps_per_second": 31.231, "step": 106200 }, { "epoch": 40.94412331406551, "grad_norm": 0.38498589396476746, "learning_rate": 3.6225048169556844e-05, "loss": 0.3219877243041992, "step": 106250 }, { "epoch": 40.94412331406551, "eval_loss": 0.38676685094833374, "eval_runtime": 12.3103, "eval_samples_per_second": 1543.746, "eval_steps_per_second": 32.168, "step": 106250 }, { "epoch": 40.96339113680154, "grad_norm": 0.3071128726005554, "learning_rate": 3.6147976878612715e-05, "loss": 0.3168501281738281, "step": 106300 }, { "epoch": 40.96339113680154, "eval_loss": 0.38437142968177795, "eval_runtime": 12.3265, "eval_samples_per_second": 1541.717, "eval_steps_per_second": 32.126, "step": 106300 }, { "epoch": 40.982658959537574, "grad_norm": 0.30703869462013245, "learning_rate": 3.607090558766859e-05, "loss": 0.3238287353515625, "step": 106350 }, { "epoch": 40.982658959537574, "eval_loss": 0.3965759575366974, "eval_runtime": 12.656, "eval_samples_per_second": 1501.581, "eval_steps_per_second": 31.29, "step": 106350 }, { "epoch": 41.0019267822736, "grad_norm": 0.3148481249809265, "learning_rate": 3.599383429672447e-05, "loss": 0.31842962265014646, "step": 106400 }, { "epoch": 41.0019267822736, "eval_loss": 0.38208281993865967, "eval_runtime": 12.2967, "eval_samples_per_second": 1545.453, "eval_steps_per_second": 32.204, "step": 106400 }, { "epoch": 41.02119460500963, "grad_norm": 0.3330551087856293, "learning_rate": 3.591676300578034e-05, "loss": 0.32364559173583984, "step": 106450 }, { "epoch": 41.02119460500963, "eval_loss": 0.38698723912239075, "eval_runtime": 12.3771, "eval_samples_per_second": 1535.419, "eval_steps_per_second": 31.995, "step": 106450 }, { "epoch": 41.040462427745666, "grad_norm": 0.33233508467674255, "learning_rate": 3.583969171483623e-05, "loss": 0.320278205871582, "step": 106500 }, { "epoch": 41.040462427745666, "eval_loss": 0.3878120183944702, "eval_runtime": 12.2361, "eval_samples_per_second": 1553.109, "eval_steps_per_second": 32.363, "step": 106500 }, { "epoch": 41.05973025048169, "grad_norm": 0.3435944616794586, "learning_rate": 3.57626204238921e-05, "loss": 0.3155942916870117, "step": 106550 }, { "epoch": 41.05973025048169, "eval_loss": 0.3839638829231262, "eval_runtime": 12.2533, "eval_samples_per_second": 1550.924, "eval_steps_per_second": 32.318, "step": 106550 }, { "epoch": 41.078998073217726, "grad_norm": 0.3221978545188904, "learning_rate": 3.568554913294798e-05, "loss": 0.31455358505249026, "step": 106600 }, { "epoch": 41.078998073217726, "eval_loss": 0.386629581451416, "eval_runtime": 12.1232, "eval_samples_per_second": 1567.568, "eval_steps_per_second": 32.665, "step": 106600 }, { "epoch": 41.09826589595376, "grad_norm": 0.3219228684902191, "learning_rate": 3.5608477842003855e-05, "loss": 0.3197637939453125, "step": 106650 }, { "epoch": 41.09826589595376, "eval_loss": 0.38509875535964966, "eval_runtime": 12.302, "eval_samples_per_second": 1544.788, "eval_steps_per_second": 32.19, "step": 106650 }, { "epoch": 41.117533718689785, "grad_norm": 0.3049073815345764, "learning_rate": 3.553140655105973e-05, "loss": 0.32550098419189455, "step": 106700 }, { "epoch": 41.117533718689785, "eval_loss": 0.3814330995082855, "eval_runtime": 12.3028, "eval_samples_per_second": 1544.686, "eval_steps_per_second": 32.188, "step": 106700 }, { "epoch": 41.13680154142582, "grad_norm": 0.3011324405670166, "learning_rate": 3.5454335260115605e-05, "loss": 0.3229445648193359, "step": 106750 }, { "epoch": 41.13680154142582, "eval_loss": 0.38609635829925537, "eval_runtime": 12.2581, "eval_samples_per_second": 1550.328, "eval_steps_per_second": 32.305, "step": 106750 }, { "epoch": 41.15606936416185, "grad_norm": 0.32302746176719666, "learning_rate": 3.537726396917149e-05, "loss": 0.32338638305664064, "step": 106800 }, { "epoch": 41.15606936416185, "eval_loss": 0.38782382011413574, "eval_runtime": 12.357, "eval_samples_per_second": 1537.914, "eval_steps_per_second": 32.047, "step": 106800 }, { "epoch": 41.17533718689788, "grad_norm": 0.2978875935077667, "learning_rate": 3.530019267822736e-05, "loss": 0.3192987060546875, "step": 106850 }, { "epoch": 41.17533718689788, "eval_loss": 0.38025543093681335, "eval_runtime": 12.0886, "eval_samples_per_second": 1572.056, "eval_steps_per_second": 32.758, "step": 106850 }, { "epoch": 41.19460500963391, "grad_norm": 0.3582557141780853, "learning_rate": 3.522312138728324e-05, "loss": 0.32335552215576174, "step": 106900 }, { "epoch": 41.19460500963391, "eval_loss": 0.37870898842811584, "eval_runtime": 12.4774, "eval_samples_per_second": 1523.068, "eval_steps_per_second": 31.737, "step": 106900 }, { "epoch": 41.213872832369944, "grad_norm": 0.3042924404144287, "learning_rate": 3.514605009633912e-05, "loss": 0.3180807113647461, "step": 106950 }, { "epoch": 41.213872832369944, "eval_loss": 0.381035715341568, "eval_runtime": 12.3869, "eval_samples_per_second": 1534.206, "eval_steps_per_second": 31.969, "step": 106950 }, { "epoch": 41.23314065510597, "grad_norm": 0.3434860408306122, "learning_rate": 3.5068978805394995e-05, "loss": 0.32142364501953125, "step": 107000 }, { "epoch": 41.23314065510597, "eval_loss": 0.3754729628562927, "eval_runtime": 12.2775, "eval_samples_per_second": 1547.871, "eval_steps_per_second": 32.254, "step": 107000 }, { "epoch": 41.252408477842, "grad_norm": 0.3284461796283722, "learning_rate": 3.4991907514450867e-05, "loss": 0.3201649856567383, "step": 107050 }, { "epoch": 41.252408477842, "eval_loss": 0.38105666637420654, "eval_runtime": 12.2964, "eval_samples_per_second": 1545.498, "eval_steps_per_second": 32.205, "step": 107050 }, { "epoch": 41.27167630057804, "grad_norm": 0.3368210196495056, "learning_rate": 3.4914836223506745e-05, "loss": 0.3262393569946289, "step": 107100 }, { "epoch": 41.27167630057804, "eval_loss": 0.3824920952320099, "eval_runtime": 12.106, "eval_samples_per_second": 1569.8, "eval_steps_per_second": 32.711, "step": 107100 }, { "epoch": 41.29094412331406, "grad_norm": 0.3505050837993622, "learning_rate": 3.483776493256262e-05, "loss": 0.3265857696533203, "step": 107150 }, { "epoch": 41.29094412331406, "eval_loss": 0.39019477367401123, "eval_runtime": 12.3809, "eval_samples_per_second": 1534.944, "eval_steps_per_second": 31.985, "step": 107150 }, { "epoch": 41.310211946050096, "grad_norm": 0.34869706630706787, "learning_rate": 3.4760693641618494e-05, "loss": 0.3198947525024414, "step": 107200 }, { "epoch": 41.310211946050096, "eval_loss": 0.3832409381866455, "eval_runtime": 12.4314, "eval_samples_per_second": 1528.708, "eval_steps_per_second": 31.855, "step": 107200 }, { "epoch": 41.32947976878613, "grad_norm": 0.3720039129257202, "learning_rate": 3.468362235067438e-05, "loss": 0.322225341796875, "step": 107250 }, { "epoch": 41.32947976878613, "eval_loss": 0.3793172240257263, "eval_runtime": 12.3057, "eval_samples_per_second": 1544.323, "eval_steps_per_second": 32.18, "step": 107250 }, { "epoch": 41.348747591522155, "grad_norm": 0.3124217391014099, "learning_rate": 3.460655105973025e-05, "loss": 0.3174454307556152, "step": 107300 }, { "epoch": 41.348747591522155, "eval_loss": 0.3856099545955658, "eval_runtime": 12.2324, "eval_samples_per_second": 1553.576, "eval_steps_per_second": 32.373, "step": 107300 }, { "epoch": 41.36801541425819, "grad_norm": 0.3079361617565155, "learning_rate": 3.452947976878613e-05, "loss": 0.3198431968688965, "step": 107350 }, { "epoch": 41.36801541425819, "eval_loss": 0.3796779215335846, "eval_runtime": 12.3286, "eval_samples_per_second": 1541.454, "eval_steps_per_second": 32.12, "step": 107350 }, { "epoch": 41.38728323699422, "grad_norm": 0.37831681966781616, "learning_rate": 3.445240847784201e-05, "loss": 0.31653919219970705, "step": 107400 }, { "epoch": 41.38728323699422, "eval_loss": 0.3780270516872406, "eval_runtime": 12.105, "eval_samples_per_second": 1569.925, "eval_steps_per_second": 32.714, "step": 107400 }, { "epoch": 41.40655105973025, "grad_norm": 0.3444681763648987, "learning_rate": 3.4375337186897885e-05, "loss": 0.3219400787353516, "step": 107450 }, { "epoch": 41.40655105973025, "eval_loss": 0.3815130591392517, "eval_runtime": 12.2628, "eval_samples_per_second": 1549.731, "eval_steps_per_second": 32.293, "step": 107450 }, { "epoch": 41.42581888246628, "grad_norm": 0.3525955379009247, "learning_rate": 3.4298265895953756e-05, "loss": 0.3196061325073242, "step": 107500 }, { "epoch": 41.42581888246628, "eval_loss": 0.3792777359485626, "eval_runtime": 12.3746, "eval_samples_per_second": 1535.732, "eval_steps_per_second": 32.001, "step": 107500 }, { "epoch": 41.445086705202314, "grad_norm": 0.36031386256217957, "learning_rate": 3.422119460500964e-05, "loss": 0.3182371139526367, "step": 107550 }, { "epoch": 41.445086705202314, "eval_loss": 0.37890419363975525, "eval_runtime": 12.416, "eval_samples_per_second": 1530.6, "eval_steps_per_second": 31.894, "step": 107550 }, { "epoch": 41.46435452793834, "grad_norm": 0.3652874827384949, "learning_rate": 3.414412331406551e-05, "loss": 0.31935115814208986, "step": 107600 }, { "epoch": 41.46435452793834, "eval_loss": 0.3820631206035614, "eval_runtime": 12.4483, "eval_samples_per_second": 1526.631, "eval_steps_per_second": 31.812, "step": 107600 }, { "epoch": 41.48362235067437, "grad_norm": 0.3051069378852844, "learning_rate": 3.406705202312139e-05, "loss": 0.32301666259765627, "step": 107650 }, { "epoch": 41.48362235067437, "eval_loss": 0.37775692343711853, "eval_runtime": 12.535, "eval_samples_per_second": 1516.076, "eval_steps_per_second": 31.592, "step": 107650 }, { "epoch": 41.50289017341041, "grad_norm": 0.3388849198818207, "learning_rate": 3.398998073217727e-05, "loss": 0.3177818870544434, "step": 107700 }, { "epoch": 41.50289017341041, "eval_loss": 0.37817448377609253, "eval_runtime": 12.4748, "eval_samples_per_second": 1523.389, "eval_steps_per_second": 31.744, "step": 107700 }, { "epoch": 41.52215799614643, "grad_norm": 0.3293895423412323, "learning_rate": 3.391290944123314e-05, "loss": 0.3166658401489258, "step": 107750 }, { "epoch": 41.52215799614643, "eval_loss": 0.38053858280181885, "eval_runtime": 12.3116, "eval_samples_per_second": 1543.58, "eval_steps_per_second": 32.165, "step": 107750 }, { "epoch": 41.541425818882466, "grad_norm": 0.3314325511455536, "learning_rate": 3.383583815028902e-05, "loss": 0.3191865158081055, "step": 107800 }, { "epoch": 41.541425818882466, "eval_loss": 0.3805483281612396, "eval_runtime": 12.421, "eval_samples_per_second": 1529.987, "eval_steps_per_second": 31.881, "step": 107800 }, { "epoch": 41.5606936416185, "grad_norm": 0.3145412802696228, "learning_rate": 3.3758766859344896e-05, "loss": 0.31737548828125, "step": 107850 }, { "epoch": 41.5606936416185, "eval_loss": 0.3857901692390442, "eval_runtime": 12.3537, "eval_samples_per_second": 1538.329, "eval_steps_per_second": 32.055, "step": 107850 }, { "epoch": 41.579961464354525, "grad_norm": 0.3384053111076355, "learning_rate": 3.3681695568400774e-05, "loss": 0.32235820770263673, "step": 107900 }, { "epoch": 41.579961464354525, "eval_loss": 0.3770088255405426, "eval_runtime": 12.5223, "eval_samples_per_second": 1517.611, "eval_steps_per_second": 31.624, "step": 107900 }, { "epoch": 41.59922928709056, "grad_norm": 0.40733498334884644, "learning_rate": 3.3604624277456646e-05, "loss": 0.3216871643066406, "step": 107950 }, { "epoch": 41.59922928709056, "eval_loss": 0.37324270606040955, "eval_runtime": 12.1253, "eval_samples_per_second": 1567.305, "eval_steps_per_second": 32.659, "step": 107950 }, { "epoch": 41.61849710982659, "grad_norm": 0.3516022861003876, "learning_rate": 3.352755298651253e-05, "loss": 0.3190634727478027, "step": 108000 }, { "epoch": 41.61849710982659, "eval_loss": 0.3836805820465088, "eval_runtime": 12.5358, "eval_samples_per_second": 1515.983, "eval_steps_per_second": 31.59, "step": 108000 }, { "epoch": 41.63776493256262, "grad_norm": 0.2945832312107086, "learning_rate": 3.34504816955684e-05, "loss": 0.3219845962524414, "step": 108050 }, { "epoch": 41.63776493256262, "eval_loss": 0.3808353841304779, "eval_runtime": 12.291, "eval_samples_per_second": 1546.177, "eval_steps_per_second": 32.219, "step": 108050 }, { "epoch": 41.65703275529865, "grad_norm": 0.2826516926288605, "learning_rate": 3.337341040462428e-05, "loss": 0.3143598556518555, "step": 108100 }, { "epoch": 41.65703275529865, "eval_loss": 0.37924185395240784, "eval_runtime": 12.2884, "eval_samples_per_second": 1546.505, "eval_steps_per_second": 32.226, "step": 108100 }, { "epoch": 41.676300578034684, "grad_norm": 0.2827834486961365, "learning_rate": 3.329633911368016e-05, "loss": 0.31906490325927733, "step": 108150 }, { "epoch": 41.676300578034684, "eval_loss": 0.3784290552139282, "eval_runtime": 12.327, "eval_samples_per_second": 1541.654, "eval_steps_per_second": 32.125, "step": 108150 }, { "epoch": 41.69556840077071, "grad_norm": 0.3203343152999878, "learning_rate": 3.3219267822736036e-05, "loss": 0.3210406494140625, "step": 108200 }, { "epoch": 41.69556840077071, "eval_loss": 0.38286808133125305, "eval_runtime": 12.2541, "eval_samples_per_second": 1550.83, "eval_steps_per_second": 32.316, "step": 108200 }, { "epoch": 41.714836223506744, "grad_norm": 0.2929115891456604, "learning_rate": 3.314219653179191e-05, "loss": 0.3148599624633789, "step": 108250 }, { "epoch": 41.714836223506744, "eval_loss": 0.380614310503006, "eval_runtime": 12.3385, "eval_samples_per_second": 1540.216, "eval_steps_per_second": 32.095, "step": 108250 }, { "epoch": 41.73410404624278, "grad_norm": 0.31150946021080017, "learning_rate": 3.3065125240847786e-05, "loss": 0.31989408493041993, "step": 108300 }, { "epoch": 41.73410404624278, "eval_loss": 0.3814939558506012, "eval_runtime": 12.4737, "eval_samples_per_second": 1523.526, "eval_steps_per_second": 31.747, "step": 108300 }, { "epoch": 41.7533718689788, "grad_norm": 0.3645922541618347, "learning_rate": 3.2988053949903664e-05, "loss": 0.31892583847045897, "step": 108350 }, { "epoch": 41.7533718689788, "eval_loss": 0.3855477273464203, "eval_runtime": 12.3812, "eval_samples_per_second": 1534.908, "eval_steps_per_second": 31.984, "step": 108350 }, { "epoch": 41.772639691714836, "grad_norm": 0.299631804227829, "learning_rate": 3.2910982658959535e-05, "loss": 0.3205451965332031, "step": 108400 }, { "epoch": 41.772639691714836, "eval_loss": 0.3781200349330902, "eval_runtime": 12.3853, "eval_samples_per_second": 1534.401, "eval_steps_per_second": 31.973, "step": 108400 }, { "epoch": 41.79190751445087, "grad_norm": 0.3361656665802002, "learning_rate": 3.283391136801541e-05, "loss": 0.32404342651367185, "step": 108450 }, { "epoch": 41.79190751445087, "eval_loss": 0.38088908791542053, "eval_runtime": 12.4369, "eval_samples_per_second": 1528.038, "eval_steps_per_second": 31.841, "step": 108450 }, { "epoch": 41.811175337186896, "grad_norm": 0.31612628698349, "learning_rate": 3.275684007707129e-05, "loss": 0.3205419158935547, "step": 108500 }, { "epoch": 41.811175337186896, "eval_loss": 0.37974169850349426, "eval_runtime": 12.7676, "eval_samples_per_second": 1488.455, "eval_steps_per_second": 31.016, "step": 108500 }, { "epoch": 41.83044315992293, "grad_norm": 0.3353908658027649, "learning_rate": 3.267976878612717e-05, "loss": 0.3197814559936523, "step": 108550 }, { "epoch": 41.83044315992293, "eval_loss": 0.38052797317504883, "eval_runtime": 12.3475, "eval_samples_per_second": 1539.099, "eval_steps_per_second": 32.071, "step": 108550 }, { "epoch": 41.84971098265896, "grad_norm": 0.35925713181495667, "learning_rate": 3.260269749518304e-05, "loss": 0.3183636474609375, "step": 108600 }, { "epoch": 41.84971098265896, "eval_loss": 0.37766048312187195, "eval_runtime": 12.2785, "eval_samples_per_second": 1547.745, "eval_steps_per_second": 32.251, "step": 108600 }, { "epoch": 41.86897880539499, "grad_norm": 0.34208741784095764, "learning_rate": 3.2525626204238926e-05, "loss": 0.3202581024169922, "step": 108650 }, { "epoch": 41.86897880539499, "eval_loss": 0.382755309343338, "eval_runtime": 12.434, "eval_samples_per_second": 1528.394, "eval_steps_per_second": 31.848, "step": 108650 }, { "epoch": 41.88824662813102, "grad_norm": 0.36012858152389526, "learning_rate": 3.24485549132948e-05, "loss": 0.31214366912841796, "step": 108700 }, { "epoch": 41.88824662813102, "eval_loss": 0.3791314363479614, "eval_runtime": 12.4869, "eval_samples_per_second": 1521.909, "eval_steps_per_second": 31.713, "step": 108700 }, { "epoch": 41.907514450867055, "grad_norm": 0.32895681262016296, "learning_rate": 3.2371483622350675e-05, "loss": 0.3176622009277344, "step": 108750 }, { "epoch": 41.907514450867055, "eval_loss": 0.3746427297592163, "eval_runtime": 12.8035, "eval_samples_per_second": 1484.277, "eval_steps_per_second": 30.929, "step": 108750 }, { "epoch": 41.92678227360308, "grad_norm": 0.2973894774913788, "learning_rate": 3.229441233140655e-05, "loss": 0.31213085174560545, "step": 108800 }, { "epoch": 41.92678227360308, "eval_loss": 0.3860454857349396, "eval_runtime": 12.4473, "eval_samples_per_second": 1526.751, "eval_steps_per_second": 31.814, "step": 108800 }, { "epoch": 41.946050096339114, "grad_norm": 0.31518200039863586, "learning_rate": 3.221734104046243e-05, "loss": 0.3210936737060547, "step": 108850 }, { "epoch": 41.946050096339114, "eval_loss": 0.3788556456565857, "eval_runtime": 12.3706, "eval_samples_per_second": 1536.226, "eval_steps_per_second": 32.011, "step": 108850 }, { "epoch": 41.96531791907515, "grad_norm": 0.28458794951438904, "learning_rate": 3.21402697495183e-05, "loss": 0.3197541618347168, "step": 108900 }, { "epoch": 41.96531791907515, "eval_loss": 0.3828953206539154, "eval_runtime": 12.4091, "eval_samples_per_second": 1531.462, "eval_steps_per_second": 31.912, "step": 108900 }, { "epoch": 41.98458574181117, "grad_norm": 0.291456937789917, "learning_rate": 3.206319845857418e-05, "loss": 0.3169184875488281, "step": 108950 }, { "epoch": 41.98458574181117, "eval_loss": 0.37738215923309326, "eval_runtime": 12.4235, "eval_samples_per_second": 1529.678, "eval_steps_per_second": 31.875, "step": 108950 }, { "epoch": 42.00385356454721, "grad_norm": 0.3108833134174347, "learning_rate": 3.198612716763006e-05, "loss": 0.32060745239257815, "step": 109000 }, { "epoch": 42.00385356454721, "eval_loss": 0.3842185437679291, "eval_runtime": 12.3836, "eval_samples_per_second": 1534.61, "eval_steps_per_second": 31.978, "step": 109000 }, { "epoch": 42.02312138728324, "grad_norm": 0.30624985694885254, "learning_rate": 3.190905587668594e-05, "loss": 0.31886093139648436, "step": 109050 }, { "epoch": 42.02312138728324, "eval_loss": 0.38095179200172424, "eval_runtime": 12.5228, "eval_samples_per_second": 1517.551, "eval_steps_per_second": 31.622, "step": 109050 }, { "epoch": 42.042389210019266, "grad_norm": 0.36061665415763855, "learning_rate": 3.1831984585741815e-05, "loss": 0.3173149871826172, "step": 109100 }, { "epoch": 42.042389210019266, "eval_loss": 0.37469109892845154, "eval_runtime": 12.4895, "eval_samples_per_second": 1521.598, "eval_steps_per_second": 31.707, "step": 109100 }, { "epoch": 42.0616570327553, "grad_norm": 0.35089370608329773, "learning_rate": 3.1754913294797686e-05, "loss": 0.31964347839355467, "step": 109150 }, { "epoch": 42.0616570327553, "eval_loss": 0.38157930970191956, "eval_runtime": 12.3115, "eval_samples_per_second": 1543.602, "eval_steps_per_second": 32.165, "step": 109150 }, { "epoch": 42.08092485549133, "grad_norm": 0.3018496334552765, "learning_rate": 3.1677842003853565e-05, "loss": 0.3253359603881836, "step": 109200 }, { "epoch": 42.08092485549133, "eval_loss": 0.3858645260334015, "eval_runtime": 12.2991, "eval_samples_per_second": 1545.151, "eval_steps_per_second": 32.197, "step": 109200 }, { "epoch": 42.10019267822736, "grad_norm": 0.3420887887477875, "learning_rate": 3.160077071290944e-05, "loss": 0.3180549621582031, "step": 109250 }, { "epoch": 42.10019267822736, "eval_loss": 0.3782184422016144, "eval_runtime": 12.0864, "eval_samples_per_second": 1572.352, "eval_steps_per_second": 32.764, "step": 109250 }, { "epoch": 42.11946050096339, "grad_norm": 0.3380780816078186, "learning_rate": 3.152369942196532e-05, "loss": 0.3155638122558594, "step": 109300 }, { "epoch": 42.11946050096339, "eval_loss": 0.37580451369285583, "eval_runtime": 12.5421, "eval_samples_per_second": 1515.221, "eval_steps_per_second": 31.574, "step": 109300 }, { "epoch": 42.138728323699425, "grad_norm": 0.3526953160762787, "learning_rate": 3.144662813102119e-05, "loss": 0.3194369888305664, "step": 109350 }, { "epoch": 42.138728323699425, "eval_loss": 0.3798079192638397, "eval_runtime": 12.5266, "eval_samples_per_second": 1517.091, "eval_steps_per_second": 31.613, "step": 109350 }, { "epoch": 42.15799614643545, "grad_norm": 0.3088805377483368, "learning_rate": 3.136955684007708e-05, "loss": 0.31903331756591796, "step": 109400 }, { "epoch": 42.15799614643545, "eval_loss": 0.37884044647216797, "eval_runtime": 12.5334, "eval_samples_per_second": 1516.265, "eval_steps_per_second": 31.596, "step": 109400 }, { "epoch": 42.177263969171484, "grad_norm": 0.3531946539878845, "learning_rate": 3.129248554913295e-05, "loss": 0.32236461639404296, "step": 109450 }, { "epoch": 42.177263969171484, "eval_loss": 0.3829404413700104, "eval_runtime": 12.2989, "eval_samples_per_second": 1545.183, "eval_steps_per_second": 32.198, "step": 109450 }, { "epoch": 42.19653179190752, "grad_norm": 0.3350399434566498, "learning_rate": 3.1215414258188827e-05, "loss": 0.31794368743896484, "step": 109500 }, { "epoch": 42.19653179190752, "eval_loss": 0.383730947971344, "eval_runtime": 12.1261, "eval_samples_per_second": 1567.197, "eval_steps_per_second": 32.657, "step": 109500 }, { "epoch": 42.215799614643544, "grad_norm": 0.3273240923881531, "learning_rate": 3.1138342967244705e-05, "loss": 0.3228916549682617, "step": 109550 }, { "epoch": 42.215799614643544, "eval_loss": 0.3863111138343811, "eval_runtime": 12.4981, "eval_samples_per_second": 1520.546, "eval_steps_per_second": 31.685, "step": 109550 }, { "epoch": 42.23506743737958, "grad_norm": 0.35053551197052, "learning_rate": 3.106127167630058e-05, "loss": 0.32228919982910154, "step": 109600 }, { "epoch": 42.23506743737958, "eval_loss": 0.38563820719718933, "eval_runtime": 14.6806, "eval_samples_per_second": 1294.495, "eval_steps_per_second": 26.974, "step": 109600 }, { "epoch": 42.25433526011561, "grad_norm": 0.3232199549674988, "learning_rate": 3.0984200385356454e-05, "loss": 0.31871078491210936, "step": 109650 }, { "epoch": 42.25433526011561, "eval_loss": 0.3843730390071869, "eval_runtime": 12.3832, "eval_samples_per_second": 1534.66, "eval_steps_per_second": 31.979, "step": 109650 }, { "epoch": 42.273603082851636, "grad_norm": 0.3924799859523773, "learning_rate": 3.090712909441233e-05, "loss": 0.317854118347168, "step": 109700 }, { "epoch": 42.273603082851636, "eval_loss": 0.37854647636413574, "eval_runtime": 12.4809, "eval_samples_per_second": 1522.649, "eval_steps_per_second": 31.729, "step": 109700 }, { "epoch": 42.29287090558767, "grad_norm": 0.3098836839199066, "learning_rate": 3.083005780346821e-05, "loss": 0.31687973022460936, "step": 109750 }, { "epoch": 42.29287090558767, "eval_loss": 0.37412339448928833, "eval_runtime": 12.5096, "eval_samples_per_second": 1519.152, "eval_steps_per_second": 31.656, "step": 109750 }, { "epoch": 42.3121387283237, "grad_norm": 0.3456902801990509, "learning_rate": 3.075298651252408e-05, "loss": 0.3186471939086914, "step": 109800 }, { "epoch": 42.3121387283237, "eval_loss": 0.38042643666267395, "eval_runtime": 13.6377, "eval_samples_per_second": 1393.487, "eval_steps_per_second": 29.037, "step": 109800 }, { "epoch": 42.33140655105973, "grad_norm": 0.34090447425842285, "learning_rate": 3.0675915221579967e-05, "loss": 0.3140385055541992, "step": 109850 }, { "epoch": 42.33140655105973, "eval_loss": 0.37783604860305786, "eval_runtime": 12.4384, "eval_samples_per_second": 1527.848, "eval_steps_per_second": 31.837, "step": 109850 }, { "epoch": 42.35067437379576, "grad_norm": 0.3179584741592407, "learning_rate": 3.059884393063584e-05, "loss": 0.3199280548095703, "step": 109900 }, { "epoch": 42.35067437379576, "eval_loss": 0.38107818365097046, "eval_runtime": 12.4464, "eval_samples_per_second": 1526.863, "eval_steps_per_second": 31.816, "step": 109900 }, { "epoch": 42.369942196531795, "grad_norm": 0.2910359501838684, "learning_rate": 3.0521772639691716e-05, "loss": 0.3152055931091309, "step": 109950 }, { "epoch": 42.369942196531795, "eval_loss": 0.3838849365711212, "eval_runtime": 12.47, "eval_samples_per_second": 1523.974, "eval_steps_per_second": 31.756, "step": 109950 }, { "epoch": 42.38921001926782, "grad_norm": 0.33358311653137207, "learning_rate": 3.0444701348747594e-05, "loss": 0.317266845703125, "step": 110000 }, { "epoch": 42.38921001926782, "eval_loss": 0.3769039511680603, "eval_runtime": 12.7178, "eval_samples_per_second": 1494.279, "eval_steps_per_second": 31.137, "step": 110000 }, { "epoch": 42.408477842003855, "grad_norm": 0.2981705963611603, "learning_rate": 3.036763005780347e-05, "loss": 0.3181655502319336, "step": 110050 }, { "epoch": 42.408477842003855, "eval_loss": 0.3693256080150604, "eval_runtime": 12.4886, "eval_samples_per_second": 1521.704, "eval_steps_per_second": 31.709, "step": 110050 }, { "epoch": 42.42774566473989, "grad_norm": 0.3446158468723297, "learning_rate": 3.0290558766859344e-05, "loss": 0.316566219329834, "step": 110100 }, { "epoch": 42.42774566473989, "eval_loss": 0.3809666931629181, "eval_runtime": 12.1009, "eval_samples_per_second": 1570.468, "eval_steps_per_second": 32.725, "step": 110100 }, { "epoch": 42.447013487475914, "grad_norm": 0.33954381942749023, "learning_rate": 3.0213487475915225e-05, "loss": 0.3172496032714844, "step": 110150 }, { "epoch": 42.447013487475914, "eval_loss": 0.381475567817688, "eval_runtime": 12.4267, "eval_samples_per_second": 1529.29, "eval_steps_per_second": 31.867, "step": 110150 }, { "epoch": 42.46628131021195, "grad_norm": 0.32704418897628784, "learning_rate": 3.01364161849711e-05, "loss": 0.3158487510681152, "step": 110200 }, { "epoch": 42.46628131021195, "eval_loss": 0.38283178210258484, "eval_runtime": 12.2805, "eval_samples_per_second": 1547.496, "eval_steps_per_second": 32.246, "step": 110200 }, { "epoch": 42.48554913294798, "grad_norm": 0.30864349007606506, "learning_rate": 3.0059344894026975e-05, "loss": 0.31619314193725584, "step": 110250 }, { "epoch": 42.48554913294798, "eval_loss": 0.3787330389022827, "eval_runtime": 12.4139, "eval_samples_per_second": 1530.864, "eval_steps_per_second": 31.9, "step": 110250 }, { "epoch": 42.50481695568401, "grad_norm": 0.32197991013526917, "learning_rate": 2.9982273603082856e-05, "loss": 0.31785629272460936, "step": 110300 }, { "epoch": 42.50481695568401, "eval_loss": 0.3893280625343323, "eval_runtime": 12.3865, "eval_samples_per_second": 1534.25, "eval_steps_per_second": 31.97, "step": 110300 }, { "epoch": 42.52408477842004, "grad_norm": 0.32175591588020325, "learning_rate": 2.990520231213873e-05, "loss": 0.32100486755371094, "step": 110350 }, { "epoch": 42.52408477842004, "eval_loss": 0.38473084568977356, "eval_runtime": 12.4295, "eval_samples_per_second": 1528.945, "eval_steps_per_second": 31.86, "step": 110350 }, { "epoch": 42.543352601156066, "grad_norm": 0.31348273158073425, "learning_rate": 2.9828131021194606e-05, "loss": 0.312562255859375, "step": 110400 }, { "epoch": 42.543352601156066, "eval_loss": 0.3779340088367462, "eval_runtime": 12.454, "eval_samples_per_second": 1525.934, "eval_steps_per_second": 31.797, "step": 110400 }, { "epoch": 42.5626204238921, "grad_norm": 0.27987855672836304, "learning_rate": 2.9751059730250487e-05, "loss": 0.3205829620361328, "step": 110450 }, { "epoch": 42.5626204238921, "eval_loss": 0.3805375099182129, "eval_runtime": 12.2879, "eval_samples_per_second": 1546.56, "eval_steps_per_second": 32.227, "step": 110450 }, { "epoch": 42.58188824662813, "grad_norm": 0.3323763906955719, "learning_rate": 2.9673988439306362e-05, "loss": 0.31567838668823245, "step": 110500 }, { "epoch": 42.58188824662813, "eval_loss": 0.3885968029499054, "eval_runtime": 12.2892, "eval_samples_per_second": 1546.4, "eval_steps_per_second": 32.223, "step": 110500 }, { "epoch": 42.60115606936416, "grad_norm": 0.26724180579185486, "learning_rate": 2.9596917148362236e-05, "loss": 0.31786710739135743, "step": 110550 }, { "epoch": 42.60115606936416, "eval_loss": 0.38563838601112366, "eval_runtime": 12.4463, "eval_samples_per_second": 1526.879, "eval_steps_per_second": 31.817, "step": 110550 }, { "epoch": 42.62042389210019, "grad_norm": 0.31703633069992065, "learning_rate": 2.9519845857418115e-05, "loss": 0.3186818504333496, "step": 110600 }, { "epoch": 42.62042389210019, "eval_loss": 0.37730512022972107, "eval_runtime": 12.2995, "eval_samples_per_second": 1545.108, "eval_steps_per_second": 32.197, "step": 110600 }, { "epoch": 42.639691714836225, "grad_norm": 0.304837703704834, "learning_rate": 2.944277456647399e-05, "loss": 0.3212838363647461, "step": 110650 }, { "epoch": 42.639691714836225, "eval_loss": 0.38994523882865906, "eval_runtime": 12.4493, "eval_samples_per_second": 1526.514, "eval_steps_per_second": 31.809, "step": 110650 }, { "epoch": 42.65895953757225, "grad_norm": 0.34598714113235474, "learning_rate": 2.9365703275529864e-05, "loss": 0.31941192626953124, "step": 110700 }, { "epoch": 42.65895953757225, "eval_loss": 0.38774341344833374, "eval_runtime": 12.3521, "eval_samples_per_second": 1538.526, "eval_steps_per_second": 32.059, "step": 110700 }, { "epoch": 42.678227360308284, "grad_norm": 0.2997381389141083, "learning_rate": 2.9288631984585746e-05, "loss": 0.318465576171875, "step": 110750 }, { "epoch": 42.678227360308284, "eval_loss": 0.3822665214538574, "eval_runtime": 12.3578, "eval_samples_per_second": 1537.811, "eval_steps_per_second": 32.044, "step": 110750 }, { "epoch": 42.69749518304432, "grad_norm": 0.3744599521160126, "learning_rate": 2.921156069364162e-05, "loss": 0.32336097717285156, "step": 110800 }, { "epoch": 42.69749518304432, "eval_loss": 0.3823181986808777, "eval_runtime": 12.3558, "eval_samples_per_second": 1538.064, "eval_steps_per_second": 32.05, "step": 110800 }, { "epoch": 42.716763005780344, "grad_norm": 0.2869740128517151, "learning_rate": 2.9134489402697495e-05, "loss": 0.3190762901306152, "step": 110850 }, { "epoch": 42.716763005780344, "eval_loss": 0.3804805874824524, "eval_runtime": 12.3806, "eval_samples_per_second": 1534.979, "eval_steps_per_second": 31.985, "step": 110850 }, { "epoch": 42.73603082851638, "grad_norm": 0.3735945522785187, "learning_rate": 2.905741811175337e-05, "loss": 0.3204268646240234, "step": 110900 }, { "epoch": 42.73603082851638, "eval_loss": 0.3812357187271118, "eval_runtime": 12.5143, "eval_samples_per_second": 1518.588, "eval_steps_per_second": 31.644, "step": 110900 }, { "epoch": 42.75529865125241, "grad_norm": 0.3263665437698364, "learning_rate": 2.898034682080925e-05, "loss": 0.3223684310913086, "step": 110950 }, { "epoch": 42.75529865125241, "eval_loss": 0.3767101764678955, "eval_runtime": 12.4999, "eval_samples_per_second": 1520.328, "eval_steps_per_second": 31.68, "step": 110950 }, { "epoch": 42.774566473988436, "grad_norm": 0.3200741410255432, "learning_rate": 2.8903275529865126e-05, "loss": 0.31877140045166014, "step": 111000 }, { "epoch": 42.774566473988436, "eval_loss": 0.3770425021648407, "eval_runtime": 12.5568, "eval_samples_per_second": 1513.443, "eval_steps_per_second": 31.537, "step": 111000 }, { "epoch": 42.79383429672447, "grad_norm": 0.3610881567001343, "learning_rate": 2.8826204238921e-05, "loss": 0.3213713836669922, "step": 111050 }, { "epoch": 42.79383429672447, "eval_loss": 0.38048869371414185, "eval_runtime": 12.2803, "eval_samples_per_second": 1547.517, "eval_steps_per_second": 32.247, "step": 111050 }, { "epoch": 42.8131021194605, "grad_norm": 0.33639150857925415, "learning_rate": 2.8749132947976882e-05, "loss": 0.31600414276123046, "step": 111100 }, { "epoch": 42.8131021194605, "eval_loss": 0.3815907835960388, "eval_runtime": 12.286, "eval_samples_per_second": 1546.798, "eval_steps_per_second": 32.232, "step": 111100 }, { "epoch": 42.83236994219653, "grad_norm": 0.3373524248600006, "learning_rate": 2.8672061657032757e-05, "loss": 0.31531194686889646, "step": 111150 }, { "epoch": 42.83236994219653, "eval_loss": 0.3793480098247528, "eval_runtime": 12.255, "eval_samples_per_second": 1550.718, "eval_steps_per_second": 32.313, "step": 111150 }, { "epoch": 42.85163776493256, "grad_norm": 0.31382426619529724, "learning_rate": 2.859499036608863e-05, "loss": 0.32087928771972657, "step": 111200 }, { "epoch": 42.85163776493256, "eval_loss": 0.38825127482414246, "eval_runtime": 12.277, "eval_samples_per_second": 1547.937, "eval_steps_per_second": 32.255, "step": 111200 }, { "epoch": 42.870905587668595, "grad_norm": 0.3469623625278473, "learning_rate": 2.851791907514451e-05, "loss": 0.31995315551757814, "step": 111250 }, { "epoch": 42.870905587668595, "eval_loss": 0.3831827640533447, "eval_runtime": 12.4326, "eval_samples_per_second": 1528.559, "eval_steps_per_second": 31.852, "step": 111250 }, { "epoch": 42.89017341040462, "grad_norm": 0.2518059313297272, "learning_rate": 2.8440847784200388e-05, "loss": 0.3142250442504883, "step": 111300 }, { "epoch": 42.89017341040462, "eval_loss": 0.3787035644054413, "eval_runtime": 12.4977, "eval_samples_per_second": 1520.598, "eval_steps_per_second": 31.686, "step": 111300 }, { "epoch": 42.909441233140655, "grad_norm": 0.3459382951259613, "learning_rate": 2.8363776493256263e-05, "loss": 0.31866573333740233, "step": 111350 }, { "epoch": 42.909441233140655, "eval_loss": 0.3867740035057068, "eval_runtime": 12.2045, "eval_samples_per_second": 1557.132, "eval_steps_per_second": 32.447, "step": 111350 }, { "epoch": 42.92870905587669, "grad_norm": 0.28773921728134155, "learning_rate": 2.828670520231214e-05, "loss": 0.318884391784668, "step": 111400 }, { "epoch": 42.92870905587669, "eval_loss": 0.38437360525131226, "eval_runtime": 12.2887, "eval_samples_per_second": 1546.46, "eval_steps_per_second": 32.225, "step": 111400 }, { "epoch": 42.947976878612714, "grad_norm": 0.36327308416366577, "learning_rate": 2.8209633911368015e-05, "loss": 0.31813100814819334, "step": 111450 }, { "epoch": 42.947976878612714, "eval_loss": 0.37909311056137085, "eval_runtime": 12.3348, "eval_samples_per_second": 1540.678, "eval_steps_per_second": 32.104, "step": 111450 }, { "epoch": 42.96724470134875, "grad_norm": 0.32063165307044983, "learning_rate": 2.813256262042389e-05, "loss": 0.32028106689453123, "step": 111500 }, { "epoch": 42.96724470134875, "eval_loss": 0.38737958669662476, "eval_runtime": 12.3285, "eval_samples_per_second": 1541.466, "eval_steps_per_second": 32.121, "step": 111500 }, { "epoch": 42.98651252408478, "grad_norm": 0.3203658163547516, "learning_rate": 2.805549132947977e-05, "loss": 0.31509876251220703, "step": 111550 }, { "epoch": 42.98651252408478, "eval_loss": 0.3755653202533722, "eval_runtime": 12.3131, "eval_samples_per_second": 1543.396, "eval_steps_per_second": 32.161, "step": 111550 }, { "epoch": 43.005780346820806, "grad_norm": 0.3753902316093445, "learning_rate": 2.7978420038535646e-05, "loss": 0.3226659393310547, "step": 111600 }, { "epoch": 43.005780346820806, "eval_loss": 0.37637045979499817, "eval_runtime": 12.3409, "eval_samples_per_second": 1539.921, "eval_steps_per_second": 32.088, "step": 111600 }, { "epoch": 43.02504816955684, "grad_norm": 0.3590492904186249, "learning_rate": 2.790134874759152e-05, "loss": 0.3167324447631836, "step": 111650 }, { "epoch": 43.02504816955684, "eval_loss": 0.38421276211738586, "eval_runtime": 12.1158, "eval_samples_per_second": 1568.527, "eval_steps_per_second": 32.685, "step": 111650 }, { "epoch": 43.04431599229287, "grad_norm": 0.32073840498924255, "learning_rate": 2.7824277456647403e-05, "loss": 0.3203307723999023, "step": 111700 }, { "epoch": 43.04431599229287, "eval_loss": 0.3752534091472626, "eval_runtime": 12.4478, "eval_samples_per_second": 1526.692, "eval_steps_per_second": 31.813, "step": 111700 }, { "epoch": 43.0635838150289, "grad_norm": 0.31798526644706726, "learning_rate": 2.7747206165703277e-05, "loss": 0.31652599334716797, "step": 111750 }, { "epoch": 43.0635838150289, "eval_loss": 0.382441908121109, "eval_runtime": 12.3292, "eval_samples_per_second": 1541.375, "eval_steps_per_second": 32.119, "step": 111750 }, { "epoch": 43.08285163776493, "grad_norm": 0.31354090571403503, "learning_rate": 2.7670134874759152e-05, "loss": 0.31711803436279296, "step": 111800 }, { "epoch": 43.08285163776493, "eval_loss": 0.3844451308250427, "eval_runtime": 12.2857, "eval_samples_per_second": 1546.841, "eval_steps_per_second": 32.233, "step": 111800 }, { "epoch": 43.102119460500965, "grad_norm": 0.3170780837535858, "learning_rate": 2.7593063583815034e-05, "loss": 0.31922834396362304, "step": 111850 }, { "epoch": 43.102119460500965, "eval_loss": 0.3786402940750122, "eval_runtime": 12.273, "eval_samples_per_second": 1548.441, "eval_steps_per_second": 32.266, "step": 111850 }, { "epoch": 43.12138728323699, "grad_norm": 0.3203486204147339, "learning_rate": 2.751599229287091e-05, "loss": 0.3181783676147461, "step": 111900 }, { "epoch": 43.12138728323699, "eval_loss": 0.38029271364212036, "eval_runtime": 12.3398, "eval_samples_per_second": 1540.059, "eval_steps_per_second": 32.091, "step": 111900 }, { "epoch": 43.140655105973025, "grad_norm": 0.35296279191970825, "learning_rate": 2.7438921001926783e-05, "loss": 0.3200511932373047, "step": 111950 }, { "epoch": 43.140655105973025, "eval_loss": 0.3805944323539734, "eval_runtime": 12.3401, "eval_samples_per_second": 1540.017, "eval_steps_per_second": 32.09, "step": 111950 }, { "epoch": 43.15992292870906, "grad_norm": 0.30983853340148926, "learning_rate": 2.736184971098266e-05, "loss": 0.3141751670837402, "step": 112000 }, { "epoch": 43.15992292870906, "eval_loss": 0.37521421909332275, "eval_runtime": 12.2752, "eval_samples_per_second": 1548.165, "eval_steps_per_second": 32.26, "step": 112000 }, { "epoch": 43.179190751445084, "grad_norm": 0.3252648115158081, "learning_rate": 2.7284778420038536e-05, "loss": 0.31713741302490234, "step": 112050 }, { "epoch": 43.179190751445084, "eval_loss": 0.374932199716568, "eval_runtime": 12.3078, "eval_samples_per_second": 1544.066, "eval_steps_per_second": 32.175, "step": 112050 }, { "epoch": 43.19845857418112, "grad_norm": 0.3417247533798218, "learning_rate": 2.720770712909441e-05, "loss": 0.31803604125976564, "step": 112100 }, { "epoch": 43.19845857418112, "eval_loss": 0.3779945969581604, "eval_runtime": 12.3092, "eval_samples_per_second": 1543.888, "eval_steps_per_second": 32.171, "step": 112100 }, { "epoch": 43.21772639691715, "grad_norm": 0.3117140531539917, "learning_rate": 2.7130635838150292e-05, "loss": 0.31660152435302735, "step": 112150 }, { "epoch": 43.21772639691715, "eval_loss": 0.38543426990509033, "eval_runtime": 14.3099, "eval_samples_per_second": 1328.032, "eval_steps_per_second": 27.673, "step": 112150 }, { "epoch": 43.23699421965318, "grad_norm": 0.3272820711135864, "learning_rate": 2.7053564547206167e-05, "loss": 0.31506216049194335, "step": 112200 }, { "epoch": 43.23699421965318, "eval_loss": 0.38472941517829895, "eval_runtime": 12.5252, "eval_samples_per_second": 1517.263, "eval_steps_per_second": 31.616, "step": 112200 }, { "epoch": 43.25626204238921, "grad_norm": 0.3199763000011444, "learning_rate": 2.697649325626204e-05, "loss": 0.32083282470703123, "step": 112250 }, { "epoch": 43.25626204238921, "eval_loss": 0.37817248702049255, "eval_runtime": 12.1344, "eval_samples_per_second": 1566.126, "eval_steps_per_second": 32.634, "step": 112250 }, { "epoch": 43.27552986512524, "grad_norm": 0.33522242307662964, "learning_rate": 2.6899421965317923e-05, "loss": 0.3167445755004883, "step": 112300 }, { "epoch": 43.27552986512524, "eval_loss": 0.38238993287086487, "eval_runtime": 15.2718, "eval_samples_per_second": 1244.389, "eval_steps_per_second": 25.93, "step": 112300 }, { "epoch": 43.29479768786127, "grad_norm": 0.3657277524471283, "learning_rate": 2.6822350674373798e-05, "loss": 0.3159714889526367, "step": 112350 }, { "epoch": 43.29479768786127, "eval_loss": 0.383439838886261, "eval_runtime": 12.4968, "eval_samples_per_second": 1520.71, "eval_steps_per_second": 31.688, "step": 112350 }, { "epoch": 43.3140655105973, "grad_norm": 0.3426041305065155, "learning_rate": 2.6745279383429673e-05, "loss": 0.3179323387145996, "step": 112400 }, { "epoch": 43.3140655105973, "eval_loss": 0.38316279649734497, "eval_runtime": 12.4313, "eval_samples_per_second": 1528.721, "eval_steps_per_second": 31.855, "step": 112400 }, { "epoch": 43.333333333333336, "grad_norm": 0.34740641713142395, "learning_rate": 2.6668208092485554e-05, "loss": 0.3163179016113281, "step": 112450 }, { "epoch": 43.333333333333336, "eval_loss": 0.3896893560886383, "eval_runtime": 12.502, "eval_samples_per_second": 1520.072, "eval_steps_per_second": 31.675, "step": 112450 }, { "epoch": 43.35260115606936, "grad_norm": 0.33207157254219055, "learning_rate": 2.659113680154143e-05, "loss": 0.3169574356079102, "step": 112500 }, { "epoch": 43.35260115606936, "eval_loss": 0.38549718260765076, "eval_runtime": 12.5381, "eval_samples_per_second": 1515.7, "eval_steps_per_second": 31.584, "step": 112500 }, { "epoch": 43.371868978805395, "grad_norm": 0.3526310324668884, "learning_rate": 2.6514065510597303e-05, "loss": 0.3175938987731934, "step": 112550 }, { "epoch": 43.371868978805395, "eval_loss": 0.3874591290950775, "eval_runtime": 13.4192, "eval_samples_per_second": 1416.184, "eval_steps_per_second": 29.51, "step": 112550 }, { "epoch": 43.39113680154143, "grad_norm": 0.2773512601852417, "learning_rate": 2.643699421965318e-05, "loss": 0.315380859375, "step": 112600 }, { "epoch": 43.39113680154143, "eval_loss": 0.3745221495628357, "eval_runtime": 12.3536, "eval_samples_per_second": 1538.335, "eval_steps_per_second": 32.055, "step": 112600 }, { "epoch": 43.410404624277454, "grad_norm": 0.31864434480667114, "learning_rate": 2.6359922928709056e-05, "loss": 0.3148013687133789, "step": 112650 }, { "epoch": 43.410404624277454, "eval_loss": 0.37397152185440063, "eval_runtime": 12.3665, "eval_samples_per_second": 1536.735, "eval_steps_per_second": 32.022, "step": 112650 }, { "epoch": 43.42967244701349, "grad_norm": 0.3007570505142212, "learning_rate": 2.628285163776493e-05, "loss": 0.3182581901550293, "step": 112700 }, { "epoch": 43.42967244701349, "eval_loss": 0.3869824707508087, "eval_runtime": 12.3361, "eval_samples_per_second": 1540.517, "eval_steps_per_second": 32.101, "step": 112700 }, { "epoch": 43.44894026974952, "grad_norm": 0.33048325777053833, "learning_rate": 2.6205780346820813e-05, "loss": 0.3213844299316406, "step": 112750 }, { "epoch": 43.44894026974952, "eval_loss": 0.37880247831344604, "eval_runtime": 12.3662, "eval_samples_per_second": 1536.768, "eval_steps_per_second": 32.023, "step": 112750 }, { "epoch": 43.46820809248555, "grad_norm": 0.2937198281288147, "learning_rate": 2.6128709055876687e-05, "loss": 0.32021537780761716, "step": 112800 }, { "epoch": 43.46820809248555, "eval_loss": 0.3837073743343353, "eval_runtime": 12.1133, "eval_samples_per_second": 1568.859, "eval_steps_per_second": 32.691, "step": 112800 }, { "epoch": 43.48747591522158, "grad_norm": 0.33313629031181335, "learning_rate": 2.6051637764932562e-05, "loss": 0.316108455657959, "step": 112850 }, { "epoch": 43.48747591522158, "eval_loss": 0.37402912974357605, "eval_runtime": 12.4302, "eval_samples_per_second": 1528.853, "eval_steps_per_second": 31.858, "step": 112850 }, { "epoch": 43.50674373795761, "grad_norm": 0.3646543622016907, "learning_rate": 2.5974566473988444e-05, "loss": 0.31682174682617187, "step": 112900 }, { "epoch": 43.50674373795761, "eval_loss": 0.38820523023605347, "eval_runtime": 12.3931, "eval_samples_per_second": 1533.435, "eval_steps_per_second": 31.953, "step": 112900 }, { "epoch": 43.52601156069364, "grad_norm": 0.313250869512558, "learning_rate": 2.5897495183044318e-05, "loss": 0.32004772186279296, "step": 112950 }, { "epoch": 43.52601156069364, "eval_loss": 0.3861270248889923, "eval_runtime": 12.3006, "eval_samples_per_second": 1544.963, "eval_steps_per_second": 32.193, "step": 112950 }, { "epoch": 43.54527938342967, "grad_norm": 0.30704498291015625, "learning_rate": 2.5820423892100193e-05, "loss": 0.32091072082519534, "step": 113000 }, { "epoch": 43.54527938342967, "eval_loss": 0.38011083006858826, "eval_runtime": 12.2771, "eval_samples_per_second": 1547.917, "eval_steps_per_second": 32.255, "step": 113000 }, { "epoch": 43.564547206165706, "grad_norm": 0.33439019322395325, "learning_rate": 2.5743352601156074e-05, "loss": 0.3215266036987305, "step": 113050 }, { "epoch": 43.564547206165706, "eval_loss": 0.3796263635158539, "eval_runtime": 12.4775, "eval_samples_per_second": 1523.058, "eval_steps_per_second": 31.737, "step": 113050 }, { "epoch": 43.58381502890173, "grad_norm": 0.37154215574264526, "learning_rate": 2.566628131021195e-05, "loss": 0.3165155029296875, "step": 113100 }, { "epoch": 43.58381502890173, "eval_loss": 0.384254515171051, "eval_runtime": 12.3289, "eval_samples_per_second": 1541.419, "eval_steps_per_second": 32.12, "step": 113100 }, { "epoch": 43.603082851637765, "grad_norm": 0.32764461636543274, "learning_rate": 2.5589210019267824e-05, "loss": 0.31572704315185546, "step": 113150 }, { "epoch": 43.603082851637765, "eval_loss": 0.3765805959701538, "eval_runtime": 12.4701, "eval_samples_per_second": 1523.971, "eval_steps_per_second": 31.756, "step": 113150 }, { "epoch": 43.6223506743738, "grad_norm": 0.34473854303359985, "learning_rate": 2.55121387283237e-05, "loss": 0.3203118133544922, "step": 113200 }, { "epoch": 43.6223506743738, "eval_loss": 0.374977707862854, "eval_runtime": 12.4708, "eval_samples_per_second": 1523.877, "eval_steps_per_second": 31.754, "step": 113200 }, { "epoch": 43.641618497109825, "grad_norm": 0.31350404024124146, "learning_rate": 2.5435067437379577e-05, "loss": 0.3238964080810547, "step": 113250 }, { "epoch": 43.641618497109825, "eval_loss": 0.3776686489582062, "eval_runtime": 12.4926, "eval_samples_per_second": 1521.216, "eval_steps_per_second": 31.699, "step": 113250 }, { "epoch": 43.66088631984586, "grad_norm": 0.33807340264320374, "learning_rate": 2.535799614643545e-05, "loss": 0.3165584373474121, "step": 113300 }, { "epoch": 43.66088631984586, "eval_loss": 0.37795570492744446, "eval_runtime": 12.4868, "eval_samples_per_second": 1521.925, "eval_steps_per_second": 31.713, "step": 113300 }, { "epoch": 43.68015414258189, "grad_norm": 0.33682283759117126, "learning_rate": 2.528092485549133e-05, "loss": 0.31933767318725587, "step": 113350 }, { "epoch": 43.68015414258189, "eval_loss": 0.38470983505249023, "eval_runtime": 12.4617, "eval_samples_per_second": 1524.994, "eval_steps_per_second": 31.777, "step": 113350 }, { "epoch": 43.69942196531792, "grad_norm": 0.30779871344566345, "learning_rate": 2.5203853564547208e-05, "loss": 0.3142449569702148, "step": 113400 }, { "epoch": 43.69942196531792, "eval_loss": 0.3836933672428131, "eval_runtime": 12.4225, "eval_samples_per_second": 1529.807, "eval_steps_per_second": 31.878, "step": 113400 }, { "epoch": 43.71868978805395, "grad_norm": 0.30461418628692627, "learning_rate": 2.5126782273603082e-05, "loss": 0.3198283386230469, "step": 113450 }, { "epoch": 43.71868978805395, "eval_loss": 0.37391602993011475, "eval_runtime": 12.4943, "eval_samples_per_second": 1521.017, "eval_steps_per_second": 31.695, "step": 113450 }, { "epoch": 43.737957610789984, "grad_norm": 0.3340475857257843, "learning_rate": 2.5049710982658957e-05, "loss": 0.31617103576660155, "step": 113500 }, { "epoch": 43.737957610789984, "eval_loss": 0.3759969174861908, "eval_runtime": 12.4963, "eval_samples_per_second": 1520.776, "eval_steps_per_second": 31.689, "step": 113500 }, { "epoch": 43.75722543352601, "grad_norm": 0.30790457129478455, "learning_rate": 2.497263969171484e-05, "loss": 0.31429370880126956, "step": 113550 }, { "epoch": 43.75722543352601, "eval_loss": 0.3775486648082733, "eval_runtime": 12.4582, "eval_samples_per_second": 1525.424, "eval_steps_per_second": 31.786, "step": 113550 }, { "epoch": 43.77649325626204, "grad_norm": 0.32379886507987976, "learning_rate": 2.4895568400770713e-05, "loss": 0.3175784683227539, "step": 113600 }, { "epoch": 43.77649325626204, "eval_loss": 0.37547677755355835, "eval_runtime": 12.454, "eval_samples_per_second": 1525.941, "eval_steps_per_second": 31.797, "step": 113600 }, { "epoch": 43.795761078998076, "grad_norm": 0.34801697731018066, "learning_rate": 2.481849710982659e-05, "loss": 0.32021877288818357, "step": 113650 }, { "epoch": 43.795761078998076, "eval_loss": 0.3733605146408081, "eval_runtime": 12.4378, "eval_samples_per_second": 1527.918, "eval_steps_per_second": 31.838, "step": 113650 }, { "epoch": 43.8150289017341, "grad_norm": 0.3368929922580719, "learning_rate": 2.474142581888247e-05, "loss": 0.32022247314453123, "step": 113700 }, { "epoch": 43.8150289017341, "eval_loss": 0.3831391930580139, "eval_runtime": 12.4954, "eval_samples_per_second": 1520.874, "eval_steps_per_second": 31.692, "step": 113700 }, { "epoch": 43.834296724470136, "grad_norm": 0.31170088052749634, "learning_rate": 2.4664354527938344e-05, "loss": 0.3166335678100586, "step": 113750 }, { "epoch": 43.834296724470136, "eval_loss": 0.38034749031066895, "eval_runtime": 12.6132, "eval_samples_per_second": 1506.675, "eval_steps_per_second": 31.396, "step": 113750 }, { "epoch": 43.85356454720617, "grad_norm": 0.3462192118167877, "learning_rate": 2.4587283236994223e-05, "loss": 0.32179267883300783, "step": 113800 }, { "epoch": 43.85356454720617, "eval_loss": 0.3842769265174866, "eval_runtime": 12.4726, "eval_samples_per_second": 1523.664, "eval_steps_per_second": 31.75, "step": 113800 }, { "epoch": 43.872832369942195, "grad_norm": 0.29232269525527954, "learning_rate": 2.4510211946050097e-05, "loss": 0.3143365097045898, "step": 113850 }, { "epoch": 43.872832369942195, "eval_loss": 0.3813326060771942, "eval_runtime": 12.4167, "eval_samples_per_second": 1530.52, "eval_steps_per_second": 31.893, "step": 113850 }, { "epoch": 43.89210019267823, "grad_norm": 0.32410410046577454, "learning_rate": 2.4433140655105975e-05, "loss": 0.31510200500488283, "step": 113900 }, { "epoch": 43.89210019267823, "eval_loss": 0.38154327869415283, "eval_runtime": 12.5593, "eval_samples_per_second": 1513.138, "eval_steps_per_second": 31.53, "step": 113900 }, { "epoch": 43.91136801541426, "grad_norm": 0.2706957757472992, "learning_rate": 2.435606936416185e-05, "loss": 0.319453125, "step": 113950 }, { "epoch": 43.91136801541426, "eval_loss": 0.37070074677467346, "eval_runtime": 12.6003, "eval_samples_per_second": 1508.22, "eval_steps_per_second": 31.428, "step": 113950 }, { "epoch": 43.93063583815029, "grad_norm": 0.3278619647026062, "learning_rate": 2.4278998073217725e-05, "loss": 0.3179777526855469, "step": 114000 }, { "epoch": 43.93063583815029, "eval_loss": 0.38294580578804016, "eval_runtime": 12.2334, "eval_samples_per_second": 1553.451, "eval_steps_per_second": 32.37, "step": 114000 }, { "epoch": 43.94990366088632, "grad_norm": 0.32492053508758545, "learning_rate": 2.4201926782273603e-05, "loss": 0.3126980400085449, "step": 114050 }, { "epoch": 43.94990366088632, "eval_loss": 0.3827511966228485, "eval_runtime": 12.5035, "eval_samples_per_second": 1519.9, "eval_steps_per_second": 31.671, "step": 114050 }, { "epoch": 43.969171483622354, "grad_norm": 0.3203409016132355, "learning_rate": 2.412485549132948e-05, "loss": 0.3133608055114746, "step": 114100 }, { "epoch": 43.969171483622354, "eval_loss": 0.37782782316207886, "eval_runtime": 12.6371, "eval_samples_per_second": 1503.823, "eval_steps_per_second": 31.336, "step": 114100 }, { "epoch": 43.98843930635838, "grad_norm": 0.34199976921081543, "learning_rate": 2.4047784200385356e-05, "loss": 0.31797256469726565, "step": 114150 }, { "epoch": 43.98843930635838, "eval_loss": 0.37376561760902405, "eval_runtime": 12.3912, "eval_samples_per_second": 1533.67, "eval_steps_per_second": 31.958, "step": 114150 }, { "epoch": 44.00770712909441, "grad_norm": 0.3400188386440277, "learning_rate": 2.3970712909441234e-05, "loss": 0.32588417053222657, "step": 114200 }, { "epoch": 44.00770712909441, "eval_loss": 0.38296931982040405, "eval_runtime": 12.52, "eval_samples_per_second": 1517.896, "eval_steps_per_second": 31.629, "step": 114200 }, { "epoch": 44.02697495183045, "grad_norm": 0.32454416155815125, "learning_rate": 2.3893641618497112e-05, "loss": 0.3212978744506836, "step": 114250 }, { "epoch": 44.02697495183045, "eval_loss": 0.381841778755188, "eval_runtime": 12.5011, "eval_samples_per_second": 1520.185, "eval_steps_per_second": 31.677, "step": 114250 }, { "epoch": 44.04624277456647, "grad_norm": 0.2890488803386688, "learning_rate": 2.3816570327552987e-05, "loss": 0.3171359062194824, "step": 114300 }, { "epoch": 44.04624277456647, "eval_loss": 0.3758659362792969, "eval_runtime": 12.5758, "eval_samples_per_second": 1511.153, "eval_steps_per_second": 31.489, "step": 114300 }, { "epoch": 44.065510597302506, "grad_norm": 0.35460665822029114, "learning_rate": 2.3739499036608865e-05, "loss": 0.31742279052734373, "step": 114350 }, { "epoch": 44.065510597302506, "eval_loss": 0.3813323676586151, "eval_runtime": 12.1362, "eval_samples_per_second": 1565.894, "eval_steps_per_second": 32.63, "step": 114350 }, { "epoch": 44.08477842003854, "grad_norm": 0.34255388379096985, "learning_rate": 2.3662427745664743e-05, "loss": 0.31607976913452146, "step": 114400 }, { "epoch": 44.08477842003854, "eval_loss": 0.3697819709777832, "eval_runtime": 12.3465, "eval_samples_per_second": 1539.216, "eval_steps_per_second": 32.074, "step": 114400 }, { "epoch": 44.104046242774565, "grad_norm": 0.30093586444854736, "learning_rate": 2.3585356454720618e-05, "loss": 0.3143572998046875, "step": 114450 }, { "epoch": 44.104046242774565, "eval_loss": 0.37418079376220703, "eval_runtime": 12.2827, "eval_samples_per_second": 1547.216, "eval_steps_per_second": 32.24, "step": 114450 }, { "epoch": 44.1233140655106, "grad_norm": 0.2958231270313263, "learning_rate": 2.3508285163776496e-05, "loss": 0.31079763412475586, "step": 114500 }, { "epoch": 44.1233140655106, "eval_loss": 0.3810518682003021, "eval_runtime": 12.3287, "eval_samples_per_second": 1541.439, "eval_steps_per_second": 32.12, "step": 114500 }, { "epoch": 44.14258188824663, "grad_norm": 0.30738914012908936, "learning_rate": 2.343121387283237e-05, "loss": 0.31278305053710936, "step": 114550 }, { "epoch": 44.14258188824663, "eval_loss": 0.38032087683677673, "eval_runtime": 12.3057, "eval_samples_per_second": 1544.328, "eval_steps_per_second": 32.18, "step": 114550 }, { "epoch": 44.16184971098266, "grad_norm": 0.35766786336898804, "learning_rate": 2.335414258188825e-05, "loss": 0.31551429748535154, "step": 114600 }, { "epoch": 44.16184971098266, "eval_loss": 0.38157767057418823, "eval_runtime": 12.3393, "eval_samples_per_second": 1540.115, "eval_steps_per_second": 32.092, "step": 114600 }, { "epoch": 44.18111753371869, "grad_norm": 0.34718501567840576, "learning_rate": 2.3277071290944123e-05, "loss": 0.3153957176208496, "step": 114650 }, { "epoch": 44.18111753371869, "eval_loss": 0.3756404519081116, "eval_runtime": 12.5173, "eval_samples_per_second": 1518.214, "eval_steps_per_second": 31.636, "step": 114650 }, { "epoch": 44.20038535645472, "grad_norm": 0.32358890771865845, "learning_rate": 2.32e-05, "loss": 0.3179976463317871, "step": 114700 }, { "epoch": 44.20038535645472, "eval_loss": 0.37536728382110596, "eval_runtime": 12.1314, "eval_samples_per_second": 1566.518, "eval_steps_per_second": 32.643, "step": 114700 }, { "epoch": 44.21965317919075, "grad_norm": 0.32825395464897156, "learning_rate": 2.3122928709055876e-05, "loss": 0.31601613998413086, "step": 114750 }, { "epoch": 44.21965317919075, "eval_loss": 0.38237783312797546, "eval_runtime": 12.3067, "eval_samples_per_second": 1544.204, "eval_steps_per_second": 32.178, "step": 114750 }, { "epoch": 44.238921001926784, "grad_norm": 0.3121371567249298, "learning_rate": 2.3045857418111754e-05, "loss": 0.31773641586303714, "step": 114800 }, { "epoch": 44.238921001926784, "eval_loss": 0.3720148205757141, "eval_runtime": 12.3139, "eval_samples_per_second": 1543.302, "eval_steps_per_second": 32.159, "step": 114800 }, { "epoch": 44.25818882466281, "grad_norm": 0.3189791440963745, "learning_rate": 2.2968786127167632e-05, "loss": 0.3213134765625, "step": 114850 }, { "epoch": 44.25818882466281, "eval_loss": 0.38044869899749756, "eval_runtime": 12.3, "eval_samples_per_second": 1545.045, "eval_steps_per_second": 32.195, "step": 114850 }, { "epoch": 44.27745664739884, "grad_norm": 0.3310971260070801, "learning_rate": 2.2891714836223507e-05, "loss": 0.31446453094482424, "step": 114900 }, { "epoch": 44.27745664739884, "eval_loss": 0.37627846002578735, "eval_runtime": 12.2811, "eval_samples_per_second": 1547.413, "eval_steps_per_second": 32.245, "step": 114900 }, { "epoch": 44.296724470134876, "grad_norm": 0.3539133667945862, "learning_rate": 2.2814643545279385e-05, "loss": 0.31611907958984375, "step": 114950 }, { "epoch": 44.296724470134876, "eval_loss": 0.38566428422927856, "eval_runtime": 12.2988, "eval_samples_per_second": 1545.188, "eval_steps_per_second": 32.198, "step": 114950 }, { "epoch": 44.3159922928709, "grad_norm": 0.3207661807537079, "learning_rate": 2.273757225433526e-05, "loss": 0.3182981491088867, "step": 115000 }, { "epoch": 44.3159922928709, "eval_loss": 0.3758276402950287, "eval_runtime": 12.297, "eval_samples_per_second": 1545.412, "eval_steps_per_second": 32.203, "step": 115000 }, { "epoch": 44.335260115606935, "grad_norm": 0.3211921751499176, "learning_rate": 2.2660500963391138e-05, "loss": 0.3178274154663086, "step": 115050 }, { "epoch": 44.335260115606935, "eval_loss": 0.3828713595867157, "eval_runtime": 12.1325, "eval_samples_per_second": 1566.37, "eval_steps_per_second": 32.64, "step": 115050 }, { "epoch": 44.35452793834297, "grad_norm": 0.29858505725860596, "learning_rate": 2.2583429672447016e-05, "loss": 0.31215078353881837, "step": 115100 }, { "epoch": 44.35452793834297, "eval_loss": 0.3781922161579132, "eval_runtime": 12.2792, "eval_samples_per_second": 1547.655, "eval_steps_per_second": 32.25, "step": 115100 }, { "epoch": 44.373795761078995, "grad_norm": 0.31394973397254944, "learning_rate": 2.250635838150289e-05, "loss": 0.31790409088134763, "step": 115150 }, { "epoch": 44.373795761078995, "eval_loss": 0.3757091164588928, "eval_runtime": 12.2708, "eval_samples_per_second": 1548.72, "eval_steps_per_second": 32.272, "step": 115150 }, { "epoch": 44.39306358381503, "grad_norm": 0.30410200357437134, "learning_rate": 2.242928709055877e-05, "loss": 0.31762611389160156, "step": 115200 }, { "epoch": 44.39306358381503, "eval_loss": 0.37595054507255554, "eval_runtime": 12.2758, "eval_samples_per_second": 1548.089, "eval_steps_per_second": 32.259, "step": 115200 }, { "epoch": 44.41233140655106, "grad_norm": 0.31472137570381165, "learning_rate": 2.2352215799614644e-05, "loss": 0.31871557235717773, "step": 115250 }, { "epoch": 44.41233140655106, "eval_loss": 0.3779892027378082, "eval_runtime": 12.3237, "eval_samples_per_second": 1542.066, "eval_steps_per_second": 32.133, "step": 115250 }, { "epoch": 44.43159922928709, "grad_norm": 0.34107232093811035, "learning_rate": 2.227514450867052e-05, "loss": 0.3216377258300781, "step": 115300 }, { "epoch": 44.43159922928709, "eval_loss": 0.37444043159484863, "eval_runtime": 12.2746, "eval_samples_per_second": 1548.24, "eval_steps_per_second": 32.262, "step": 115300 }, { "epoch": 44.45086705202312, "grad_norm": 0.3111012578010559, "learning_rate": 2.2198073217726397e-05, "loss": 0.3164173698425293, "step": 115350 }, { "epoch": 44.45086705202312, "eval_loss": 0.39405733346939087, "eval_runtime": 12.3775, "eval_samples_per_second": 1535.368, "eval_steps_per_second": 31.994, "step": 115350 }, { "epoch": 44.470134874759154, "grad_norm": 0.30981308221817017, "learning_rate": 2.2121001926782275e-05, "loss": 0.32107887268066404, "step": 115400 }, { "epoch": 44.470134874759154, "eval_loss": 0.3811919689178467, "eval_runtime": 12.1151, "eval_samples_per_second": 1568.621, "eval_steps_per_second": 32.686, "step": 115400 }, { "epoch": 44.48940269749518, "grad_norm": 0.370969295501709, "learning_rate": 2.204393063583815e-05, "loss": 0.32387351989746094, "step": 115450 }, { "epoch": 44.48940269749518, "eval_loss": 0.37636277079582214, "eval_runtime": 12.3303, "eval_samples_per_second": 1541.239, "eval_steps_per_second": 32.116, "step": 115450 }, { "epoch": 44.50867052023121, "grad_norm": 0.28018200397491455, "learning_rate": 2.1966859344894028e-05, "loss": 0.3198374176025391, "step": 115500 }, { "epoch": 44.50867052023121, "eval_loss": 0.38411301374435425, "eval_runtime": 12.4702, "eval_samples_per_second": 1523.95, "eval_steps_per_second": 31.756, "step": 115500 }, { "epoch": 44.527938342967246, "grad_norm": 0.3382354974746704, "learning_rate": 2.1889788053949906e-05, "loss": 0.31281515121459963, "step": 115550 }, { "epoch": 44.527938342967246, "eval_loss": 0.3786512613296509, "eval_runtime": 12.287, "eval_samples_per_second": 1546.681, "eval_steps_per_second": 32.229, "step": 115550 }, { "epoch": 44.54720616570327, "grad_norm": 0.33646029233932495, "learning_rate": 2.181271676300578e-05, "loss": 0.3162067794799805, "step": 115600 }, { "epoch": 44.54720616570327, "eval_loss": 0.3754532039165497, "eval_runtime": 12.3134, "eval_samples_per_second": 1543.364, "eval_steps_per_second": 32.16, "step": 115600 }, { "epoch": 44.566473988439306, "grad_norm": 0.3216371238231659, "learning_rate": 2.173564547206166e-05, "loss": 0.3195106887817383, "step": 115650 }, { "epoch": 44.566473988439306, "eval_loss": 0.3834928572177887, "eval_runtime": 12.2806, "eval_samples_per_second": 1547.479, "eval_steps_per_second": 32.246, "step": 115650 }, { "epoch": 44.58574181117534, "grad_norm": 0.3572232127189636, "learning_rate": 2.1658574181117537e-05, "loss": 0.31717910766601565, "step": 115700 }, { "epoch": 44.58574181117534, "eval_loss": 0.3754159212112427, "eval_runtime": 12.2831, "eval_samples_per_second": 1547.163, "eval_steps_per_second": 32.239, "step": 115700 }, { "epoch": 44.605009633911365, "grad_norm": 0.3428240418434143, "learning_rate": 2.158150289017341e-05, "loss": 0.31843143463134765, "step": 115750 }, { "epoch": 44.605009633911365, "eval_loss": 0.3833730220794678, "eval_runtime": 12.0912, "eval_samples_per_second": 1571.721, "eval_steps_per_second": 32.751, "step": 115750 }, { "epoch": 44.6242774566474, "grad_norm": 0.310301274061203, "learning_rate": 2.150443159922929e-05, "loss": 0.3164694786071777, "step": 115800 }, { "epoch": 44.6242774566474, "eval_loss": 0.3843063414096832, "eval_runtime": 12.3142, "eval_samples_per_second": 1543.254, "eval_steps_per_second": 32.158, "step": 115800 }, { "epoch": 44.64354527938343, "grad_norm": 0.2950308918952942, "learning_rate": 2.1427360308285164e-05, "loss": 0.31810392379760744, "step": 115850 }, { "epoch": 44.64354527938343, "eval_loss": 0.3778769373893738, "eval_runtime": 12.3275, "eval_samples_per_second": 1541.589, "eval_steps_per_second": 32.123, "step": 115850 }, { "epoch": 44.66281310211946, "grad_norm": 0.34716594219207764, "learning_rate": 2.1350289017341042e-05, "loss": 0.3176814842224121, "step": 115900 }, { "epoch": 44.66281310211946, "eval_loss": 0.3806772530078888, "eval_runtime": 12.3655, "eval_samples_per_second": 1536.851, "eval_steps_per_second": 32.024, "step": 115900 }, { "epoch": 44.68208092485549, "grad_norm": 0.3874136805534363, "learning_rate": 2.1273217726396917e-05, "loss": 0.3201847839355469, "step": 115950 }, { "epoch": 44.68208092485549, "eval_loss": 0.38224098086357117, "eval_runtime": 12.2977, "eval_samples_per_second": 1545.33, "eval_steps_per_second": 32.201, "step": 115950 }, { "epoch": 44.701348747591524, "grad_norm": 0.3297894597053528, "learning_rate": 2.1196146435452795e-05, "loss": 0.31566390991210935, "step": 116000 }, { "epoch": 44.701348747591524, "eval_loss": 0.38407188653945923, "eval_runtime": 12.3046, "eval_samples_per_second": 1544.459, "eval_steps_per_second": 32.183, "step": 116000 }, { "epoch": 44.72061657032755, "grad_norm": 0.2991618812084198, "learning_rate": 2.111907514450867e-05, "loss": 0.3198761749267578, "step": 116050 }, { "epoch": 44.72061657032755, "eval_loss": 0.3870941698551178, "eval_runtime": 12.2964, "eval_samples_per_second": 1545.491, "eval_steps_per_second": 32.205, "step": 116050 }, { "epoch": 44.73988439306358, "grad_norm": 0.30083540081977844, "learning_rate": 2.1042003853564548e-05, "loss": 0.3172170257568359, "step": 116100 }, { "epoch": 44.73988439306358, "eval_loss": 0.38393253087997437, "eval_runtime": 12.2412, "eval_samples_per_second": 1552.461, "eval_steps_per_second": 32.35, "step": 116100 }, { "epoch": 44.75915221579962, "grad_norm": 0.3624337315559387, "learning_rate": 2.0964932562620426e-05, "loss": 0.321322021484375, "step": 116150 }, { "epoch": 44.75915221579962, "eval_loss": 0.37831658124923706, "eval_runtime": 12.4399, "eval_samples_per_second": 1527.67, "eval_steps_per_second": 31.833, "step": 116150 }, { "epoch": 44.77842003853564, "grad_norm": 0.3573884665966034, "learning_rate": 2.08878612716763e-05, "loss": 0.31733409881591795, "step": 116200 }, { "epoch": 44.77842003853564, "eval_loss": 0.3719310462474823, "eval_runtime": 12.4282, "eval_samples_per_second": 1529.107, "eval_steps_per_second": 31.863, "step": 116200 }, { "epoch": 44.797687861271676, "grad_norm": 0.33050408959388733, "learning_rate": 2.081078998073218e-05, "loss": 0.3175558853149414, "step": 116250 }, { "epoch": 44.797687861271676, "eval_loss": 0.36558330059051514, "eval_runtime": 12.4802, "eval_samples_per_second": 1522.737, "eval_steps_per_second": 31.73, "step": 116250 }, { "epoch": 44.81695568400771, "grad_norm": 0.27746376395225525, "learning_rate": 2.0733718689788054e-05, "loss": 0.3170751953125, "step": 116300 }, { "epoch": 44.81695568400771, "eval_loss": 0.37522512674331665, "eval_runtime": 12.4795, "eval_samples_per_second": 1522.813, "eval_steps_per_second": 31.732, "step": 116300 }, { "epoch": 44.836223506743735, "grad_norm": 0.2980521321296692, "learning_rate": 2.0656647398843932e-05, "loss": 0.314088134765625, "step": 116350 }, { "epoch": 44.836223506743735, "eval_loss": 0.3745180666446686, "eval_runtime": 12.4703, "eval_samples_per_second": 1523.945, "eval_steps_per_second": 31.756, "step": 116350 }, { "epoch": 44.85549132947977, "grad_norm": 0.301089346408844, "learning_rate": 2.057957610789981e-05, "loss": 0.31228559494018554, "step": 116400 }, { "epoch": 44.85549132947977, "eval_loss": 0.3767491579055786, "eval_runtime": 12.446, "eval_samples_per_second": 1526.912, "eval_steps_per_second": 31.817, "step": 116400 }, { "epoch": 44.8747591522158, "grad_norm": 0.3328346610069275, "learning_rate": 2.0502504816955685e-05, "loss": 0.31691558837890627, "step": 116450 }, { "epoch": 44.8747591522158, "eval_loss": 0.3786751627922058, "eval_runtime": 12.2783, "eval_samples_per_second": 1547.777, "eval_steps_per_second": 32.252, "step": 116450 }, { "epoch": 44.89402697495183, "grad_norm": 0.32480883598327637, "learning_rate": 2.0425433526011563e-05, "loss": 0.3217074584960937, "step": 116500 }, { "epoch": 44.89402697495183, "eval_loss": 0.3792929947376251, "eval_runtime": 12.3627, "eval_samples_per_second": 1537.208, "eval_steps_per_second": 32.032, "step": 116500 }, { "epoch": 44.91329479768786, "grad_norm": 0.3460099995136261, "learning_rate": 2.0348362235067438e-05, "loss": 0.31549285888671874, "step": 116550 }, { "epoch": 44.91329479768786, "eval_loss": 0.370061993598938, "eval_runtime": 12.331, "eval_samples_per_second": 1541.154, "eval_steps_per_second": 32.114, "step": 116550 }, { "epoch": 44.932562620423894, "grad_norm": 0.32472115755081177, "learning_rate": 2.0271290944123312e-05, "loss": 0.31524139404296875, "step": 116600 }, { "epoch": 44.932562620423894, "eval_loss": 0.3777141571044922, "eval_runtime": 12.3454, "eval_samples_per_second": 1539.362, "eval_steps_per_second": 32.077, "step": 116600 }, { "epoch": 44.95183044315992, "grad_norm": 0.3221111595630646, "learning_rate": 2.019421965317919e-05, "loss": 0.3171316337585449, "step": 116650 }, { "epoch": 44.95183044315992, "eval_loss": 0.37753066420555115, "eval_runtime": 12.2923, "eval_samples_per_second": 1546.004, "eval_steps_per_second": 32.215, "step": 116650 }, { "epoch": 44.971098265895954, "grad_norm": 0.29997193813323975, "learning_rate": 2.011714836223507e-05, "loss": 0.314368896484375, "step": 116700 }, { "epoch": 44.971098265895954, "eval_loss": 0.37525317072868347, "eval_runtime": 12.3, "eval_samples_per_second": 1545.039, "eval_steps_per_second": 32.195, "step": 116700 }, { "epoch": 44.99036608863199, "grad_norm": 0.28102725744247437, "learning_rate": 2.0040077071290943e-05, "loss": 0.3150864791870117, "step": 116750 }, { "epoch": 44.99036608863199, "eval_loss": 0.377450168132782, "eval_runtime": 12.3346, "eval_samples_per_second": 1540.703, "eval_steps_per_second": 32.105, "step": 116750 }, { "epoch": 45.00963391136801, "grad_norm": 0.3258819580078125, "learning_rate": 1.996300578034682e-05, "loss": 0.31344310760498045, "step": 116800 }, { "epoch": 45.00963391136801, "eval_loss": 0.3782449960708618, "eval_runtime": 12.4814, "eval_samples_per_second": 1522.583, "eval_steps_per_second": 31.727, "step": 116800 }, { "epoch": 45.028901734104046, "grad_norm": 0.3585723042488098, "learning_rate": 1.98859344894027e-05, "loss": 0.3192028617858887, "step": 116850 }, { "epoch": 45.028901734104046, "eval_loss": 0.3782693147659302, "eval_runtime": 12.5571, "eval_samples_per_second": 1513.411, "eval_steps_per_second": 31.536, "step": 116850 }, { "epoch": 45.04816955684008, "grad_norm": 0.3001513183116913, "learning_rate": 1.9808863198458574e-05, "loss": 0.3193472290039063, "step": 116900 }, { "epoch": 45.04816955684008, "eval_loss": 0.3735579252243042, "eval_runtime": 12.5087, "eval_samples_per_second": 1519.258, "eval_steps_per_second": 31.658, "step": 116900 }, { "epoch": 45.067437379576106, "grad_norm": 0.3293183147907257, "learning_rate": 1.9731791907514452e-05, "loss": 0.3122613525390625, "step": 116950 }, { "epoch": 45.067437379576106, "eval_loss": 0.3729932904243469, "eval_runtime": 12.5216, "eval_samples_per_second": 1517.692, "eval_steps_per_second": 31.625, "step": 116950 }, { "epoch": 45.08670520231214, "grad_norm": 0.309392511844635, "learning_rate": 1.965472061657033e-05, "loss": 0.314786376953125, "step": 117000 }, { "epoch": 45.08670520231214, "eval_loss": 0.3808589279651642, "eval_runtime": 12.5497, "eval_samples_per_second": 1514.301, "eval_steps_per_second": 31.555, "step": 117000 }, { "epoch": 45.10597302504817, "grad_norm": 0.36831536889076233, "learning_rate": 1.9577649325626205e-05, "loss": 0.317233943939209, "step": 117050 }, { "epoch": 45.10597302504817, "eval_loss": 0.3706683814525604, "eval_runtime": 12.5089, "eval_samples_per_second": 1519.236, "eval_steps_per_second": 31.657, "step": 117050 }, { "epoch": 45.1252408477842, "grad_norm": 0.3502669036388397, "learning_rate": 1.9500578034682083e-05, "loss": 0.31790843963623044, "step": 117100 }, { "epoch": 45.1252408477842, "eval_loss": 0.37147513031959534, "eval_runtime": 12.508, "eval_samples_per_second": 1519.353, "eval_steps_per_second": 31.66, "step": 117100 }, { "epoch": 45.14450867052023, "grad_norm": 0.3433837592601776, "learning_rate": 1.942350674373796e-05, "loss": 0.31217010498046877, "step": 117150 }, { "epoch": 45.14450867052023, "eval_loss": 0.3730934262275696, "eval_runtime": 12.4058, "eval_samples_per_second": 1531.86, "eval_steps_per_second": 31.92, "step": 117150 }, { "epoch": 45.163776493256265, "grad_norm": 0.332608699798584, "learning_rate": 1.9346435452793836e-05, "loss": 0.3132847595214844, "step": 117200 }, { "epoch": 45.163776493256265, "eval_loss": 0.3765961527824402, "eval_runtime": 12.52, "eval_samples_per_second": 1517.897, "eval_steps_per_second": 31.63, "step": 117200 }, { "epoch": 45.18304431599229, "grad_norm": 0.3030131757259369, "learning_rate": 1.926936416184971e-05, "loss": 0.3184045791625977, "step": 117250 }, { "epoch": 45.18304431599229, "eval_loss": 0.3713468611240387, "eval_runtime": 12.5602, "eval_samples_per_second": 1513.035, "eval_steps_per_second": 31.528, "step": 117250 }, { "epoch": 45.202312138728324, "grad_norm": 0.30044758319854736, "learning_rate": 1.919229287090559e-05, "loss": 0.318929500579834, "step": 117300 }, { "epoch": 45.202312138728324, "eval_loss": 0.3806862533092499, "eval_runtime": 12.6233, "eval_samples_per_second": 1505.469, "eval_steps_per_second": 31.371, "step": 117300 }, { "epoch": 45.22157996146436, "grad_norm": 0.30143222212791443, "learning_rate": 1.9115221579961464e-05, "loss": 0.31131351470947266, "step": 117350 }, { "epoch": 45.22157996146436, "eval_loss": 0.37356749176979065, "eval_runtime": 12.5051, "eval_samples_per_second": 1519.697, "eval_steps_per_second": 31.667, "step": 117350 }, { "epoch": 45.24084778420038, "grad_norm": 0.31283918023109436, "learning_rate": 1.9038150289017342e-05, "loss": 0.3175273132324219, "step": 117400 }, { "epoch": 45.24084778420038, "eval_loss": 0.37910696864128113, "eval_runtime": 12.5342, "eval_samples_per_second": 1516.176, "eval_steps_per_second": 31.594, "step": 117400 }, { "epoch": 45.26011560693642, "grad_norm": 0.290582537651062, "learning_rate": 1.8961078998073217e-05, "loss": 0.31903900146484376, "step": 117450 }, { "epoch": 45.26011560693642, "eval_loss": 0.37733322381973267, "eval_runtime": 12.3667, "eval_samples_per_second": 1536.704, "eval_steps_per_second": 32.021, "step": 117450 }, { "epoch": 45.27938342967245, "grad_norm": 0.3094509541988373, "learning_rate": 1.8884007707129095e-05, "loss": 0.31782888412475585, "step": 117500 }, { "epoch": 45.27938342967245, "eval_loss": 0.3845844864845276, "eval_runtime": 12.4066, "eval_samples_per_second": 1531.768, "eval_steps_per_second": 31.919, "step": 117500 }, { "epoch": 45.298651252408476, "grad_norm": 0.2962471842765808, "learning_rate": 1.8806936416184973e-05, "loss": 0.31795034408569334, "step": 117550 }, { "epoch": 45.298651252408476, "eval_loss": 0.3759951591491699, "eval_runtime": 12.3296, "eval_samples_per_second": 1541.332, "eval_steps_per_second": 32.118, "step": 117550 }, { "epoch": 45.31791907514451, "grad_norm": 0.3339860737323761, "learning_rate": 1.8729865125240848e-05, "loss": 0.3194940185546875, "step": 117600 }, { "epoch": 45.31791907514451, "eval_loss": 0.37709230184555054, "eval_runtime": 12.3064, "eval_samples_per_second": 1544.239, "eval_steps_per_second": 32.178, "step": 117600 }, { "epoch": 45.33718689788054, "grad_norm": 0.3004661500453949, "learning_rate": 1.8652793834296726e-05, "loss": 0.31389249801635744, "step": 117650 }, { "epoch": 45.33718689788054, "eval_loss": 0.3822256922721863, "eval_runtime": 12.4055, "eval_samples_per_second": 1531.9, "eval_steps_per_second": 31.921, "step": 117650 }, { "epoch": 45.35645472061657, "grad_norm": 0.3330288231372833, "learning_rate": 1.8575722543352604e-05, "loss": 0.31983413696289065, "step": 117700 }, { "epoch": 45.35645472061657, "eval_loss": 0.37873631715774536, "eval_runtime": 12.4745, "eval_samples_per_second": 1523.432, "eval_steps_per_second": 31.745, "step": 117700 }, { "epoch": 45.3757225433526, "grad_norm": 0.34123167395591736, "learning_rate": 1.849865125240848e-05, "loss": 0.31591054916381833, "step": 117750 }, { "epoch": 45.3757225433526, "eval_loss": 0.37288522720336914, "eval_runtime": 12.5406, "eval_samples_per_second": 1515.398, "eval_steps_per_second": 31.577, "step": 117750 }, { "epoch": 45.394990366088635, "grad_norm": 0.30284854769706726, "learning_rate": 1.8421579961464357e-05, "loss": 0.3174330520629883, "step": 117800 }, { "epoch": 45.394990366088635, "eval_loss": 0.37764304876327515, "eval_runtime": 12.308, "eval_samples_per_second": 1544.037, "eval_steps_per_second": 32.174, "step": 117800 }, { "epoch": 45.41425818882466, "grad_norm": 0.34748566150665283, "learning_rate": 1.834450867052023e-05, "loss": 0.315201530456543, "step": 117850 }, { "epoch": 45.41425818882466, "eval_loss": 0.3821808397769928, "eval_runtime": 12.2943, "eval_samples_per_second": 1545.752, "eval_steps_per_second": 32.21, "step": 117850 }, { "epoch": 45.433526011560694, "grad_norm": 0.3080303370952606, "learning_rate": 1.826743737957611e-05, "loss": 0.3116407775878906, "step": 117900 }, { "epoch": 45.433526011560694, "eval_loss": 0.37341150641441345, "eval_runtime": 12.4209, "eval_samples_per_second": 1530.006, "eval_steps_per_second": 31.882, "step": 117900 }, { "epoch": 45.45279383429673, "grad_norm": 0.32689645886421204, "learning_rate": 1.8190366088631984e-05, "loss": 0.3159679985046387, "step": 117950 }, { "epoch": 45.45279383429673, "eval_loss": 0.3727525770664215, "eval_runtime": 12.3298, "eval_samples_per_second": 1541.304, "eval_steps_per_second": 32.117, "step": 117950 }, { "epoch": 45.472061657032754, "grad_norm": 0.33535563945770264, "learning_rate": 1.8113294797687862e-05, "loss": 0.3163621520996094, "step": 118000 }, { "epoch": 45.472061657032754, "eval_loss": 0.37364575266838074, "eval_runtime": 12.388, "eval_samples_per_second": 1534.067, "eval_steps_per_second": 31.966, "step": 118000 }, { "epoch": 45.49132947976879, "grad_norm": 0.3078160881996155, "learning_rate": 1.8036223506743737e-05, "loss": 0.31522689819335936, "step": 118050 }, { "epoch": 45.49132947976879, "eval_loss": 0.37165525555610657, "eval_runtime": 12.3559, "eval_samples_per_second": 1538.051, "eval_steps_per_second": 32.049, "step": 118050 }, { "epoch": 45.51059730250482, "grad_norm": 0.30527159571647644, "learning_rate": 1.7959152215799615e-05, "loss": 0.3144795799255371, "step": 118100 }, { "epoch": 45.51059730250482, "eval_loss": 0.3778696358203888, "eval_runtime": 12.3012, "eval_samples_per_second": 1544.888, "eval_steps_per_second": 32.192, "step": 118100 }, { "epoch": 45.529865125240846, "grad_norm": 0.4187621474266052, "learning_rate": 1.7882080924855493e-05, "loss": 0.315799446105957, "step": 118150 }, { "epoch": 45.529865125240846, "eval_loss": 0.3671876788139343, "eval_runtime": 12.1717, "eval_samples_per_second": 1561.332, "eval_steps_per_second": 32.535, "step": 118150 }, { "epoch": 45.54913294797688, "grad_norm": 0.375291109085083, "learning_rate": 1.7805009633911368e-05, "loss": 0.3132997703552246, "step": 118200 }, { "epoch": 45.54913294797688, "eval_loss": 0.37737685441970825, "eval_runtime": 12.5997, "eval_samples_per_second": 1508.289, "eval_steps_per_second": 31.429, "step": 118200 }, { "epoch": 45.56840077071291, "grad_norm": 0.32900384068489075, "learning_rate": 1.7727938342967246e-05, "loss": 0.3223805618286133, "step": 118250 }, { "epoch": 45.56840077071291, "eval_loss": 0.37703344225883484, "eval_runtime": 12.6631, "eval_samples_per_second": 1500.737, "eval_steps_per_second": 31.272, "step": 118250 }, { "epoch": 45.58766859344894, "grad_norm": 0.32847416400909424, "learning_rate": 1.7650867052023124e-05, "loss": 0.31249263763427737, "step": 118300 }, { "epoch": 45.58766859344894, "eval_loss": 0.37564605474472046, "eval_runtime": 12.5441, "eval_samples_per_second": 1514.976, "eval_steps_per_second": 31.569, "step": 118300 }, { "epoch": 45.60693641618497, "grad_norm": 0.33615854382514954, "learning_rate": 1.7573795761079e-05, "loss": 0.3173861503601074, "step": 118350 }, { "epoch": 45.60693641618497, "eval_loss": 0.3782126307487488, "eval_runtime": 12.4816, "eval_samples_per_second": 1522.558, "eval_steps_per_second": 31.727, "step": 118350 }, { "epoch": 45.626204238921005, "grad_norm": 0.3105054795742035, "learning_rate": 1.7496724470134877e-05, "loss": 0.31422332763671873, "step": 118400 }, { "epoch": 45.626204238921005, "eval_loss": 0.3722893297672272, "eval_runtime": 12.5372, "eval_samples_per_second": 1515.81, "eval_steps_per_second": 31.586, "step": 118400 }, { "epoch": 45.64547206165703, "grad_norm": 0.37331879138946533, "learning_rate": 1.7419653179190755e-05, "loss": 0.31239253997802735, "step": 118450 }, { "epoch": 45.64547206165703, "eval_loss": 0.3749631941318512, "eval_runtime": 12.3863, "eval_samples_per_second": 1534.272, "eval_steps_per_second": 31.971, "step": 118450 }, { "epoch": 45.664739884393065, "grad_norm": 0.3535233438014984, "learning_rate": 1.734258188824663e-05, "loss": 0.3163831329345703, "step": 118500 }, { "epoch": 45.664739884393065, "eval_loss": 0.3767615258693695, "eval_runtime": 12.1599, "eval_samples_per_second": 1562.845, "eval_steps_per_second": 32.566, "step": 118500 }, { "epoch": 45.6840077071291, "grad_norm": 0.3013818860054016, "learning_rate": 1.7265510597302505e-05, "loss": 0.30634361267089844, "step": 118550 }, { "epoch": 45.6840077071291, "eval_loss": 0.3807436525821686, "eval_runtime": 12.5562, "eval_samples_per_second": 1513.517, "eval_steps_per_second": 31.538, "step": 118550 }, { "epoch": 45.703275529865124, "grad_norm": 0.37790995836257935, "learning_rate": 1.718843930635838e-05, "loss": 0.31531949996948244, "step": 118600 }, { "epoch": 45.703275529865124, "eval_loss": 0.3815579414367676, "eval_runtime": 12.4536, "eval_samples_per_second": 1525.988, "eval_steps_per_second": 31.798, "step": 118600 }, { "epoch": 45.72254335260116, "grad_norm": 0.3243614137172699, "learning_rate": 1.7111368015414257e-05, "loss": 0.31760574340820313, "step": 118650 }, { "epoch": 45.72254335260116, "eval_loss": 0.3905768394470215, "eval_runtime": 12.4582, "eval_samples_per_second": 1525.417, "eval_steps_per_second": 31.786, "step": 118650 }, { "epoch": 45.74181117533719, "grad_norm": 0.3511382043361664, "learning_rate": 1.7034296724470136e-05, "loss": 0.31949073791503907, "step": 118700 }, { "epoch": 45.74181117533719, "eval_loss": 0.3775988221168518, "eval_runtime": 12.4599, "eval_samples_per_second": 1525.211, "eval_steps_per_second": 31.782, "step": 118700 }, { "epoch": 45.761078998073216, "grad_norm": 0.3030148446559906, "learning_rate": 1.695722543352601e-05, "loss": 0.31495162963867185, "step": 118750 }, { "epoch": 45.761078998073216, "eval_loss": 0.3854365944862366, "eval_runtime": 12.4466, "eval_samples_per_second": 1526.848, "eval_steps_per_second": 31.816, "step": 118750 }, { "epoch": 45.78034682080925, "grad_norm": 0.3138434588909149, "learning_rate": 1.688015414258189e-05, "loss": 0.31729217529296877, "step": 118800 }, { "epoch": 45.78034682080925, "eval_loss": 0.3793911039829254, "eval_runtime": 12.4339, "eval_samples_per_second": 1528.404, "eval_steps_per_second": 31.848, "step": 118800 }, { "epoch": 45.79961464354528, "grad_norm": 0.3236357867717743, "learning_rate": 1.6803082851637767e-05, "loss": 0.31218509674072265, "step": 118850 }, { "epoch": 45.79961464354528, "eval_loss": 0.3896384537220001, "eval_runtime": 12.1683, "eval_samples_per_second": 1561.767, "eval_steps_per_second": 32.544, "step": 118850 }, { "epoch": 45.81888246628131, "grad_norm": 0.33250656723976135, "learning_rate": 1.672601156069364e-05, "loss": 0.314116325378418, "step": 118900 }, { "epoch": 45.81888246628131, "eval_loss": 0.37075406312942505, "eval_runtime": 12.3582, "eval_samples_per_second": 1537.768, "eval_steps_per_second": 32.044, "step": 118900 }, { "epoch": 45.83815028901734, "grad_norm": 0.3577900230884552, "learning_rate": 1.664894026974952e-05, "loss": 0.3181720542907715, "step": 118950 }, { "epoch": 45.83815028901734, "eval_loss": 0.3710523247718811, "eval_runtime": 12.3323, "eval_samples_per_second": 1540.995, "eval_steps_per_second": 32.111, "step": 118950 }, { "epoch": 45.857418111753375, "grad_norm": 0.2803007662296295, "learning_rate": 1.6571868978805397e-05, "loss": 0.3159703826904297, "step": 119000 }, { "epoch": 45.857418111753375, "eval_loss": 0.37411707639694214, "eval_runtime": 12.3025, "eval_samples_per_second": 1544.73, "eval_steps_per_second": 32.189, "step": 119000 }, { "epoch": 45.8766859344894, "grad_norm": 0.3209672272205353, "learning_rate": 1.6494797687861272e-05, "loss": 0.31448806762695314, "step": 119050 }, { "epoch": 45.8766859344894, "eval_loss": 0.37459367513656616, "eval_runtime": 12.3385, "eval_samples_per_second": 1540.216, "eval_steps_per_second": 32.095, "step": 119050 }, { "epoch": 45.895953757225435, "grad_norm": 0.3054676949977875, "learning_rate": 1.641772639691715e-05, "loss": 0.31506139755249024, "step": 119100 }, { "epoch": 45.895953757225435, "eval_loss": 0.3839593231678009, "eval_runtime": 12.5097, "eval_samples_per_second": 1519.14, "eval_steps_per_second": 31.655, "step": 119100 }, { "epoch": 45.91522157996146, "grad_norm": 0.2985895276069641, "learning_rate": 1.6340655105973025e-05, "loss": 0.3125262069702148, "step": 119150 }, { "epoch": 45.91522157996146, "eval_loss": 0.3747832775115967, "eval_runtime": 12.457, "eval_samples_per_second": 1525.564, "eval_steps_per_second": 31.789, "step": 119150 }, { "epoch": 45.934489402697494, "grad_norm": 0.3306261897087097, "learning_rate": 1.6263583815028903e-05, "loss": 0.3119842529296875, "step": 119200 }, { "epoch": 45.934489402697494, "eval_loss": 0.3748623728752136, "eval_runtime": 12.1334, "eval_samples_per_second": 1566.255, "eval_steps_per_second": 32.637, "step": 119200 }, { "epoch": 45.95375722543353, "grad_norm": 0.34142059087753296, "learning_rate": 1.6186512524084778e-05, "loss": 0.31405057907104494, "step": 119250 }, { "epoch": 45.95375722543353, "eval_loss": 0.3695586025714874, "eval_runtime": 12.4315, "eval_samples_per_second": 1528.696, "eval_steps_per_second": 31.855, "step": 119250 }, { "epoch": 45.97302504816955, "grad_norm": 0.3256899416446686, "learning_rate": 1.6109441233140656e-05, "loss": 0.31724159240722655, "step": 119300 }, { "epoch": 45.97302504816955, "eval_loss": 0.36734992265701294, "eval_runtime": 12.4565, "eval_samples_per_second": 1525.629, "eval_steps_per_second": 31.791, "step": 119300 }, { "epoch": 45.99229287090559, "grad_norm": 0.3388533592224121, "learning_rate": 1.603236994219653e-05, "loss": 0.30671987533569334, "step": 119350 }, { "epoch": 45.99229287090559, "eval_loss": 0.38761237263679504, "eval_runtime": 12.4727, "eval_samples_per_second": 1523.648, "eval_steps_per_second": 31.749, "step": 119350 }, { "epoch": 46.01156069364162, "grad_norm": 0.3275613486766815, "learning_rate": 1.595529865125241e-05, "loss": 0.31522891998291014, "step": 119400 }, { "epoch": 46.01156069364162, "eval_loss": 0.37896597385406494, "eval_runtime": 12.4717, "eval_samples_per_second": 1523.768, "eval_steps_per_second": 31.752, "step": 119400 }, { "epoch": 46.030828516377646, "grad_norm": 0.3095250725746155, "learning_rate": 1.5878227360308287e-05, "loss": 0.3139674758911133, "step": 119450 }, { "epoch": 46.030828516377646, "eval_loss": 0.37464839220046997, "eval_runtime": 12.4504, "eval_samples_per_second": 1526.383, "eval_steps_per_second": 31.806, "step": 119450 }, { "epoch": 46.05009633911368, "grad_norm": 0.2978660464286804, "learning_rate": 1.5801156069364162e-05, "loss": 0.32063056945800783, "step": 119500 }, { "epoch": 46.05009633911368, "eval_loss": 0.37854284048080444, "eval_runtime": 12.7367, "eval_samples_per_second": 1492.067, "eval_steps_per_second": 31.091, "step": 119500 }, { "epoch": 46.06936416184971, "grad_norm": 0.36088597774505615, "learning_rate": 1.572408477842004e-05, "loss": 0.318469295501709, "step": 119550 }, { "epoch": 46.06936416184971, "eval_loss": 0.37784332036972046, "eval_runtime": 12.1256, "eval_samples_per_second": 1567.263, "eval_steps_per_second": 32.658, "step": 119550 }, { "epoch": 46.08863198458574, "grad_norm": 0.3296859562397003, "learning_rate": 1.5647013487475918e-05, "loss": 0.31712955474853516, "step": 119600 }, { "epoch": 46.08863198458574, "eval_loss": 0.3754724860191345, "eval_runtime": 12.4457, "eval_samples_per_second": 1526.958, "eval_steps_per_second": 31.818, "step": 119600 }, { "epoch": 46.10789980732177, "grad_norm": 0.3490089178085327, "learning_rate": 1.5569942196531793e-05, "loss": 0.313203239440918, "step": 119650 }, { "epoch": 46.10789980732177, "eval_loss": 0.3770994246006012, "eval_runtime": 12.4639, "eval_samples_per_second": 1524.721, "eval_steps_per_second": 31.772, "step": 119650 }, { "epoch": 46.127167630057805, "grad_norm": 0.3349704444408417, "learning_rate": 1.549287090558767e-05, "loss": 0.3185121536254883, "step": 119700 }, { "epoch": 46.127167630057805, "eval_loss": 0.37520313262939453, "eval_runtime": 12.4678, "eval_samples_per_second": 1524.242, "eval_steps_per_second": 31.762, "step": 119700 }, { "epoch": 46.14643545279383, "grad_norm": 0.3181188106536865, "learning_rate": 1.541579961464355e-05, "loss": 0.31656131744384763, "step": 119750 }, { "epoch": 46.14643545279383, "eval_loss": 0.3778001666069031, "eval_runtime": 12.35, "eval_samples_per_second": 1538.785, "eval_steps_per_second": 32.065, "step": 119750 }, { "epoch": 46.165703275529864, "grad_norm": 0.3703780770301819, "learning_rate": 1.5338728323699424e-05, "loss": 0.3175255584716797, "step": 119800 }, { "epoch": 46.165703275529864, "eval_loss": 0.3742023706436157, "eval_runtime": 12.4645, "eval_samples_per_second": 1524.655, "eval_steps_per_second": 31.77, "step": 119800 }, { "epoch": 46.1849710982659, "grad_norm": 0.3832343518733978, "learning_rate": 1.52616570327553e-05, "loss": 0.317154598236084, "step": 119850 }, { "epoch": 46.1849710982659, "eval_loss": 0.3720446825027466, "eval_runtime": 12.3072, "eval_samples_per_second": 1544.134, "eval_steps_per_second": 32.176, "step": 119850 }, { "epoch": 46.204238921001924, "grad_norm": 0.3253882825374603, "learning_rate": 1.5184585741811175e-05, "loss": 0.3104520225524902, "step": 119900 }, { "epoch": 46.204238921001924, "eval_loss": 0.38470402359962463, "eval_runtime": 12.1648, "eval_samples_per_second": 1562.218, "eval_steps_per_second": 32.553, "step": 119900 }, { "epoch": 46.22350674373796, "grad_norm": 0.36345887184143066, "learning_rate": 1.5107514450867053e-05, "loss": 0.3150153923034668, "step": 119950 }, { "epoch": 46.22350674373796, "eval_loss": 0.37373486161231995, "eval_runtime": 12.5405, "eval_samples_per_second": 1515.412, "eval_steps_per_second": 31.578, "step": 119950 }, { "epoch": 46.24277456647399, "grad_norm": 0.310004860162735, "learning_rate": 1.503044315992293e-05, "loss": 0.3139848899841309, "step": 120000 }, { "epoch": 46.24277456647399, "eval_loss": 0.3762872517108917, "eval_runtime": 12.5164, "eval_samples_per_second": 1518.328, "eval_steps_per_second": 31.638, "step": 120000 }, { "epoch": 46.262042389210016, "grad_norm": 0.3325580954551697, "learning_rate": 1.4953371868978806e-05, "loss": 0.31378232955932617, "step": 120050 }, { "epoch": 46.262042389210016, "eval_loss": 0.38394948840141296, "eval_runtime": 12.5917, "eval_samples_per_second": 1509.253, "eval_steps_per_second": 31.449, "step": 120050 }, { "epoch": 46.28131021194605, "grad_norm": 0.335650771856308, "learning_rate": 1.4876300578034682e-05, "loss": 0.31865041732788085, "step": 120100 }, { "epoch": 46.28131021194605, "eval_loss": 0.3813934326171875, "eval_runtime": 12.5044, "eval_samples_per_second": 1519.791, "eval_steps_per_second": 31.669, "step": 120100 }, { "epoch": 46.30057803468208, "grad_norm": 0.3081348240375519, "learning_rate": 1.479922928709056e-05, "loss": 0.3125326156616211, "step": 120150 }, { "epoch": 46.30057803468208, "eval_loss": 0.38740256428718567, "eval_runtime": 12.541, "eval_samples_per_second": 1515.352, "eval_steps_per_second": 31.576, "step": 120150 }, { "epoch": 46.31984585741811, "grad_norm": 0.3427327573299408, "learning_rate": 1.4722157996146435e-05, "loss": 0.31161510467529296, "step": 120200 }, { "epoch": 46.31984585741811, "eval_loss": 0.37910690903663635, "eval_runtime": 12.6326, "eval_samples_per_second": 1504.356, "eval_steps_per_second": 31.347, "step": 120200 }, { "epoch": 46.33911368015414, "grad_norm": 0.310493528842926, "learning_rate": 1.4645086705202313e-05, "loss": 0.313527946472168, "step": 120250 }, { "epoch": 46.33911368015414, "eval_loss": 0.37870362401008606, "eval_runtime": 12.139, "eval_samples_per_second": 1565.527, "eval_steps_per_second": 32.622, "step": 120250 }, { "epoch": 46.358381502890175, "grad_norm": 0.32953211665153503, "learning_rate": 1.4568015414258191e-05, "loss": 0.3124885177612305, "step": 120300 }, { "epoch": 46.358381502890175, "eval_loss": 0.38447558879852295, "eval_runtime": 12.4645, "eval_samples_per_second": 1524.644, "eval_steps_per_second": 31.77, "step": 120300 }, { "epoch": 46.3776493256262, "grad_norm": 0.3159143626689911, "learning_rate": 1.4490944123314066e-05, "loss": 0.3152968215942383, "step": 120350 }, { "epoch": 46.3776493256262, "eval_loss": 0.3815658986568451, "eval_runtime": 12.5123, "eval_samples_per_second": 1518.83, "eval_steps_per_second": 31.649, "step": 120350 }, { "epoch": 46.396917148362235, "grad_norm": 0.31647688150405884, "learning_rate": 1.4413872832369942e-05, "loss": 0.31441253662109375, "step": 120400 }, { "epoch": 46.396917148362235, "eval_loss": 0.37881147861480713, "eval_runtime": 12.4616, "eval_samples_per_second": 1524.999, "eval_steps_per_second": 31.777, "step": 120400 }, { "epoch": 46.41618497109827, "grad_norm": 0.31065747141838074, "learning_rate": 1.433680154142582e-05, "loss": 0.31542369842529294, "step": 120450 }, { "epoch": 46.41618497109827, "eval_loss": 0.3721565008163452, "eval_runtime": 12.4604, "eval_samples_per_second": 1525.154, "eval_steps_per_second": 31.781, "step": 120450 }, { "epoch": 46.435452793834294, "grad_norm": 0.3876585364341736, "learning_rate": 1.4259730250481695e-05, "loss": 0.3196664047241211, "step": 120500 }, { "epoch": 46.435452793834294, "eval_loss": 0.3828496038913727, "eval_runtime": 12.3028, "eval_samples_per_second": 1544.684, "eval_steps_per_second": 32.188, "step": 120500 }, { "epoch": 46.45472061657033, "grad_norm": 0.3450692594051361, "learning_rate": 1.4182658959537573e-05, "loss": 0.30886001586914064, "step": 120550 }, { "epoch": 46.45472061657033, "eval_loss": 0.380370169878006, "eval_runtime": 12.4236, "eval_samples_per_second": 1529.665, "eval_steps_per_second": 31.875, "step": 120550 }, { "epoch": 46.47398843930636, "grad_norm": 0.320608526468277, "learning_rate": 1.4105587668593451e-05, "loss": 0.31883413314819337, "step": 120600 }, { "epoch": 46.47398843930636, "eval_loss": 0.3784538507461548, "eval_runtime": 12.1669, "eval_samples_per_second": 1561.941, "eval_steps_per_second": 32.547, "step": 120600 }, { "epoch": 46.49325626204239, "grad_norm": 0.3336312174797058, "learning_rate": 1.4028516377649326e-05, "loss": 0.3142364883422852, "step": 120650 }, { "epoch": 46.49325626204239, "eval_loss": 0.3761374354362488, "eval_runtime": 12.3661, "eval_samples_per_second": 1536.779, "eval_steps_per_second": 32.023, "step": 120650 }, { "epoch": 46.51252408477842, "grad_norm": 0.32600539922714233, "learning_rate": 1.3951445086705203e-05, "loss": 0.31660640716552735, "step": 120700 }, { "epoch": 46.51252408477842, "eval_loss": 0.37305355072021484, "eval_runtime": 12.3055, "eval_samples_per_second": 1544.353, "eval_steps_per_second": 32.181, "step": 120700 }, { "epoch": 46.53179190751445, "grad_norm": 0.28718996047973633, "learning_rate": 1.387437379576108e-05, "loss": 0.31169178009033205, "step": 120750 }, { "epoch": 46.53179190751445, "eval_loss": 0.37510454654693604, "eval_runtime": 12.3491, "eval_samples_per_second": 1538.899, "eval_steps_per_second": 32.067, "step": 120750 }, { "epoch": 46.55105973025048, "grad_norm": 0.32287558913230896, "learning_rate": 1.3797302504816955e-05, "loss": 0.3135261154174805, "step": 120800 }, { "epoch": 46.55105973025048, "eval_loss": 0.38073721528053284, "eval_runtime": 12.3101, "eval_samples_per_second": 1543.768, "eval_steps_per_second": 32.169, "step": 120800 }, { "epoch": 46.57032755298651, "grad_norm": 0.31558486819267273, "learning_rate": 1.3720231213872834e-05, "loss": 0.31745079040527346, "step": 120850 }, { "epoch": 46.57032755298651, "eval_loss": 0.3723495900630951, "eval_runtime": 12.3519, "eval_samples_per_second": 1538.547, "eval_steps_per_second": 32.06, "step": 120850 }, { "epoch": 46.589595375722546, "grad_norm": 0.3127133250236511, "learning_rate": 1.3643159922928712e-05, "loss": 0.3173346519470215, "step": 120900 }, { "epoch": 46.589595375722546, "eval_loss": 0.3851313889026642, "eval_runtime": 12.488, "eval_samples_per_second": 1521.785, "eval_steps_per_second": 31.711, "step": 120900 }, { "epoch": 46.60886319845857, "grad_norm": 0.3095417320728302, "learning_rate": 1.3566088631984586e-05, "loss": 0.314422607421875, "step": 120950 }, { "epoch": 46.60886319845857, "eval_loss": 0.3776075839996338, "eval_runtime": 12.1862, "eval_samples_per_second": 1559.468, "eval_steps_per_second": 32.496, "step": 120950 }, { "epoch": 46.628131021194605, "grad_norm": 0.28882572054862976, "learning_rate": 1.3489017341040463e-05, "loss": 0.3160143089294434, "step": 121000 }, { "epoch": 46.628131021194605, "eval_loss": 0.3762606978416443, "eval_runtime": 12.3457, "eval_samples_per_second": 1539.326, "eval_steps_per_second": 32.076, "step": 121000 }, { "epoch": 46.64739884393064, "grad_norm": 0.29840222001075745, "learning_rate": 1.341194605009634e-05, "loss": 0.3145107078552246, "step": 121050 }, { "epoch": 46.64739884393064, "eval_loss": 0.37033870816230774, "eval_runtime": 12.3395, "eval_samples_per_second": 1540.092, "eval_steps_per_second": 32.092, "step": 121050 }, { "epoch": 46.666666666666664, "grad_norm": 0.2936212718486786, "learning_rate": 1.3334874759152216e-05, "loss": 0.3145816040039062, "step": 121100 }, { "epoch": 46.666666666666664, "eval_loss": 0.37245556712150574, "eval_runtime": 12.3907, "eval_samples_per_second": 1533.725, "eval_steps_per_second": 31.959, "step": 121100 }, { "epoch": 46.6859344894027, "grad_norm": 0.38480520248413086, "learning_rate": 1.3257803468208094e-05, "loss": 0.31214366912841796, "step": 121150 }, { "epoch": 46.6859344894027, "eval_loss": 0.3778720796108246, "eval_runtime": 12.3198, "eval_samples_per_second": 1542.554, "eval_steps_per_second": 32.143, "step": 121150 }, { "epoch": 46.70520231213873, "grad_norm": 0.3268295228481293, "learning_rate": 1.3180732177263969e-05, "loss": 0.3179030418395996, "step": 121200 }, { "epoch": 46.70520231213873, "eval_loss": 0.3653090298175812, "eval_runtime": 12.3441, "eval_samples_per_second": 1539.515, "eval_steps_per_second": 32.08, "step": 121200 }, { "epoch": 46.72447013487476, "grad_norm": 0.32132795453071594, "learning_rate": 1.3103660886319847e-05, "loss": 0.31894248962402344, "step": 121250 }, { "epoch": 46.72447013487476, "eval_loss": 0.37288036942481995, "eval_runtime": 12.5389, "eval_samples_per_second": 1515.607, "eval_steps_per_second": 31.582, "step": 121250 }, { "epoch": 46.74373795761079, "grad_norm": 0.3105332851409912, "learning_rate": 1.3026589595375723e-05, "loss": 0.31581409454345705, "step": 121300 }, { "epoch": 46.74373795761079, "eval_loss": 0.37440770864486694, "eval_runtime": 12.1531, "eval_samples_per_second": 1563.713, "eval_steps_per_second": 32.584, "step": 121300 }, { "epoch": 46.76300578034682, "grad_norm": 0.35166361927986145, "learning_rate": 1.29495183044316e-05, "loss": 0.31351146697998045, "step": 121350 }, { "epoch": 46.76300578034682, "eval_loss": 0.3752146065235138, "eval_runtime": 12.493, "eval_samples_per_second": 1521.173, "eval_steps_per_second": 31.698, "step": 121350 }, { "epoch": 46.78227360308285, "grad_norm": 0.32759183645248413, "learning_rate": 1.2872447013487476e-05, "loss": 0.31524152755737306, "step": 121400 }, { "epoch": 46.78227360308285, "eval_loss": 0.36877521872520447, "eval_runtime": 12.5417, "eval_samples_per_second": 1515.261, "eval_steps_per_second": 31.575, "step": 121400 }, { "epoch": 46.80154142581888, "grad_norm": 0.3450312316417694, "learning_rate": 1.2795375722543354e-05, "loss": 0.3114239501953125, "step": 121450 }, { "epoch": 46.80154142581888, "eval_loss": 0.3747256100177765, "eval_runtime": 12.5302, "eval_samples_per_second": 1516.659, "eval_steps_per_second": 31.604, "step": 121450 }, { "epoch": 46.820809248554916, "grad_norm": 0.33829131722450256, "learning_rate": 1.2718304431599229e-05, "loss": 0.3108376121520996, "step": 121500 }, { "epoch": 46.820809248554916, "eval_loss": 0.3777056336402893, "eval_runtime": 12.5375, "eval_samples_per_second": 1515.777, "eval_steps_per_second": 31.585, "step": 121500 }, { "epoch": 46.84007707129094, "grad_norm": 0.32395511865615845, "learning_rate": 1.2641233140655107e-05, "loss": 0.31169723510742187, "step": 121550 }, { "epoch": 46.84007707129094, "eval_loss": 0.3801754117012024, "eval_runtime": 12.4699, "eval_samples_per_second": 1523.993, "eval_steps_per_second": 31.757, "step": 121550 }, { "epoch": 46.859344894026975, "grad_norm": 0.32372596859931946, "learning_rate": 1.2564161849710985e-05, "loss": 0.3143695640563965, "step": 121600 }, { "epoch": 46.859344894026975, "eval_loss": 0.37210366129875183, "eval_runtime": 12.4787, "eval_samples_per_second": 1522.917, "eval_steps_per_second": 31.734, "step": 121600 }, { "epoch": 46.87861271676301, "grad_norm": 0.34715986251831055, "learning_rate": 1.248709055876686e-05, "loss": 0.31454078674316405, "step": 121650 }, { "epoch": 46.87861271676301, "eval_loss": 0.3777550756931305, "eval_runtime": 12.1959, "eval_samples_per_second": 1558.228, "eval_steps_per_second": 32.47, "step": 121650 }, { "epoch": 46.897880539499035, "grad_norm": 0.34723198413848877, "learning_rate": 1.2410019267822736e-05, "loss": 0.31409460067749023, "step": 121700 }, { "epoch": 46.897880539499035, "eval_loss": 0.37000223994255066, "eval_runtime": 12.3341, "eval_samples_per_second": 1540.773, "eval_steps_per_second": 32.106, "step": 121700 }, { "epoch": 46.91714836223507, "grad_norm": 0.30443429946899414, "learning_rate": 1.2332947976878613e-05, "loss": 0.31122020721435545, "step": 121750 }, { "epoch": 46.91714836223507, "eval_loss": 0.37270689010620117, "eval_runtime": 12.4533, "eval_samples_per_second": 1526.018, "eval_steps_per_second": 31.799, "step": 121750 }, { "epoch": 46.9364161849711, "grad_norm": 0.3150986135005951, "learning_rate": 1.225587668593449e-05, "loss": 0.312861328125, "step": 121800 }, { "epoch": 46.9364161849711, "eval_loss": 0.36859381198883057, "eval_runtime": 12.4976, "eval_samples_per_second": 1520.606, "eval_steps_per_second": 31.686, "step": 121800 }, { "epoch": 46.95568400770713, "grad_norm": 0.30906933546066284, "learning_rate": 1.2178805394990367e-05, "loss": 0.3077062034606934, "step": 121850 }, { "epoch": 46.95568400770713, "eval_loss": 0.375499963760376, "eval_runtime": 12.4845, "eval_samples_per_second": 1522.214, "eval_steps_per_second": 31.719, "step": 121850 }, { "epoch": 46.97495183044316, "grad_norm": 0.29194894433021545, "learning_rate": 1.2101734104046243e-05, "loss": 0.3193522453308105, "step": 121900 }, { "epoch": 46.97495183044316, "eval_loss": 0.3723788559436798, "eval_runtime": 12.4844, "eval_samples_per_second": 1522.223, "eval_steps_per_second": 31.72, "step": 121900 }, { "epoch": 46.994219653179194, "grad_norm": 0.34389710426330566, "learning_rate": 1.202466281310212e-05, "loss": 0.31497310638427733, "step": 121950 }, { "epoch": 46.994219653179194, "eval_loss": 0.3760836124420166, "eval_runtime": 12.5024, "eval_samples_per_second": 1520.032, "eval_steps_per_second": 31.674, "step": 121950 }, { "epoch": 47.01348747591522, "grad_norm": 0.3204994201660156, "learning_rate": 1.1947591522157996e-05, "loss": 0.3132350158691406, "step": 122000 }, { "epoch": 47.01348747591522, "eval_loss": 0.37114498019218445, "eval_runtime": 12.4376, "eval_samples_per_second": 1527.945, "eval_steps_per_second": 31.839, "step": 122000 }, { "epoch": 47.03275529865125, "grad_norm": 0.2978506088256836, "learning_rate": 1.1870520231213873e-05, "loss": 0.32117912292480466, "step": 122050 }, { "epoch": 47.03275529865125, "eval_loss": 0.3666188716888428, "eval_runtime": 12.3647, "eval_samples_per_second": 1536.956, "eval_steps_per_second": 32.027, "step": 122050 }, { "epoch": 47.052023121387286, "grad_norm": 0.36784815788269043, "learning_rate": 1.179344894026975e-05, "loss": 0.31190460205078124, "step": 122100 }, { "epoch": 47.052023121387286, "eval_loss": 0.3754938840866089, "eval_runtime": 12.2996, "eval_samples_per_second": 1545.085, "eval_steps_per_second": 32.196, "step": 122100 }, { "epoch": 47.07129094412331, "grad_norm": 0.3382199704647064, "learning_rate": 1.1716377649325627e-05, "loss": 0.3156040954589844, "step": 122150 }, { "epoch": 47.07129094412331, "eval_loss": 0.3817451596260071, "eval_runtime": 12.2996, "eval_samples_per_second": 1545.096, "eval_steps_per_second": 32.196, "step": 122150 }, { "epoch": 47.090558766859345, "grad_norm": 0.31749191880226135, "learning_rate": 1.1639306358381504e-05, "loss": 0.3135897445678711, "step": 122200 }, { "epoch": 47.090558766859345, "eval_loss": 0.37815746665000916, "eval_runtime": 12.5816, "eval_samples_per_second": 1510.462, "eval_steps_per_second": 31.475, "step": 122200 }, { "epoch": 47.10982658959538, "grad_norm": 0.3640945255756378, "learning_rate": 1.156223506743738e-05, "loss": 0.3135330581665039, "step": 122250 }, { "epoch": 47.10982658959538, "eval_loss": 0.3821195662021637, "eval_runtime": 12.4622, "eval_samples_per_second": 1524.937, "eval_steps_per_second": 31.776, "step": 122250 }, { "epoch": 47.129094412331405, "grad_norm": 0.32237550616264343, "learning_rate": 1.1485163776493257e-05, "loss": 0.31968313217163086, "step": 122300 }, { "epoch": 47.129094412331405, "eval_loss": 0.3747214376926422, "eval_runtime": 12.376, "eval_samples_per_second": 1535.557, "eval_steps_per_second": 31.998, "step": 122300 }, { "epoch": 47.14836223506744, "grad_norm": 0.27705883979797363, "learning_rate": 1.1408092485549133e-05, "loss": 0.31914968490600587, "step": 122350 }, { "epoch": 47.14836223506744, "eval_loss": 0.37368497252464294, "eval_runtime": 12.2993, "eval_samples_per_second": 1545.13, "eval_steps_per_second": 32.197, "step": 122350 }, { "epoch": 47.16763005780347, "grad_norm": 0.3236149847507477, "learning_rate": 1.133102119460501e-05, "loss": 0.312783203125, "step": 122400 }, { "epoch": 47.16763005780347, "eval_loss": 0.3772985339164734, "eval_runtime": 12.571, "eval_samples_per_second": 1511.732, "eval_steps_per_second": 31.501, "step": 122400 }, { "epoch": 47.1868978805395, "grad_norm": 0.317035436630249, "learning_rate": 1.1253949903660888e-05, "loss": 0.31816802978515624, "step": 122450 }, { "epoch": 47.1868978805395, "eval_loss": 0.38027259707450867, "eval_runtime": 12.5126, "eval_samples_per_second": 1518.784, "eval_steps_per_second": 31.648, "step": 122450 }, { "epoch": 47.20616570327553, "grad_norm": 0.3020758628845215, "learning_rate": 1.1176878612716764e-05, "loss": 0.31503484725952147, "step": 122500 }, { "epoch": 47.20616570327553, "eval_loss": 0.3854586184024811, "eval_runtime": 12.511, "eval_samples_per_second": 1518.982, "eval_steps_per_second": 31.652, "step": 122500 }, { "epoch": 47.225433526011564, "grad_norm": 0.3499301075935364, "learning_rate": 1.109980732177264e-05, "loss": 0.3171534538269043, "step": 122550 }, { "epoch": 47.225433526011564, "eval_loss": 0.3777940571308136, "eval_runtime": 12.4939, "eval_samples_per_second": 1521.057, "eval_steps_per_second": 31.695, "step": 122550 }, { "epoch": 47.24470134874759, "grad_norm": 0.4409113824367523, "learning_rate": 1.1022736030828517e-05, "loss": 0.31164520263671874, "step": 122600 }, { "epoch": 47.24470134874759, "eval_loss": 0.3788793087005615, "eval_runtime": 12.9178, "eval_samples_per_second": 1471.145, "eval_steps_per_second": 30.655, "step": 122600 }, { "epoch": 47.26396917148362, "grad_norm": 0.32910701632499695, "learning_rate": 1.0945664739884393e-05, "loss": 0.321817512512207, "step": 122650 }, { "epoch": 47.26396917148362, "eval_loss": 0.3757881820201874, "eval_runtime": 12.9422, "eval_samples_per_second": 1468.377, "eval_steps_per_second": 30.598, "step": 122650 }, { "epoch": 47.283236994219656, "grad_norm": 0.3010399043560028, "learning_rate": 1.086859344894027e-05, "loss": 0.3147060775756836, "step": 122700 }, { "epoch": 47.283236994219656, "eval_loss": 0.37467846274375916, "eval_runtime": 12.5461, "eval_samples_per_second": 1514.729, "eval_steps_per_second": 31.563, "step": 122700 }, { "epoch": 47.30250481695568, "grad_norm": 0.31982433795928955, "learning_rate": 1.0791522157996146e-05, "loss": 0.31034088134765625, "step": 122750 }, { "epoch": 47.30250481695568, "eval_loss": 0.37488803267478943, "eval_runtime": 12.457, "eval_samples_per_second": 1525.572, "eval_steps_per_second": 31.789, "step": 122750 }, { "epoch": 47.321772639691716, "grad_norm": 0.3483738899230957, "learning_rate": 1.0714450867052024e-05, "loss": 0.3089602279663086, "step": 122800 }, { "epoch": 47.321772639691716, "eval_loss": 0.378061443567276, "eval_runtime": 12.506, "eval_samples_per_second": 1519.593, "eval_steps_per_second": 31.665, "step": 122800 }, { "epoch": 47.34104046242775, "grad_norm": 0.28804290294647217, "learning_rate": 1.06373795761079e-05, "loss": 0.3151264762878418, "step": 122850 }, { "epoch": 47.34104046242775, "eval_loss": 0.3834393322467804, "eval_runtime": 12.4063, "eval_samples_per_second": 1531.796, "eval_steps_per_second": 31.919, "step": 122850 }, { "epoch": 47.360308285163775, "grad_norm": 0.36090412735939026, "learning_rate": 1.0560308285163777e-05, "loss": 0.31368370056152345, "step": 122900 }, { "epoch": 47.360308285163775, "eval_loss": 0.37363961338996887, "eval_runtime": 12.3231, "eval_samples_per_second": 1542.143, "eval_steps_per_second": 32.135, "step": 122900 }, { "epoch": 47.37957610789981, "grad_norm": 0.3261222839355469, "learning_rate": 1.0483236994219653e-05, "loss": 0.31944808959960935, "step": 122950 }, { "epoch": 47.37957610789981, "eval_loss": 0.37645837664604187, "eval_runtime": 12.3426, "eval_samples_per_second": 1539.71, "eval_steps_per_second": 32.084, "step": 122950 }, { "epoch": 47.39884393063584, "grad_norm": 0.3177816867828369, "learning_rate": 1.040616570327553e-05, "loss": 0.3122319030761719, "step": 123000 }, { "epoch": 47.39884393063584, "eval_loss": 0.3712409734725952, "eval_runtime": 12.5013, "eval_samples_per_second": 1520.166, "eval_steps_per_second": 31.677, "step": 123000 }, { "epoch": 47.41811175337187, "grad_norm": 0.27910086512565613, "learning_rate": 1.0329094412331406e-05, "loss": 0.31095006942749026, "step": 123050 }, { "epoch": 47.41811175337187, "eval_loss": 0.37572693824768066, "eval_runtime": 12.673, "eval_samples_per_second": 1499.569, "eval_steps_per_second": 31.248, "step": 123050 }, { "epoch": 47.4373795761079, "grad_norm": 0.3854800760746002, "learning_rate": 1.0252023121387283e-05, "loss": 0.31259403228759763, "step": 123100 }, { "epoch": 47.4373795761079, "eval_loss": 0.3749905824661255, "eval_runtime": 12.4239, "eval_samples_per_second": 1529.627, "eval_steps_per_second": 31.874, "step": 123100 }, { "epoch": 47.456647398843934, "grad_norm": 0.31723377108573914, "learning_rate": 1.017495183044316e-05, "loss": 0.314243221282959, "step": 123150 }, { "epoch": 47.456647398843934, "eval_loss": 0.38505715131759644, "eval_runtime": 12.4525, "eval_samples_per_second": 1526.122, "eval_steps_per_second": 31.801, "step": 123150 }, { "epoch": 47.47591522157996, "grad_norm": 0.31791236996650696, "learning_rate": 1.0097880539499037e-05, "loss": 0.31486867904663085, "step": 123200 }, { "epoch": 47.47591522157996, "eval_loss": 0.3723740875720978, "eval_runtime": 12.3981, "eval_samples_per_second": 1532.811, "eval_steps_per_second": 31.94, "step": 123200 }, { "epoch": 47.49518304431599, "grad_norm": 0.3411237597465515, "learning_rate": 1.0020809248554914e-05, "loss": 0.3139814376831055, "step": 123250 }, { "epoch": 47.49518304431599, "eval_loss": 0.375340074300766, "eval_runtime": 12.3157, "eval_samples_per_second": 1543.071, "eval_steps_per_second": 32.154, "step": 123250 }, { "epoch": 47.51445086705202, "grad_norm": 0.32147324085235596, "learning_rate": 9.94373795761079e-06, "loss": 0.3131045150756836, "step": 123300 }, { "epoch": 47.51445086705202, "eval_loss": 0.3750040829181671, "eval_runtime": 12.3274, "eval_samples_per_second": 1541.611, "eval_steps_per_second": 32.124, "step": 123300 }, { "epoch": 47.53371868978805, "grad_norm": 0.33248093724250793, "learning_rate": 9.866666666666667e-06, "loss": 0.3160318565368652, "step": 123350 }, { "epoch": 47.53371868978805, "eval_loss": 0.371918261051178, "eval_runtime": 12.3316, "eval_samples_per_second": 1541.08, "eval_steps_per_second": 32.113, "step": 123350 }, { "epoch": 47.552986512524086, "grad_norm": 0.3050123453140259, "learning_rate": 9.789595375722543e-06, "loss": 0.3154665184020996, "step": 123400 }, { "epoch": 47.552986512524086, "eval_loss": 0.3776727616786957, "eval_runtime": 12.5049, "eval_samples_per_second": 1519.719, "eval_steps_per_second": 31.667, "step": 123400 }, { "epoch": 47.57225433526011, "grad_norm": 0.30728015303611755, "learning_rate": 9.712524084778421e-06, "loss": 0.31435230255126956, "step": 123450 }, { "epoch": 47.57225433526011, "eval_loss": 0.38228926062583923, "eval_runtime": 12.5663, "eval_samples_per_second": 1512.298, "eval_steps_per_second": 31.513, "step": 123450 }, { "epoch": 47.591522157996145, "grad_norm": 0.35533830523490906, "learning_rate": 9.635452793834297e-06, "loss": 0.31497184753417967, "step": 123500 }, { "epoch": 47.591522157996145, "eval_loss": 0.3677665591239929, "eval_runtime": 12.3795, "eval_samples_per_second": 1535.12, "eval_steps_per_second": 31.988, "step": 123500 }, { "epoch": 47.61078998073218, "grad_norm": 0.33300554752349854, "learning_rate": 9.558381502890174e-06, "loss": 0.3183267974853516, "step": 123550 }, { "epoch": 47.61078998073218, "eval_loss": 0.3769160509109497, "eval_runtime": 12.3663, "eval_samples_per_second": 1536.757, "eval_steps_per_second": 32.023, "step": 123550 }, { "epoch": 47.630057803468205, "grad_norm": 0.30762341618537903, "learning_rate": 9.481310211946052e-06, "loss": 0.3105363082885742, "step": 123600 }, { "epoch": 47.630057803468205, "eval_loss": 0.3688500225543976, "eval_runtime": 12.5648, "eval_samples_per_second": 1512.475, "eval_steps_per_second": 31.517, "step": 123600 }, { "epoch": 47.64932562620424, "grad_norm": 0.3103982210159302, "learning_rate": 9.404238921001927e-06, "loss": 0.31401092529296876, "step": 123650 }, { "epoch": 47.64932562620424, "eval_loss": 0.378212034702301, "eval_runtime": 12.4923, "eval_samples_per_second": 1521.26, "eval_steps_per_second": 31.7, "step": 123650 }, { "epoch": 47.66859344894027, "grad_norm": 0.3446447253227234, "learning_rate": 9.327167630057803e-06, "loss": 0.3124014663696289, "step": 123700 }, { "epoch": 47.66859344894027, "eval_loss": 0.38099050521850586, "eval_runtime": 12.2049, "eval_samples_per_second": 1557.075, "eval_steps_per_second": 32.446, "step": 123700 }, { "epoch": 47.6878612716763, "grad_norm": 0.31678974628448486, "learning_rate": 9.25009633911368e-06, "loss": 0.3147323226928711, "step": 123750 }, { "epoch": 47.6878612716763, "eval_loss": 0.3672657608985901, "eval_runtime": 12.5469, "eval_samples_per_second": 1514.639, "eval_steps_per_second": 31.562, "step": 123750 }, { "epoch": 47.70712909441233, "grad_norm": 0.3556235134601593, "learning_rate": 9.173025048169558e-06, "loss": 0.3105028533935547, "step": 123800 }, { "epoch": 47.70712909441233, "eval_loss": 0.37238574028015137, "eval_runtime": 12.3151, "eval_samples_per_second": 1543.152, "eval_steps_per_second": 32.156, "step": 123800 }, { "epoch": 47.726396917148364, "grad_norm": 0.3154390752315521, "learning_rate": 9.095953757225434e-06, "loss": 0.3173929595947266, "step": 123850 }, { "epoch": 47.726396917148364, "eval_loss": 0.37603068351745605, "eval_runtime": 12.3466, "eval_samples_per_second": 1539.213, "eval_steps_per_second": 32.074, "step": 123850 }, { "epoch": 47.74566473988439, "grad_norm": 0.3085898458957672, "learning_rate": 9.01888246628131e-06, "loss": 0.3136648941040039, "step": 123900 }, { "epoch": 47.74566473988439, "eval_loss": 0.3758414387702942, "eval_runtime": 12.4424, "eval_samples_per_second": 1527.359, "eval_steps_per_second": 31.827, "step": 123900 }, { "epoch": 47.76493256262042, "grad_norm": 0.3261592388153076, "learning_rate": 8.941811175337187e-06, "loss": 0.31411617279052734, "step": 123950 }, { "epoch": 47.76493256262042, "eval_loss": 0.3793683648109436, "eval_runtime": 12.5183, "eval_samples_per_second": 1518.097, "eval_steps_per_second": 31.634, "step": 123950 }, { "epoch": 47.784200385356456, "grad_norm": 0.2998732626438141, "learning_rate": 8.864739884393063e-06, "loss": 0.3115138626098633, "step": 124000 }, { "epoch": 47.784200385356456, "eval_loss": 0.37943366169929504, "eval_runtime": 12.5185, "eval_samples_per_second": 1518.071, "eval_steps_per_second": 31.633, "step": 124000 }, { "epoch": 47.80346820809248, "grad_norm": 0.36781153082847595, "learning_rate": 8.78766859344894e-06, "loss": 0.31497146606445314, "step": 124050 }, { "epoch": 47.80346820809248, "eval_loss": 0.378867506980896, "eval_runtime": 12.1877, "eval_samples_per_second": 1559.278, "eval_steps_per_second": 32.492, "step": 124050 }, { "epoch": 47.822736030828516, "grad_norm": 0.318486750125885, "learning_rate": 8.710597302504818e-06, "loss": 0.31960454940795896, "step": 124100 }, { "epoch": 47.822736030828516, "eval_loss": 0.37166455388069153, "eval_runtime": 12.332, "eval_samples_per_second": 1541.035, "eval_steps_per_second": 32.112, "step": 124100 }, { "epoch": 47.84200385356455, "grad_norm": 0.3715088665485382, "learning_rate": 8.633526011560694e-06, "loss": 0.3134461975097656, "step": 124150 }, { "epoch": 47.84200385356455, "eval_loss": 0.3776034414768219, "eval_runtime": 12.3474, "eval_samples_per_second": 1539.114, "eval_steps_per_second": 32.072, "step": 124150 }, { "epoch": 47.861271676300575, "grad_norm": 0.3011038899421692, "learning_rate": 8.55645472061657e-06, "loss": 0.30940603256225585, "step": 124200 }, { "epoch": 47.861271676300575, "eval_loss": 0.37839454412460327, "eval_runtime": 12.3228, "eval_samples_per_second": 1542.183, "eval_steps_per_second": 32.136, "step": 124200 }, { "epoch": 47.88053949903661, "grad_norm": 0.32909664511680603, "learning_rate": 8.479383429672449e-06, "loss": 0.3163459777832031, "step": 124250 }, { "epoch": 47.88053949903661, "eval_loss": 0.3752474784851074, "eval_runtime": 12.3606, "eval_samples_per_second": 1537.466, "eval_steps_per_second": 32.037, "step": 124250 }, { "epoch": 47.89980732177264, "grad_norm": 0.2993583679199219, "learning_rate": 8.402312138728324e-06, "loss": 0.31542232513427737, "step": 124300 }, { "epoch": 47.89980732177264, "eval_loss": 0.37116435170173645, "eval_runtime": 12.3302, "eval_samples_per_second": 1541.262, "eval_steps_per_second": 32.116, "step": 124300 }, { "epoch": 47.91907514450867, "grad_norm": 0.33805274963378906, "learning_rate": 8.3252408477842e-06, "loss": 0.30900829315185546, "step": 124350 }, { "epoch": 47.91907514450867, "eval_loss": 0.37227094173431396, "eval_runtime": 12.3223, "eval_samples_per_second": 1542.248, "eval_steps_per_second": 32.137, "step": 124350 }, { "epoch": 47.9383429672447, "grad_norm": 0.32848456501960754, "learning_rate": 8.248169556840076e-06, "loss": 0.31327919006347654, "step": 124400 }, { "epoch": 47.9383429672447, "eval_loss": 0.37457647919654846, "eval_runtime": 12.634, "eval_samples_per_second": 1504.2, "eval_steps_per_second": 31.344, "step": 124400 }, { "epoch": 47.957610789980734, "grad_norm": 0.3354359269142151, "learning_rate": 8.171098265895955e-06, "loss": 0.3124615478515625, "step": 124450 }, { "epoch": 47.957610789980734, "eval_loss": 0.3760894238948822, "eval_runtime": 12.3712, "eval_samples_per_second": 1536.143, "eval_steps_per_second": 32.01, "step": 124450 }, { "epoch": 47.97687861271676, "grad_norm": 0.3056959807872772, "learning_rate": 8.094026974951831e-06, "loss": 0.3148496055603027, "step": 124500 }, { "epoch": 47.97687861271676, "eval_loss": 0.37996163964271545, "eval_runtime": 12.3974, "eval_samples_per_second": 1532.897, "eval_steps_per_second": 31.942, "step": 124500 }, { "epoch": 47.99614643545279, "grad_norm": 0.3058818280696869, "learning_rate": 8.016955684007707e-06, "loss": 0.31361175537109376, "step": 124550 }, { "epoch": 47.99614643545279, "eval_loss": 0.37659260630607605, "eval_runtime": 12.3577, "eval_samples_per_second": 1537.828, "eval_steps_per_second": 32.045, "step": 124550 }, { "epoch": 48.01541425818883, "grad_norm": 0.30214881896972656, "learning_rate": 7.939884393063584e-06, "loss": 0.3142112731933594, "step": 124600 }, { "epoch": 48.01541425818883, "eval_loss": 0.3748052418231964, "eval_runtime": 12.3456, "eval_samples_per_second": 1539.329, "eval_steps_per_second": 32.076, "step": 124600 }, { "epoch": 48.03468208092485, "grad_norm": 0.3281204402446747, "learning_rate": 7.86281310211946e-06, "loss": 0.31600500106811524, "step": 124650 }, { "epoch": 48.03468208092485, "eval_loss": 0.3759361803531647, "eval_runtime": 12.531, "eval_samples_per_second": 1516.562, "eval_steps_per_second": 31.602, "step": 124650 }, { "epoch": 48.053949903660886, "grad_norm": 0.29310184717178345, "learning_rate": 7.785741811175337e-06, "loss": 0.31539318084716794, "step": 124700 }, { "epoch": 48.053949903660886, "eval_loss": 0.37337177991867065, "eval_runtime": 12.5834, "eval_samples_per_second": 1510.244, "eval_steps_per_second": 31.47, "step": 124700 }, { "epoch": 48.07321772639692, "grad_norm": 0.32594558596611023, "learning_rate": 7.708670520231215e-06, "loss": 0.31420583724975587, "step": 124750 }, { "epoch": 48.07321772639692, "eval_loss": 0.3766196668148041, "eval_runtime": 12.5049, "eval_samples_per_second": 1519.727, "eval_steps_per_second": 31.668, "step": 124750 }, { "epoch": 48.092485549132945, "grad_norm": 0.3111829161643982, "learning_rate": 7.631599229287091e-06, "loss": 0.3092048263549805, "step": 124800 }, { "epoch": 48.092485549132945, "eval_loss": 0.3717285990715027, "eval_runtime": 12.3271, "eval_samples_per_second": 1541.641, "eval_steps_per_second": 32.124, "step": 124800 }, { "epoch": 48.11175337186898, "grad_norm": 0.3463335335254669, "learning_rate": 7.554527938342968e-06, "loss": 0.31337240219116214, "step": 124850 }, { "epoch": 48.11175337186898, "eval_loss": 0.3713030517101288, "eval_runtime": 12.3875, "eval_samples_per_second": 1534.129, "eval_steps_per_second": 31.968, "step": 124850 }, { "epoch": 48.13102119460501, "grad_norm": 0.34456127882003784, "learning_rate": 7.477456647398843e-06, "loss": 0.314285945892334, "step": 124900 }, { "epoch": 48.13102119460501, "eval_loss": 0.37123605608940125, "eval_runtime": 12.3231, "eval_samples_per_second": 1542.139, "eval_steps_per_second": 32.135, "step": 124900 }, { "epoch": 48.15028901734104, "grad_norm": 0.38072866201400757, "learning_rate": 7.400385356454721e-06, "loss": 0.31664159774780276, "step": 124950 }, { "epoch": 48.15028901734104, "eval_loss": 0.37517061829566956, "eval_runtime": 12.3236, "eval_samples_per_second": 1542.086, "eval_steps_per_second": 32.134, "step": 124950 }, { "epoch": 48.16955684007707, "grad_norm": 0.3250613510608673, "learning_rate": 7.323314065510598e-06, "loss": 0.31553985595703127, "step": 125000 }, { "epoch": 48.16955684007707, "eval_loss": 0.37157493829727173, "eval_runtime": 12.3239, "eval_samples_per_second": 1542.044, "eval_steps_per_second": 32.133, "step": 125000 }, { "epoch": 48.188824662813104, "grad_norm": 0.3050674498081207, "learning_rate": 7.246242774566474e-06, "loss": 0.3105717086791992, "step": 125050 }, { "epoch": 48.188824662813104, "eval_loss": 0.3808838427066803, "eval_runtime": 12.3033, "eval_samples_per_second": 1544.624, "eval_steps_per_second": 32.186, "step": 125050 }, { "epoch": 48.20809248554913, "grad_norm": 0.3575219213962555, "learning_rate": 7.1691714836223514e-06, "loss": 0.31300298690795897, "step": 125100 }, { "epoch": 48.20809248554913, "eval_loss": 0.37876856327056885, "eval_runtime": 12.6707, "eval_samples_per_second": 1499.839, "eval_steps_per_second": 31.253, "step": 125100 }, { "epoch": 48.227360308285164, "grad_norm": 0.33538296818733215, "learning_rate": 7.092100192678228e-06, "loss": 0.3118548583984375, "step": 125150 }, { "epoch": 48.227360308285164, "eval_loss": 0.37486404180526733, "eval_runtime": 12.4846, "eval_samples_per_second": 1522.193, "eval_steps_per_second": 31.719, "step": 125150 }, { "epoch": 48.2466281310212, "grad_norm": 0.2990739345550537, "learning_rate": 7.015028901734104e-06, "loss": 0.313055477142334, "step": 125200 }, { "epoch": 48.2466281310212, "eval_loss": 0.36624443531036377, "eval_runtime": 12.5633, "eval_samples_per_second": 1512.655, "eval_steps_per_second": 31.52, "step": 125200 }, { "epoch": 48.26589595375722, "grad_norm": 0.32747626304626465, "learning_rate": 6.9379576107899815e-06, "loss": 0.31096950531005857, "step": 125250 }, { "epoch": 48.26589595375722, "eval_loss": 0.37779727578163147, "eval_runtime": 12.5136, "eval_samples_per_second": 1518.673, "eval_steps_per_second": 31.646, "step": 125250 }, { "epoch": 48.285163776493256, "grad_norm": 0.35663625597953796, "learning_rate": 6.860886319845858e-06, "loss": 0.3158100509643555, "step": 125300 }, { "epoch": 48.285163776493256, "eval_loss": 0.36870840191841125, "eval_runtime": 12.5138, "eval_samples_per_second": 1518.647, "eval_steps_per_second": 31.645, "step": 125300 }, { "epoch": 48.30443159922929, "grad_norm": 0.29555878043174744, "learning_rate": 6.783815028901734e-06, "loss": 0.3127567100524902, "step": 125350 }, { "epoch": 48.30443159922929, "eval_loss": 0.37664222717285156, "eval_runtime": 12.563, "eval_samples_per_second": 1512.697, "eval_steps_per_second": 31.521, "step": 125350 }, { "epoch": 48.323699421965316, "grad_norm": 0.31667178869247437, "learning_rate": 6.706743737957612e-06, "loss": 0.31104824066162107, "step": 125400 }, { "epoch": 48.323699421965316, "eval_loss": 0.37487319111824036, "eval_runtime": 12.1958, "eval_samples_per_second": 1558.245, "eval_steps_per_second": 32.47, "step": 125400 }, { "epoch": 48.34296724470135, "grad_norm": 0.3147968649864197, "learning_rate": 6.629672447013488e-06, "loss": 0.31386877059936524, "step": 125450 }, { "epoch": 48.34296724470135, "eval_loss": 0.3754046857357025, "eval_runtime": 12.3361, "eval_samples_per_second": 1540.517, "eval_steps_per_second": 32.101, "step": 125450 }, { "epoch": 48.36223506743738, "grad_norm": 0.33441001176834106, "learning_rate": 6.5526011560693645e-06, "loss": 0.3156875228881836, "step": 125500 }, { "epoch": 48.36223506743738, "eval_loss": 0.3755277693271637, "eval_runtime": 12.3134, "eval_samples_per_second": 1543.362, "eval_steps_per_second": 32.16, "step": 125500 }, { "epoch": 48.38150289017341, "grad_norm": 0.31255850195884705, "learning_rate": 6.475529865125241e-06, "loss": 0.3113642501831055, "step": 125550 }, { "epoch": 48.38150289017341, "eval_loss": 0.38143154978752136, "eval_runtime": 12.3236, "eval_samples_per_second": 1542.084, "eval_steps_per_second": 32.134, "step": 125550 }, { "epoch": 48.40077071290944, "grad_norm": 0.3218488395214081, "learning_rate": 6.398458574181118e-06, "loss": 0.3171889495849609, "step": 125600 }, { "epoch": 48.40077071290944, "eval_loss": 0.37087661027908325, "eval_runtime": 12.3421, "eval_samples_per_second": 1539.772, "eval_steps_per_second": 32.085, "step": 125600 }, { "epoch": 48.420038535645475, "grad_norm": 0.32948046922683716, "learning_rate": 6.321387283236995e-06, "loss": 0.317139949798584, "step": 125650 }, { "epoch": 48.420038535645475, "eval_loss": 0.37450286746025085, "eval_runtime": 12.331, "eval_samples_per_second": 1541.155, "eval_steps_per_second": 32.114, "step": 125650 }, { "epoch": 48.4393063583815, "grad_norm": 0.2921614944934845, "learning_rate": 6.244315992292871e-06, "loss": 0.3172334289550781, "step": 125700 }, { "epoch": 48.4393063583815, "eval_loss": 0.3785669803619385, "eval_runtime": 12.3282, "eval_samples_per_second": 1541.508, "eval_steps_per_second": 32.122, "step": 125700 }, { "epoch": 48.458574181117534, "grad_norm": 0.3041069805622101, "learning_rate": 6.1672447013487475e-06, "loss": 0.3159562301635742, "step": 125750 }, { "epoch": 48.458574181117534, "eval_loss": 0.3758064806461334, "eval_runtime": 12.1943, "eval_samples_per_second": 1558.429, "eval_steps_per_second": 32.474, "step": 125750 }, { "epoch": 48.47784200385357, "grad_norm": 0.2780841588973999, "learning_rate": 6.090173410404625e-06, "loss": 0.31322240829467773, "step": 125800 }, { "epoch": 48.47784200385357, "eval_loss": 0.3739124536514282, "eval_runtime": 12.5483, "eval_samples_per_second": 1514.466, "eval_steps_per_second": 31.558, "step": 125800 }, { "epoch": 48.49710982658959, "grad_norm": 0.31909555196762085, "learning_rate": 6.013102119460501e-06, "loss": 0.3125851440429688, "step": 125850 }, { "epoch": 48.49710982658959, "eval_loss": 0.36966589093208313, "eval_runtime": 12.5186, "eval_samples_per_second": 1518.061, "eval_steps_per_second": 31.633, "step": 125850 }, { "epoch": 48.51637764932563, "grad_norm": 0.34710410237312317, "learning_rate": 5.9360308285163776e-06, "loss": 0.31891685485839844, "step": 125900 }, { "epoch": 48.51637764932563, "eval_loss": 0.371250718832016, "eval_runtime": 12.508, "eval_samples_per_second": 1519.346, "eval_steps_per_second": 31.66, "step": 125900 }, { "epoch": 48.53564547206166, "grad_norm": 0.34938186407089233, "learning_rate": 5.858959537572255e-06, "loss": 0.31159215927124023, "step": 125950 }, { "epoch": 48.53564547206166, "eval_loss": 0.373399943113327, "eval_runtime": 12.5632, "eval_samples_per_second": 1512.668, "eval_steps_per_second": 31.521, "step": 125950 }, { "epoch": 48.554913294797686, "grad_norm": 0.3336155116558075, "learning_rate": 5.781888246628131e-06, "loss": 0.3184162521362305, "step": 126000 }, { "epoch": 48.554913294797686, "eval_loss": 0.3801146149635315, "eval_runtime": 12.5704, "eval_samples_per_second": 1511.8, "eval_steps_per_second": 31.502, "step": 126000 }, { "epoch": 48.57418111753372, "grad_norm": 0.33880093693733215, "learning_rate": 5.704816955684008e-06, "loss": 0.318155574798584, "step": 126050 }, { "epoch": 48.57418111753372, "eval_loss": 0.37473320960998535, "eval_runtime": 12.5885, "eval_samples_per_second": 1509.63, "eval_steps_per_second": 31.457, "step": 126050 }, { "epoch": 48.59344894026975, "grad_norm": 0.29124149680137634, "learning_rate": 5.627745664739885e-06, "loss": 0.3089302635192871, "step": 126100 }, { "epoch": 48.59344894026975, "eval_loss": 0.37449708580970764, "eval_runtime": 12.1816, "eval_samples_per_second": 1560.059, "eval_steps_per_second": 32.508, "step": 126100 }, { "epoch": 48.61271676300578, "grad_norm": 0.47301697731018066, "learning_rate": 5.550674373795761e-06, "loss": 0.3167064476013184, "step": 126150 }, { "epoch": 48.61271676300578, "eval_loss": 0.3779939115047455, "eval_runtime": 12.3481, "eval_samples_per_second": 1539.028, "eval_steps_per_second": 32.07, "step": 126150 }, { "epoch": 48.63198458574181, "grad_norm": 0.3405545651912689, "learning_rate": 5.473603082851638e-06, "loss": 0.31468490600585936, "step": 126200 }, { "epoch": 48.63198458574181, "eval_loss": 0.37360289692878723, "eval_runtime": 12.3825, "eval_samples_per_second": 1534.744, "eval_steps_per_second": 31.981, "step": 126200 }, { "epoch": 48.651252408477845, "grad_norm": 0.3534184694290161, "learning_rate": 5.396531791907514e-06, "loss": 0.31267950057983396, "step": 126250 }, { "epoch": 48.651252408477845, "eval_loss": 0.3719702959060669, "eval_runtime": 12.3079, "eval_samples_per_second": 1544.051, "eval_steps_per_second": 32.175, "step": 126250 }, { "epoch": 48.67052023121387, "grad_norm": 0.3424595594406128, "learning_rate": 5.3194605009633915e-06, "loss": 0.30739004135131837, "step": 126300 }, { "epoch": 48.67052023121387, "eval_loss": 0.3834350109100342, "eval_runtime": 12.4541, "eval_samples_per_second": 1525.919, "eval_steps_per_second": 31.797, "step": 126300 }, { "epoch": 48.689788053949904, "grad_norm": 0.3366718590259552, "learning_rate": 5.242389210019268e-06, "loss": 0.3116274642944336, "step": 126350 }, { "epoch": 48.689788053949904, "eval_loss": 0.37771639227867126, "eval_runtime": 12.3443, "eval_samples_per_second": 1539.493, "eval_steps_per_second": 32.08, "step": 126350 }, { "epoch": 48.70905587668594, "grad_norm": 0.33036479353904724, "learning_rate": 5.165317919075144e-06, "loss": 0.3173628234863281, "step": 126400 }, { "epoch": 48.70905587668594, "eval_loss": 0.37642914056777954, "eval_runtime": 12.3455, "eval_samples_per_second": 1539.353, "eval_steps_per_second": 32.077, "step": 126400 }, { "epoch": 48.72832369942196, "grad_norm": 0.292422890663147, "learning_rate": 5.088246628131022e-06, "loss": 0.31515012741088866, "step": 126450 }, { "epoch": 48.72832369942196, "eval_loss": 0.37340351939201355, "eval_runtime": 12.1672, "eval_samples_per_second": 1561.903, "eval_steps_per_second": 32.546, "step": 126450 }, { "epoch": 48.747591522158, "grad_norm": 0.3555428683757782, "learning_rate": 5.011175337186899e-06, "loss": 0.3121352767944336, "step": 126500 }, { "epoch": 48.747591522158, "eval_loss": 0.3738767206668854, "eval_runtime": 12.5319, "eval_samples_per_second": 1516.452, "eval_steps_per_second": 31.599, "step": 126500 }, { "epoch": 48.76685934489403, "grad_norm": 0.332850843667984, "learning_rate": 4.9341040462427744e-06, "loss": 0.3154208374023437, "step": 126550 }, { "epoch": 48.76685934489403, "eval_loss": 0.37823599576950073, "eval_runtime": 12.5204, "eval_samples_per_second": 1517.844, "eval_steps_per_second": 31.628, "step": 126550 }, { "epoch": 48.786127167630056, "grad_norm": 0.3141418695449829, "learning_rate": 4.857032755298652e-06, "loss": 0.3136017990112305, "step": 126600 }, { "epoch": 48.786127167630056, "eval_loss": 0.3705299198627472, "eval_runtime": 12.5854, "eval_samples_per_second": 1510.004, "eval_steps_per_second": 31.465, "step": 126600 }, { "epoch": 48.80539499036609, "grad_norm": 0.32747882604599, "learning_rate": 4.779961464354528e-06, "loss": 0.3118405723571777, "step": 126650 }, { "epoch": 48.80539499036609, "eval_loss": 0.37061241269111633, "eval_runtime": 12.346, "eval_samples_per_second": 1539.29, "eval_steps_per_second": 32.075, "step": 126650 }, { "epoch": 48.82466281310212, "grad_norm": 0.32652801275253296, "learning_rate": 4.7028901734104045e-06, "loss": 0.3130314636230469, "step": 126700 }, { "epoch": 48.82466281310212, "eval_loss": 0.3777045011520386, "eval_runtime": 12.3807, "eval_samples_per_second": 1534.972, "eval_steps_per_second": 31.985, "step": 126700 }, { "epoch": 48.84393063583815, "grad_norm": 0.35098356008529663, "learning_rate": 4.625818882466282e-06, "loss": 0.313006649017334, "step": 126750 }, { "epoch": 48.84393063583815, "eval_loss": 0.38011181354522705, "eval_runtime": 12.4316, "eval_samples_per_second": 1528.679, "eval_steps_per_second": 31.854, "step": 126750 }, { "epoch": 48.86319845857418, "grad_norm": 0.3428313136100769, "learning_rate": 4.548747591522158e-06, "loss": 0.31282934188842776, "step": 126800 }, { "epoch": 48.86319845857418, "eval_loss": 0.3751291334629059, "eval_runtime": 12.1745, "eval_samples_per_second": 1560.964, "eval_steps_per_second": 32.527, "step": 126800 }, { "epoch": 48.882466281310215, "grad_norm": 0.3416329324245453, "learning_rate": 4.471676300578035e-06, "loss": 0.31242416381835936, "step": 126850 }, { "epoch": 48.882466281310215, "eval_loss": 0.37883928418159485, "eval_runtime": 12.4791, "eval_samples_per_second": 1522.865, "eval_steps_per_second": 31.733, "step": 126850 }, { "epoch": 48.90173410404624, "grad_norm": 0.33391645550727844, "learning_rate": 4.394605009633911e-06, "loss": 0.31442167282104494, "step": 126900 }, { "epoch": 48.90173410404624, "eval_loss": 0.37575867772102356, "eval_runtime": 12.3584, "eval_samples_per_second": 1537.734, "eval_steps_per_second": 32.043, "step": 126900 }, { "epoch": 48.921001926782274, "grad_norm": 0.3005427420139313, "learning_rate": 4.317533718689788e-06, "loss": 0.311391658782959, "step": 126950 }, { "epoch": 48.921001926782274, "eval_loss": 0.38130486011505127, "eval_runtime": 12.3468, "eval_samples_per_second": 1539.187, "eval_steps_per_second": 32.073, "step": 126950 }, { "epoch": 48.94026974951831, "grad_norm": 0.33095765113830566, "learning_rate": 4.240462427745665e-06, "loss": 0.3083032989501953, "step": 127000 }, { "epoch": 48.94026974951831, "eval_loss": 0.36842289566993713, "eval_runtime": 12.3387, "eval_samples_per_second": 1540.196, "eval_steps_per_second": 32.094, "step": 127000 }, { "epoch": 48.959537572254334, "grad_norm": 0.33910655975341797, "learning_rate": 4.163391136801541e-06, "loss": 0.3121000289916992, "step": 127050 }, { "epoch": 48.959537572254334, "eval_loss": 0.371126264333725, "eval_runtime": 12.374, "eval_samples_per_second": 1535.798, "eval_steps_per_second": 32.003, "step": 127050 }, { "epoch": 48.97880539499037, "grad_norm": 0.3132646977901459, "learning_rate": 4.0863198458574185e-06, "loss": 0.3128261375427246, "step": 127100 }, { "epoch": 48.97880539499037, "eval_loss": 0.37540972232818604, "eval_runtime": 12.5108, "eval_samples_per_second": 1519.007, "eval_steps_per_second": 31.653, "step": 127100 }, { "epoch": 48.9980732177264, "grad_norm": 0.32786962389945984, "learning_rate": 4.009248554913295e-06, "loss": 0.3145319175720215, "step": 127150 }, { "epoch": 48.9980732177264, "eval_loss": 0.3735699951648712, "eval_runtime": 12.3304, "eval_samples_per_second": 1541.231, "eval_steps_per_second": 32.116, "step": 127150 }, { "epoch": 49.017341040462426, "grad_norm": 0.34330835938453674, "learning_rate": 3.932177263969171e-06, "loss": 0.3136917495727539, "step": 127200 }, { "epoch": 49.017341040462426, "eval_loss": 0.38108447194099426, "eval_runtime": 12.3956, "eval_samples_per_second": 1533.127, "eval_steps_per_second": 31.947, "step": 127200 }, { "epoch": 49.03660886319846, "grad_norm": 0.3595306873321533, "learning_rate": 3.8551059730250486e-06, "loss": 0.3069376754760742, "step": 127250 }, { "epoch": 49.03660886319846, "eval_loss": 0.3777084946632385, "eval_runtime": 12.3867, "eval_samples_per_second": 1534.226, "eval_steps_per_second": 31.97, "step": 127250 }, { "epoch": 49.05587668593449, "grad_norm": 0.32325655221939087, "learning_rate": 3.778034682080925e-06, "loss": 0.30812911987304686, "step": 127300 }, { "epoch": 49.05587668593449, "eval_loss": 0.379452645778656, "eval_runtime": 12.3479, "eval_samples_per_second": 1539.05, "eval_steps_per_second": 32.07, "step": 127300 }, { "epoch": 49.07514450867052, "grad_norm": 0.3118170499801636, "learning_rate": 3.700963391136802e-06, "loss": 0.3114314651489258, "step": 127350 }, { "epoch": 49.07514450867052, "eval_loss": 0.3757716417312622, "eval_runtime": 12.3382, "eval_samples_per_second": 1540.258, "eval_steps_per_second": 32.095, "step": 127350 }, { "epoch": 49.09441233140655, "grad_norm": 0.30816927552223206, "learning_rate": 3.6238921001926787e-06, "loss": 0.31267002105712893, "step": 127400 }, { "epoch": 49.09441233140655, "eval_loss": 0.3783451020717621, "eval_runtime": 12.6735, "eval_samples_per_second": 1499.508, "eval_steps_per_second": 31.246, "step": 127400 }, { "epoch": 49.113680154142585, "grad_norm": 0.30862778425216675, "learning_rate": 3.546820809248555e-06, "loss": 0.3128258895874023, "step": 127450 }, { "epoch": 49.113680154142585, "eval_loss": 0.37591612339019775, "eval_runtime": 12.4357, "eval_samples_per_second": 1528.184, "eval_steps_per_second": 31.844, "step": 127450 }, { "epoch": 49.13294797687861, "grad_norm": 0.332684189081192, "learning_rate": 3.469749518304432e-06, "loss": 0.3167525291442871, "step": 127500 }, { "epoch": 49.13294797687861, "eval_loss": 0.3715929388999939, "eval_runtime": 12.2005, "eval_samples_per_second": 1557.647, "eval_steps_per_second": 32.458, "step": 127500 }, { "epoch": 49.152215799614645, "grad_norm": 0.36266401410102844, "learning_rate": 3.3926782273603084e-06, "loss": 0.31350852966308596, "step": 127550 }, { "epoch": 49.152215799614645, "eval_loss": 0.37509670853614807, "eval_runtime": 12.5457, "eval_samples_per_second": 1514.779, "eval_steps_per_second": 31.565, "step": 127550 }, { "epoch": 49.17148362235068, "grad_norm": 0.3281245827674866, "learning_rate": 3.3156069364161852e-06, "loss": 0.3100265884399414, "step": 127600 }, { "epoch": 49.17148362235068, "eval_loss": 0.37851524353027344, "eval_runtime": 12.5346, "eval_samples_per_second": 1516.121, "eval_steps_per_second": 31.593, "step": 127600 }, { "epoch": 49.190751445086704, "grad_norm": 0.30824142694473267, "learning_rate": 3.238535645472062e-06, "loss": 0.312092170715332, "step": 127650 }, { "epoch": 49.190751445086704, "eval_loss": 0.37551149725914, "eval_runtime": 12.5337, "eval_samples_per_second": 1516.237, "eval_steps_per_second": 31.595, "step": 127650 }, { "epoch": 49.21001926782274, "grad_norm": 0.3404642641544342, "learning_rate": 3.1614643545279385e-06, "loss": 0.3092914962768555, "step": 127700 }, { "epoch": 49.21001926782274, "eval_loss": 0.37659066915512085, "eval_runtime": 12.5264, "eval_samples_per_second": 1517.112, "eval_steps_per_second": 31.613, "step": 127700 }, { "epoch": 49.22928709055876, "grad_norm": 0.3702749013900757, "learning_rate": 3.0843930635838153e-06, "loss": 0.3158684158325195, "step": 127750 }, { "epoch": 49.22928709055876, "eval_loss": 0.3714820146560669, "eval_runtime": 12.5092, "eval_samples_per_second": 1519.202, "eval_steps_per_second": 31.657, "step": 127750 }, { "epoch": 49.2485549132948, "grad_norm": 0.36979177594184875, "learning_rate": 3.0073217726396918e-06, "loss": 0.31175344467163085, "step": 127800 }, { "epoch": 49.2485549132948, "eval_loss": 0.374685138463974, "eval_runtime": 12.4929, "eval_samples_per_second": 1521.188, "eval_steps_per_second": 31.698, "step": 127800 }, { "epoch": 49.26782273603083, "grad_norm": 0.36520740389823914, "learning_rate": 2.9302504816955686e-06, "loss": 0.31313255310058596, "step": 127850 }, { "epoch": 49.26782273603083, "eval_loss": 0.3790181577205658, "eval_runtime": 12.189, "eval_samples_per_second": 1559.113, "eval_steps_per_second": 32.488, "step": 127850 }, { "epoch": 49.287090558766856, "grad_norm": 0.3170088231563568, "learning_rate": 2.853179190751445e-06, "loss": 0.31387088775634764, "step": 127900 }, { "epoch": 49.287090558766856, "eval_loss": 0.38057762384414673, "eval_runtime": 12.4064, "eval_samples_per_second": 1531.793, "eval_steps_per_second": 31.919, "step": 127900 }, { "epoch": 49.30635838150289, "grad_norm": 0.3228326141834259, "learning_rate": 2.776107899807322e-06, "loss": 0.31400955200195313, "step": 127950 }, { "epoch": 49.30635838150289, "eval_loss": 0.37723463773727417, "eval_runtime": 12.3372, "eval_samples_per_second": 1540.387, "eval_steps_per_second": 32.098, "step": 127950 }, { "epoch": 49.32562620423892, "grad_norm": 0.32092100381851196, "learning_rate": 2.6990366088631987e-06, "loss": 0.31626760482788085, "step": 128000 }, { "epoch": 49.32562620423892, "eval_loss": 0.37647855281829834, "eval_runtime": 12.4385, "eval_samples_per_second": 1527.835, "eval_steps_per_second": 31.837, "step": 128000 }, { "epoch": 49.34489402697495, "grad_norm": 0.33620157837867737, "learning_rate": 2.621965317919075e-06, "loss": 0.3122221374511719, "step": 128050 }, { "epoch": 49.34489402697495, "eval_loss": 0.38114631175994873, "eval_runtime": 12.3592, "eval_samples_per_second": 1537.641, "eval_steps_per_second": 32.041, "step": 128050 }, { "epoch": 49.36416184971098, "grad_norm": 0.3432556092739105, "learning_rate": 2.544894026974952e-06, "loss": 0.3141707611083984, "step": 128100 }, { "epoch": 49.36416184971098, "eval_loss": 0.3766125440597534, "eval_runtime": 12.3523, "eval_samples_per_second": 1538.494, "eval_steps_per_second": 32.059, "step": 128100 }, { "epoch": 49.383429672447015, "grad_norm": 0.3112456500530243, "learning_rate": 2.4678227360308284e-06, "loss": 0.314884147644043, "step": 128150 }, { "epoch": 49.383429672447015, "eval_loss": 0.37907496094703674, "eval_runtime": 12.3572, "eval_samples_per_second": 1537.891, "eval_steps_per_second": 32.046, "step": 128150 }, { "epoch": 49.40269749518304, "grad_norm": 0.3016362488269806, "learning_rate": 2.3907514450867052e-06, "loss": 0.3145466423034668, "step": 128200 }, { "epoch": 49.40269749518304, "eval_loss": 0.3739704191684723, "eval_runtime": 12.2784, "eval_samples_per_second": 1547.761, "eval_steps_per_second": 32.252, "step": 128200 }, { "epoch": 49.421965317919074, "grad_norm": 0.2896656394004822, "learning_rate": 2.313680154142582e-06, "loss": 0.3107080078125, "step": 128250 }, { "epoch": 49.421965317919074, "eval_loss": 0.37697139382362366, "eval_runtime": 12.5988, "eval_samples_per_second": 1508.393, "eval_steps_per_second": 31.431, "step": 128250 }, { "epoch": 49.44123314065511, "grad_norm": 0.3054453134536743, "learning_rate": 2.2366088631984585e-06, "loss": 0.31587413787841795, "step": 128300 }, { "epoch": 49.44123314065511, "eval_loss": 0.3727811574935913, "eval_runtime": 12.5174, "eval_samples_per_second": 1518.209, "eval_steps_per_second": 31.636, "step": 128300 }, { "epoch": 49.460500963391134, "grad_norm": 0.34844058752059937, "learning_rate": 2.1595375722543354e-06, "loss": 0.31065170288085936, "step": 128350 }, { "epoch": 49.460500963391134, "eval_loss": 0.370384156703949, "eval_runtime": 12.5733, "eval_samples_per_second": 1511.454, "eval_steps_per_second": 31.495, "step": 128350 }, { "epoch": 49.47976878612717, "grad_norm": 0.33221831917762756, "learning_rate": 2.082466281310212e-06, "loss": 0.3147000503540039, "step": 128400 }, { "epoch": 49.47976878612717, "eval_loss": 0.37621158361434937, "eval_runtime": 12.5404, "eval_samples_per_second": 1515.417, "eval_steps_per_second": 31.578, "step": 128400 }, { "epoch": 49.4990366088632, "grad_norm": 0.333281010389328, "learning_rate": 2.0053949903660886e-06, "loss": 0.3152140426635742, "step": 128450 }, { "epoch": 49.4990366088632, "eval_loss": 0.37135961651802063, "eval_runtime": 12.3332, "eval_samples_per_second": 1540.88, "eval_steps_per_second": 32.108, "step": 128450 }, { "epoch": 49.518304431599226, "grad_norm": 0.34447622299194336, "learning_rate": 1.9283236994219655e-06, "loss": 0.31316892623901366, "step": 128500 }, { "epoch": 49.518304431599226, "eval_loss": 0.37598878145217896, "eval_runtime": 12.3353, "eval_samples_per_second": 1540.613, "eval_steps_per_second": 32.103, "step": 128500 }, { "epoch": 49.53757225433526, "grad_norm": 0.31129783391952515, "learning_rate": 1.8512524084778421e-06, "loss": 0.31180267333984374, "step": 128550 }, { "epoch": 49.53757225433526, "eval_loss": 0.3795592784881592, "eval_runtime": 12.197, "eval_samples_per_second": 1558.086, "eval_steps_per_second": 32.467, "step": 128550 }, { "epoch": 49.55684007707129, "grad_norm": 0.3488447368144989, "learning_rate": 1.7741811175337185e-06, "loss": 0.31554647445678713, "step": 128600 }, { "epoch": 49.55684007707129, "eval_loss": 0.3731045126914978, "eval_runtime": 12.5879, "eval_samples_per_second": 1509.7, "eval_steps_per_second": 31.459, "step": 128600 }, { "epoch": 49.57610789980732, "grad_norm": 0.3336491584777832, "learning_rate": 1.6971098265895956e-06, "loss": 0.3118484115600586, "step": 128650 }, { "epoch": 49.57610789980732, "eval_loss": 0.37234219908714294, "eval_runtime": 12.5693, "eval_samples_per_second": 1511.933, "eval_steps_per_second": 31.505, "step": 128650 }, { "epoch": 49.59537572254335, "grad_norm": 0.33818238973617554, "learning_rate": 1.6200385356454722e-06, "loss": 0.31385269165039065, "step": 128700 }, { "epoch": 49.59537572254335, "eval_loss": 0.3730485439300537, "eval_runtime": 12.5613, "eval_samples_per_second": 1512.901, "eval_steps_per_second": 31.525, "step": 128700 }, { "epoch": 49.614643545279385, "grad_norm": 0.3071143627166748, "learning_rate": 1.5429672447013489e-06, "loss": 0.3129909896850586, "step": 128750 }, { "epoch": 49.614643545279385, "eval_loss": 0.3714693784713745, "eval_runtime": 12.3291, "eval_samples_per_second": 1541.395, "eval_steps_per_second": 32.119, "step": 128750 }, { "epoch": 49.63391136801541, "grad_norm": 0.39351969957351685, "learning_rate": 1.4658959537572255e-06, "loss": 0.3115042114257813, "step": 128800 }, { "epoch": 49.63391136801541, "eval_loss": 0.3793357312679291, "eval_runtime": 12.4263, "eval_samples_per_second": 1529.342, "eval_steps_per_second": 31.868, "step": 128800 }, { "epoch": 49.653179190751445, "grad_norm": 0.3372744619846344, "learning_rate": 1.3888246628131021e-06, "loss": 0.3151411056518555, "step": 128850 }, { "epoch": 49.653179190751445, "eval_loss": 0.37741291522979736, "eval_runtime": 12.4755, "eval_samples_per_second": 1523.304, "eval_steps_per_second": 31.742, "step": 128850 }, { "epoch": 49.67244701348748, "grad_norm": 0.3770137131214142, "learning_rate": 1.311753371868979e-06, "loss": 0.31294944763183596, "step": 128900 }, { "epoch": 49.67244701348748, "eval_loss": 0.3769587278366089, "eval_runtime": 12.1937, "eval_samples_per_second": 1558.515, "eval_steps_per_second": 32.476, "step": 128900 }, { "epoch": 49.691714836223504, "grad_norm": 0.37299880385398865, "learning_rate": 1.2346820809248556e-06, "loss": 0.3116561698913574, "step": 128950 }, { "epoch": 49.691714836223504, "eval_loss": 0.37794843316078186, "eval_runtime": 12.6365, "eval_samples_per_second": 1503.898, "eval_steps_per_second": 31.338, "step": 128950 }, { "epoch": 49.71098265895954, "grad_norm": 0.3475261330604553, "learning_rate": 1.1576107899807322e-06, "loss": 0.31387279510498045, "step": 129000 }, { "epoch": 49.71098265895954, "eval_loss": 0.37873902916908264, "eval_runtime": 12.4016, "eval_samples_per_second": 1532.385, "eval_steps_per_second": 31.931, "step": 129000 }, { "epoch": 49.73025048169557, "grad_norm": 0.31200867891311646, "learning_rate": 1.0805394990366089e-06, "loss": 0.31226469039916993, "step": 129050 }, { "epoch": 49.73025048169557, "eval_loss": 0.3737594485282898, "eval_runtime": 12.3435, "eval_samples_per_second": 1539.601, "eval_steps_per_second": 32.082, "step": 129050 }, { "epoch": 49.7495183044316, "grad_norm": 0.3299984335899353, "learning_rate": 1.0034682080924855e-06, "loss": 0.30944189071655276, "step": 129100 }, { "epoch": 49.7495183044316, "eval_loss": 0.3802502453327179, "eval_runtime": 12.3548, "eval_samples_per_second": 1538.193, "eval_steps_per_second": 32.052, "step": 129100 }, { "epoch": 49.76878612716763, "grad_norm": 0.29350775480270386, "learning_rate": 9.263969171483623e-07, "loss": 0.3128667449951172, "step": 129150 }, { "epoch": 49.76878612716763, "eval_loss": 0.3822285830974579, "eval_runtime": 12.6315, "eval_samples_per_second": 1504.487, "eval_steps_per_second": 31.35, "step": 129150 }, { "epoch": 49.78805394990366, "grad_norm": 0.34126511216163635, "learning_rate": 8.49325626204239e-07, "loss": 0.3171816635131836, "step": 129200 }, { "epoch": 49.78805394990366, "eval_loss": 0.3725033104419708, "eval_runtime": 12.5226, "eval_samples_per_second": 1517.575, "eval_steps_per_second": 31.623, "step": 129200 }, { "epoch": 49.80732177263969, "grad_norm": 0.30340707302093506, "learning_rate": 7.722543352601156e-07, "loss": 0.31133209228515624, "step": 129250 }, { "epoch": 49.80732177263969, "eval_loss": 0.36940038204193115, "eval_runtime": 12.2089, "eval_samples_per_second": 1556.571, "eval_steps_per_second": 32.435, "step": 129250 }, { "epoch": 49.82658959537572, "grad_norm": 0.34533703327178955, "learning_rate": 6.951830443159924e-07, "loss": 0.31458875656127927, "step": 129300 }, { "epoch": 49.82658959537572, "eval_loss": 0.3746315836906433, "eval_runtime": 12.6898, "eval_samples_per_second": 1497.581, "eval_steps_per_second": 31.206, "step": 129300 }, { "epoch": 49.845857418111756, "grad_norm": 0.33609938621520996, "learning_rate": 6.18111753371869e-07, "loss": 0.3112278366088867, "step": 129350 }, { "epoch": 49.845857418111756, "eval_loss": 0.37403663992881775, "eval_runtime": 12.8805, "eval_samples_per_second": 1475.41, "eval_steps_per_second": 30.744, "step": 129350 }, { "epoch": 49.86512524084778, "grad_norm": 0.3107355237007141, "learning_rate": 5.410404624277457e-07, "loss": 0.3109264373779297, "step": 129400 }, { "epoch": 49.86512524084778, "eval_loss": 0.37232062220573425, "eval_runtime": 12.5685, "eval_samples_per_second": 1512.033, "eval_steps_per_second": 31.507, "step": 129400 }, { "epoch": 49.884393063583815, "grad_norm": 0.32119622826576233, "learning_rate": 4.639691714836224e-07, "loss": 0.31297595977783205, "step": 129450 }, { "epoch": 49.884393063583815, "eval_loss": 0.37324583530426025, "eval_runtime": 12.5971, "eval_samples_per_second": 1508.603, "eval_steps_per_second": 31.436, "step": 129450 }, { "epoch": 49.90366088631985, "grad_norm": 0.32297369837760925, "learning_rate": 3.8689788053949904e-07, "loss": 0.317218017578125, "step": 129500 }, { "epoch": 49.90366088631985, "eval_loss": 0.37822598218917847, "eval_runtime": 12.5888, "eval_samples_per_second": 1509.598, "eval_steps_per_second": 31.457, "step": 129500 }, { "epoch": 49.922928709055874, "grad_norm": 0.3293539583683014, "learning_rate": 3.0982658959537573e-07, "loss": 0.3134767150878906, "step": 129550 }, { "epoch": 49.922928709055874, "eval_loss": 0.3847479224205017, "eval_runtime": 12.5736, "eval_samples_per_second": 1511.417, "eval_steps_per_second": 31.494, "step": 129550 }, { "epoch": 49.94219653179191, "grad_norm": 0.31514301896095276, "learning_rate": 2.3275529865125242e-07, "loss": 0.3057246971130371, "step": 129600 }, { "epoch": 49.94219653179191, "eval_loss": 0.3657572269439697, "eval_runtime": 12.5202, "eval_samples_per_second": 1517.864, "eval_steps_per_second": 31.629, "step": 129600 }, { "epoch": 49.96146435452794, "grad_norm": 0.3282798230648041, "learning_rate": 1.556840077071291e-07, "loss": 0.31438514709472654, "step": 129650 }, { "epoch": 49.96146435452794, "eval_loss": 0.3816390037536621, "eval_runtime": 12.5235, "eval_samples_per_second": 1517.47, "eval_steps_per_second": 31.621, "step": 129650 }, { "epoch": 49.98073217726397, "grad_norm": 0.32588261365890503, "learning_rate": 7.861271676300579e-08, "loss": 0.31282987594604494, "step": 129700 }, { "epoch": 49.98073217726397, "eval_loss": 0.374738872051239, "eval_runtime": 12.7432, "eval_samples_per_second": 1491.302, "eval_steps_per_second": 31.075, "step": 129700 }, { "epoch": 50.0, "grad_norm": 0.3201296031475067, "learning_rate": 1.5414258188824663e-09, "loss": 0.31281429290771484, "step": 129750 }, { "epoch": 50.0, "eval_loss": 0.37653329968452454, "eval_runtime": 12.517, "eval_samples_per_second": 1518.255, "eval_steps_per_second": 31.637, "step": 129750 } ], "logging_steps": 50, "max_steps": 129750, "num_input_tokens_seen": 0, "num_train_epochs": 50, "save_steps": 1000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.0471193945906872e+19, "train_batch_size": 128, "trial_name": null, "trial_params": null }