diff --git "a/checkpoint-1976/trainer_state.json" "b/checkpoint-1976/trainer_state.json" new file mode 100644--- /dev/null +++ "b/checkpoint-1976/trainer_state.json" @@ -0,0 +1,13866 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 8.0, + "eval_steps": 500, + "global_step": 1976, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.004048582995951417, + "grad_norm": 0.46473725687854356, + "learning_rate": 0.0, + "loss": 2.5926, + "step": 1 + }, + { + "epoch": 0.008097165991902834, + "grad_norm": 0.7862315968268553, + "learning_rate": 4.0485829959514176e-08, + "loss": 2.9114, + "step": 2 + }, + { + "epoch": 0.012145748987854251, + "grad_norm": 0.6677933506680473, + "learning_rate": 8.097165991902835e-08, + "loss": 2.7471, + "step": 3 + }, + { + "epoch": 0.016194331983805668, + "grad_norm": 0.8630518959378011, + "learning_rate": 1.2145748987854252e-07, + "loss": 2.8706, + "step": 4 + }, + { + "epoch": 0.020242914979757085, + "grad_norm": 0.5173190139924537, + "learning_rate": 1.619433198380567e-07, + "loss": 2.9912, + "step": 5 + }, + { + "epoch": 0.024291497975708502, + "grad_norm": 0.7759993718339214, + "learning_rate": 2.0242914979757086e-07, + "loss": 3.0072, + "step": 6 + }, + { + "epoch": 0.02834008097165992, + "grad_norm": 1.3755130452390263, + "learning_rate": 2.4291497975708504e-07, + "loss": 2.4721, + "step": 7 + }, + { + "epoch": 0.032388663967611336, + "grad_norm": 0.44121276912866286, + "learning_rate": 2.834008097165992e-07, + "loss": 2.843, + "step": 8 + }, + { + "epoch": 0.03643724696356275, + "grad_norm": 0.5559835506705462, + "learning_rate": 3.238866396761134e-07, + "loss": 2.9053, + "step": 9 + }, + { + "epoch": 0.04048582995951417, + "grad_norm": 0.6731704914870359, + "learning_rate": 3.6437246963562754e-07, + "loss": 2.7608, + "step": 10 + }, + { + "epoch": 0.044534412955465584, + "grad_norm": 0.43190024730085624, + "learning_rate": 4.048582995951417e-07, + "loss": 2.7074, + "step": 11 + }, + { + "epoch": 0.048582995951417005, + "grad_norm": 0.7594718614486027, + "learning_rate": 4.453441295546559e-07, + "loss": 2.7846, + "step": 12 + }, + { + "epoch": 0.05263157894736842, + "grad_norm": 0.4278958670654092, + "learning_rate": 4.858299595141701e-07, + "loss": 3.018, + "step": 13 + }, + { + "epoch": 0.05668016194331984, + "grad_norm": 0.48698492939265825, + "learning_rate": 5.263157894736843e-07, + "loss": 2.8131, + "step": 14 + }, + { + "epoch": 0.06072874493927125, + "grad_norm": 0.405274105300616, + "learning_rate": 5.668016194331984e-07, + "loss": 2.8777, + "step": 15 + }, + { + "epoch": 0.06477732793522267, + "grad_norm": 0.5554327831452092, + "learning_rate": 6.072874493927125e-07, + "loss": 2.9472, + "step": 16 + }, + { + "epoch": 0.06882591093117409, + "grad_norm": 0.44756530277540646, + "learning_rate": 6.477732793522268e-07, + "loss": 3.0157, + "step": 17 + }, + { + "epoch": 0.0728744939271255, + "grad_norm": 0.8072585997136504, + "learning_rate": 6.882591093117409e-07, + "loss": 2.7773, + "step": 18 + }, + { + "epoch": 0.07692307692307693, + "grad_norm": 0.5635933276885046, + "learning_rate": 7.287449392712551e-07, + "loss": 2.7169, + "step": 19 + }, + { + "epoch": 0.08097165991902834, + "grad_norm": 0.4673928500608582, + "learning_rate": 7.692307692307694e-07, + "loss": 2.7934, + "step": 20 + }, + { + "epoch": 0.08502024291497975, + "grad_norm": 1.3664880257539318, + "learning_rate": 8.097165991902834e-07, + "loss": 2.713, + "step": 21 + }, + { + "epoch": 0.08906882591093117, + "grad_norm": 0.6438340318121762, + "learning_rate": 8.502024291497976e-07, + "loss": 2.8722, + "step": 22 + }, + { + "epoch": 0.0931174089068826, + "grad_norm": 0.512121787489251, + "learning_rate": 8.906882591093118e-07, + "loss": 2.722, + "step": 23 + }, + { + "epoch": 0.09716599190283401, + "grad_norm": 1.023552604444706, + "learning_rate": 9.31174089068826e-07, + "loss": 2.5291, + "step": 24 + }, + { + "epoch": 0.10121457489878542, + "grad_norm": 0.556430330792241, + "learning_rate": 9.716599190283402e-07, + "loss": 2.7028, + "step": 25 + }, + { + "epoch": 0.10526315789473684, + "grad_norm": 1.0165779263195185, + "learning_rate": 1.0121457489878542e-06, + "loss": 2.7946, + "step": 26 + }, + { + "epoch": 0.10931174089068826, + "grad_norm": 0.8434539164732048, + "learning_rate": 1.0526315789473685e-06, + "loss": 2.6139, + "step": 27 + }, + { + "epoch": 0.11336032388663968, + "grad_norm": 0.6252954896694622, + "learning_rate": 1.0931174089068828e-06, + "loss": 2.469, + "step": 28 + }, + { + "epoch": 0.11740890688259109, + "grad_norm": 0.8618444900481227, + "learning_rate": 1.133603238866397e-06, + "loss": 2.6452, + "step": 29 + }, + { + "epoch": 0.1214574898785425, + "grad_norm": 0.9066908581713439, + "learning_rate": 1.174089068825911e-06, + "loss": 2.4396, + "step": 30 + }, + { + "epoch": 0.12550607287449392, + "grad_norm": 0.528141325017682, + "learning_rate": 1.214574898785425e-06, + "loss": 2.469, + "step": 31 + }, + { + "epoch": 0.12955465587044535, + "grad_norm": 0.6378156052352336, + "learning_rate": 1.2550607287449393e-06, + "loss": 2.5795, + "step": 32 + }, + { + "epoch": 0.13360323886639677, + "grad_norm": 0.5624703100477139, + "learning_rate": 1.2955465587044536e-06, + "loss": 2.6768, + "step": 33 + }, + { + "epoch": 0.13765182186234817, + "grad_norm": 0.5821134471598685, + "learning_rate": 1.336032388663968e-06, + "loss": 2.8086, + "step": 34 + }, + { + "epoch": 0.1417004048582996, + "grad_norm": 0.6258194867082703, + "learning_rate": 1.3765182186234818e-06, + "loss": 2.3603, + "step": 35 + }, + { + "epoch": 0.145748987854251, + "grad_norm": 0.5477831289461287, + "learning_rate": 1.417004048582996e-06, + "loss": 2.7758, + "step": 36 + }, + { + "epoch": 0.14979757085020243, + "grad_norm": 0.5008051448479439, + "learning_rate": 1.4574898785425101e-06, + "loss": 2.7543, + "step": 37 + }, + { + "epoch": 0.15384615384615385, + "grad_norm": 0.5096264603702895, + "learning_rate": 1.4979757085020244e-06, + "loss": 2.7356, + "step": 38 + }, + { + "epoch": 0.15789473684210525, + "grad_norm": 0.6456644864025523, + "learning_rate": 1.5384615384615387e-06, + "loss": 3.0218, + "step": 39 + }, + { + "epoch": 0.16194331983805668, + "grad_norm": 0.5888424191973028, + "learning_rate": 1.5789473684210526e-06, + "loss": 2.6165, + "step": 40 + }, + { + "epoch": 0.1659919028340081, + "grad_norm": 0.7898553504446816, + "learning_rate": 1.6194331983805669e-06, + "loss": 2.6223, + "step": 41 + }, + { + "epoch": 0.1700404858299595, + "grad_norm": 0.6232472926548593, + "learning_rate": 1.6599190283400812e-06, + "loss": 2.7768, + "step": 42 + }, + { + "epoch": 0.17408906882591094, + "grad_norm": 0.6922764219271268, + "learning_rate": 1.7004048582995952e-06, + "loss": 2.479, + "step": 43 + }, + { + "epoch": 0.17813765182186234, + "grad_norm": 0.6679665416214551, + "learning_rate": 1.7408906882591095e-06, + "loss": 2.6842, + "step": 44 + }, + { + "epoch": 0.18218623481781376, + "grad_norm": 0.48868645690455986, + "learning_rate": 1.7813765182186236e-06, + "loss": 2.3611, + "step": 45 + }, + { + "epoch": 0.1862348178137652, + "grad_norm": 1.0959755351532565, + "learning_rate": 1.8218623481781379e-06, + "loss": 2.6644, + "step": 46 + }, + { + "epoch": 0.1902834008097166, + "grad_norm": 0.7403727047924632, + "learning_rate": 1.862348178137652e-06, + "loss": 2.7313, + "step": 47 + }, + { + "epoch": 0.19433198380566802, + "grad_norm": 0.5355809576361324, + "learning_rate": 1.902834008097166e-06, + "loss": 2.976, + "step": 48 + }, + { + "epoch": 0.19838056680161945, + "grad_norm": 0.6203117033335515, + "learning_rate": 1.9433198380566803e-06, + "loss": 2.8615, + "step": 49 + }, + { + "epoch": 0.20242914979757085, + "grad_norm": 0.6748602332749001, + "learning_rate": 1.9838056680161946e-06, + "loss": 2.7385, + "step": 50 + }, + { + "epoch": 0.20647773279352227, + "grad_norm": 0.6061522444778688, + "learning_rate": 2.0242914979757085e-06, + "loss": 2.7926, + "step": 51 + }, + { + "epoch": 0.21052631578947367, + "grad_norm": 0.5677094053210018, + "learning_rate": 2.0647773279352228e-06, + "loss": 2.8905, + "step": 52 + }, + { + "epoch": 0.2145748987854251, + "grad_norm": 0.7539663022721307, + "learning_rate": 2.105263157894737e-06, + "loss": 2.7044, + "step": 53 + }, + { + "epoch": 0.21862348178137653, + "grad_norm": 0.5511775427996539, + "learning_rate": 2.1457489878542513e-06, + "loss": 2.6044, + "step": 54 + }, + { + "epoch": 0.22267206477732793, + "grad_norm": 0.5001055873779205, + "learning_rate": 2.1862348178137656e-06, + "loss": 2.7154, + "step": 55 + }, + { + "epoch": 0.22672064777327935, + "grad_norm": 5.059433496293122, + "learning_rate": 2.2267206477732795e-06, + "loss": 2.6151, + "step": 56 + }, + { + "epoch": 0.23076923076923078, + "grad_norm": 0.5976992576491789, + "learning_rate": 2.267206477732794e-06, + "loss": 2.8561, + "step": 57 + }, + { + "epoch": 0.23481781376518218, + "grad_norm": 0.5650795458768608, + "learning_rate": 2.307692307692308e-06, + "loss": 2.994, + "step": 58 + }, + { + "epoch": 0.2388663967611336, + "grad_norm": 1.110043039226332, + "learning_rate": 2.348178137651822e-06, + "loss": 2.9581, + "step": 59 + }, + { + "epoch": 0.242914979757085, + "grad_norm": 0.8353821859752748, + "learning_rate": 2.3886639676113362e-06, + "loss": 2.9613, + "step": 60 + }, + { + "epoch": 0.24696356275303644, + "grad_norm": 0.7575324618871198, + "learning_rate": 2.42914979757085e-06, + "loss": 2.7295, + "step": 61 + }, + { + "epoch": 0.25101214574898784, + "grad_norm": 0.7791476828146748, + "learning_rate": 2.4696356275303644e-06, + "loss": 2.7126, + "step": 62 + }, + { + "epoch": 0.2550607287449393, + "grad_norm": 0.4809737260566304, + "learning_rate": 2.5101214574898787e-06, + "loss": 2.8892, + "step": 63 + }, + { + "epoch": 0.2591093117408907, + "grad_norm": 0.5968909877448142, + "learning_rate": 2.550607287449393e-06, + "loss": 2.6468, + "step": 64 + }, + { + "epoch": 0.2631578947368421, + "grad_norm": 0.7701935599652083, + "learning_rate": 2.5910931174089072e-06, + "loss": 2.5171, + "step": 65 + }, + { + "epoch": 0.26720647773279355, + "grad_norm": 0.49540617385936636, + "learning_rate": 2.631578947368421e-06, + "loss": 2.5617, + "step": 66 + }, + { + "epoch": 0.27125506072874495, + "grad_norm": 0.5880768265382437, + "learning_rate": 2.672064777327936e-06, + "loss": 2.6525, + "step": 67 + }, + { + "epoch": 0.27530364372469635, + "grad_norm": 0.8719044761766179, + "learning_rate": 2.7125506072874497e-06, + "loss": 2.5136, + "step": 68 + }, + { + "epoch": 0.2793522267206478, + "grad_norm": 0.7508384152907464, + "learning_rate": 2.7530364372469636e-06, + "loss": 2.7136, + "step": 69 + }, + { + "epoch": 0.2834008097165992, + "grad_norm": 0.7593508374848729, + "learning_rate": 2.7935222672064783e-06, + "loss": 2.5836, + "step": 70 + }, + { + "epoch": 0.2874493927125506, + "grad_norm": 0.6236865711432193, + "learning_rate": 2.834008097165992e-06, + "loss": 2.6042, + "step": 71 + }, + { + "epoch": 0.291497975708502, + "grad_norm": 0.9207439340534006, + "learning_rate": 2.8744939271255064e-06, + "loss": 2.4534, + "step": 72 + }, + { + "epoch": 0.29554655870445345, + "grad_norm": 0.9048216657065745, + "learning_rate": 2.9149797570850203e-06, + "loss": 2.7732, + "step": 73 + }, + { + "epoch": 0.29959514170040485, + "grad_norm": 1.0531213295224573, + "learning_rate": 2.955465587044535e-06, + "loss": 2.6927, + "step": 74 + }, + { + "epoch": 0.30364372469635625, + "grad_norm": 0.8889664393499657, + "learning_rate": 2.995951417004049e-06, + "loss": 2.7532, + "step": 75 + }, + { + "epoch": 0.3076923076923077, + "grad_norm": 0.678148296266936, + "learning_rate": 3.0364372469635627e-06, + "loss": 2.4982, + "step": 76 + }, + { + "epoch": 0.3117408906882591, + "grad_norm": 0.9143989903488097, + "learning_rate": 3.0769230769230774e-06, + "loss": 2.4821, + "step": 77 + }, + { + "epoch": 0.3157894736842105, + "grad_norm": 0.7430526887934812, + "learning_rate": 3.1174089068825913e-06, + "loss": 2.8892, + "step": 78 + }, + { + "epoch": 0.31983805668016196, + "grad_norm": 1.0967354490931058, + "learning_rate": 3.157894736842105e-06, + "loss": 2.5355, + "step": 79 + }, + { + "epoch": 0.32388663967611336, + "grad_norm": 0.6474936013842225, + "learning_rate": 3.19838056680162e-06, + "loss": 2.4627, + "step": 80 + }, + { + "epoch": 0.32793522267206476, + "grad_norm": 0.8223317792104156, + "learning_rate": 3.2388663967611337e-06, + "loss": 2.5097, + "step": 81 + }, + { + "epoch": 0.3319838056680162, + "grad_norm": 0.8471027758590536, + "learning_rate": 3.279352226720648e-06, + "loss": 2.5888, + "step": 82 + }, + { + "epoch": 0.3360323886639676, + "grad_norm": 0.4892443825365843, + "learning_rate": 3.3198380566801623e-06, + "loss": 2.4857, + "step": 83 + }, + { + "epoch": 0.340080971659919, + "grad_norm": 0.6329419393193343, + "learning_rate": 3.3603238866396766e-06, + "loss": 2.3704, + "step": 84 + }, + { + "epoch": 0.3441295546558704, + "grad_norm": 0.7450745621264726, + "learning_rate": 3.4008097165991905e-06, + "loss": 2.4814, + "step": 85 + }, + { + "epoch": 0.3481781376518219, + "grad_norm": 0.7915890438013479, + "learning_rate": 3.4412955465587043e-06, + "loss": 2.7336, + "step": 86 + }, + { + "epoch": 0.3522267206477733, + "grad_norm": 0.8224002727747803, + "learning_rate": 3.481781376518219e-06, + "loss": 2.6197, + "step": 87 + }, + { + "epoch": 0.3562753036437247, + "grad_norm": 0.7379097347027997, + "learning_rate": 3.522267206477733e-06, + "loss": 2.3123, + "step": 88 + }, + { + "epoch": 0.3603238866396761, + "grad_norm": 0.63590140796502, + "learning_rate": 3.562753036437247e-06, + "loss": 2.659, + "step": 89 + }, + { + "epoch": 0.3643724696356275, + "grad_norm": 0.9402424866754966, + "learning_rate": 3.6032388663967615e-06, + "loss": 2.6324, + "step": 90 + }, + { + "epoch": 0.3684210526315789, + "grad_norm": 0.7757246306456501, + "learning_rate": 3.6437246963562758e-06, + "loss": 2.5935, + "step": 91 + }, + { + "epoch": 0.3724696356275304, + "grad_norm": 0.7001956828085119, + "learning_rate": 3.6842105263157896e-06, + "loss": 2.8634, + "step": 92 + }, + { + "epoch": 0.3765182186234818, + "grad_norm": 0.6770880287428972, + "learning_rate": 3.724696356275304e-06, + "loss": 2.3526, + "step": 93 + }, + { + "epoch": 0.3805668016194332, + "grad_norm": 0.7469924696350099, + "learning_rate": 3.7651821862348182e-06, + "loss": 2.4551, + "step": 94 + }, + { + "epoch": 0.38461538461538464, + "grad_norm": 0.6156146016330529, + "learning_rate": 3.805668016194332e-06, + "loss": 2.441, + "step": 95 + }, + { + "epoch": 0.38866396761133604, + "grad_norm": 0.7142333380873401, + "learning_rate": 3.846153846153847e-06, + "loss": 2.5222, + "step": 96 + }, + { + "epoch": 0.39271255060728744, + "grad_norm": 0.6126483934481857, + "learning_rate": 3.886639676113361e-06, + "loss": 2.6018, + "step": 97 + }, + { + "epoch": 0.3967611336032389, + "grad_norm": 0.7531177478658849, + "learning_rate": 3.9271255060728745e-06, + "loss": 2.4227, + "step": 98 + }, + { + "epoch": 0.4008097165991903, + "grad_norm": 0.7172471080034739, + "learning_rate": 3.967611336032389e-06, + "loss": 2.4637, + "step": 99 + }, + { + "epoch": 0.4048582995951417, + "grad_norm": 0.7800438096349082, + "learning_rate": 4.008097165991903e-06, + "loss": 2.5228, + "step": 100 + }, + { + "epoch": 0.4089068825910931, + "grad_norm": 0.8009705607457139, + "learning_rate": 4.048582995951417e-06, + "loss": 2.6356, + "step": 101 + }, + { + "epoch": 0.41295546558704455, + "grad_norm": 0.9574889353775141, + "learning_rate": 4.089068825910931e-06, + "loss": 2.3874, + "step": 102 + }, + { + "epoch": 0.41700404858299595, + "grad_norm": 0.7824043116812712, + "learning_rate": 4.1295546558704455e-06, + "loss": 2.6671, + "step": 103 + }, + { + "epoch": 0.42105263157894735, + "grad_norm": 0.7116660818199502, + "learning_rate": 4.170040485829959e-06, + "loss": 2.6795, + "step": 104 + }, + { + "epoch": 0.4251012145748988, + "grad_norm": 0.6234909516086495, + "learning_rate": 4.210526315789474e-06, + "loss": 2.4891, + "step": 105 + }, + { + "epoch": 0.4291497975708502, + "grad_norm": 0.7507042701110958, + "learning_rate": 4.251012145748988e-06, + "loss": 2.5374, + "step": 106 + }, + { + "epoch": 0.4331983805668016, + "grad_norm": 0.5830775553501698, + "learning_rate": 4.291497975708503e-06, + "loss": 2.4393, + "step": 107 + }, + { + "epoch": 0.43724696356275305, + "grad_norm": 0.8561666711107475, + "learning_rate": 4.3319838056680166e-06, + "loss": 2.3122, + "step": 108 + }, + { + "epoch": 0.44129554655870445, + "grad_norm": 0.914997362840242, + "learning_rate": 4.372469635627531e-06, + "loss": 2.5436, + "step": 109 + }, + { + "epoch": 0.44534412955465585, + "grad_norm": 0.6732155905531092, + "learning_rate": 4.412955465587045e-06, + "loss": 2.5005, + "step": 110 + }, + { + "epoch": 0.4493927125506073, + "grad_norm": 0.7462341368666683, + "learning_rate": 4.453441295546559e-06, + "loss": 2.4483, + "step": 111 + }, + { + "epoch": 0.4534412955465587, + "grad_norm": 0.8245738963488927, + "learning_rate": 4.493927125506074e-06, + "loss": 2.5333, + "step": 112 + }, + { + "epoch": 0.4574898785425101, + "grad_norm": 0.7702932505386926, + "learning_rate": 4.534412955465588e-06, + "loss": 2.5613, + "step": 113 + }, + { + "epoch": 0.46153846153846156, + "grad_norm": 1.3101615300934801, + "learning_rate": 4.5748987854251014e-06, + "loss": 2.973, + "step": 114 + }, + { + "epoch": 0.46558704453441296, + "grad_norm": 0.7651586289456958, + "learning_rate": 4.615384615384616e-06, + "loss": 2.5947, + "step": 115 + }, + { + "epoch": 0.46963562753036436, + "grad_norm": 0.8222224925704688, + "learning_rate": 4.65587044534413e-06, + "loss": 2.4581, + "step": 116 + }, + { + "epoch": 0.47368421052631576, + "grad_norm": 0.6556587501075568, + "learning_rate": 4.696356275303644e-06, + "loss": 2.4571, + "step": 117 + }, + { + "epoch": 0.4777327935222672, + "grad_norm": 0.821438637414972, + "learning_rate": 4.736842105263158e-06, + "loss": 2.6622, + "step": 118 + }, + { + "epoch": 0.4817813765182186, + "grad_norm": 0.6254867878515806, + "learning_rate": 4.7773279352226725e-06, + "loss": 2.3622, + "step": 119 + }, + { + "epoch": 0.48582995951417, + "grad_norm": 0.6606998242945233, + "learning_rate": 4.817813765182186e-06, + "loss": 2.4812, + "step": 120 + }, + { + "epoch": 0.4898785425101215, + "grad_norm": 0.9140647082414407, + "learning_rate": 4.8582995951417e-06, + "loss": 2.5297, + "step": 121 + }, + { + "epoch": 0.4939271255060729, + "grad_norm": 0.8543729933153993, + "learning_rate": 4.898785425101215e-06, + "loss": 2.5534, + "step": 122 + }, + { + "epoch": 0.4979757085020243, + "grad_norm": 0.9641287101724041, + "learning_rate": 4.939271255060729e-06, + "loss": 2.3909, + "step": 123 + }, + { + "epoch": 0.5020242914979757, + "grad_norm": 0.7562747998003689, + "learning_rate": 4.9797570850202435e-06, + "loss": 2.3104, + "step": 124 + }, + { + "epoch": 0.5060728744939271, + "grad_norm": 0.9684058066200523, + "learning_rate": 5.020242914979757e-06, + "loss": 2.5894, + "step": 125 + }, + { + "epoch": 0.5101214574898786, + "grad_norm": 1.0833146453760147, + "learning_rate": 5.060728744939272e-06, + "loss": 2.686, + "step": 126 + }, + { + "epoch": 0.5141700404858299, + "grad_norm": 0.7212110120886743, + "learning_rate": 5.101214574898786e-06, + "loss": 2.5203, + "step": 127 + }, + { + "epoch": 0.5182186234817814, + "grad_norm": 0.9848467525032204, + "learning_rate": 5.1417004048583e-06, + "loss": 2.66, + "step": 128 + }, + { + "epoch": 0.5222672064777328, + "grad_norm": 0.78315965526943, + "learning_rate": 5.1821862348178145e-06, + "loss": 2.5008, + "step": 129 + }, + { + "epoch": 0.5263157894736842, + "grad_norm": 0.8583112834837245, + "learning_rate": 5.222672064777329e-06, + "loss": 2.3134, + "step": 130 + }, + { + "epoch": 0.5303643724696356, + "grad_norm": 0.7581206885647646, + "learning_rate": 5.263157894736842e-06, + "loss": 2.4191, + "step": 131 + }, + { + "epoch": 0.5344129554655871, + "grad_norm": 0.9695513408717512, + "learning_rate": 5.303643724696357e-06, + "loss": 2.5499, + "step": 132 + }, + { + "epoch": 0.5384615384615384, + "grad_norm": 6.764939321667699, + "learning_rate": 5.344129554655872e-06, + "loss": 2.4736, + "step": 133 + }, + { + "epoch": 0.5425101214574899, + "grad_norm": 1.0247610500949114, + "learning_rate": 5.384615384615385e-06, + "loss": 2.3723, + "step": 134 + }, + { + "epoch": 0.5465587044534413, + "grad_norm": 15.672428379790873, + "learning_rate": 5.425101214574899e-06, + "loss": 3.4815, + "step": 135 + }, + { + "epoch": 0.5506072874493927, + "grad_norm": 2.249245731133667, + "learning_rate": 5.465587044534414e-06, + "loss": 3.4231, + "step": 136 + }, + { + "epoch": 0.5546558704453441, + "grad_norm": 3.797144058522148, + "learning_rate": 5.506072874493927e-06, + "loss": 4.4025, + "step": 137 + }, + { + "epoch": 0.5587044534412956, + "grad_norm": 0.8114215476851966, + "learning_rate": 5.546558704453442e-06, + "loss": 2.3958, + "step": 138 + }, + { + "epoch": 0.562753036437247, + "grad_norm": 0.7631595156767096, + "learning_rate": 5.5870445344129565e-06, + "loss": 2.1963, + "step": 139 + }, + { + "epoch": 0.5668016194331984, + "grad_norm": 0.8648024420211529, + "learning_rate": 5.6275303643724695e-06, + "loss": 2.4664, + "step": 140 + }, + { + "epoch": 0.5708502024291497, + "grad_norm": 1.1398946486999715, + "learning_rate": 5.668016194331984e-06, + "loss": 2.2672, + "step": 141 + }, + { + "epoch": 0.5748987854251012, + "grad_norm": 0.7035715089344788, + "learning_rate": 5.708502024291498e-06, + "loss": 2.4001, + "step": 142 + }, + { + "epoch": 0.5789473684210527, + "grad_norm": 0.7842465817250697, + "learning_rate": 5.748987854251013e-06, + "loss": 2.2186, + "step": 143 + }, + { + "epoch": 0.582995951417004, + "grad_norm": 0.8358191441707306, + "learning_rate": 5.789473684210527e-06, + "loss": 2.5692, + "step": 144 + }, + { + "epoch": 0.5870445344129555, + "grad_norm": 0.7027969455146362, + "learning_rate": 5.8299595141700406e-06, + "loss": 2.3088, + "step": 145 + }, + { + "epoch": 0.5910931174089069, + "grad_norm": 0.7026752876788243, + "learning_rate": 5.870445344129555e-06, + "loss": 2.4148, + "step": 146 + }, + { + "epoch": 0.5951417004048583, + "grad_norm": 0.9049685837714232, + "learning_rate": 5.91093117408907e-06, + "loss": 2.146, + "step": 147 + }, + { + "epoch": 0.5991902834008097, + "grad_norm": 0.8388567349727308, + "learning_rate": 5.951417004048583e-06, + "loss": 2.0989, + "step": 148 + }, + { + "epoch": 0.6032388663967612, + "grad_norm": 0.773577497225349, + "learning_rate": 5.991902834008098e-06, + "loss": 2.2379, + "step": 149 + }, + { + "epoch": 0.6072874493927125, + "grad_norm": 0.7826979729986758, + "learning_rate": 6.0323886639676124e-06, + "loss": 2.18, + "step": 150 + }, + { + "epoch": 0.611336032388664, + "grad_norm": 0.8592925674032668, + "learning_rate": 6.0728744939271254e-06, + "loss": 2.4302, + "step": 151 + }, + { + "epoch": 0.6153846153846154, + "grad_norm": 0.6169427006453612, + "learning_rate": 6.11336032388664e-06, + "loss": 2.2208, + "step": 152 + }, + { + "epoch": 0.6194331983805668, + "grad_norm": 0.8979145279675816, + "learning_rate": 6.153846153846155e-06, + "loss": 2.3089, + "step": 153 + }, + { + "epoch": 0.6234817813765182, + "grad_norm": 0.8069478254920203, + "learning_rate": 6.194331983805668e-06, + "loss": 2.5248, + "step": 154 + }, + { + "epoch": 0.6275303643724697, + "grad_norm": 0.702872317531758, + "learning_rate": 6.234817813765183e-06, + "loss": 2.2786, + "step": 155 + }, + { + "epoch": 0.631578947368421, + "grad_norm": 1.1902510486781737, + "learning_rate": 6.275303643724697e-06, + "loss": 2.564, + "step": 156 + }, + { + "epoch": 0.6356275303643725, + "grad_norm": 0.7322358696471963, + "learning_rate": 6.31578947368421e-06, + "loss": 2.2575, + "step": 157 + }, + { + "epoch": 0.6396761133603239, + "grad_norm": 0.827272619073328, + "learning_rate": 6.356275303643725e-06, + "loss": 2.4085, + "step": 158 + }, + { + "epoch": 0.6437246963562753, + "grad_norm": 0.844449245612401, + "learning_rate": 6.39676113360324e-06, + "loss": 2.3392, + "step": 159 + }, + { + "epoch": 0.6477732793522267, + "grad_norm": 0.6963954379010507, + "learning_rate": 6.437246963562754e-06, + "loss": 2.3474, + "step": 160 + }, + { + "epoch": 0.6518218623481782, + "grad_norm": 1.0062158283533227, + "learning_rate": 6.4777327935222675e-06, + "loss": 2.206, + "step": 161 + }, + { + "epoch": 0.6558704453441295, + "grad_norm": 0.7010434692271018, + "learning_rate": 6.518218623481782e-06, + "loss": 2.4407, + "step": 162 + }, + { + "epoch": 0.659919028340081, + "grad_norm": 0.8546299950775236, + "learning_rate": 6.558704453441296e-06, + "loss": 2.3308, + "step": 163 + }, + { + "epoch": 0.6639676113360324, + "grad_norm": 0.9160069550133176, + "learning_rate": 6.599190283400811e-06, + "loss": 2.2799, + "step": 164 + }, + { + "epoch": 0.6680161943319838, + "grad_norm": 0.6991934828570997, + "learning_rate": 6.639676113360325e-06, + "loss": 2.3277, + "step": 165 + }, + { + "epoch": 0.6720647773279352, + "grad_norm": 2.441952914795693, + "learning_rate": 6.6801619433198385e-06, + "loss": 2.2357, + "step": 166 + }, + { + "epoch": 0.6761133603238867, + "grad_norm": 0.7134946099061733, + "learning_rate": 6.720647773279353e-06, + "loss": 2.1807, + "step": 167 + }, + { + "epoch": 0.680161943319838, + "grad_norm": 0.7920123504029117, + "learning_rate": 6.761133603238867e-06, + "loss": 2.4623, + "step": 168 + }, + { + "epoch": 0.6842105263157895, + "grad_norm": 0.7987244705898385, + "learning_rate": 6.801619433198381e-06, + "loss": 2.2289, + "step": 169 + }, + { + "epoch": 0.6882591093117408, + "grad_norm": 0.8092206406250949, + "learning_rate": 6.842105263157896e-06, + "loss": 2.3704, + "step": 170 + }, + { + "epoch": 0.6923076923076923, + "grad_norm": 0.7440145606342271, + "learning_rate": 6.882591093117409e-06, + "loss": 2.3322, + "step": 171 + }, + { + "epoch": 0.6963562753036437, + "grad_norm": 0.704685785309606, + "learning_rate": 6.923076923076923e-06, + "loss": 2.1067, + "step": 172 + }, + { + "epoch": 0.7004048582995951, + "grad_norm": 0.8716057180507851, + "learning_rate": 6.963562753036438e-06, + "loss": 2.6915, + "step": 173 + }, + { + "epoch": 0.7044534412955465, + "grad_norm": 0.8610302596466904, + "learning_rate": 7.004048582995951e-06, + "loss": 2.3607, + "step": 174 + }, + { + "epoch": 0.708502024291498, + "grad_norm": 0.7454341645101108, + "learning_rate": 7.044534412955466e-06, + "loss": 2.0946, + "step": 175 + }, + { + "epoch": 0.7125506072874493, + "grad_norm": 0.775526558923258, + "learning_rate": 7.0850202429149805e-06, + "loss": 2.2197, + "step": 176 + }, + { + "epoch": 0.7165991902834008, + "grad_norm": 0.7425363416700347, + "learning_rate": 7.125506072874494e-06, + "loss": 2.2515, + "step": 177 + }, + { + "epoch": 0.7206477732793523, + "grad_norm": 0.799480261879121, + "learning_rate": 7.165991902834008e-06, + "loss": 2.2984, + "step": 178 + }, + { + "epoch": 0.7246963562753036, + "grad_norm": 1.208911299168472, + "learning_rate": 7.206477732793523e-06, + "loss": 2.3498, + "step": 179 + }, + { + "epoch": 0.728744939271255, + "grad_norm": 0.8451843361875137, + "learning_rate": 7.246963562753037e-06, + "loss": 2.3922, + "step": 180 + }, + { + "epoch": 0.7327935222672065, + "grad_norm": 0.6688748588442022, + "learning_rate": 7.2874493927125516e-06, + "loss": 2.2572, + "step": 181 + }, + { + "epoch": 0.7368421052631579, + "grad_norm": 1.1693138233285796, + "learning_rate": 7.327935222672065e-06, + "loss": 2.327, + "step": 182 + }, + { + "epoch": 0.7408906882591093, + "grad_norm": 1.6904745941237547, + "learning_rate": 7.368421052631579e-06, + "loss": 2.8703, + "step": 183 + }, + { + "epoch": 0.7449392712550608, + "grad_norm": 0.8844949083017518, + "learning_rate": 7.408906882591094e-06, + "loss": 2.2888, + "step": 184 + }, + { + "epoch": 0.7489878542510121, + "grad_norm": 0.8858477106782153, + "learning_rate": 7.449392712550608e-06, + "loss": 2.2582, + "step": 185 + }, + { + "epoch": 0.7530364372469636, + "grad_norm": 0.7394352987608678, + "learning_rate": 7.489878542510122e-06, + "loss": 2.0775, + "step": 186 + }, + { + "epoch": 0.757085020242915, + "grad_norm": 0.8834206013583122, + "learning_rate": 7.5303643724696364e-06, + "loss": 2.2682, + "step": 187 + }, + { + "epoch": 0.7611336032388664, + "grad_norm": 6.250751086281045, + "learning_rate": 7.570850202429151e-06, + "loss": 3.2512, + "step": 188 + }, + { + "epoch": 0.7651821862348178, + "grad_norm": 35.543626516502854, + "learning_rate": 7.611336032388664e-06, + "loss": 3.2673, + "step": 189 + }, + { + "epoch": 0.7692307692307693, + "grad_norm": 4.671464673421441, + "learning_rate": 7.651821862348178e-06, + "loss": 3.288, + "step": 190 + }, + { + "epoch": 0.7732793522267206, + "grad_norm": 0.8467043403003462, + "learning_rate": 7.692307692307694e-06, + "loss": 2.3525, + "step": 191 + }, + { + "epoch": 0.7773279352226721, + "grad_norm": 0.7553553742503454, + "learning_rate": 7.732793522267207e-06, + "loss": 2.4147, + "step": 192 + }, + { + "epoch": 0.7813765182186235, + "grad_norm": 0.6722184689731728, + "learning_rate": 7.773279352226721e-06, + "loss": 2.4408, + "step": 193 + }, + { + "epoch": 0.7854251012145749, + "grad_norm": 0.8742278117345931, + "learning_rate": 7.813765182186235e-06, + "loss": 2.2427, + "step": 194 + }, + { + "epoch": 0.7894736842105263, + "grad_norm": 0.7018298382516639, + "learning_rate": 7.854251012145749e-06, + "loss": 2.1401, + "step": 195 + }, + { + "epoch": 0.7935222672064778, + "grad_norm": 0.8441291024867053, + "learning_rate": 7.894736842105265e-06, + "loss": 2.417, + "step": 196 + }, + { + "epoch": 0.7975708502024291, + "grad_norm": 0.8440780587728888, + "learning_rate": 7.935222672064778e-06, + "loss": 2.343, + "step": 197 + }, + { + "epoch": 0.8016194331983806, + "grad_norm": 0.7817852912155946, + "learning_rate": 7.975708502024292e-06, + "loss": 2.0718, + "step": 198 + }, + { + "epoch": 0.805668016194332, + "grad_norm": 0.8173811480736421, + "learning_rate": 8.016194331983806e-06, + "loss": 1.9574, + "step": 199 + }, + { + "epoch": 0.8097165991902834, + "grad_norm": 0.9130733429115842, + "learning_rate": 8.056680161943322e-06, + "loss": 2.1815, + "step": 200 + }, + { + "epoch": 0.8137651821862348, + "grad_norm": 0.9847086103025836, + "learning_rate": 8.097165991902834e-06, + "loss": 2.3515, + "step": 201 + }, + { + "epoch": 0.8178137651821862, + "grad_norm": 0.8676876881551969, + "learning_rate": 8.13765182186235e-06, + "loss": 2.0846, + "step": 202 + }, + { + "epoch": 0.8218623481781376, + "grad_norm": 13.90144045255743, + "learning_rate": 8.178137651821862e-06, + "loss": 2.901, + "step": 203 + }, + { + "epoch": 0.8259109311740891, + "grad_norm": 26.964637613541246, + "learning_rate": 8.218623481781377e-06, + "loss": 4.9217, + "step": 204 + }, + { + "epoch": 0.8299595141700404, + "grad_norm": 0.9450475296548486, + "learning_rate": 8.259109311740891e-06, + "loss": 2.213, + "step": 205 + }, + { + "epoch": 0.8340080971659919, + "grad_norm": 0.8251626027353501, + "learning_rate": 8.299595141700405e-06, + "loss": 2.1265, + "step": 206 + }, + { + "epoch": 0.8380566801619433, + "grad_norm": 1.5637444134794973, + "learning_rate": 8.340080971659919e-06, + "loss": 2.1168, + "step": 207 + }, + { + "epoch": 0.8421052631578947, + "grad_norm": 0.8572609413241875, + "learning_rate": 8.380566801619434e-06, + "loss": 2.2021, + "step": 208 + }, + { + "epoch": 0.8461538461538461, + "grad_norm": 0.8829526183041908, + "learning_rate": 8.421052631578948e-06, + "loss": 2.1197, + "step": 209 + }, + { + "epoch": 0.8502024291497976, + "grad_norm": 0.8230040936414714, + "learning_rate": 8.461538461538462e-06, + "loss": 2.1389, + "step": 210 + }, + { + "epoch": 0.854251012145749, + "grad_norm": 1.0630722291016348, + "learning_rate": 8.502024291497976e-06, + "loss": 2.2071, + "step": 211 + }, + { + "epoch": 0.8582995951417004, + "grad_norm": 0.8285650816893187, + "learning_rate": 8.54251012145749e-06, + "loss": 2.1278, + "step": 212 + }, + { + "epoch": 0.8623481781376519, + "grad_norm": 0.9374104368567024, + "learning_rate": 8.582995951417005e-06, + "loss": 2.2602, + "step": 213 + }, + { + "epoch": 0.8663967611336032, + "grad_norm": 0.9292432454800617, + "learning_rate": 8.62348178137652e-06, + "loss": 2.2139, + "step": 214 + }, + { + "epoch": 0.8704453441295547, + "grad_norm": 1.102816596900189, + "learning_rate": 8.663967611336033e-06, + "loss": 2.6954, + "step": 215 + }, + { + "epoch": 0.8744939271255061, + "grad_norm": 1.0693734533760941, + "learning_rate": 8.704453441295547e-06, + "loss": 2.6307, + "step": 216 + }, + { + "epoch": 0.8785425101214575, + "grad_norm": 0.9576307746487195, + "learning_rate": 8.744939271255063e-06, + "loss": 2.3637, + "step": 217 + }, + { + "epoch": 0.8825910931174089, + "grad_norm": 0.9705930148144204, + "learning_rate": 8.785425101214575e-06, + "loss": 2.2346, + "step": 218 + }, + { + "epoch": 0.8866396761133604, + "grad_norm": 1.0504776994181708, + "learning_rate": 8.82591093117409e-06, + "loss": 1.8973, + "step": 219 + }, + { + "epoch": 0.8906882591093117, + "grad_norm": 0.8931928814405187, + "learning_rate": 8.866396761133604e-06, + "loss": 2.2742, + "step": 220 + }, + { + "epoch": 0.8947368421052632, + "grad_norm": 0.9688347208506803, + "learning_rate": 8.906882591093118e-06, + "loss": 2.2952, + "step": 221 + }, + { + "epoch": 0.8987854251012146, + "grad_norm": 0.978996274596435, + "learning_rate": 8.947368421052632e-06, + "loss": 2.0332, + "step": 222 + }, + { + "epoch": 0.902834008097166, + "grad_norm": 0.9073798024023706, + "learning_rate": 8.987854251012147e-06, + "loss": 2.0714, + "step": 223 + }, + { + "epoch": 0.9068825910931174, + "grad_norm": 1.1581613082581128, + "learning_rate": 9.02834008097166e-06, + "loss": 2.2157, + "step": 224 + }, + { + "epoch": 0.9109311740890689, + "grad_norm": 1.0884120135655109, + "learning_rate": 9.068825910931175e-06, + "loss": 1.7915, + "step": 225 + }, + { + "epoch": 0.9149797570850202, + "grad_norm": 0.9581672716343882, + "learning_rate": 9.109311740890689e-06, + "loss": 2.0722, + "step": 226 + }, + { + "epoch": 0.9190283400809717, + "grad_norm": 0.9523432975820123, + "learning_rate": 9.149797570850203e-06, + "loss": 2.0351, + "step": 227 + }, + { + "epoch": 0.9230769230769231, + "grad_norm": 0.9395262500295037, + "learning_rate": 9.190283400809717e-06, + "loss": 2.1823, + "step": 228 + }, + { + "epoch": 0.9271255060728745, + "grad_norm": 1.0734663585541728, + "learning_rate": 9.230769230769232e-06, + "loss": 2.2329, + "step": 229 + }, + { + "epoch": 0.9311740890688259, + "grad_norm": 5.915661456573777, + "learning_rate": 9.271255060728746e-06, + "loss": 2.142, + "step": 230 + }, + { + "epoch": 0.9352226720647774, + "grad_norm": 0.943964635554494, + "learning_rate": 9.31174089068826e-06, + "loss": 2.0151, + "step": 231 + }, + { + "epoch": 0.9392712550607287, + "grad_norm": 0.9400321772267921, + "learning_rate": 9.352226720647774e-06, + "loss": 1.9453, + "step": 232 + }, + { + "epoch": 0.9433198380566802, + "grad_norm": 1.0803744575815664, + "learning_rate": 9.392712550607288e-06, + "loss": 2.2879, + "step": 233 + }, + { + "epoch": 0.9473684210526315, + "grad_norm": 1.1375116889631114, + "learning_rate": 9.433198380566803e-06, + "loss": 1.997, + "step": 234 + }, + { + "epoch": 0.951417004048583, + "grad_norm": 1.0484948139162147, + "learning_rate": 9.473684210526315e-06, + "loss": 2.0557, + "step": 235 + }, + { + "epoch": 0.9554655870445344, + "grad_norm": 1.9953282124950078, + "learning_rate": 9.514170040485831e-06, + "loss": 2.2939, + "step": 236 + }, + { + "epoch": 0.9595141700404858, + "grad_norm": 0.976191957030197, + "learning_rate": 9.554655870445345e-06, + "loss": 2.0733, + "step": 237 + }, + { + "epoch": 0.9635627530364372, + "grad_norm": 1.2563869839657487, + "learning_rate": 9.595141700404859e-06, + "loss": 2.0464, + "step": 238 + }, + { + "epoch": 0.9676113360323887, + "grad_norm": 1.5608940397030466, + "learning_rate": 9.635627530364373e-06, + "loss": 2.336, + "step": 239 + }, + { + "epoch": 0.97165991902834, + "grad_norm": 1.3591514491532213, + "learning_rate": 9.676113360323888e-06, + "loss": 2.3022, + "step": 240 + }, + { + "epoch": 0.9757085020242915, + "grad_norm": 0.9384697642414853, + "learning_rate": 9.7165991902834e-06, + "loss": 2.0917, + "step": 241 + }, + { + "epoch": 0.979757085020243, + "grad_norm": 1.0921517070072044, + "learning_rate": 9.757085020242916e-06, + "loss": 2.2454, + "step": 242 + }, + { + "epoch": 0.9838056680161943, + "grad_norm": 1.0952417249590038, + "learning_rate": 9.79757085020243e-06, + "loss": 2.2731, + "step": 243 + }, + { + "epoch": 0.9878542510121457, + "grad_norm": 1.004948368911197, + "learning_rate": 9.838056680161944e-06, + "loss": 2.0318, + "step": 244 + }, + { + "epoch": 0.9919028340080972, + "grad_norm": 0.9149897248279167, + "learning_rate": 9.878542510121458e-06, + "loss": 2.0005, + "step": 245 + }, + { + "epoch": 0.9959514170040485, + "grad_norm": 0.8508821706595309, + "learning_rate": 9.919028340080973e-06, + "loss": 2.2101, + "step": 246 + }, + { + "epoch": 1.0, + "grad_norm": 1.0244113302231659, + "learning_rate": 9.959514170040487e-06, + "loss": 2.0861, + "step": 247 + }, + { + "epoch": 1.0040485829959513, + "grad_norm": 0.9985250389875123, + "learning_rate": 1e-05, + "loss": 2.1654, + "step": 248 + }, + { + "epoch": 1.008097165991903, + "grad_norm": 1.5212147724237604, + "learning_rate": 9.999995007009308e-06, + "loss": 2.3841, + "step": 249 + }, + { + "epoch": 1.0121457489878543, + "grad_norm": 1.5612489351031709, + "learning_rate": 9.999980028047207e-06, + "loss": 2.2013, + "step": 250 + }, + { + "epoch": 1.0161943319838056, + "grad_norm": 1.3355032190827423, + "learning_rate": 9.99995506314361e-06, + "loss": 2.3109, + "step": 251 + }, + { + "epoch": 1.0202429149797572, + "grad_norm": 1.309995468445311, + "learning_rate": 9.999920112348379e-06, + "loss": 2.5018, + "step": 252 + }, + { + "epoch": 1.0242914979757085, + "grad_norm": 1.4582415698006528, + "learning_rate": 9.999875175731316e-06, + "loss": 2.4387, + "step": 253 + }, + { + "epoch": 1.0283400809716599, + "grad_norm": 1.2959671971401512, + "learning_rate": 9.99982025338217e-06, + "loss": 2.0271, + "step": 254 + }, + { + "epoch": 1.0323886639676114, + "grad_norm": 1.3702661061884107, + "learning_rate": 9.999755345410628e-06, + "loss": 2.1942, + "step": 255 + }, + { + "epoch": 1.0364372469635628, + "grad_norm": 1.2343807344186972, + "learning_rate": 9.999680451946327e-06, + "loss": 2.3802, + "step": 256 + }, + { + "epoch": 1.040485829959514, + "grad_norm": 1.2422842542141688, + "learning_rate": 9.999595573138845e-06, + "loss": 2.1737, + "step": 257 + }, + { + "epoch": 1.0445344129554657, + "grad_norm": 1.0535455017417064, + "learning_rate": 9.9995007091577e-06, + "loss": 2.1892, + "step": 258 + }, + { + "epoch": 1.048582995951417, + "grad_norm": 1.1326643708775719, + "learning_rate": 9.999395860192354e-06, + "loss": 2.165, + "step": 259 + }, + { + "epoch": 1.0526315789473684, + "grad_norm": 1.1512147523566951, + "learning_rate": 9.99928102645221e-06, + "loss": 2.4136, + "step": 260 + }, + { + "epoch": 1.05668016194332, + "grad_norm": 1.161431041066393, + "learning_rate": 9.999156208166614e-06, + "loss": 2.2649, + "step": 261 + }, + { + "epoch": 1.0607287449392713, + "grad_norm": 1.0550067630684001, + "learning_rate": 9.999021405584855e-06, + "loss": 2.2776, + "step": 262 + }, + { + "epoch": 1.0647773279352226, + "grad_norm": 1.2456078968374804, + "learning_rate": 9.99887661897616e-06, + "loss": 2.2937, + "step": 263 + }, + { + "epoch": 1.0688259109311742, + "grad_norm": 2.6565909174287934, + "learning_rate": 9.998721848629691e-06, + "loss": 2.3373, + "step": 264 + }, + { + "epoch": 1.0728744939271255, + "grad_norm": 1.2585354952683687, + "learning_rate": 9.99855709485456e-06, + "loss": 2.1755, + "step": 265 + }, + { + "epoch": 1.0769230769230769, + "grad_norm": 1.0397633573741487, + "learning_rate": 9.99838235797981e-06, + "loss": 2.1224, + "step": 266 + }, + { + "epoch": 1.0809716599190284, + "grad_norm": 1.3490485543349722, + "learning_rate": 9.998197638354428e-06, + "loss": 2.162, + "step": 267 + }, + { + "epoch": 1.0850202429149798, + "grad_norm": 0.9779246835555004, + "learning_rate": 9.998002936347334e-06, + "loss": 2.0674, + "step": 268 + }, + { + "epoch": 1.0890688259109311, + "grad_norm": 1.326338728002689, + "learning_rate": 9.997798252347382e-06, + "loss": 2.1639, + "step": 269 + }, + { + "epoch": 1.0931174089068827, + "grad_norm": 1.0363012993300713, + "learning_rate": 9.99758358676337e-06, + "loss": 2.2088, + "step": 270 + }, + { + "epoch": 1.097165991902834, + "grad_norm": 1.0931184449284037, + "learning_rate": 9.99735894002403e-06, + "loss": 1.9417, + "step": 271 + }, + { + "epoch": 1.1012145748987854, + "grad_norm": 1.1142050270090365, + "learning_rate": 9.99712431257802e-06, + "loss": 2.1229, + "step": 272 + }, + { + "epoch": 1.1052631578947367, + "grad_norm": 1.1058458560003002, + "learning_rate": 9.99687970489394e-06, + "loss": 2.147, + "step": 273 + }, + { + "epoch": 1.1093117408906883, + "grad_norm": 1.1507827310584715, + "learning_rate": 9.996625117460319e-06, + "loss": 2.0305, + "step": 274 + }, + { + "epoch": 1.1133603238866396, + "grad_norm": 1.4399534822311415, + "learning_rate": 9.996360550785619e-06, + "loss": 1.993, + "step": 275 + }, + { + "epoch": 1.117408906882591, + "grad_norm": 1.3360646827911495, + "learning_rate": 9.996086005398228e-06, + "loss": 1.9789, + "step": 276 + }, + { + "epoch": 1.1214574898785425, + "grad_norm": 1.1287606232609018, + "learning_rate": 9.995801481846474e-06, + "loss": 1.9362, + "step": 277 + }, + { + "epoch": 1.125506072874494, + "grad_norm": 1.0926872380366626, + "learning_rate": 9.9955069806986e-06, + "loss": 1.8981, + "step": 278 + }, + { + "epoch": 1.1295546558704452, + "grad_norm": 1.225113996229143, + "learning_rate": 9.995202502542785e-06, + "loss": 1.877, + "step": 279 + }, + { + "epoch": 1.1336032388663968, + "grad_norm": 1.350566519940966, + "learning_rate": 9.99488804798713e-06, + "loss": 2.1812, + "step": 280 + }, + { + "epoch": 1.1376518218623481, + "grad_norm": 1.3946048118439773, + "learning_rate": 9.994563617659665e-06, + "loss": 2.0952, + "step": 281 + }, + { + "epoch": 1.1417004048582995, + "grad_norm": 1.016854167145539, + "learning_rate": 9.99422921220834e-06, + "loss": 1.7897, + "step": 282 + }, + { + "epoch": 1.145748987854251, + "grad_norm": 1.1675202565627227, + "learning_rate": 9.993884832301029e-06, + "loss": 2.1832, + "step": 283 + }, + { + "epoch": 1.1497975708502024, + "grad_norm": 1.1052537876752062, + "learning_rate": 9.993530478625524e-06, + "loss": 2.0419, + "step": 284 + }, + { + "epoch": 1.1538461538461537, + "grad_norm": 1.0339091939503424, + "learning_rate": 9.99316615188954e-06, + "loss": 2.1765, + "step": 285 + }, + { + "epoch": 1.1578947368421053, + "grad_norm": 1.224235640342616, + "learning_rate": 9.992791852820709e-06, + "loss": 2.414, + "step": 286 + }, + { + "epoch": 1.1619433198380567, + "grad_norm": 1.1077938277922803, + "learning_rate": 9.992407582166582e-06, + "loss": 2.0729, + "step": 287 + }, + { + "epoch": 1.165991902834008, + "grad_norm": 1.1047832453065312, + "learning_rate": 9.99201334069462e-06, + "loss": 2.0816, + "step": 288 + }, + { + "epoch": 1.1700404858299596, + "grad_norm": 1.020340791924455, + "learning_rate": 9.991609129192202e-06, + "loss": 2.4242, + "step": 289 + }, + { + "epoch": 1.174089068825911, + "grad_norm": 1.0597565636193305, + "learning_rate": 9.991194948466615e-06, + "loss": 1.9546, + "step": 290 + }, + { + "epoch": 1.1781376518218623, + "grad_norm": 2.733652108939615, + "learning_rate": 9.990770799345064e-06, + "loss": 2.0891, + "step": 291 + }, + { + "epoch": 1.1821862348178138, + "grad_norm": 1.06820787268932, + "learning_rate": 9.990336682674656e-06, + "loss": 1.8523, + "step": 292 + }, + { + "epoch": 1.1862348178137652, + "grad_norm": 2.087421429190754, + "learning_rate": 9.989892599322404e-06, + "loss": 2.0252, + "step": 293 + }, + { + "epoch": 1.1902834008097165, + "grad_norm": 1.0884298591172652, + "learning_rate": 9.989438550175235e-06, + "loss": 2.094, + "step": 294 + }, + { + "epoch": 1.194331983805668, + "grad_norm": 1.4465924376774404, + "learning_rate": 9.98897453613997e-06, + "loss": 2.2522, + "step": 295 + }, + { + "epoch": 1.1983805668016194, + "grad_norm": 1.2561153181877684, + "learning_rate": 9.988500558143337e-06, + "loss": 2.3174, + "step": 296 + }, + { + "epoch": 1.2024291497975708, + "grad_norm": 1.299592783957394, + "learning_rate": 9.988016617131966e-06, + "loss": 2.0626, + "step": 297 + }, + { + "epoch": 1.2064777327935223, + "grad_norm": 1.616312765069768, + "learning_rate": 9.987522714072377e-06, + "loss": 2.332, + "step": 298 + }, + { + "epoch": 1.2105263157894737, + "grad_norm": 1.1673730449379247, + "learning_rate": 9.987018849950996e-06, + "loss": 2.3944, + "step": 299 + }, + { + "epoch": 1.214574898785425, + "grad_norm": 1.143398053052611, + "learning_rate": 9.986505025774137e-06, + "loss": 2.1948, + "step": 300 + }, + { + "epoch": 1.2186234817813766, + "grad_norm": 1.097402992490867, + "learning_rate": 9.985981242568009e-06, + "loss": 2.0261, + "step": 301 + }, + { + "epoch": 1.222672064777328, + "grad_norm": 1.1862462194607237, + "learning_rate": 9.985447501378706e-06, + "loss": 2.0268, + "step": 302 + }, + { + "epoch": 1.2267206477732793, + "grad_norm": 1.1867953576661743, + "learning_rate": 9.984903803272216e-06, + "loss": 2.0609, + "step": 303 + }, + { + "epoch": 1.2307692307692308, + "grad_norm": 1.160233224133256, + "learning_rate": 9.984350149334415e-06, + "loss": 2.118, + "step": 304 + }, + { + "epoch": 1.2348178137651822, + "grad_norm": 1.1580496833430431, + "learning_rate": 9.983786540671052e-06, + "loss": 2.2939, + "step": 305 + }, + { + "epoch": 1.2388663967611335, + "grad_norm": 1.1904466983631679, + "learning_rate": 9.983212978407767e-06, + "loss": 2.2554, + "step": 306 + }, + { + "epoch": 1.242914979757085, + "grad_norm": 1.191066075711238, + "learning_rate": 9.982629463690075e-06, + "loss": 2.2252, + "step": 307 + }, + { + "epoch": 1.2469635627530364, + "grad_norm": 0.9748723838702108, + "learning_rate": 9.982035997683372e-06, + "loss": 2.0288, + "step": 308 + }, + { + "epoch": 1.2510121457489878, + "grad_norm": 1.0421752021046666, + "learning_rate": 9.981432581572925e-06, + "loss": 2.0528, + "step": 309 + }, + { + "epoch": 1.2550607287449393, + "grad_norm": 1.1354302953976132, + "learning_rate": 9.980819216563875e-06, + "loss": 2.1848, + "step": 310 + }, + { + "epoch": 1.2591093117408907, + "grad_norm": 1.1565556608606453, + "learning_rate": 9.980195903881231e-06, + "loss": 1.9964, + "step": 311 + }, + { + "epoch": 1.263157894736842, + "grad_norm": 1.0637756069428104, + "learning_rate": 9.979562644769871e-06, + "loss": 1.8735, + "step": 312 + }, + { + "epoch": 1.2672064777327936, + "grad_norm": 1.0699259387542537, + "learning_rate": 9.978919440494538e-06, + "loss": 2.0595, + "step": 313 + }, + { + "epoch": 1.271255060728745, + "grad_norm": 1.1179452169818913, + "learning_rate": 9.978266292339838e-06, + "loss": 2.1342, + "step": 314 + }, + { + "epoch": 1.2753036437246963, + "grad_norm": 0.9851906694579183, + "learning_rate": 9.977603201610236e-06, + "loss": 2.0658, + "step": 315 + }, + { + "epoch": 1.2793522267206479, + "grad_norm": 1.664317835506444, + "learning_rate": 9.976930169630052e-06, + "loss": 2.1478, + "step": 316 + }, + { + "epoch": 1.2834008097165992, + "grad_norm": 2.1052363417173012, + "learning_rate": 9.976247197743465e-06, + "loss": 1.8522, + "step": 317 + }, + { + "epoch": 1.2874493927125505, + "grad_norm": 1.1846256759923113, + "learning_rate": 9.975554287314505e-06, + "loss": 1.9432, + "step": 318 + }, + { + "epoch": 1.291497975708502, + "grad_norm": 1.138896431387234, + "learning_rate": 9.974851439727045e-06, + "loss": 1.8181, + "step": 319 + }, + { + "epoch": 1.2955465587044535, + "grad_norm": 1.153796269934686, + "learning_rate": 9.974138656384815e-06, + "loss": 2.1573, + "step": 320 + }, + { + "epoch": 1.2995951417004048, + "grad_norm": 1.703181471948063, + "learning_rate": 9.973415938711383e-06, + "loss": 2.1787, + "step": 321 + }, + { + "epoch": 1.3036437246963564, + "grad_norm": 1.7096036636558702, + "learning_rate": 9.972683288150155e-06, + "loss": 1.9479, + "step": 322 + }, + { + "epoch": 1.3076923076923077, + "grad_norm": 1.1866073546875906, + "learning_rate": 9.97194070616438e-06, + "loss": 1.9284, + "step": 323 + }, + { + "epoch": 1.311740890688259, + "grad_norm": 1.0952591943942271, + "learning_rate": 9.971188194237141e-06, + "loss": 1.9908, + "step": 324 + }, + { + "epoch": 1.3157894736842106, + "grad_norm": 1.5313235105110092, + "learning_rate": 9.97042575387135e-06, + "loss": 2.0365, + "step": 325 + }, + { + "epoch": 1.319838056680162, + "grad_norm": 1.2326037015549494, + "learning_rate": 9.969653386589749e-06, + "loss": 1.9016, + "step": 326 + }, + { + "epoch": 1.3238866396761133, + "grad_norm": 1.08612437072456, + "learning_rate": 9.968871093934908e-06, + "loss": 1.9295, + "step": 327 + }, + { + "epoch": 1.3279352226720649, + "grad_norm": 1.1765201682452633, + "learning_rate": 9.968078877469221e-06, + "loss": 1.9057, + "step": 328 + }, + { + "epoch": 1.3319838056680162, + "grad_norm": 1.1266840563836074, + "learning_rate": 9.967276738774897e-06, + "loss": 1.7933, + "step": 329 + }, + { + "epoch": 1.3360323886639676, + "grad_norm": 1.096241365913634, + "learning_rate": 9.966464679453969e-06, + "loss": 1.8225, + "step": 330 + }, + { + "epoch": 1.3400809716599191, + "grad_norm": 1.0190613068454424, + "learning_rate": 9.965642701128273e-06, + "loss": 1.7548, + "step": 331 + }, + { + "epoch": 1.3441295546558705, + "grad_norm": 1.045370042720153, + "learning_rate": 9.964810805439464e-06, + "loss": 1.8602, + "step": 332 + }, + { + "epoch": 1.3481781376518218, + "grad_norm": 1.2609434903119947, + "learning_rate": 9.963968994049e-06, + "loss": 2.0594, + "step": 333 + }, + { + "epoch": 1.3522267206477734, + "grad_norm": 2.6150970483606812, + "learning_rate": 9.963117268638147e-06, + "loss": 1.8496, + "step": 334 + }, + { + "epoch": 1.3562753036437247, + "grad_norm": 1.2099371136718209, + "learning_rate": 9.962255630907964e-06, + "loss": 1.6494, + "step": 335 + }, + { + "epoch": 1.360323886639676, + "grad_norm": 1.313765722576788, + "learning_rate": 9.961384082579311e-06, + "loss": 1.9562, + "step": 336 + }, + { + "epoch": 1.3643724696356276, + "grad_norm": 1.2172159882432991, + "learning_rate": 9.96050262539284e-06, + "loss": 2.0155, + "step": 337 + }, + { + "epoch": 1.368421052631579, + "grad_norm": 1.2586156100651915, + "learning_rate": 9.959611261108999e-06, + "loss": 1.9085, + "step": 338 + }, + { + "epoch": 1.3724696356275303, + "grad_norm": 1.5183212778349207, + "learning_rate": 9.958709991508013e-06, + "loss": 2.0875, + "step": 339 + }, + { + "epoch": 1.376518218623482, + "grad_norm": 1.1522560111562028, + "learning_rate": 9.957798818389894e-06, + "loss": 1.619, + "step": 340 + }, + { + "epoch": 1.3805668016194332, + "grad_norm": 1.1594845675041106, + "learning_rate": 9.956877743574437e-06, + "loss": 1.809, + "step": 341 + }, + { + "epoch": 1.3846153846153846, + "grad_norm": 1.1122066306670175, + "learning_rate": 9.955946768901207e-06, + "loss": 1.7047, + "step": 342 + }, + { + "epoch": 1.3886639676113361, + "grad_norm": 1.330314253280862, + "learning_rate": 9.955005896229543e-06, + "loss": 1.7574, + "step": 343 + }, + { + "epoch": 1.3927125506072875, + "grad_norm": 1.1715493987473338, + "learning_rate": 9.954055127438554e-06, + "loss": 1.903, + "step": 344 + }, + { + "epoch": 1.3967611336032388, + "grad_norm": 1.3791674988449036, + "learning_rate": 9.95309446442711e-06, + "loss": 1.7259, + "step": 345 + }, + { + "epoch": 1.4008097165991904, + "grad_norm": 1.1049829081327143, + "learning_rate": 9.952123909113842e-06, + "loss": 1.7903, + "step": 346 + }, + { + "epoch": 1.4048582995951417, + "grad_norm": 1.2032214776472194, + "learning_rate": 9.951143463437145e-06, + "loss": 1.8805, + "step": 347 + }, + { + "epoch": 1.408906882591093, + "grad_norm": 1.4430732870842997, + "learning_rate": 9.950153129355156e-06, + "loss": 1.963, + "step": 348 + }, + { + "epoch": 1.4129554655870447, + "grad_norm": 1.1510222292519288, + "learning_rate": 9.949152908845771e-06, + "loss": 1.8567, + "step": 349 + }, + { + "epoch": 1.417004048582996, + "grad_norm": 1.195578264117532, + "learning_rate": 9.948142803906623e-06, + "loss": 2.0649, + "step": 350 + }, + { + "epoch": 1.4210526315789473, + "grad_norm": 1.233691487377917, + "learning_rate": 9.947122816555091e-06, + "loss": 2.1272, + "step": 351 + }, + { + "epoch": 1.425101214574899, + "grad_norm": 1.1086448213277071, + "learning_rate": 9.94609294882829e-06, + "loss": 1.9559, + "step": 352 + }, + { + "epoch": 1.4291497975708503, + "grad_norm": 1.095236792272251, + "learning_rate": 9.94505320278307e-06, + "loss": 2.0925, + "step": 353 + }, + { + "epoch": 1.4331983805668016, + "grad_norm": 1.5358655904235856, + "learning_rate": 9.944003580496004e-06, + "loss": 2.1299, + "step": 354 + }, + { + "epoch": 1.4372469635627532, + "grad_norm": 4.618210545500014, + "learning_rate": 9.942944084063397e-06, + "loss": 1.906, + "step": 355 + }, + { + "epoch": 1.4412955465587045, + "grad_norm": 1.2771853507714968, + "learning_rate": 9.94187471560127e-06, + "loss": 1.8895, + "step": 356 + }, + { + "epoch": 1.4453441295546559, + "grad_norm": 1.503260525653169, + "learning_rate": 9.940795477245362e-06, + "loss": 2.123, + "step": 357 + }, + { + "epoch": 1.4493927125506074, + "grad_norm": 1.1357577615662766, + "learning_rate": 9.939706371151124e-06, + "loss": 1.9087, + "step": 358 + }, + { + "epoch": 1.4534412955465588, + "grad_norm": 1.3448821103990194, + "learning_rate": 9.938607399493714e-06, + "loss": 1.8989, + "step": 359 + }, + { + "epoch": 1.45748987854251, + "grad_norm": 1.3913310219583304, + "learning_rate": 9.937498564467993e-06, + "loss": 2.2799, + "step": 360 + }, + { + "epoch": 1.4615384615384617, + "grad_norm": 1.9605641433764716, + "learning_rate": 9.936379868288525e-06, + "loss": 2.5915, + "step": 361 + }, + { + "epoch": 1.465587044534413, + "grad_norm": 1.2844543412275256, + "learning_rate": 9.935251313189564e-06, + "loss": 2.1301, + "step": 362 + }, + { + "epoch": 1.4696356275303644, + "grad_norm": 1.034982029315575, + "learning_rate": 9.934112901425058e-06, + "loss": 2.0549, + "step": 363 + }, + { + "epoch": 1.4736842105263157, + "grad_norm": 1.204999735063322, + "learning_rate": 9.932964635268637e-06, + "loss": 1.9596, + "step": 364 + }, + { + "epoch": 1.4777327935222673, + "grad_norm": 1.286601988495976, + "learning_rate": 9.931806517013612e-06, + "loss": 2.0348, + "step": 365 + }, + { + "epoch": 1.4817813765182186, + "grad_norm": 0.9482600934112612, + "learning_rate": 9.930638548972976e-06, + "loss": 1.9226, + "step": 366 + }, + { + "epoch": 1.48582995951417, + "grad_norm": 1.2527379198286719, + "learning_rate": 9.92946073347939e-06, + "loss": 1.9363, + "step": 367 + }, + { + "epoch": 1.4898785425101215, + "grad_norm": 1.416748811839403, + "learning_rate": 9.92827307288518e-06, + "loss": 1.8743, + "step": 368 + }, + { + "epoch": 1.4939271255060729, + "grad_norm": 1.4807677636442649, + "learning_rate": 9.927075569562342e-06, + "loss": 1.9204, + "step": 369 + }, + { + "epoch": 1.4979757085020242, + "grad_norm": 1.3869419977919077, + "learning_rate": 9.925868225902518e-06, + "loss": 1.8206, + "step": 370 + }, + { + "epoch": 1.5020242914979756, + "grad_norm": 1.1484019096824427, + "learning_rate": 9.924651044317017e-06, + "loss": 1.741, + "step": 371 + }, + { + "epoch": 1.5060728744939271, + "grad_norm": 1.33557569757452, + "learning_rate": 9.923424027236786e-06, + "loss": 2.0195, + "step": 372 + }, + { + "epoch": 1.5101214574898787, + "grad_norm": 1.3948710108814935, + "learning_rate": 9.922187177112422e-06, + "loss": 2.0682, + "step": 373 + }, + { + "epoch": 1.5141700404858298, + "grad_norm": 0.9670281862333157, + "learning_rate": 9.920940496414153e-06, + "loss": 2.0098, + "step": 374 + }, + { + "epoch": 1.5182186234817814, + "grad_norm": 1.1816940948972323, + "learning_rate": 9.919683987631849e-06, + "loss": 2.041, + "step": 375 + }, + { + "epoch": 1.522267206477733, + "grad_norm": 1.1912191018269882, + "learning_rate": 9.918417653275004e-06, + "loss": 1.9668, + "step": 376 + }, + { + "epoch": 1.526315789473684, + "grad_norm": 2.3568912806934783, + "learning_rate": 9.917141495872733e-06, + "loss": 1.737, + "step": 377 + }, + { + "epoch": 1.5303643724696356, + "grad_norm": 1.4730591126031292, + "learning_rate": 9.915855517973776e-06, + "loss": 1.8672, + "step": 378 + }, + { + "epoch": 1.5344129554655872, + "grad_norm": 1.5631199604094446, + "learning_rate": 9.914559722146483e-06, + "loss": 2.0038, + "step": 379 + }, + { + "epoch": 1.5384615384615383, + "grad_norm": 2.5148949693335014, + "learning_rate": 9.913254110978812e-06, + "loss": 2.0916, + "step": 380 + }, + { + "epoch": 1.54251012145749, + "grad_norm": 1.0936340454215232, + "learning_rate": 9.911938687078324e-06, + "loss": 1.9959, + "step": 381 + }, + { + "epoch": 1.5465587044534415, + "grad_norm": 9.59805118170954, + "learning_rate": 9.91061345307218e-06, + "loss": 2.6669, + "step": 382 + }, + { + "epoch": 1.5506072874493926, + "grad_norm": 5.341110768663029, + "learning_rate": 9.909278411607134e-06, + "loss": 2.7524, + "step": 383 + }, + { + "epoch": 1.5546558704453441, + "grad_norm": 6.319523626825805, + "learning_rate": 9.90793356534952e-06, + "loss": 3.2784, + "step": 384 + }, + { + "epoch": 1.5587044534412957, + "grad_norm": 1.1632747156326964, + "learning_rate": 9.906578916985267e-06, + "loss": 1.9441, + "step": 385 + }, + { + "epoch": 1.5627530364372468, + "grad_norm": 1.129320861281679, + "learning_rate": 9.90521446921987e-06, + "loss": 1.84, + "step": 386 + }, + { + "epoch": 1.5668016194331984, + "grad_norm": 1.0396625767769134, + "learning_rate": 9.9038402247784e-06, + "loss": 2.0999, + "step": 387 + }, + { + "epoch": 1.5708502024291497, + "grad_norm": 1.1109350507878293, + "learning_rate": 9.90245618640549e-06, + "loss": 1.7455, + "step": 388 + }, + { + "epoch": 1.574898785425101, + "grad_norm": 1.1573410708340344, + "learning_rate": 9.90106235686534e-06, + "loss": 2.1349, + "step": 389 + }, + { + "epoch": 1.5789473684210527, + "grad_norm": 1.0084157125260218, + "learning_rate": 9.8996587389417e-06, + "loss": 1.8406, + "step": 390 + }, + { + "epoch": 1.582995951417004, + "grad_norm": 1.1571333441837306, + "learning_rate": 9.89824533543787e-06, + "loss": 2.1231, + "step": 391 + }, + { + "epoch": 1.5870445344129553, + "grad_norm": 1.0697948256338023, + "learning_rate": 9.896822149176695e-06, + "loss": 1.9727, + "step": 392 + }, + { + "epoch": 1.591093117408907, + "grad_norm": 1.1795302734430202, + "learning_rate": 9.895389183000557e-06, + "loss": 1.9829, + "step": 393 + }, + { + "epoch": 1.5951417004048583, + "grad_norm": 1.3378200533531102, + "learning_rate": 9.893946439771369e-06, + "loss": 1.648, + "step": 394 + }, + { + "epoch": 1.5991902834008096, + "grad_norm": 1.190232768067943, + "learning_rate": 9.892493922370575e-06, + "loss": 1.6858, + "step": 395 + }, + { + "epoch": 1.6032388663967612, + "grad_norm": 1.1458315074040415, + "learning_rate": 9.891031633699135e-06, + "loss": 1.8744, + "step": 396 + }, + { + "epoch": 1.6072874493927125, + "grad_norm": 1.1819017581575564, + "learning_rate": 9.88955957667753e-06, + "loss": 1.7732, + "step": 397 + }, + { + "epoch": 1.6113360323886639, + "grad_norm": 1.8565903989047288, + "learning_rate": 9.888077754245741e-06, + "loss": 2.0753, + "step": 398 + }, + { + "epoch": 1.6153846153846154, + "grad_norm": 1.0244971639990994, + "learning_rate": 9.886586169363267e-06, + "loss": 1.9333, + "step": 399 + }, + { + "epoch": 1.6194331983805668, + "grad_norm": 1.249918723327364, + "learning_rate": 9.885084825009085e-06, + "loss": 1.8167, + "step": 400 + }, + { + "epoch": 1.623481781376518, + "grad_norm": 1.379879581099796, + "learning_rate": 9.883573724181683e-06, + "loss": 2.1783, + "step": 401 + }, + { + "epoch": 1.6275303643724697, + "grad_norm": 1.0714251364756116, + "learning_rate": 9.882052869899024e-06, + "loss": 1.9676, + "step": 402 + }, + { + "epoch": 1.631578947368421, + "grad_norm": 1.2237579067545878, + "learning_rate": 9.880522265198548e-06, + "loss": 2.154, + "step": 403 + }, + { + "epoch": 1.6356275303643724, + "grad_norm": 1.0681493200255976, + "learning_rate": 9.878981913137178e-06, + "loss": 1.8629, + "step": 404 + }, + { + "epoch": 1.639676113360324, + "grad_norm": 1.213978261543208, + "learning_rate": 9.877431816791299e-06, + "loss": 2.0544, + "step": 405 + }, + { + "epoch": 1.6437246963562753, + "grad_norm": 1.0906406926843764, + "learning_rate": 9.875871979256754e-06, + "loss": 2.0126, + "step": 406 + }, + { + "epoch": 1.6477732793522266, + "grad_norm": 1.1548847276751324, + "learning_rate": 9.87430240364885e-06, + "loss": 1.9896, + "step": 407 + }, + { + "epoch": 1.6518218623481782, + "grad_norm": 1.1007484969249457, + "learning_rate": 9.872723093102332e-06, + "loss": 1.8537, + "step": 408 + }, + { + "epoch": 1.6558704453441295, + "grad_norm": 1.4626798707839297, + "learning_rate": 9.871134050771398e-06, + "loss": 2.0636, + "step": 409 + }, + { + "epoch": 1.6599190283400809, + "grad_norm": 1.4362925135326843, + "learning_rate": 9.869535279829674e-06, + "loss": 1.892, + "step": 410 + }, + { + "epoch": 1.6639676113360324, + "grad_norm": 1.1158035130218342, + "learning_rate": 9.867926783470221e-06, + "loss": 2.0106, + "step": 411 + }, + { + "epoch": 1.6680161943319838, + "grad_norm": 1.094342494438384, + "learning_rate": 9.866308564905523e-06, + "loss": 2.0453, + "step": 412 + }, + { + "epoch": 1.6720647773279351, + "grad_norm": 1.0432966613184569, + "learning_rate": 9.864680627367476e-06, + "loss": 1.9541, + "step": 413 + }, + { + "epoch": 1.6761133603238867, + "grad_norm": 1.2646590113938572, + "learning_rate": 9.863042974107395e-06, + "loss": 1.9078, + "step": 414 + }, + { + "epoch": 1.680161943319838, + "grad_norm": 1.4143613333940679, + "learning_rate": 9.861395608395993e-06, + "loss": 2.0498, + "step": 415 + }, + { + "epoch": 1.6842105263157894, + "grad_norm": 1.1227780591009553, + "learning_rate": 9.859738533523384e-06, + "loss": 1.8425, + "step": 416 + }, + { + "epoch": 1.688259109311741, + "grad_norm": 1.1478310296573677, + "learning_rate": 9.85807175279907e-06, + "loss": 1.9961, + "step": 417 + }, + { + "epoch": 1.6923076923076923, + "grad_norm": 1.1555612172711482, + "learning_rate": 9.856395269551941e-06, + "loss": 1.9982, + "step": 418 + }, + { + "epoch": 1.6963562753036436, + "grad_norm": 1.2453555718552303, + "learning_rate": 9.854709087130261e-06, + "loss": 1.8074, + "step": 419 + }, + { + "epoch": 1.7004048582995952, + "grad_norm": 1.3445248996792332, + "learning_rate": 9.85301320890167e-06, + "loss": 2.315, + "step": 420 + }, + { + "epoch": 1.7044534412955465, + "grad_norm": 1.37583724829167, + "learning_rate": 9.851307638253167e-06, + "loss": 2.0698, + "step": 421 + }, + { + "epoch": 1.708502024291498, + "grad_norm": 1.4100704184587762, + "learning_rate": 9.849592378591113e-06, + "loss": 1.7238, + "step": 422 + }, + { + "epoch": 1.7125506072874495, + "grad_norm": 1.2265807736330994, + "learning_rate": 9.847867433341218e-06, + "loss": 1.881, + "step": 423 + }, + { + "epoch": 1.7165991902834008, + "grad_norm": 1.192372006539784, + "learning_rate": 9.846132805948534e-06, + "loss": 1.9658, + "step": 424 + }, + { + "epoch": 1.7206477732793521, + "grad_norm": 1.307546713268623, + "learning_rate": 9.844388499877457e-06, + "loss": 1.873, + "step": 425 + }, + { + "epoch": 1.7246963562753037, + "grad_norm": 1.382722813051471, + "learning_rate": 9.842634518611705e-06, + "loss": 1.9664, + "step": 426 + }, + { + "epoch": 1.728744939271255, + "grad_norm": 1.4179302059943903, + "learning_rate": 9.840870865654323e-06, + "loss": 2.1073, + "step": 427 + }, + { + "epoch": 1.7327935222672064, + "grad_norm": 1.0508460965436048, + "learning_rate": 9.839097544527674e-06, + "loss": 1.9957, + "step": 428 + }, + { + "epoch": 1.736842105263158, + "grad_norm": 1.239601761065164, + "learning_rate": 9.837314558773427e-06, + "loss": 2.0381, + "step": 429 + }, + { + "epoch": 1.7408906882591093, + "grad_norm": 2.1485433652175137, + "learning_rate": 9.835521911952554e-06, + "loss": 2.6976, + "step": 430 + }, + { + "epoch": 1.7449392712550607, + "grad_norm": 1.2416619753926275, + "learning_rate": 9.833719607645325e-06, + "loss": 2.0715, + "step": 431 + }, + { + "epoch": 1.7489878542510122, + "grad_norm": 1.2591779562696075, + "learning_rate": 9.831907649451291e-06, + "loss": 1.9002, + "step": 432 + }, + { + "epoch": 1.7530364372469636, + "grad_norm": 1.1535891547143164, + "learning_rate": 9.830086040989294e-06, + "loss": 1.7871, + "step": 433 + }, + { + "epoch": 1.757085020242915, + "grad_norm": 1.1923358702044, + "learning_rate": 9.82825478589744e-06, + "loss": 1.9962, + "step": 434 + }, + { + "epoch": 1.7611336032388665, + "grad_norm": 4.275347299758622, + "learning_rate": 9.826413887833103e-06, + "loss": 2.9222, + "step": 435 + }, + { + "epoch": 1.7651821862348178, + "grad_norm": 4.287598045967039, + "learning_rate": 9.824563350472922e-06, + "loss": 2.8461, + "step": 436 + }, + { + "epoch": 1.7692307692307692, + "grad_norm": 10.935868536450831, + "learning_rate": 9.822703177512783e-06, + "loss": 2.7384, + "step": 437 + }, + { + "epoch": 1.7732793522267207, + "grad_norm": 1.3409883266265459, + "learning_rate": 9.820833372667813e-06, + "loss": 1.9939, + "step": 438 + }, + { + "epoch": 1.777327935222672, + "grad_norm": 1.3613081112789813, + "learning_rate": 9.818953939672382e-06, + "loss": 2.1821, + "step": 439 + }, + { + "epoch": 1.7813765182186234, + "grad_norm": 1.2675875076339627, + "learning_rate": 9.817064882280085e-06, + "loss": 2.2096, + "step": 440 + }, + { + "epoch": 1.785425101214575, + "grad_norm": 1.1133761183439654, + "learning_rate": 9.815166204263743e-06, + "loss": 2.0038, + "step": 441 + }, + { + "epoch": 1.7894736842105263, + "grad_norm": 1.0606754044873359, + "learning_rate": 9.813257909415384e-06, + "loss": 1.887, + "step": 442 + }, + { + "epoch": 1.7935222672064777, + "grad_norm": 1.2526447757224037, + "learning_rate": 9.811340001546252e-06, + "loss": 2.0549, + "step": 443 + }, + { + "epoch": 1.7975708502024292, + "grad_norm": 1.1262042906691425, + "learning_rate": 9.809412484486785e-06, + "loss": 2.077, + "step": 444 + }, + { + "epoch": 1.8016194331983806, + "grad_norm": 1.155022921046038, + "learning_rate": 9.80747536208661e-06, + "loss": 1.8171, + "step": 445 + }, + { + "epoch": 1.805668016194332, + "grad_norm": 1.1470501457250857, + "learning_rate": 9.805528638214543e-06, + "loss": 1.709, + "step": 446 + }, + { + "epoch": 1.8097165991902835, + "grad_norm": 1.254871859778204, + "learning_rate": 9.803572316758573e-06, + "loss": 2.005, + "step": 447 + }, + { + "epoch": 1.8137651821862348, + "grad_norm": 1.4428684006978485, + "learning_rate": 9.801606401625857e-06, + "loss": 2.0437, + "step": 448 + }, + { + "epoch": 1.8178137651821862, + "grad_norm": 1.1372709832560302, + "learning_rate": 9.799630896742716e-06, + "loss": 1.8053, + "step": 449 + }, + { + "epoch": 1.8218623481781377, + "grad_norm": 7.867540851479705, + "learning_rate": 9.797645806054617e-06, + "loss": 2.6057, + "step": 450 + }, + { + "epoch": 1.825910931174089, + "grad_norm": 17.828898730946783, + "learning_rate": 9.79565113352618e-06, + "loss": 4.1742, + "step": 451 + }, + { + "epoch": 1.8299595141700404, + "grad_norm": 1.3323533085958537, + "learning_rate": 9.793646883141155e-06, + "loss": 1.9001, + "step": 452 + }, + { + "epoch": 1.834008097165992, + "grad_norm": 1.2550944955882024, + "learning_rate": 9.791633058902424e-06, + "loss": 1.7789, + "step": 453 + }, + { + "epoch": 1.8380566801619433, + "grad_norm": 1.2515953723091495, + "learning_rate": 9.789609664831988e-06, + "loss": 1.8425, + "step": 454 + }, + { + "epoch": 1.8421052631578947, + "grad_norm": 1.1650016476570495, + "learning_rate": 9.787576704970965e-06, + "loss": 1.8701, + "step": 455 + }, + { + "epoch": 1.8461538461538463, + "grad_norm": 1.1568290050770706, + "learning_rate": 9.785534183379571e-06, + "loss": 1.8468, + "step": 456 + }, + { + "epoch": 1.8502024291497976, + "grad_norm": 1.1529373182216824, + "learning_rate": 9.783482104137127e-06, + "loss": 1.8772, + "step": 457 + }, + { + "epoch": 1.854251012145749, + "grad_norm": 1.3000516637827273, + "learning_rate": 9.781420471342035e-06, + "loss": 1.9477, + "step": 458 + }, + { + "epoch": 1.8582995951417005, + "grad_norm": 1.0258650008659411, + "learning_rate": 9.779349289111781e-06, + "loss": 1.8995, + "step": 459 + }, + { + "epoch": 1.8623481781376519, + "grad_norm": 1.2394575763975424, + "learning_rate": 9.777268561582921e-06, + "loss": 1.9406, + "step": 460 + }, + { + "epoch": 1.8663967611336032, + "grad_norm": 1.2541685708518606, + "learning_rate": 9.77517829291108e-06, + "loss": 1.9325, + "step": 461 + }, + { + "epoch": 1.8704453441295548, + "grad_norm": 1.5330647366042962, + "learning_rate": 9.773078487270932e-06, + "loss": 2.4038, + "step": 462 + }, + { + "epoch": 1.874493927125506, + "grad_norm": 1.5015880335176561, + "learning_rate": 9.770969148856202e-06, + "loss": 2.3187, + "step": 463 + }, + { + "epoch": 1.8785425101214575, + "grad_norm": 1.4834304636666527, + "learning_rate": 9.768850281879651e-06, + "loss": 2.1105, + "step": 464 + }, + { + "epoch": 1.882591093117409, + "grad_norm": 1.2140714457469706, + "learning_rate": 9.766721890573075e-06, + "loss": 1.9824, + "step": 465 + }, + { + "epoch": 1.8866396761133604, + "grad_norm": 1.3661085878272685, + "learning_rate": 9.764583979187288e-06, + "loss": 1.5205, + "step": 466 + }, + { + "epoch": 1.8906882591093117, + "grad_norm": 1.2317311840953222, + "learning_rate": 9.762436551992117e-06, + "loss": 1.9872, + "step": 467 + }, + { + "epoch": 1.8947368421052633, + "grad_norm": 1.3883104250103875, + "learning_rate": 9.760279613276397e-06, + "loss": 2.0814, + "step": 468 + }, + { + "epoch": 1.8987854251012146, + "grad_norm": 1.1681713845582538, + "learning_rate": 9.75811316734796e-06, + "loss": 1.7849, + "step": 469 + }, + { + "epoch": 1.902834008097166, + "grad_norm": 1.15545443174025, + "learning_rate": 9.755937218533622e-06, + "loss": 1.8179, + "step": 470 + }, + { + "epoch": 1.9068825910931175, + "grad_norm": 1.5408624758508003, + "learning_rate": 9.753751771179177e-06, + "loss": 2.0286, + "step": 471 + }, + { + "epoch": 1.9109311740890689, + "grad_norm": 1.3817398480348058, + "learning_rate": 9.751556829649398e-06, + "loss": 1.5547, + "step": 472 + }, + { + "epoch": 1.9149797570850202, + "grad_norm": 1.3351696061966247, + "learning_rate": 9.74935239832801e-06, + "loss": 1.733, + "step": 473 + }, + { + "epoch": 1.9190283400809718, + "grad_norm": 1.264760117783077, + "learning_rate": 9.747138481617695e-06, + "loss": 1.767, + "step": 474 + }, + { + "epoch": 1.9230769230769231, + "grad_norm": 1.2863761477462097, + "learning_rate": 9.74491508394008e-06, + "loss": 2.0018, + "step": 475 + }, + { + "epoch": 1.9271255060728745, + "grad_norm": 1.5310497493928237, + "learning_rate": 9.742682209735727e-06, + "loss": 1.8865, + "step": 476 + }, + { + "epoch": 1.931174089068826, + "grad_norm": 1.711973469366144, + "learning_rate": 9.740439863464127e-06, + "loss": 1.9105, + "step": 477 + }, + { + "epoch": 1.9352226720647774, + "grad_norm": 1.249933707627717, + "learning_rate": 9.738188049603679e-06, + "loss": 1.7676, + "step": 478 + }, + { + "epoch": 1.9392712550607287, + "grad_norm": 1.2902981801333298, + "learning_rate": 9.735926772651703e-06, + "loss": 1.6493, + "step": 479 + }, + { + "epoch": 1.9433198380566803, + "grad_norm": 1.4792877192638219, + "learning_rate": 9.73365603712441e-06, + "loss": 1.9464, + "step": 480 + }, + { + "epoch": 1.9473684210526314, + "grad_norm": 1.3282266924987296, + "learning_rate": 9.731375847556905e-06, + "loss": 1.6826, + "step": 481 + }, + { + "epoch": 1.951417004048583, + "grad_norm": 1.4677668638223476, + "learning_rate": 9.729086208503174e-06, + "loss": 1.7014, + "step": 482 + }, + { + "epoch": 1.9554655870445345, + "grad_norm": 2.3808599607342855, + "learning_rate": 9.726787124536077e-06, + "loss": 1.9583, + "step": 483 + }, + { + "epoch": 1.9595141700404857, + "grad_norm": 1.3600754750050374, + "learning_rate": 9.724478600247333e-06, + "loss": 1.7925, + "step": 484 + }, + { + "epoch": 1.9635627530364372, + "grad_norm": 1.1666914976637783, + "learning_rate": 9.722160640247523e-06, + "loss": 1.8932, + "step": 485 + }, + { + "epoch": 1.9676113360323888, + "grad_norm": 1.3451750453053897, + "learning_rate": 9.719833249166061e-06, + "loss": 2.1332, + "step": 486 + }, + { + "epoch": 1.97165991902834, + "grad_norm": 1.9010105722641066, + "learning_rate": 9.717496431651212e-06, + "loss": 2.0526, + "step": 487 + }, + { + "epoch": 1.9757085020242915, + "grad_norm": 1.1672390815512188, + "learning_rate": 9.715150192370054e-06, + "loss": 1.8783, + "step": 488 + }, + { + "epoch": 1.979757085020243, + "grad_norm": 1.384114220461852, + "learning_rate": 9.712794536008488e-06, + "loss": 1.9859, + "step": 489 + }, + { + "epoch": 1.9838056680161942, + "grad_norm": 1.2933526518975824, + "learning_rate": 9.710429467271221e-06, + "loss": 2.0382, + "step": 490 + }, + { + "epoch": 1.9878542510121457, + "grad_norm": 1.423570288241044, + "learning_rate": 9.708054990881763e-06, + "loss": 1.8377, + "step": 491 + }, + { + "epoch": 1.9919028340080973, + "grad_norm": 1.2866158830707874, + "learning_rate": 9.705671111582406e-06, + "loss": 1.7694, + "step": 492 + }, + { + "epoch": 1.9959514170040484, + "grad_norm": 1.0521519412024614, + "learning_rate": 9.703277834134227e-06, + "loss": 2.0757, + "step": 493 + }, + { + "epoch": 2.0, + "grad_norm": 1.2995506674782646, + "learning_rate": 9.700875163317072e-06, + "loss": 1.8875, + "step": 494 + }, + { + "epoch": 2.0040485829959516, + "grad_norm": 1.1352855274001465, + "learning_rate": 9.698463103929542e-06, + "loss": 1.9618, + "step": 495 + }, + { + "epoch": 2.0080971659919027, + "grad_norm": 1.542269208448278, + "learning_rate": 9.696041660788997e-06, + "loss": 2.0888, + "step": 496 + }, + { + "epoch": 2.0121457489878543, + "grad_norm": 1.6780350902786914, + "learning_rate": 9.693610838731532e-06, + "loss": 1.9408, + "step": 497 + }, + { + "epoch": 2.016194331983806, + "grad_norm": 1.6035230575875041, + "learning_rate": 9.691170642611975e-06, + "loss": 2.0771, + "step": 498 + }, + { + "epoch": 2.020242914979757, + "grad_norm": 1.4671035377471024, + "learning_rate": 9.68872107730388e-06, + "loss": 2.3311, + "step": 499 + }, + { + "epoch": 2.0242914979757085, + "grad_norm": 1.5075955512152057, + "learning_rate": 9.686262147699507e-06, + "loss": 2.2077, + "step": 500 + }, + { + "epoch": 2.02834008097166, + "grad_norm": 1.5639916261560791, + "learning_rate": 9.683793858709821e-06, + "loss": 1.8546, + "step": 501 + }, + { + "epoch": 2.032388663967611, + "grad_norm": 1.5331421353363675, + "learning_rate": 9.681316215264481e-06, + "loss": 1.9004, + "step": 502 + }, + { + "epoch": 2.0364372469635628, + "grad_norm": 1.4656462364511347, + "learning_rate": 9.678829222311827e-06, + "loss": 2.1369, + "step": 503 + }, + { + "epoch": 2.0404858299595143, + "grad_norm": 1.7055289856989309, + "learning_rate": 9.67633288481887e-06, + "loss": 1.9294, + "step": 504 + }, + { + "epoch": 2.0445344129554655, + "grad_norm": 1.3320529357395552, + "learning_rate": 9.67382720777129e-06, + "loss": 1.9228, + "step": 505 + }, + { + "epoch": 2.048582995951417, + "grad_norm": 1.378485994628673, + "learning_rate": 9.671312196173413e-06, + "loss": 1.9005, + "step": 506 + }, + { + "epoch": 2.0526315789473686, + "grad_norm": 1.4519006220083899, + "learning_rate": 9.668787855048209e-06, + "loss": 2.0772, + "step": 507 + }, + { + "epoch": 2.0566801619433197, + "grad_norm": 1.46960243337033, + "learning_rate": 9.666254189437286e-06, + "loss": 1.9259, + "step": 508 + }, + { + "epoch": 2.0607287449392713, + "grad_norm": 1.3018755932293484, + "learning_rate": 9.663711204400872e-06, + "loss": 2.0637, + "step": 509 + }, + { + "epoch": 2.064777327935223, + "grad_norm": 1.4438151108336905, + "learning_rate": 9.661158905017804e-06, + "loss": 1.9998, + "step": 510 + }, + { + "epoch": 2.068825910931174, + "grad_norm": 1.5146888645164116, + "learning_rate": 9.658597296385527e-06, + "loss": 2.1032, + "step": 511 + }, + { + "epoch": 2.0728744939271255, + "grad_norm": 1.4173605487062464, + "learning_rate": 9.656026383620076e-06, + "loss": 1.9957, + "step": 512 + }, + { + "epoch": 2.076923076923077, + "grad_norm": 1.3186505882274318, + "learning_rate": 9.653446171856069e-06, + "loss": 1.9291, + "step": 513 + }, + { + "epoch": 2.080971659919028, + "grad_norm": 1.2929004725593367, + "learning_rate": 9.650856666246693e-06, + "loss": 1.9435, + "step": 514 + }, + { + "epoch": 2.08502024291498, + "grad_norm": 1.2511951269635655, + "learning_rate": 9.6482578719637e-06, + "loss": 1.9267, + "step": 515 + }, + { + "epoch": 2.0890688259109313, + "grad_norm": 1.9429673192553882, + "learning_rate": 9.645649794197394e-06, + "loss": 1.9435, + "step": 516 + }, + { + "epoch": 2.0931174089068825, + "grad_norm": 1.315419932054697, + "learning_rate": 9.643032438156616e-06, + "loss": 2.0396, + "step": 517 + }, + { + "epoch": 2.097165991902834, + "grad_norm": 1.3284199817957691, + "learning_rate": 9.640405809068743e-06, + "loss": 1.765, + "step": 518 + }, + { + "epoch": 2.1012145748987856, + "grad_norm": 1.4032585852247357, + "learning_rate": 9.637769912179664e-06, + "loss": 1.9292, + "step": 519 + }, + { + "epoch": 2.1052631578947367, + "grad_norm": 1.4202061741742247, + "learning_rate": 9.635124752753787e-06, + "loss": 1.9832, + "step": 520 + }, + { + "epoch": 2.1093117408906883, + "grad_norm": 1.4962037346644237, + "learning_rate": 9.632470336074009e-06, + "loss": 1.8461, + "step": 521 + }, + { + "epoch": 2.11336032388664, + "grad_norm": 1.829451958189404, + "learning_rate": 9.629806667441727e-06, + "loss": 1.7856, + "step": 522 + }, + { + "epoch": 2.117408906882591, + "grad_norm": 1.6374878381545, + "learning_rate": 9.627133752176809e-06, + "loss": 1.7441, + "step": 523 + }, + { + "epoch": 2.1214574898785425, + "grad_norm": 1.4010819404830996, + "learning_rate": 9.624451595617588e-06, + "loss": 1.7615, + "step": 524 + }, + { + "epoch": 2.125506072874494, + "grad_norm": 1.441999234959946, + "learning_rate": 9.62176020312086e-06, + "loss": 1.7378, + "step": 525 + }, + { + "epoch": 2.1295546558704452, + "grad_norm": 1.5770630911097265, + "learning_rate": 9.619059580061862e-06, + "loss": 1.7039, + "step": 526 + }, + { + "epoch": 2.133603238866397, + "grad_norm": 1.4591597594445938, + "learning_rate": 9.616349731834271e-06, + "loss": 2.0009, + "step": 527 + }, + { + "epoch": 2.1376518218623484, + "grad_norm": 1.6179185626843804, + "learning_rate": 9.613630663850184e-06, + "loss": 1.872, + "step": 528 + }, + { + "epoch": 2.1417004048582995, + "grad_norm": 1.3086175576058332, + "learning_rate": 9.610902381540115e-06, + "loss": 1.5977, + "step": 529 + }, + { + "epoch": 2.145748987854251, + "grad_norm": 1.444761778117532, + "learning_rate": 9.608164890352977e-06, + "loss": 2.0221, + "step": 530 + }, + { + "epoch": 2.1497975708502026, + "grad_norm": 1.4113693951603745, + "learning_rate": 9.605418195756077e-06, + "loss": 1.8497, + "step": 531 + }, + { + "epoch": 2.1538461538461537, + "grad_norm": 1.2987083078720463, + "learning_rate": 9.602662303235106e-06, + "loss": 1.9881, + "step": 532 + }, + { + "epoch": 2.1578947368421053, + "grad_norm": 1.5356679778352307, + "learning_rate": 9.599897218294122e-06, + "loss": 2.2169, + "step": 533 + }, + { + "epoch": 2.161943319838057, + "grad_norm": 1.2586253730389827, + "learning_rate": 9.597122946455539e-06, + "loss": 1.8884, + "step": 534 + }, + { + "epoch": 2.165991902834008, + "grad_norm": 1.3241548388576752, + "learning_rate": 9.594339493260127e-06, + "loss": 1.9169, + "step": 535 + }, + { + "epoch": 2.1700404858299596, + "grad_norm": 3.3161848122832627, + "learning_rate": 9.591546864266983e-06, + "loss": 2.3116, + "step": 536 + }, + { + "epoch": 2.174089068825911, + "grad_norm": 1.2785252284615238, + "learning_rate": 9.58874506505354e-06, + "loss": 1.7854, + "step": 537 + }, + { + "epoch": 2.1781376518218623, + "grad_norm": 1.4062987764786141, + "learning_rate": 9.58593410121554e-06, + "loss": 1.9564, + "step": 538 + }, + { + "epoch": 2.182186234817814, + "grad_norm": 1.1858759757574733, + "learning_rate": 9.583113978367026e-06, + "loss": 1.7449, + "step": 539 + }, + { + "epoch": 2.1862348178137654, + "grad_norm": 1.4958289357631562, + "learning_rate": 9.580284702140342e-06, + "loss": 1.8748, + "step": 540 + }, + { + "epoch": 2.1902834008097165, + "grad_norm": 1.271888181605562, + "learning_rate": 9.577446278186103e-06, + "loss": 1.944, + "step": 541 + }, + { + "epoch": 2.194331983805668, + "grad_norm": 1.6297569109832326, + "learning_rate": 9.574598712173202e-06, + "loss": 2.1136, + "step": 542 + }, + { + "epoch": 2.1983805668016196, + "grad_norm": 1.7294919253670684, + "learning_rate": 9.571742009788787e-06, + "loss": 2.1866, + "step": 543 + }, + { + "epoch": 2.2024291497975708, + "grad_norm": 1.5317790321439353, + "learning_rate": 9.568876176738251e-06, + "loss": 1.8859, + "step": 544 + }, + { + "epoch": 2.2064777327935223, + "grad_norm": 1.711554028884214, + "learning_rate": 9.56600121874523e-06, + "loss": 2.1936, + "step": 545 + }, + { + "epoch": 2.2105263157894735, + "grad_norm": 1.4435460877228636, + "learning_rate": 9.563117141551574e-06, + "loss": 2.2517, + "step": 546 + }, + { + "epoch": 2.214574898785425, + "grad_norm": 1.4961050962412457, + "learning_rate": 9.560223950917354e-06, + "loss": 2.041, + "step": 547 + }, + { + "epoch": 2.2186234817813766, + "grad_norm": 1.3247670963766616, + "learning_rate": 9.557321652620839e-06, + "loss": 1.8986, + "step": 548 + }, + { + "epoch": 2.2226720647773277, + "grad_norm": 1.4724998096864195, + "learning_rate": 9.554410252458489e-06, + "loss": 1.8568, + "step": 549 + }, + { + "epoch": 2.2267206477732793, + "grad_norm": 3.7991275518186196, + "learning_rate": 9.551489756244939e-06, + "loss": 1.9347, + "step": 550 + }, + { + "epoch": 2.230769230769231, + "grad_norm": 1.4010848185779328, + "learning_rate": 9.548560169812997e-06, + "loss": 1.8809, + "step": 551 + }, + { + "epoch": 2.234817813765182, + "grad_norm": 1.6221348693259423, + "learning_rate": 9.54562149901362e-06, + "loss": 2.0865, + "step": 552 + }, + { + "epoch": 2.2388663967611335, + "grad_norm": 1.4196865192753882, + "learning_rate": 9.54267374971591e-06, + "loss": 2.0449, + "step": 553 + }, + { + "epoch": 2.242914979757085, + "grad_norm": 1.4599787722592332, + "learning_rate": 9.539716927807102e-06, + "loss": 2.0083, + "step": 554 + }, + { + "epoch": 2.246963562753036, + "grad_norm": 1.251605201082177, + "learning_rate": 9.536751039192549e-06, + "loss": 1.8576, + "step": 555 + }, + { + "epoch": 2.251012145748988, + "grad_norm": 1.30407928376828, + "learning_rate": 9.533776089795712e-06, + "loss": 1.8923, + "step": 556 + }, + { + "epoch": 2.2550607287449393, + "grad_norm": 1.4348421622864604, + "learning_rate": 9.530792085558151e-06, + "loss": 1.9873, + "step": 557 + }, + { + "epoch": 2.2591093117408905, + "grad_norm": 1.4429474918555736, + "learning_rate": 9.527799032439506e-06, + "loss": 1.8211, + "step": 558 + }, + { + "epoch": 2.263157894736842, + "grad_norm": 1.338584745094179, + "learning_rate": 9.524796936417495e-06, + "loss": 1.7082, + "step": 559 + }, + { + "epoch": 2.2672064777327936, + "grad_norm": 1.329824996124572, + "learning_rate": 9.521785803487888e-06, + "loss": 1.9216, + "step": 560 + }, + { + "epoch": 2.2712550607287447, + "grad_norm": 1.3374675078915148, + "learning_rate": 9.518765639664512e-06, + "loss": 1.9723, + "step": 561 + }, + { + "epoch": 2.2753036437246963, + "grad_norm": 1.4689345418902104, + "learning_rate": 9.515736450979224e-06, + "loss": 1.953, + "step": 562 + }, + { + "epoch": 2.279352226720648, + "grad_norm": 1.6439512327159642, + "learning_rate": 9.512698243481914e-06, + "loss": 1.991, + "step": 563 + }, + { + "epoch": 2.283400809716599, + "grad_norm": 1.5280266119657933, + "learning_rate": 9.509651023240472e-06, + "loss": 1.7088, + "step": 564 + }, + { + "epoch": 2.2874493927125505, + "grad_norm": 1.5234607385845351, + "learning_rate": 9.5065947963408e-06, + "loss": 1.7975, + "step": 565 + }, + { + "epoch": 2.291497975708502, + "grad_norm": 1.4898313464385229, + "learning_rate": 9.50352956888678e-06, + "loss": 1.6643, + "step": 566 + }, + { + "epoch": 2.2955465587044532, + "grad_norm": 1.5049004900957001, + "learning_rate": 9.500455347000273e-06, + "loss": 2.0078, + "step": 567 + }, + { + "epoch": 2.299595141700405, + "grad_norm": 1.5268023276941818, + "learning_rate": 9.497372136821103e-06, + "loss": 2.0653, + "step": 568 + }, + { + "epoch": 2.3036437246963564, + "grad_norm": 1.5293343920918272, + "learning_rate": 9.49427994450705e-06, + "loss": 1.8078, + "step": 569 + }, + { + "epoch": 2.3076923076923075, + "grad_norm": 1.504441993367853, + "learning_rate": 9.491178776233825e-06, + "loss": 1.8219, + "step": 570 + }, + { + "epoch": 2.311740890688259, + "grad_norm": 1.3604060927952581, + "learning_rate": 9.488068638195072e-06, + "loss": 1.8582, + "step": 571 + }, + { + "epoch": 2.3157894736842106, + "grad_norm": 1.7336288728624165, + "learning_rate": 9.484949536602343e-06, + "loss": 1.8562, + "step": 572 + }, + { + "epoch": 2.3198380566801617, + "grad_norm": 1.536212130823414, + "learning_rate": 9.481821477685102e-06, + "loss": 1.7431, + "step": 573 + }, + { + "epoch": 2.3238866396761133, + "grad_norm": 1.4120913757834546, + "learning_rate": 9.478684467690693e-06, + "loss": 1.7586, + "step": 574 + }, + { + "epoch": 2.327935222672065, + "grad_norm": 1.453958520209467, + "learning_rate": 9.47553851288434e-06, + "loss": 1.7694, + "step": 575 + }, + { + "epoch": 2.331983805668016, + "grad_norm": 1.3935000424019952, + "learning_rate": 9.472383619549133e-06, + "loss": 1.6545, + "step": 576 + }, + { + "epoch": 2.3360323886639676, + "grad_norm": 1.3589610652505588, + "learning_rate": 9.469219793986016e-06, + "loss": 1.6896, + "step": 577 + }, + { + "epoch": 2.340080971659919, + "grad_norm": 1.7566987829139051, + "learning_rate": 9.466047042513767e-06, + "loss": 1.6272, + "step": 578 + }, + { + "epoch": 2.3441295546558703, + "grad_norm": 1.3287178155779462, + "learning_rate": 9.462865371468994e-06, + "loss": 1.7176, + "step": 579 + }, + { + "epoch": 2.348178137651822, + "grad_norm": 1.8490808825118674, + "learning_rate": 9.459674787206117e-06, + "loss": 1.9005, + "step": 580 + }, + { + "epoch": 2.3522267206477734, + "grad_norm": 1.8200114285326863, + "learning_rate": 9.45647529609736e-06, + "loss": 1.7493, + "step": 581 + }, + { + "epoch": 2.3562753036437245, + "grad_norm": 1.7944997812037724, + "learning_rate": 9.453266904532737e-06, + "loss": 1.4856, + "step": 582 + }, + { + "epoch": 2.360323886639676, + "grad_norm": 1.6449884777915886, + "learning_rate": 9.450049618920034e-06, + "loss": 1.8312, + "step": 583 + }, + { + "epoch": 2.3643724696356276, + "grad_norm": 1.6009358010430617, + "learning_rate": 9.4468234456848e-06, + "loss": 1.9048, + "step": 584 + }, + { + "epoch": 2.3684210526315788, + "grad_norm": 1.519230320705593, + "learning_rate": 9.44358839127034e-06, + "loss": 1.8077, + "step": 585 + }, + { + "epoch": 2.3724696356275303, + "grad_norm": 1.8694258750708748, + "learning_rate": 9.44034446213769e-06, + "loss": 1.9556, + "step": 586 + }, + { + "epoch": 2.376518218623482, + "grad_norm": 1.4302907644008036, + "learning_rate": 9.437091664765611e-06, + "loss": 1.5064, + "step": 587 + }, + { + "epoch": 2.380566801619433, + "grad_norm": 1.5423881317930213, + "learning_rate": 9.433830005650582e-06, + "loss": 1.69, + "step": 588 + }, + { + "epoch": 2.3846153846153846, + "grad_norm": 1.4747017336722326, + "learning_rate": 9.430559491306777e-06, + "loss": 1.5552, + "step": 589 + }, + { + "epoch": 2.388663967611336, + "grad_norm": 1.600482934018078, + "learning_rate": 9.427280128266049e-06, + "loss": 1.6106, + "step": 590 + }, + { + "epoch": 2.3927125506072873, + "grad_norm": 1.5014148151060753, + "learning_rate": 9.423991923077938e-06, + "loss": 1.7636, + "step": 591 + }, + { + "epoch": 2.396761133603239, + "grad_norm": 1.7672182274084831, + "learning_rate": 9.420694882309628e-06, + "loss": 1.5786, + "step": 592 + }, + { + "epoch": 2.4008097165991904, + "grad_norm": 1.440572594457583, + "learning_rate": 9.41738901254596e-06, + "loss": 1.6426, + "step": 593 + }, + { + "epoch": 2.4048582995951415, + "grad_norm": 1.5625132261883155, + "learning_rate": 9.414074320389403e-06, + "loss": 1.7306, + "step": 594 + }, + { + "epoch": 2.408906882591093, + "grad_norm": 1.683823244071828, + "learning_rate": 9.41075081246005e-06, + "loss": 1.821, + "step": 595 + }, + { + "epoch": 2.4129554655870447, + "grad_norm": 1.4314599370281114, + "learning_rate": 9.4074184953956e-06, + "loss": 1.6872, + "step": 596 + }, + { + "epoch": 2.417004048582996, + "grad_norm": 1.5657957134872598, + "learning_rate": 9.404077375851338e-06, + "loss": 1.9362, + "step": 597 + }, + { + "epoch": 2.4210526315789473, + "grad_norm": 1.6198467768431548, + "learning_rate": 9.400727460500141e-06, + "loss": 2.0139, + "step": 598 + }, + { + "epoch": 2.425101214574899, + "grad_norm": 1.4103077055466628, + "learning_rate": 9.397368756032445e-06, + "loss": 1.8485, + "step": 599 + }, + { + "epoch": 2.42914979757085, + "grad_norm": 1.3471173889103276, + "learning_rate": 9.394001269156245e-06, + "loss": 1.9812, + "step": 600 + }, + { + "epoch": 2.4331983805668016, + "grad_norm": 1.4234064588511484, + "learning_rate": 9.39062500659707e-06, + "loss": 2.0496, + "step": 601 + }, + { + "epoch": 2.437246963562753, + "grad_norm": 1.4784926767119206, + "learning_rate": 9.38723997509798e-06, + "loss": 1.837, + "step": 602 + }, + { + "epoch": 2.4412955465587043, + "grad_norm": 1.5518065193263646, + "learning_rate": 9.383846181419547e-06, + "loss": 1.765, + "step": 603 + }, + { + "epoch": 2.445344129554656, + "grad_norm": 1.3196666479973478, + "learning_rate": 9.380443632339845e-06, + "loss": 2.0255, + "step": 604 + }, + { + "epoch": 2.4493927125506074, + "grad_norm": 1.440061740597458, + "learning_rate": 9.37703233465443e-06, + "loss": 1.7942, + "step": 605 + }, + { + "epoch": 2.4534412955465585, + "grad_norm": 1.5327759577164166, + "learning_rate": 9.373612295176333e-06, + "loss": 1.777, + "step": 606 + }, + { + "epoch": 2.45748987854251, + "grad_norm": 1.6814358499503075, + "learning_rate": 9.370183520736045e-06, + "loss": 2.185, + "step": 607 + }, + { + "epoch": 2.4615384615384617, + "grad_norm": 2.30393335895373, + "learning_rate": 9.366746018181503e-06, + "loss": 2.4563, + "step": 608 + }, + { + "epoch": 2.465587044534413, + "grad_norm": 1.8584859443814368, + "learning_rate": 9.363299794378072e-06, + "loss": 2.0155, + "step": 609 + }, + { + "epoch": 2.4696356275303644, + "grad_norm": 1.2803493212403667, + "learning_rate": 9.359844856208538e-06, + "loss": 1.9623, + "step": 610 + }, + { + "epoch": 2.473684210526316, + "grad_norm": 1.323092986933548, + "learning_rate": 9.356381210573092e-06, + "loss": 1.8725, + "step": 611 + }, + { + "epoch": 2.477732793522267, + "grad_norm": 1.716120944564361, + "learning_rate": 9.352908864389313e-06, + "loss": 1.9058, + "step": 612 + }, + { + "epoch": 2.4817813765182186, + "grad_norm": 1.1767574227433577, + "learning_rate": 9.349427824592157e-06, + "loss": 1.818, + "step": 613 + }, + { + "epoch": 2.48582995951417, + "grad_norm": 1.8646580879242294, + "learning_rate": 9.345938098133946e-06, + "loss": 1.8001, + "step": 614 + }, + { + "epoch": 2.4898785425101213, + "grad_norm": 1.7755724904128214, + "learning_rate": 9.342439691984346e-06, + "loss": 1.7282, + "step": 615 + }, + { + "epoch": 2.493927125506073, + "grad_norm": 1.7352293901651843, + "learning_rate": 9.338932613130363e-06, + "loss": 1.7961, + "step": 616 + }, + { + "epoch": 2.4979757085020244, + "grad_norm": 1.6153408388514847, + "learning_rate": 9.33541686857632e-06, + "loss": 1.662, + "step": 617 + }, + { + "epoch": 2.5020242914979756, + "grad_norm": 1.5099283023047843, + "learning_rate": 9.331892465343851e-06, + "loss": 1.588, + "step": 618 + }, + { + "epoch": 2.506072874493927, + "grad_norm": 1.730183741035281, + "learning_rate": 9.328359410471878e-06, + "loss": 1.8722, + "step": 619 + }, + { + "epoch": 2.5101214574898787, + "grad_norm": 1.7321761047223487, + "learning_rate": 9.324817711016609e-06, + "loss": 1.9167, + "step": 620 + }, + { + "epoch": 2.51417004048583, + "grad_norm": 1.2095836589724516, + "learning_rate": 9.32126737405151e-06, + "loss": 1.8743, + "step": 621 + }, + { + "epoch": 2.5182186234817814, + "grad_norm": 1.5485434750214813, + "learning_rate": 9.3177084066673e-06, + "loss": 1.89, + "step": 622 + }, + { + "epoch": 2.522267206477733, + "grad_norm": 1.5145693598054688, + "learning_rate": 9.31414081597194e-06, + "loss": 1.8321, + "step": 623 + }, + { + "epoch": 2.526315789473684, + "grad_norm": 1.8660173525702701, + "learning_rate": 9.310564609090605e-06, + "loss": 1.6178, + "step": 624 + }, + { + "epoch": 2.5303643724696356, + "grad_norm": 1.9092894315915314, + "learning_rate": 9.306979793165682e-06, + "loss": 1.718, + "step": 625 + }, + { + "epoch": 2.534412955465587, + "grad_norm": 2.1574694273419817, + "learning_rate": 9.303386375356752e-06, + "loss": 1.8536, + "step": 626 + }, + { + "epoch": 2.5384615384615383, + "grad_norm": 1.5187220263251169, + "learning_rate": 9.299784362840578e-06, + "loss": 2.0088, + "step": 627 + }, + { + "epoch": 2.54251012145749, + "grad_norm": 1.3524410374053388, + "learning_rate": 9.296173762811084e-06, + "loss": 1.8993, + "step": 628 + }, + { + "epoch": 2.5465587044534415, + "grad_norm": 3.8294272400161993, + "learning_rate": 9.292554582479349e-06, + "loss": 2.3583, + "step": 629 + }, + { + "epoch": 2.5506072874493926, + "grad_norm": 6.070012543144345, + "learning_rate": 9.288926829073583e-06, + "loss": 2.4906, + "step": 630 + }, + { + "epoch": 2.554655870445344, + "grad_norm": 5.603752988478888, + "learning_rate": 9.285290509839126e-06, + "loss": 2.7822, + "step": 631 + }, + { + "epoch": 2.5587044534412957, + "grad_norm": 1.4481838054717586, + "learning_rate": 9.281645632038417e-06, + "loss": 1.8168, + "step": 632 + }, + { + "epoch": 2.562753036437247, + "grad_norm": 1.414449313894791, + "learning_rate": 9.277992202950996e-06, + "loss": 1.7136, + "step": 633 + }, + { + "epoch": 2.5668016194331984, + "grad_norm": 1.4634757861687506, + "learning_rate": 9.274330229873474e-06, + "loss": 2.0032, + "step": 634 + }, + { + "epoch": 2.57085020242915, + "grad_norm": 1.484422105707642, + "learning_rate": 9.270659720119533e-06, + "loss": 1.6359, + "step": 635 + }, + { + "epoch": 2.574898785425101, + "grad_norm": 1.4574650651898802, + "learning_rate": 9.266980681019902e-06, + "loss": 1.9962, + "step": 636 + }, + { + "epoch": 2.5789473684210527, + "grad_norm": 1.2408661225828688, + "learning_rate": 9.263293119922341e-06, + "loss": 1.7137, + "step": 637 + }, + { + "epoch": 2.582995951417004, + "grad_norm": 1.4397062187160998, + "learning_rate": 9.259597044191635e-06, + "loss": 1.9567, + "step": 638 + }, + { + "epoch": 2.5870445344129553, + "grad_norm": 1.3678454147168124, + "learning_rate": 9.255892461209574e-06, + "loss": 1.8607, + "step": 639 + }, + { + "epoch": 2.591093117408907, + "grad_norm": 1.51295578810032, + "learning_rate": 9.252179378374937e-06, + "loss": 1.8423, + "step": 640 + }, + { + "epoch": 2.5951417004048585, + "grad_norm": 1.493191888596024, + "learning_rate": 9.248457803103476e-06, + "loss": 1.5365, + "step": 641 + }, + { + "epoch": 2.5991902834008096, + "grad_norm": 1.4402174802959915, + "learning_rate": 9.24472774282791e-06, + "loss": 1.5837, + "step": 642 + }, + { + "epoch": 2.603238866396761, + "grad_norm": 1.3814570168249611, + "learning_rate": 9.240989204997903e-06, + "loss": 1.7433, + "step": 643 + }, + { + "epoch": 2.6072874493927127, + "grad_norm": 1.4229224856881553, + "learning_rate": 9.237242197080045e-06, + "loss": 1.6373, + "step": 644 + }, + { + "epoch": 2.611336032388664, + "grad_norm": 1.529255344732051, + "learning_rate": 9.23348672655785e-06, + "loss": 1.9638, + "step": 645 + }, + { + "epoch": 2.6153846153846154, + "grad_norm": 1.2990811736528833, + "learning_rate": 9.229722800931727e-06, + "loss": 1.8372, + "step": 646 + }, + { + "epoch": 2.619433198380567, + "grad_norm": 1.7287958707975635, + "learning_rate": 9.225950427718974e-06, + "loss": 1.665, + "step": 647 + }, + { + "epoch": 2.623481781376518, + "grad_norm": 1.631936855970988, + "learning_rate": 9.222169614453765e-06, + "loss": 2.052, + "step": 648 + }, + { + "epoch": 2.6275303643724697, + "grad_norm": 1.384358037456477, + "learning_rate": 9.21838036868712e-06, + "loss": 1.8437, + "step": 649 + }, + { + "epoch": 2.6315789473684212, + "grad_norm": 1.57010881393224, + "learning_rate": 9.21458269798691e-06, + "loss": 2.0542, + "step": 650 + }, + { + "epoch": 2.6356275303643724, + "grad_norm": 1.4074541953077098, + "learning_rate": 9.21077660993783e-06, + "loss": 1.7342, + "step": 651 + }, + { + "epoch": 2.639676113360324, + "grad_norm": 1.6189308816605772, + "learning_rate": 9.206962112141382e-06, + "loss": 1.9321, + "step": 652 + }, + { + "epoch": 2.6437246963562755, + "grad_norm": 1.4090618348929758, + "learning_rate": 9.203139212215868e-06, + "loss": 1.871, + "step": 653 + }, + { + "epoch": 2.6477732793522266, + "grad_norm": 1.9494105407548425, + "learning_rate": 9.199307917796371e-06, + "loss": 1.8667, + "step": 654 + }, + { + "epoch": 2.651821862348178, + "grad_norm": 1.4331583331274316, + "learning_rate": 9.195468236534734e-06, + "loss": 1.7255, + "step": 655 + }, + { + "epoch": 2.6558704453441297, + "grad_norm": 1.5909315996217737, + "learning_rate": 9.191620176099559e-06, + "loss": 1.9444, + "step": 656 + }, + { + "epoch": 2.659919028340081, + "grad_norm": 1.7461445494408216, + "learning_rate": 9.187763744176175e-06, + "loss": 1.7728, + "step": 657 + }, + { + "epoch": 2.6639676113360324, + "grad_norm": 1.422126938114325, + "learning_rate": 9.183898948466633e-06, + "loss": 1.9077, + "step": 658 + }, + { + "epoch": 2.668016194331984, + "grad_norm": 1.4144043249974336, + "learning_rate": 9.180025796689692e-06, + "loss": 1.9331, + "step": 659 + }, + { + "epoch": 2.672064777327935, + "grad_norm": 2.7772861017132255, + "learning_rate": 9.176144296580794e-06, + "loss": 1.8667, + "step": 660 + }, + { + "epoch": 2.6761133603238867, + "grad_norm": 1.3064807850177453, + "learning_rate": 9.172254455892054e-06, + "loss": 1.8187, + "step": 661 + }, + { + "epoch": 2.6801619433198383, + "grad_norm": 1.7419083953095058, + "learning_rate": 9.168356282392253e-06, + "loss": 1.903, + "step": 662 + }, + { + "epoch": 2.6842105263157894, + "grad_norm": 1.4496863008780128, + "learning_rate": 9.164449783866802e-06, + "loss": 1.7048, + "step": 663 + }, + { + "epoch": 2.688259109311741, + "grad_norm": 1.491984655358695, + "learning_rate": 9.160534968117752e-06, + "loss": 1.8734, + "step": 664 + }, + { + "epoch": 2.6923076923076925, + "grad_norm": 1.5308194782439823, + "learning_rate": 9.156611842963753e-06, + "loss": 1.8788, + "step": 665 + }, + { + "epoch": 2.6963562753036436, + "grad_norm": 1.3476877228875297, + "learning_rate": 9.152680416240059e-06, + "loss": 1.7147, + "step": 666 + }, + { + "epoch": 2.700404858299595, + "grad_norm": 1.8151640153934792, + "learning_rate": 9.1487406957985e-06, + "loss": 2.2048, + "step": 667 + }, + { + "epoch": 2.7044534412955468, + "grad_norm": 1.7628995278188238, + "learning_rate": 9.144792689507471e-06, + "loss": 1.9635, + "step": 668 + }, + { + "epoch": 2.708502024291498, + "grad_norm": 1.602921120835359, + "learning_rate": 9.140836405251917e-06, + "loss": 1.5744, + "step": 669 + }, + { + "epoch": 2.7125506072874495, + "grad_norm": 1.490856129715411, + "learning_rate": 9.136871850933312e-06, + "loss": 1.7612, + "step": 670 + }, + { + "epoch": 2.716599190283401, + "grad_norm": 1.4382592619602368, + "learning_rate": 9.132899034469648e-06, + "loss": 1.8414, + "step": 671 + }, + { + "epoch": 2.720647773279352, + "grad_norm": 1.8014041637984994, + "learning_rate": 9.128917963795422e-06, + "loss": 1.7066, + "step": 672 + }, + { + "epoch": 2.7246963562753037, + "grad_norm": 1.7582254633750898, + "learning_rate": 9.124928646861613e-06, + "loss": 1.7925, + "step": 673 + }, + { + "epoch": 2.7287449392712553, + "grad_norm": 1.6343159265633571, + "learning_rate": 9.120931091635669e-06, + "loss": 1.9923, + "step": 674 + }, + { + "epoch": 2.7327935222672064, + "grad_norm": 1.3849537338720197, + "learning_rate": 9.116925306101494e-06, + "loss": 1.858, + "step": 675 + }, + { + "epoch": 2.736842105263158, + "grad_norm": 1.5938145614524974, + "learning_rate": 9.112911298259426e-06, + "loss": 1.8935, + "step": 676 + }, + { + "epoch": 2.7408906882591095, + "grad_norm": 2.232137755564454, + "learning_rate": 9.108889076126226e-06, + "loss": 2.5611, + "step": 677 + }, + { + "epoch": 2.7449392712550607, + "grad_norm": 1.597451641610388, + "learning_rate": 9.104858647735065e-06, + "loss": 1.9346, + "step": 678 + }, + { + "epoch": 2.748987854251012, + "grad_norm": 1.734843462936045, + "learning_rate": 9.100820021135495e-06, + "loss": 1.7738, + "step": 679 + }, + { + "epoch": 2.753036437246964, + "grad_norm": 1.5432674907856907, + "learning_rate": 9.09677320439345e-06, + "loss": 1.6451, + "step": 680 + }, + { + "epoch": 2.757085020242915, + "grad_norm": 1.4375865005427824, + "learning_rate": 9.092718205591213e-06, + "loss": 1.8788, + "step": 681 + }, + { + "epoch": 2.7611336032388665, + "grad_norm": 3.7437865438416433, + "learning_rate": 9.088655032827418e-06, + "loss": 2.6938, + "step": 682 + }, + { + "epoch": 2.765182186234818, + "grad_norm": 6.350052687447943, + "learning_rate": 9.084583694217012e-06, + "loss": 2.5299, + "step": 683 + }, + { + "epoch": 2.769230769230769, + "grad_norm": 4.945671727596882, + "learning_rate": 9.080504197891262e-06, + "loss": 2.4088, + "step": 684 + }, + { + "epoch": 2.7732793522267207, + "grad_norm": 1.6795835965091561, + "learning_rate": 9.076416551997721e-06, + "loss": 1.824, + "step": 685 + }, + { + "epoch": 2.7773279352226723, + "grad_norm": 1.5949270953831338, + "learning_rate": 9.072320764700223e-06, + "loss": 2.0511, + "step": 686 + }, + { + "epoch": 2.7813765182186234, + "grad_norm": 1.4556536124547252, + "learning_rate": 9.068216844178857e-06, + "loss": 2.0932, + "step": 687 + }, + { + "epoch": 2.785425101214575, + "grad_norm": 1.6439876597132232, + "learning_rate": 9.064104798629955e-06, + "loss": 1.8796, + "step": 688 + }, + { + "epoch": 2.7894736842105265, + "grad_norm": 1.4368651555210203, + "learning_rate": 9.059984636266082e-06, + "loss": 1.7757, + "step": 689 + }, + { + "epoch": 2.7935222672064777, + "grad_norm": 1.6510465877279545, + "learning_rate": 9.055856365316012e-06, + "loss": 1.9039, + "step": 690 + }, + { + "epoch": 2.7975708502024292, + "grad_norm": 1.5313446048549542, + "learning_rate": 9.051719994024711e-06, + "loss": 1.9171, + "step": 691 + }, + { + "epoch": 2.801619433198381, + "grad_norm": 1.5880262025571767, + "learning_rate": 9.047575530653324e-06, + "loss": 1.6852, + "step": 692 + }, + { + "epoch": 2.805668016194332, + "grad_norm": 1.4675446257129918, + "learning_rate": 9.043422983479158e-06, + "loss": 1.5727, + "step": 693 + }, + { + "epoch": 2.8097165991902835, + "grad_norm": 1.6282110219820332, + "learning_rate": 9.039262360795664e-06, + "loss": 1.9079, + "step": 694 + }, + { + "epoch": 2.813765182186235, + "grad_norm": 1.9452631088170542, + "learning_rate": 9.035093670912424e-06, + "loss": 1.9093, + "step": 695 + }, + { + "epoch": 2.817813765182186, + "grad_norm": 1.6299011643761043, + "learning_rate": 9.03091692215513e-06, + "loss": 1.6569, + "step": 696 + }, + { + "epoch": 2.8218623481781377, + "grad_norm": 7.734091901664539, + "learning_rate": 9.026732122865567e-06, + "loss": 2.4758, + "step": 697 + }, + { + "epoch": 2.8259109311740893, + "grad_norm": 18.1486281089367, + "learning_rate": 9.022539281401601e-06, + "loss": 3.9379, + "step": 698 + }, + { + "epoch": 2.8299595141700404, + "grad_norm": 1.7406474445735873, + "learning_rate": 9.01833840613716e-06, + "loss": 1.7599, + "step": 699 + }, + { + "epoch": 2.834008097165992, + "grad_norm": 1.7079549569427872, + "learning_rate": 9.014129505462217e-06, + "loss": 1.6112, + "step": 700 + }, + { + "epoch": 2.8380566801619436, + "grad_norm": 1.5492178198371753, + "learning_rate": 9.009912587782772e-06, + "loss": 1.719, + "step": 701 + }, + { + "epoch": 2.8421052631578947, + "grad_norm": 1.5966963855692302, + "learning_rate": 9.005687661520838e-06, + "loss": 1.7237, + "step": 702 + }, + { + "epoch": 2.8461538461538463, + "grad_norm": 1.5738987901659376, + "learning_rate": 9.00145473511442e-06, + "loss": 1.6892, + "step": 703 + }, + { + "epoch": 2.850202429149798, + "grad_norm": 1.6008695127081995, + "learning_rate": 8.997213817017508e-06, + "loss": 1.7534, + "step": 704 + }, + { + "epoch": 2.854251012145749, + "grad_norm": 1.8027657159531043, + "learning_rate": 8.99296491570004e-06, + "loss": 1.8313, + "step": 705 + }, + { + "epoch": 2.8582995951417005, + "grad_norm": 1.388477920242152, + "learning_rate": 8.98870803964791e-06, + "loss": 1.7662, + "step": 706 + }, + { + "epoch": 2.862348178137652, + "grad_norm": 1.697508321391829, + "learning_rate": 8.984443197362938e-06, + "loss": 1.7739, + "step": 707 + }, + { + "epoch": 2.866396761133603, + "grad_norm": 1.7051210953826448, + "learning_rate": 8.980170397362846e-06, + "loss": 1.7885, + "step": 708 + }, + { + "epoch": 2.8704453441295548, + "grad_norm": 2.112476620801928, + "learning_rate": 8.975889648181258e-06, + "loss": 2.2786, + "step": 709 + }, + { + "epoch": 2.8744939271255063, + "grad_norm": 1.9686852205718806, + "learning_rate": 8.971600958367668e-06, + "loss": 2.2033, + "step": 710 + }, + { + "epoch": 2.8785425101214575, + "grad_norm": 1.8858645037099275, + "learning_rate": 8.96730433648743e-06, + "loss": 1.9747, + "step": 711 + }, + { + "epoch": 2.882591093117409, + "grad_norm": 1.629389176480098, + "learning_rate": 8.962999791121745e-06, + "loss": 1.8561, + "step": 712 + }, + { + "epoch": 2.8866396761133606, + "grad_norm": 1.7283481294339973, + "learning_rate": 8.958687330867634e-06, + "loss": 1.3887, + "step": 713 + }, + { + "epoch": 2.8906882591093117, + "grad_norm": 1.5884187879059617, + "learning_rate": 8.954366964337926e-06, + "loss": 1.8757, + "step": 714 + }, + { + "epoch": 2.8947368421052633, + "grad_norm": 1.5310621607610841, + "learning_rate": 8.950038700161239e-06, + "loss": 1.9746, + "step": 715 + }, + { + "epoch": 2.898785425101215, + "grad_norm": 1.4608377788624507, + "learning_rate": 8.94570254698197e-06, + "loss": 1.6592, + "step": 716 + }, + { + "epoch": 2.902834008097166, + "grad_norm": 1.5297317667519899, + "learning_rate": 8.941358513460264e-06, + "loss": 1.722, + "step": 717 + }, + { + "epoch": 2.9068825910931175, + "grad_norm": 1.847621037937598, + "learning_rate": 8.937006608272009e-06, + "loss": 1.9182, + "step": 718 + }, + { + "epoch": 2.910931174089069, + "grad_norm": 1.6585955176413567, + "learning_rate": 8.932646840108818e-06, + "loss": 1.4523, + "step": 719 + }, + { + "epoch": 2.91497975708502, + "grad_norm": 1.807939122311604, + "learning_rate": 8.928279217677999e-06, + "loss": 1.5928, + "step": 720 + }, + { + "epoch": 2.919028340080972, + "grad_norm": 1.6812175947881611, + "learning_rate": 8.923903749702556e-06, + "loss": 1.6197, + "step": 721 + }, + { + "epoch": 2.9230769230769234, + "grad_norm": 1.5868810975571848, + "learning_rate": 8.919520444921153e-06, + "loss": 1.9066, + "step": 722 + }, + { + "epoch": 2.9271255060728745, + "grad_norm": 2.008002647816905, + "learning_rate": 8.915129312088112e-06, + "loss": 1.7547, + "step": 723 + }, + { + "epoch": 2.931174089068826, + "grad_norm": 2.2074435698181185, + "learning_rate": 8.910730359973386e-06, + "loss": 1.7851, + "step": 724 + }, + { + "epoch": 2.9352226720647776, + "grad_norm": 1.6720121053555042, + "learning_rate": 8.906323597362547e-06, + "loss": 1.6173, + "step": 725 + }, + { + "epoch": 2.9392712550607287, + "grad_norm": 1.7840437064722243, + "learning_rate": 8.901909033056763e-06, + "loss": 1.5244, + "step": 726 + }, + { + "epoch": 2.9433198380566803, + "grad_norm": 2.087404813784654, + "learning_rate": 8.89748667587279e-06, + "loss": 1.8108, + "step": 727 + }, + { + "epoch": 2.9473684210526314, + "grad_norm": 1.7622420447448541, + "learning_rate": 8.893056534642938e-06, + "loss": 1.5553, + "step": 728 + }, + { + "epoch": 2.951417004048583, + "grad_norm": 1.9454050876073625, + "learning_rate": 8.88861861821507e-06, + "loss": 1.5518, + "step": 729 + }, + { + "epoch": 2.9554655870445345, + "grad_norm": 3.180217232768608, + "learning_rate": 8.88417293545258e-06, + "loss": 1.7772, + "step": 730 + }, + { + "epoch": 2.9595141700404857, + "grad_norm": 3.564301283270782, + "learning_rate": 8.879719495234363e-06, + "loss": 1.6766, + "step": 731 + }, + { + "epoch": 2.9635627530364372, + "grad_norm": 1.5385071245811799, + "learning_rate": 8.875258306454814e-06, + "loss": 1.7823, + "step": 732 + }, + { + "epoch": 2.967611336032389, + "grad_norm": 1.8013008659956586, + "learning_rate": 8.87078937802381e-06, + "loss": 2.0096, + "step": 733 + }, + { + "epoch": 2.97165991902834, + "grad_norm": 2.38933092267862, + "learning_rate": 8.866312718866669e-06, + "loss": 1.9226, + "step": 734 + }, + { + "epoch": 2.9757085020242915, + "grad_norm": 1.5349029688081202, + "learning_rate": 8.861828337924164e-06, + "loss": 1.7634, + "step": 735 + }, + { + "epoch": 2.979757085020243, + "grad_norm": 1.7807993217999074, + "learning_rate": 8.85733624415248e-06, + "loss": 1.862, + "step": 736 + }, + { + "epoch": 2.983805668016194, + "grad_norm": 1.6270967039867585, + "learning_rate": 8.852836446523213e-06, + "loss": 1.9281, + "step": 737 + }, + { + "epoch": 2.9878542510121457, + "grad_norm": 1.8692589473995715, + "learning_rate": 8.848328954023342e-06, + "loss": 1.7317, + "step": 738 + }, + { + "epoch": 2.9919028340080973, + "grad_norm": 1.5874083562158485, + "learning_rate": 8.843813775655211e-06, + "loss": 1.6635, + "step": 739 + }, + { + "epoch": 2.9959514170040484, + "grad_norm": 1.3707872942838146, + "learning_rate": 8.83929092043652e-06, + "loss": 1.9759, + "step": 740 + }, + { + "epoch": 3.0, + "grad_norm": 1.7529361765269527, + "learning_rate": 8.8347603974003e-06, + "loss": 1.7407, + "step": 741 + }, + { + "epoch": 3.0040485829959516, + "grad_norm": 1.4847998012230224, + "learning_rate": 8.83022221559489e-06, + "loss": 1.8183, + "step": 742 + }, + { + "epoch": 3.0080971659919027, + "grad_norm": 2.0727143325799453, + "learning_rate": 8.825676384083936e-06, + "loss": 1.9566, + "step": 743 + }, + { + "epoch": 3.0121457489878543, + "grad_norm": 2.1863226369459072, + "learning_rate": 8.82112291194635e-06, + "loss": 1.8211, + "step": 744 + }, + { + "epoch": 3.016194331983806, + "grad_norm": 2.194214751548881, + "learning_rate": 8.816561808276312e-06, + "loss": 1.9756, + "step": 745 + }, + { + "epoch": 3.020242914979757, + "grad_norm": 1.8746800584359844, + "learning_rate": 8.811993082183243e-06, + "loss": 2.2277, + "step": 746 + }, + { + "epoch": 3.0242914979757085, + "grad_norm": 2.0032700627210636, + "learning_rate": 8.807416742791784e-06, + "loss": 2.0822, + "step": 747 + }, + { + "epoch": 3.02834008097166, + "grad_norm": 1.6874624326476195, + "learning_rate": 8.80283279924178e-06, + "loss": 1.7544, + "step": 748 + }, + { + "epoch": 3.032388663967611, + "grad_norm": 1.981414959416955, + "learning_rate": 8.798241260688273e-06, + "loss": 1.7612, + "step": 749 + }, + { + "epoch": 3.0364372469635628, + "grad_norm": 1.85228853236934, + "learning_rate": 8.793642136301462e-06, + "loss": 2.0061, + "step": 750 + }, + { + "epoch": 3.0404858299595143, + "grad_norm": 1.839202167316395, + "learning_rate": 8.7890354352667e-06, + "loss": 1.8078, + "step": 751 + }, + { + "epoch": 3.0445344129554655, + "grad_norm": 1.664692242856933, + "learning_rate": 8.784421166784476e-06, + "loss": 1.7918, + "step": 752 + }, + { + "epoch": 3.048582995951417, + "grad_norm": 1.8125016947634567, + "learning_rate": 8.779799340070388e-06, + "loss": 1.7574, + "step": 753 + }, + { + "epoch": 3.0526315789473686, + "grad_norm": 1.922401307664431, + "learning_rate": 8.775169964355134e-06, + "loss": 1.8982, + "step": 754 + }, + { + "epoch": 3.0566801619433197, + "grad_norm": 1.893673085388173, + "learning_rate": 8.770533048884483e-06, + "loss": 1.7375, + "step": 755 + }, + { + "epoch": 3.0607287449392713, + "grad_norm": 1.7578051605078406, + "learning_rate": 8.765888602919266e-06, + "loss": 1.9075, + "step": 756 + }, + { + "epoch": 3.064777327935223, + "grad_norm": 1.8959640677324443, + "learning_rate": 8.761236635735353e-06, + "loss": 1.8378, + "step": 757 + }, + { + "epoch": 3.068825910931174, + "grad_norm": 1.9801599495189568, + "learning_rate": 8.756577156623636e-06, + "loss": 1.9702, + "step": 758 + }, + { + "epoch": 3.0728744939271255, + "grad_norm": 1.790845579793568, + "learning_rate": 8.751910174890009e-06, + "loss": 1.8932, + "step": 759 + }, + { + "epoch": 3.076923076923077, + "grad_norm": 1.8236903737287826, + "learning_rate": 8.74723569985535e-06, + "loss": 1.8215, + "step": 760 + }, + { + "epoch": 3.080971659919028, + "grad_norm": 1.7121510890543619, + "learning_rate": 8.742553740855507e-06, + "loss": 1.8237, + "step": 761 + }, + { + "epoch": 3.08502024291498, + "grad_norm": 1.6455567766467654, + "learning_rate": 8.737864307241266e-06, + "loss": 1.825, + "step": 762 + }, + { + "epoch": 3.0890688259109313, + "grad_norm": 2.004800789953328, + "learning_rate": 8.733167408378348e-06, + "loss": 1.83, + "step": 763 + }, + { + "epoch": 3.0931174089068825, + "grad_norm": 1.761656112643498, + "learning_rate": 8.728463053647382e-06, + "loss": 1.9209, + "step": 764 + }, + { + "epoch": 3.097165991902834, + "grad_norm": 1.7248736206433866, + "learning_rate": 8.723751252443891e-06, + "loss": 1.6591, + "step": 765 + }, + { + "epoch": 3.1012145748987856, + "grad_norm": 1.8246435273625035, + "learning_rate": 8.71903201417826e-06, + "loss": 1.8214, + "step": 766 + }, + { + "epoch": 3.1052631578947367, + "grad_norm": 1.8468962560997435, + "learning_rate": 8.71430534827574e-06, + "loss": 1.854, + "step": 767 + }, + { + "epoch": 3.1093117408906883, + "grad_norm": 1.9312402322655278, + "learning_rate": 8.709571264176408e-06, + "loss": 1.7321, + "step": 768 + }, + { + "epoch": 3.11336032388664, + "grad_norm": 2.316632605973664, + "learning_rate": 8.70482977133516e-06, + "loss": 1.6709, + "step": 769 + }, + { + "epoch": 3.117408906882591, + "grad_norm": 1.9879535887114659, + "learning_rate": 8.700080879221689e-06, + "loss": 1.6082, + "step": 770 + }, + { + "epoch": 3.1214574898785425, + "grad_norm": 1.8223147298487212, + "learning_rate": 8.69532459732046e-06, + "loss": 1.6324, + "step": 771 + }, + { + "epoch": 3.125506072874494, + "grad_norm": 1.9254678274105181, + "learning_rate": 8.690560935130708e-06, + "loss": 1.626, + "step": 772 + }, + { + "epoch": 3.1295546558704452, + "grad_norm": 2.1237007524174683, + "learning_rate": 8.685789902166395e-06, + "loss": 1.5525, + "step": 773 + }, + { + "epoch": 3.133603238866397, + "grad_norm": 1.7727476948432017, + "learning_rate": 8.681011507956215e-06, + "loss": 1.8873, + "step": 774 + }, + { + "epoch": 3.1376518218623484, + "grad_norm": 2.049295618159139, + "learning_rate": 8.676225762043555e-06, + "loss": 1.7496, + "step": 775 + }, + { + "epoch": 3.1417004048582995, + "grad_norm": 1.5682714669220028, + "learning_rate": 8.671432673986493e-06, + "loss": 1.4753, + "step": 776 + }, + { + "epoch": 3.145748987854251, + "grad_norm": 1.8938048440408406, + "learning_rate": 8.666632253357767e-06, + "loss": 1.8963, + "step": 777 + }, + { + "epoch": 3.1497975708502026, + "grad_norm": 1.8936062118104038, + "learning_rate": 8.661824509744754e-06, + "loss": 1.7098, + "step": 778 + }, + { + "epoch": 3.1538461538461537, + "grad_norm": 1.6774875162585348, + "learning_rate": 8.657009452749466e-06, + "loss": 1.8596, + "step": 779 + }, + { + "epoch": 3.1578947368421053, + "grad_norm": 2.015957389549595, + "learning_rate": 8.652187091988516e-06, + "loss": 2.061, + "step": 780 + }, + { + "epoch": 3.161943319838057, + "grad_norm": 1.7186637319125118, + "learning_rate": 8.647357437093104e-06, + "loss": 1.7589, + "step": 781 + }, + { + "epoch": 3.165991902834008, + "grad_norm": 1.7941883707597104, + "learning_rate": 8.642520497709001e-06, + "loss": 1.8086, + "step": 782 + }, + { + "epoch": 3.1700404858299596, + "grad_norm": 1.774631391234699, + "learning_rate": 8.637676283496521e-06, + "loss": 2.2517, + "step": 783 + }, + { + "epoch": 3.174089068825911, + "grad_norm": 1.7904179919335834, + "learning_rate": 8.632824804130514e-06, + "loss": 1.6679, + "step": 784 + }, + { + "epoch": 3.1781376518218623, + "grad_norm": 1.972746622761643, + "learning_rate": 8.627966069300332e-06, + "loss": 1.8345, + "step": 785 + }, + { + "epoch": 3.182186234817814, + "grad_norm": 1.5336336477310177, + "learning_rate": 8.623100088709829e-06, + "loss": 1.6473, + "step": 786 + }, + { + "epoch": 3.1862348178137654, + "grad_norm": 1.9951657707171577, + "learning_rate": 8.618226872077315e-06, + "loss": 1.7821, + "step": 787 + }, + { + "epoch": 3.1902834008097165, + "grad_norm": 1.7282375741642677, + "learning_rate": 8.613346429135567e-06, + "loss": 1.8289, + "step": 788 + }, + { + "epoch": 3.194331983805668, + "grad_norm": 2.1277631117336675, + "learning_rate": 8.608458769631785e-06, + "loss": 2.0076, + "step": 789 + }, + { + "epoch": 3.1983805668016196, + "grad_norm": 1.8372643674137712, + "learning_rate": 8.603563903327582e-06, + "loss": 2.0805, + "step": 790 + }, + { + "epoch": 3.2024291497975708, + "grad_norm": 1.8065321863693007, + "learning_rate": 8.598661839998972e-06, + "loss": 1.7669, + "step": 791 + }, + { + "epoch": 3.2064777327935223, + "grad_norm": 2.031336948957746, + "learning_rate": 8.593752589436334e-06, + "loss": 2.0858, + "step": 792 + }, + { + "epoch": 3.2105263157894735, + "grad_norm": 1.8889862063112353, + "learning_rate": 8.588836161444405e-06, + "loss": 2.1341, + "step": 793 + }, + { + "epoch": 3.214574898785425, + "grad_norm": 1.8426615628835388, + "learning_rate": 8.583912565842258e-06, + "loss": 1.9304, + "step": 794 + }, + { + "epoch": 3.2186234817813766, + "grad_norm": 1.7414893453963287, + "learning_rate": 8.578981812463278e-06, + "loss": 1.7942, + "step": 795 + }, + { + "epoch": 3.2226720647773277, + "grad_norm": 1.9096193735192637, + "learning_rate": 8.574043911155148e-06, + "loss": 1.72, + "step": 796 + }, + { + "epoch": 3.2267206477732793, + "grad_norm": 1.8025258377815987, + "learning_rate": 8.569098871779828e-06, + "loss": 1.8542, + "step": 797 + }, + { + "epoch": 3.230769230769231, + "grad_norm": 1.8460762696682704, + "learning_rate": 8.56414670421353e-06, + "loss": 1.7101, + "step": 798 + }, + { + "epoch": 3.234817813765182, + "grad_norm": 1.9398991434247146, + "learning_rate": 8.559187418346703e-06, + "loss": 1.95, + "step": 799 + }, + { + "epoch": 3.2388663967611335, + "grad_norm": 1.8632306612622278, + "learning_rate": 8.554221024084019e-06, + "loss": 1.8895, + "step": 800 + }, + { + "epoch": 3.242914979757085, + "grad_norm": 1.893700967064052, + "learning_rate": 8.54924753134434e-06, + "loss": 1.873, + "step": 801 + }, + { + "epoch": 3.246963562753036, + "grad_norm": 1.7151529599583697, + "learning_rate": 8.544266950060706e-06, + "loss": 1.7236, + "step": 802 + }, + { + "epoch": 3.251012145748988, + "grad_norm": 1.7251248112215953, + "learning_rate": 8.539279290180315e-06, + "loss": 1.7693, + "step": 803 + }, + { + "epoch": 3.2550607287449393, + "grad_norm": 1.9817743209184147, + "learning_rate": 8.534284561664508e-06, + "loss": 1.8365, + "step": 804 + }, + { + "epoch": 3.2591093117408905, + "grad_norm": 1.8362666024929137, + "learning_rate": 8.529282774488731e-06, + "loss": 1.6791, + "step": 805 + }, + { + "epoch": 3.263157894736842, + "grad_norm": 1.9144972025615734, + "learning_rate": 8.524273938642539e-06, + "loss": 1.5622, + "step": 806 + }, + { + "epoch": 3.2672064777327936, + "grad_norm": 1.8150113569889472, + "learning_rate": 8.519258064129559e-06, + "loss": 1.8107, + "step": 807 + }, + { + "epoch": 3.2712550607287447, + "grad_norm": 1.8132774105922835, + "learning_rate": 8.514235160967476e-06, + "loss": 1.8382, + "step": 808 + }, + { + "epoch": 3.2753036437246963, + "grad_norm": 1.7178012200999808, + "learning_rate": 8.509205239188017e-06, + "loss": 1.8519, + "step": 809 + }, + { + "epoch": 3.279352226720648, + "grad_norm": 2.2519702448886845, + "learning_rate": 8.504168308836918e-06, + "loss": 1.8559, + "step": 810 + }, + { + "epoch": 3.283400809716599, + "grad_norm": 2.1015013370666513, + "learning_rate": 8.499124379973922e-06, + "loss": 1.5602, + "step": 811 + }, + { + "epoch": 3.2874493927125505, + "grad_norm": 2.1456515647605365, + "learning_rate": 8.494073462672743e-06, + "loss": 1.6597, + "step": 812 + }, + { + "epoch": 3.291497975708502, + "grad_norm": 2.1425091129883613, + "learning_rate": 8.489015567021054e-06, + "loss": 1.5311, + "step": 813 + }, + { + "epoch": 3.2955465587044532, + "grad_norm": 2.1055979919937693, + "learning_rate": 8.483950703120466e-06, + "loss": 1.8547, + "step": 814 + }, + { + "epoch": 3.299595141700405, + "grad_norm": 1.9678625432719996, + "learning_rate": 8.478878881086505e-06, + "loss": 1.9357, + "step": 815 + }, + { + "epoch": 3.3036437246963564, + "grad_norm": 2.0317817207691538, + "learning_rate": 8.473800111048598e-06, + "loss": 1.6684, + "step": 816 + }, + { + "epoch": 3.3076923076923075, + "grad_norm": 2.0379814335298843, + "learning_rate": 8.468714403150043e-06, + "loss": 1.6929, + "step": 817 + }, + { + "epoch": 3.311740890688259, + "grad_norm": 1.9848650286398888, + "learning_rate": 8.463621767547998e-06, + "loss": 1.7112, + "step": 818 + }, + { + "epoch": 3.3157894736842106, + "grad_norm": 2.274800378339576, + "learning_rate": 8.458522214413455e-06, + "loss": 1.7005, + "step": 819 + }, + { + "epoch": 3.3198380566801617, + "grad_norm": 2.170751690325617, + "learning_rate": 8.453415753931223e-06, + "loss": 1.5995, + "step": 820 + }, + { + "epoch": 3.3238866396761133, + "grad_norm": 1.9913626012571344, + "learning_rate": 8.448302396299906e-06, + "loss": 1.6057, + "step": 821 + }, + { + "epoch": 3.327935222672065, + "grad_norm": 1.9395230430651595, + "learning_rate": 8.443182151731883e-06, + "loss": 1.6349, + "step": 822 + }, + { + "epoch": 3.331983805668016, + "grad_norm": 1.9091197381555691, + "learning_rate": 8.438055030453287e-06, + "loss": 1.5595, + "step": 823 + }, + { + "epoch": 3.3360323886639676, + "grad_norm": 1.8562911407114664, + "learning_rate": 8.432921042703985e-06, + "loss": 1.6019, + "step": 824 + }, + { + "epoch": 3.340080971659919, + "grad_norm": 1.7832079833064884, + "learning_rate": 8.42778019873756e-06, + "loss": 1.552, + "step": 825 + }, + { + "epoch": 3.3441295546558703, + "grad_norm": 1.8542638409385725, + "learning_rate": 8.422632508821284e-06, + "loss": 1.5851, + "step": 826 + }, + { + "epoch": 3.348178137651822, + "grad_norm": 2.1436195397021436, + "learning_rate": 8.417477983236107e-06, + "loss": 1.7666, + "step": 827 + }, + { + "epoch": 3.3522267206477734, + "grad_norm": 2.33071372223659, + "learning_rate": 8.412316632276627e-06, + "loss": 1.6497, + "step": 828 + }, + { + "epoch": 3.3562753036437245, + "grad_norm": 2.205436986044382, + "learning_rate": 8.407148466251072e-06, + "loss": 1.3523, + "step": 829 + }, + { + "epoch": 3.360323886639676, + "grad_norm": 2.2620315487409877, + "learning_rate": 8.401973495481289e-06, + "loss": 1.723, + "step": 830 + }, + { + "epoch": 3.3643724696356276, + "grad_norm": 2.180101120238927, + "learning_rate": 8.396791730302708e-06, + "loss": 1.8056, + "step": 831 + }, + { + "epoch": 3.3684210526315788, + "grad_norm": 1.990085418505961, + "learning_rate": 8.39160318106433e-06, + "loss": 1.7166, + "step": 832 + }, + { + "epoch": 3.3724696356275303, + "grad_norm": 2.40657553356096, + "learning_rate": 8.386407858128707e-06, + "loss": 1.8193, + "step": 833 + }, + { + "epoch": 3.376518218623482, + "grad_norm": 1.94489059367538, + "learning_rate": 8.381205771871918e-06, + "loss": 1.4172, + "step": 834 + }, + { + "epoch": 3.380566801619433, + "grad_norm": 2.150391672244522, + "learning_rate": 8.375996932683553e-06, + "loss": 1.5949, + "step": 835 + }, + { + "epoch": 3.3846153846153846, + "grad_norm": 2.0030590669894903, + "learning_rate": 8.370781350966683e-06, + "loss": 1.4156, + "step": 836 + }, + { + "epoch": 3.388663967611336, + "grad_norm": 2.197019034882382, + "learning_rate": 8.36555903713785e-06, + "loss": 1.4714, + "step": 837 + }, + { + "epoch": 3.3927125506072873, + "grad_norm": 2.078166195454461, + "learning_rate": 8.360330001627043e-06, + "loss": 1.6429, + "step": 838 + }, + { + "epoch": 3.396761133603239, + "grad_norm": 2.40629641977567, + "learning_rate": 8.355094254877665e-06, + "loss": 1.4713, + "step": 839 + }, + { + "epoch": 3.4008097165991904, + "grad_norm": 1.9645801904393803, + "learning_rate": 8.349851807346535e-06, + "loss": 1.5146, + "step": 840 + }, + { + "epoch": 3.4048582995951415, + "grad_norm": 1.9534289124567972, + "learning_rate": 8.344602669503849e-06, + "loss": 1.5871, + "step": 841 + }, + { + "epoch": 3.408906882591093, + "grad_norm": 2.3102884897188534, + "learning_rate": 8.339346851833163e-06, + "loss": 1.6862, + "step": 842 + }, + { + "epoch": 3.4129554655870447, + "grad_norm": 2.0401234182707406, + "learning_rate": 8.334084364831381e-06, + "loss": 1.5214, + "step": 843 + }, + { + "epoch": 3.417004048582996, + "grad_norm": 2.159768925630674, + "learning_rate": 8.328815219008719e-06, + "loss": 1.8219, + "step": 844 + }, + { + "epoch": 3.4210526315789473, + "grad_norm": 2.2204972461802757, + "learning_rate": 8.323539424888695e-06, + "loss": 1.8941, + "step": 845 + }, + { + "epoch": 3.425101214574899, + "grad_norm": 1.9873340221710971, + "learning_rate": 8.318256993008108e-06, + "loss": 1.7539, + "step": 846 + }, + { + "epoch": 3.42914979757085, + "grad_norm": 1.975202455896719, + "learning_rate": 8.31296793391701e-06, + "loss": 1.8598, + "step": 847 + }, + { + "epoch": 3.4331983805668016, + "grad_norm": 1.8415081642607933, + "learning_rate": 8.30767225817869e-06, + "loss": 1.9574, + "step": 848 + }, + { + "epoch": 3.437246963562753, + "grad_norm": 2.047274050267817, + "learning_rate": 8.302369976369651e-06, + "loss": 1.736, + "step": 849 + }, + { + "epoch": 3.4412955465587043, + "grad_norm": 2.1457366433830454, + "learning_rate": 8.297061099079592e-06, + "loss": 1.6581, + "step": 850 + }, + { + "epoch": 3.445344129554656, + "grad_norm": 1.8891113266245207, + "learning_rate": 8.291745636911382e-06, + "loss": 1.9183, + "step": 851 + }, + { + "epoch": 3.4493927125506074, + "grad_norm": 2.05347009046486, + "learning_rate": 8.286423600481044e-06, + "loss": 1.6869, + "step": 852 + }, + { + "epoch": 3.4534412955465585, + "grad_norm": 2.1578470259791795, + "learning_rate": 8.281095000417725e-06, + "loss": 1.6709, + "step": 853 + }, + { + "epoch": 3.45748987854251, + "grad_norm": 2.2158190833608606, + "learning_rate": 8.27575984736369e-06, + "loss": 2.079, + "step": 854 + }, + { + "epoch": 3.4615384615384617, + "grad_norm": 2.9226191862145265, + "learning_rate": 8.270418151974286e-06, + "loss": 2.3146, + "step": 855 + }, + { + "epoch": 3.465587044534413, + "grad_norm": 2.1657050143675205, + "learning_rate": 8.265069924917925e-06, + "loss": 1.9175, + "step": 856 + }, + { + "epoch": 3.4696356275303644, + "grad_norm": 1.7932680376129573, + "learning_rate": 8.259715176876069e-06, + "loss": 1.8725, + "step": 857 + }, + { + "epoch": 3.473684210526316, + "grad_norm": 1.8709685644083165, + "learning_rate": 8.254353918543199e-06, + "loss": 1.7809, + "step": 858 + }, + { + "epoch": 3.477732793522267, + "grad_norm": 2.4167400582718694, + "learning_rate": 8.2489861606268e-06, + "loss": 1.8016, + "step": 859 + }, + { + "epoch": 3.4817813765182186, + "grad_norm": 1.659768741074137, + "learning_rate": 8.243611913847337e-06, + "loss": 1.7188, + "step": 860 + }, + { + "epoch": 3.48582995951417, + "grad_norm": 2.1480568234600668, + "learning_rate": 8.238231188938237e-06, + "loss": 1.6913, + "step": 861 + }, + { + "epoch": 3.4898785425101213, + "grad_norm": 2.461283879827119, + "learning_rate": 8.232843996645865e-06, + "loss": 1.6242, + "step": 862 + }, + { + "epoch": 3.493927125506073, + "grad_norm": 2.3643514071925056, + "learning_rate": 8.2274503477295e-06, + "loss": 1.6881, + "step": 863 + }, + { + "epoch": 3.4979757085020244, + "grad_norm": 3.087293785042021, + "learning_rate": 8.222050252961318e-06, + "loss": 1.5087, + "step": 864 + }, + { + "epoch": 3.5020242914979756, + "grad_norm": 2.105684160210004, + "learning_rate": 8.216643723126367e-06, + "loss": 1.4331, + "step": 865 + }, + { + "epoch": 3.506072874493927, + "grad_norm": 2.420952436641065, + "learning_rate": 8.211230769022552e-06, + "loss": 1.7553, + "step": 866 + }, + { + "epoch": 3.5101214574898787, + "grad_norm": 2.2746665377354116, + "learning_rate": 8.2058114014606e-06, + "loss": 1.782, + "step": 867 + }, + { + "epoch": 3.51417004048583, + "grad_norm": 1.6776374980476494, + "learning_rate": 8.200385631264051e-06, + "loss": 1.7357, + "step": 868 + }, + { + "epoch": 3.5182186234817814, + "grad_norm": 2.130957958265717, + "learning_rate": 8.19495346926924e-06, + "loss": 1.7569, + "step": 869 + }, + { + "epoch": 3.522267206477733, + "grad_norm": 2.1241420175580386, + "learning_rate": 8.189514926325255e-06, + "loss": 1.7036, + "step": 870 + }, + { + "epoch": 3.526315789473684, + "grad_norm": 2.397883462392177, + "learning_rate": 8.184070013293936e-06, + "loss": 1.4984, + "step": 871 + }, + { + "epoch": 3.5303643724696356, + "grad_norm": 2.676554915114245, + "learning_rate": 8.178618741049841e-06, + "loss": 1.5719, + "step": 872 + }, + { + "epoch": 3.534412955465587, + "grad_norm": 2.641036334787177, + "learning_rate": 8.173161120480232e-06, + "loss": 1.7235, + "step": 873 + }, + { + "epoch": 3.5384615384615383, + "grad_norm": 2.4283908813712127, + "learning_rate": 8.16769716248505e-06, + "loss": 1.8976, + "step": 874 + }, + { + "epoch": 3.54251012145749, + "grad_norm": 1.9109389793413394, + "learning_rate": 8.162226877976886e-06, + "loss": 1.797, + "step": 875 + }, + { + "epoch": 3.5465587044534415, + "grad_norm": 3.1765952449893073, + "learning_rate": 8.156750277880979e-06, + "loss": 2.2212, + "step": 876 + }, + { + "epoch": 3.5506072874493926, + "grad_norm": 6.740978753214387, + "learning_rate": 8.15126737313517e-06, + "loss": 2.2759, + "step": 877 + }, + { + "epoch": 3.554655870445344, + "grad_norm": 6.646199027432937, + "learning_rate": 8.145778174689897e-06, + "loss": 2.5045, + "step": 878 + }, + { + "epoch": 3.5587044534412957, + "grad_norm": 1.9732928727215509, + "learning_rate": 8.140282693508168e-06, + "loss": 1.702, + "step": 879 + }, + { + "epoch": 3.562753036437247, + "grad_norm": 1.923113895215325, + "learning_rate": 8.134780940565535e-06, + "loss": 1.5859, + "step": 880 + }, + { + "epoch": 3.5668016194331984, + "grad_norm": 1.888490124882663, + "learning_rate": 8.129272926850079e-06, + "loss": 1.9019, + "step": 881 + }, + { + "epoch": 3.57085020242915, + "grad_norm": 2.0879599313529247, + "learning_rate": 8.123758663362386e-06, + "loss": 1.5424, + "step": 882 + }, + { + "epoch": 3.574898785425101, + "grad_norm": 2.1113301524020778, + "learning_rate": 8.118238161115523e-06, + "loss": 1.8581, + "step": 883 + }, + { + "epoch": 3.5789473684210527, + "grad_norm": 1.7105898329062328, + "learning_rate": 8.112711431135014e-06, + "loss": 1.5914, + "step": 884 + }, + { + "epoch": 3.582995951417004, + "grad_norm": 1.9358089378047225, + "learning_rate": 8.107178484458825e-06, + "loss": 1.7957, + "step": 885 + }, + { + "epoch": 3.5870445344129553, + "grad_norm": 1.9092777164097747, + "learning_rate": 8.101639332137337e-06, + "loss": 1.7404, + "step": 886 + }, + { + "epoch": 3.591093117408907, + "grad_norm": 2.098080272876577, + "learning_rate": 8.096093985233323e-06, + "loss": 1.7127, + "step": 887 + }, + { + "epoch": 3.5951417004048585, + "grad_norm": 2.4907144738421065, + "learning_rate": 8.090542454821929e-06, + "loss": 1.4308, + "step": 888 + }, + { + "epoch": 3.5991902834008096, + "grad_norm": 1.8678109793168913, + "learning_rate": 8.084984751990652e-06, + "loss": 1.4797, + "step": 889 + }, + { + "epoch": 3.603238866396761, + "grad_norm": 1.8961480105884363, + "learning_rate": 8.079420887839316e-06, + "loss": 1.6173, + "step": 890 + }, + { + "epoch": 3.6072874493927127, + "grad_norm": 1.9539785870788862, + "learning_rate": 8.073850873480047e-06, + "loss": 1.4952, + "step": 891 + }, + { + "epoch": 3.611336032388664, + "grad_norm": 2.31450202449626, + "learning_rate": 8.068274720037261e-06, + "loss": 1.813, + "step": 892 + }, + { + "epoch": 3.6153846153846154, + "grad_norm": 1.8087093273790038, + "learning_rate": 8.062692438647628e-06, + "loss": 1.7376, + "step": 893 + }, + { + "epoch": 3.619433198380567, + "grad_norm": 2.408589476299181, + "learning_rate": 8.057104040460062e-06, + "loss": 1.505, + "step": 894 + }, + { + "epoch": 3.623481781376518, + "grad_norm": 2.3231639351842035, + "learning_rate": 8.051509536635686e-06, + "loss": 1.9039, + "step": 895 + }, + { + "epoch": 3.6275303643724697, + "grad_norm": 1.9849491847712974, + "learning_rate": 8.045908938347828e-06, + "loss": 1.7125, + "step": 896 + }, + { + "epoch": 3.6315789473684212, + "grad_norm": 2.2249483352664026, + "learning_rate": 8.04030225678198e-06, + "loss": 1.9514, + "step": 897 + }, + { + "epoch": 3.6356275303643724, + "grad_norm": 2.005047614111562, + "learning_rate": 8.034689503135785e-06, + "loss": 1.597, + "step": 898 + }, + { + "epoch": 3.639676113360324, + "grad_norm": 2.2925145752574854, + "learning_rate": 8.029070688619013e-06, + "loss": 1.8072, + "step": 899 + }, + { + "epoch": 3.6437246963562755, + "grad_norm": 1.9475842419850795, + "learning_rate": 8.023445824453539e-06, + "loss": 1.7289, + "step": 900 + }, + { + "epoch": 3.6477732793522266, + "grad_norm": 2.071154449190338, + "learning_rate": 8.017814921873326e-06, + "loss": 1.7658, + "step": 901 + }, + { + "epoch": 3.651821862348178, + "grad_norm": 1.9935193669759015, + "learning_rate": 8.012177992124385e-06, + "loss": 1.6002, + "step": 902 + }, + { + "epoch": 3.6558704453441297, + "grad_norm": 2.2483209235168737, + "learning_rate": 8.006535046464774e-06, + "loss": 1.8275, + "step": 903 + }, + { + "epoch": 3.659919028340081, + "grad_norm": 2.5274264683222425, + "learning_rate": 8.000886096164564e-06, + "loss": 1.6502, + "step": 904 + }, + { + "epoch": 3.6639676113360324, + "grad_norm": 2.0119741262052195, + "learning_rate": 7.995231152505815e-06, + "loss": 1.8017, + "step": 905 + }, + { + "epoch": 3.668016194331984, + "grad_norm": 2.1027093845450233, + "learning_rate": 7.989570226782562e-06, + "loss": 1.8138, + "step": 906 + }, + { + "epoch": 3.672064777327935, + "grad_norm": 3.056649771146675, + "learning_rate": 7.983903330300782e-06, + "loss": 1.8128, + "step": 907 + }, + { + "epoch": 3.6761133603238867, + "grad_norm": 1.9139807090551522, + "learning_rate": 7.978230474378383e-06, + "loss": 1.7148, + "step": 908 + }, + { + "epoch": 3.6801619433198383, + "grad_norm": 2.416490627923619, + "learning_rate": 7.97255167034517e-06, + "loss": 1.7726, + "step": 909 + }, + { + "epoch": 3.6842105263157894, + "grad_norm": 2.053612332583323, + "learning_rate": 7.966866929542827e-06, + "loss": 1.5779, + "step": 910 + }, + { + "epoch": 3.688259109311741, + "grad_norm": 2.0666037215601505, + "learning_rate": 7.961176263324902e-06, + "loss": 1.7465, + "step": 911 + }, + { + "epoch": 3.6923076923076925, + "grad_norm": 2.1463137742100327, + "learning_rate": 7.955479683056767e-06, + "loss": 1.7608, + "step": 912 + }, + { + "epoch": 3.6963562753036436, + "grad_norm": 1.9232481327470194, + "learning_rate": 7.949777200115617e-06, + "loss": 1.5992, + "step": 913 + }, + { + "epoch": 3.700404858299595, + "grad_norm": 2.5029604743639515, + "learning_rate": 7.944068825890424e-06, + "loss": 2.089, + "step": 914 + }, + { + "epoch": 3.7044534412955468, + "grad_norm": 2.425403056999352, + "learning_rate": 7.938354571781933e-06, + "loss": 1.8514, + "step": 915 + }, + { + "epoch": 3.708502024291498, + "grad_norm": 2.2889869162476315, + "learning_rate": 7.932634449202635e-06, + "loss": 1.4493, + "step": 916 + }, + { + "epoch": 3.7125506072874495, + "grad_norm": 2.0245599708625988, + "learning_rate": 7.92690846957673e-06, + "loss": 1.6351, + "step": 917 + }, + { + "epoch": 3.716599190283401, + "grad_norm": 1.997997696536965, + "learning_rate": 7.921176644340132e-06, + "loss": 1.7253, + "step": 918 + }, + { + "epoch": 3.720647773279352, + "grad_norm": 2.344635708570945, + "learning_rate": 7.915438984940415e-06, + "loss": 1.5384, + "step": 919 + }, + { + "epoch": 3.7246963562753037, + "grad_norm": 2.399788568220564, + "learning_rate": 7.909695502836814e-06, + "loss": 1.6518, + "step": 920 + }, + { + "epoch": 3.7287449392712553, + "grad_norm": 2.258204100694036, + "learning_rate": 7.903946209500189e-06, + "loss": 1.8741, + "step": 921 + }, + { + "epoch": 3.7327935222672064, + "grad_norm": 1.9355255173187593, + "learning_rate": 7.898191116413007e-06, + "loss": 1.6996, + "step": 922 + }, + { + "epoch": 3.736842105263158, + "grad_norm": 2.1474241115417425, + "learning_rate": 7.892430235069317e-06, + "loss": 1.7427, + "step": 923 + }, + { + "epoch": 3.7408906882591095, + "grad_norm": 3.071687208613463, + "learning_rate": 7.886663576974733e-06, + "loss": 2.4106, + "step": 924 + }, + { + "epoch": 3.7449392712550607, + "grad_norm": 2.0799708188253465, + "learning_rate": 7.880891153646401e-06, + "loss": 1.808, + "step": 925 + }, + { + "epoch": 3.748987854251012, + "grad_norm": 2.4353787137639453, + "learning_rate": 7.875112976612984e-06, + "loss": 1.6368, + "step": 926 + }, + { + "epoch": 3.753036437246964, + "grad_norm": 2.159792334487355, + "learning_rate": 7.869329057414635e-06, + "loss": 1.5175, + "step": 927 + }, + { + "epoch": 3.757085020242915, + "grad_norm": 2.0548605804443274, + "learning_rate": 7.863539407602976e-06, + "loss": 1.7423, + "step": 928 + }, + { + "epoch": 3.7611336032388665, + "grad_norm": 3.9628857560933324, + "learning_rate": 7.857744038741076e-06, + "loss": 2.5332, + "step": 929 + }, + { + "epoch": 3.765182186234818, + "grad_norm": 4.514218437938051, + "learning_rate": 7.85194296240342e-06, + "loss": 2.3287, + "step": 930 + }, + { + "epoch": 3.769230769230769, + "grad_norm": 5.356074790215057, + "learning_rate": 7.846136190175901e-06, + "loss": 2.1714, + "step": 931 + }, + { + "epoch": 3.7732793522267207, + "grad_norm": 2.238703863406207, + "learning_rate": 7.84032373365578e-06, + "loss": 1.671, + "step": 932 + }, + { + "epoch": 3.7773279352226723, + "grad_norm": 2.194562792441507, + "learning_rate": 7.834505604451672e-06, + "loss": 1.9108, + "step": 933 + }, + { + "epoch": 3.7813765182186234, + "grad_norm": 2.085928113902739, + "learning_rate": 7.828681814183527e-06, + "loss": 1.9396, + "step": 934 + }, + { + "epoch": 3.785425101214575, + "grad_norm": 2.215253557008417, + "learning_rate": 7.822852374482597e-06, + "loss": 1.7587, + "step": 935 + }, + { + "epoch": 3.7894736842105265, + "grad_norm": 3.010107826761077, + "learning_rate": 7.817017296991411e-06, + "loss": 1.6507, + "step": 936 + }, + { + "epoch": 3.7935222672064777, + "grad_norm": 2.25886892537205, + "learning_rate": 7.811176593363771e-06, + "loss": 1.7372, + "step": 937 + }, + { + "epoch": 3.7975708502024292, + "grad_norm": 2.2130344020805297, + "learning_rate": 7.805330275264707e-06, + "loss": 1.7485, + "step": 938 + }, + { + "epoch": 3.801619433198381, + "grad_norm": 2.0367189537336907, + "learning_rate": 7.79947835437046e-06, + "loss": 1.5515, + "step": 939 + }, + { + "epoch": 3.805668016194332, + "grad_norm": 2.070856690389127, + "learning_rate": 7.79362084236847e-06, + "loss": 1.4447, + "step": 940 + }, + { + "epoch": 3.8097165991902835, + "grad_norm": 2.1857926637124794, + "learning_rate": 7.787757750957335e-06, + "loss": 1.8015, + "step": 941 + }, + { + "epoch": 3.813765182186235, + "grad_norm": 2.6872149719652305, + "learning_rate": 7.781889091846799e-06, + "loss": 1.7528, + "step": 942 + }, + { + "epoch": 3.817813765182186, + "grad_norm": 2.3048135110635264, + "learning_rate": 7.776014876757727e-06, + "loss": 1.5226, + "step": 943 + }, + { + "epoch": 3.8218623481781377, + "grad_norm": 8.991127581731243, + "learning_rate": 7.77013511742208e-06, + "loss": 2.3966, + "step": 944 + }, + { + "epoch": 3.8259109311740893, + "grad_norm": 19.276037930316928, + "learning_rate": 7.76424982558289e-06, + "loss": 3.7738, + "step": 945 + }, + { + "epoch": 3.8299595141700404, + "grad_norm": 2.4583074183525677, + "learning_rate": 7.758359012994242e-06, + "loss": 1.6137, + "step": 946 + }, + { + "epoch": 3.834008097165992, + "grad_norm": 2.405931055156567, + "learning_rate": 7.752462691421245e-06, + "loss": 1.4666, + "step": 947 + }, + { + "epoch": 3.8380566801619436, + "grad_norm": 2.114379083785604, + "learning_rate": 7.746560872640007e-06, + "loss": 1.5791, + "step": 948 + }, + { + "epoch": 3.8421052631578947, + "grad_norm": 2.1946059502111845, + "learning_rate": 7.740653568437623e-06, + "loss": 1.5937, + "step": 949 + }, + { + "epoch": 3.8461538461538463, + "grad_norm": 2.3168344745949456, + "learning_rate": 7.734740790612137e-06, + "loss": 1.5169, + "step": 950 + }, + { + "epoch": 3.850202429149798, + "grad_norm": 2.3139829718351197, + "learning_rate": 7.728822550972523e-06, + "loss": 1.6162, + "step": 951 + }, + { + "epoch": 3.854251012145749, + "grad_norm": 2.5483408296020764, + "learning_rate": 7.722898861338674e-06, + "loss": 1.7001, + "step": 952 + }, + { + "epoch": 3.8582995951417005, + "grad_norm": 1.917540396918308, + "learning_rate": 7.716969733541357e-06, + "loss": 1.6257, + "step": 953 + }, + { + "epoch": 3.862348178137652, + "grad_norm": 2.4091479518780177, + "learning_rate": 7.711035179422205e-06, + "loss": 1.6058, + "step": 954 + }, + { + "epoch": 3.866396761133603, + "grad_norm": 2.4390857592479183, + "learning_rate": 7.705095210833687e-06, + "loss": 1.6468, + "step": 955 + }, + { + "epoch": 3.8704453441295548, + "grad_norm": 3.01025731676863, + "learning_rate": 7.699149839639086e-06, + "loss": 2.1392, + "step": 956 + }, + { + "epoch": 3.8744939271255063, + "grad_norm": 2.6957364897623473, + "learning_rate": 7.693199077712476e-06, + "loss": 2.0741, + "step": 957 + }, + { + "epoch": 3.8785425101214575, + "grad_norm": 2.6726767004932395, + "learning_rate": 7.687242936938694e-06, + "loss": 1.8205, + "step": 958 + }, + { + "epoch": 3.882591093117409, + "grad_norm": 2.3223231672079727, + "learning_rate": 7.681281429213328e-06, + "loss": 1.7239, + "step": 959 + }, + { + "epoch": 3.8866396761133606, + "grad_norm": 2.4223424195591505, + "learning_rate": 7.675314566442673e-06, + "loss": 1.2702, + "step": 960 + }, + { + "epoch": 3.8906882591093117, + "grad_norm": 2.1111739790928024, + "learning_rate": 7.669342360543727e-06, + "loss": 1.7654, + "step": 961 + }, + { + "epoch": 3.8947368421052633, + "grad_norm": 2.0865715931939968, + "learning_rate": 7.663364823444157e-06, + "loss": 1.8567, + "step": 962 + }, + { + "epoch": 3.898785425101215, + "grad_norm": 1.9521945713254736, + "learning_rate": 7.65738196708228e-06, + "loss": 1.5513, + "step": 963 + }, + { + "epoch": 3.902834008097166, + "grad_norm": 2.252893420029499, + "learning_rate": 7.651393803407032e-06, + "loss": 1.6101, + "step": 964 + }, + { + "epoch": 3.9068825910931175, + "grad_norm": 2.445627287506017, + "learning_rate": 7.645400344377953e-06, + "loss": 1.7802, + "step": 965 + }, + { + "epoch": 3.910931174089069, + "grad_norm": 2.206311718559999, + "learning_rate": 7.639401601965158e-06, + "loss": 1.3433, + "step": 966 + }, + { + "epoch": 3.91497975708502, + "grad_norm": 2.5126306064577935, + "learning_rate": 7.63339758814931e-06, + "loss": 1.4571, + "step": 967 + }, + { + "epoch": 3.919028340080972, + "grad_norm": 2.301201962037062, + "learning_rate": 7.627388314921602e-06, + "loss": 1.4798, + "step": 968 + }, + { + "epoch": 3.9230769230769234, + "grad_norm": 2.0505587515987265, + "learning_rate": 7.621373794283735e-06, + "loss": 1.7924, + "step": 969 + }, + { + "epoch": 3.9271255060728745, + "grad_norm": 2.716118255543476, + "learning_rate": 7.615354038247889e-06, + "loss": 1.6337, + "step": 970 + }, + { + "epoch": 3.931174089068826, + "grad_norm": 2.636209282969381, + "learning_rate": 7.609329058836694e-06, + "loss": 1.6699, + "step": 971 + }, + { + "epoch": 3.9352226720647776, + "grad_norm": 2.3802398786409107, + "learning_rate": 7.6032988680832195e-06, + "loss": 1.4692, + "step": 972 + }, + { + "epoch": 3.9392712550607287, + "grad_norm": 2.5735078826994844, + "learning_rate": 7.597263478030939e-06, + "loss": 1.3909, + "step": 973 + }, + { + "epoch": 3.9433198380566803, + "grad_norm": 2.986329351018389, + "learning_rate": 7.59122290073371e-06, + "loss": 1.6787, + "step": 974 + }, + { + "epoch": 3.9473684210526314, + "grad_norm": 2.4407323865905015, + "learning_rate": 7.5851771482557535e-06, + "loss": 1.4349, + "step": 975 + }, + { + "epoch": 3.951417004048583, + "grad_norm": 2.8570555841909657, + "learning_rate": 7.579126232671621e-06, + "loss": 1.4016, + "step": 976 + }, + { + "epoch": 3.9554655870445345, + "grad_norm": 3.322338952206099, + "learning_rate": 7.5730701660661795e-06, + "loss": 1.6104, + "step": 977 + }, + { + "epoch": 3.9595141700404857, + "grad_norm": 2.5182830088343082, + "learning_rate": 7.567008960534585e-06, + "loss": 1.6231, + "step": 978 + }, + { + "epoch": 3.9635627530364372, + "grad_norm": 2.1739951186703923, + "learning_rate": 7.560942628182251e-06, + "loss": 1.6679, + "step": 979 + }, + { + "epoch": 3.967611336032389, + "grad_norm": 2.5756124639646982, + "learning_rate": 7.554871181124836e-06, + "loss": 1.8633, + "step": 980 + }, + { + "epoch": 3.97165991902834, + "grad_norm": 3.073388081199716, + "learning_rate": 7.548794631488211e-06, + "loss": 1.768, + "step": 981 + }, + { + "epoch": 3.9757085020242915, + "grad_norm": 2.1012291254049797, + "learning_rate": 7.5427129914084385e-06, + "loss": 1.6442, + "step": 982 + }, + { + "epoch": 3.979757085020243, + "grad_norm": 2.351295674425286, + "learning_rate": 7.536626273031747e-06, + "loss": 1.7358, + "step": 983 + }, + { + "epoch": 3.983805668016194, + "grad_norm": 2.115853749649768, + "learning_rate": 7.530534488514507e-06, + "loss": 1.8024, + "step": 984 + }, + { + "epoch": 3.9878542510121457, + "grad_norm": 2.454948116388734, + "learning_rate": 7.524437650023211e-06, + "loss": 1.6063, + "step": 985 + }, + { + "epoch": 3.9919028340080973, + "grad_norm": 2.043008387794743, + "learning_rate": 7.5183357697344395e-06, + "loss": 1.5544, + "step": 986 + }, + { + "epoch": 3.9959514170040484, + "grad_norm": 1.8968397388893163, + "learning_rate": 7.512228859834845e-06, + "loss": 1.8733, + "step": 987 + }, + { + "epoch": 4.0, + "grad_norm": 2.2142162316932255, + "learning_rate": 7.506116932521127e-06, + "loss": 1.6136, + "step": 988 + }, + { + "epoch": 4.004048582995951, + "grad_norm": 2.080064737878757, + "learning_rate": 7.500000000000001e-06, + "loss": 1.6735, + "step": 989 + }, + { + "epoch": 4.008097165991903, + "grad_norm": 2.8195577020771863, + "learning_rate": 7.493878074488184e-06, + "loss": 1.8144, + "step": 990 + }, + { + "epoch": 4.012145748987854, + "grad_norm": 2.861434123319288, + "learning_rate": 7.4877511682123635e-06, + "loss": 1.6734, + "step": 991 + }, + { + "epoch": 4.016194331983805, + "grad_norm": 3.0695960191225247, + "learning_rate": 7.481619293409173e-06, + "loss": 1.8495, + "step": 992 + }, + { + "epoch": 4.020242914979757, + "grad_norm": 2.580474309033628, + "learning_rate": 7.475482462325169e-06, + "loss": 2.099, + "step": 993 + }, + { + "epoch": 4.0242914979757085, + "grad_norm": 2.721243409721488, + "learning_rate": 7.469340687216809e-06, + "loss": 1.9446, + "step": 994 + }, + { + "epoch": 4.02834008097166, + "grad_norm": 2.3410049191202074, + "learning_rate": 7.4631939803504215e-06, + "loss": 1.6196, + "step": 995 + }, + { + "epoch": 4.032388663967612, + "grad_norm": 2.720885518023577, + "learning_rate": 7.4570423540021905e-06, + "loss": 1.6221, + "step": 996 + }, + { + "epoch": 4.036437246963563, + "grad_norm": 2.5413861683291996, + "learning_rate": 7.450885820458117e-06, + "loss": 1.8749, + "step": 997 + }, + { + "epoch": 4.040485829959514, + "grad_norm": 2.5863690862096957, + "learning_rate": 7.44472439201401e-06, + "loss": 1.6649, + "step": 998 + }, + { + "epoch": 4.044534412955466, + "grad_norm": 2.371552718771952, + "learning_rate": 7.438558080975449e-06, + "loss": 1.6799, + "step": 999 + }, + { + "epoch": 4.048582995951417, + "grad_norm": 2.5691951258164063, + "learning_rate": 7.4323868996577696e-06, + "loss": 1.63, + "step": 1000 + }, + { + "epoch": 4.052631578947368, + "grad_norm": 2.675468998968646, + "learning_rate": 7.426210860386032e-06, + "loss": 1.7354, + "step": 1001 + }, + { + "epoch": 4.05668016194332, + "grad_norm": 2.58607973493479, + "learning_rate": 7.420029975494996e-06, + "loss": 1.5703, + "step": 1002 + }, + { + "epoch": 4.060728744939271, + "grad_norm": 2.475852723612659, + "learning_rate": 7.413844257329104e-06, + "loss": 1.749, + "step": 1003 + }, + { + "epoch": 4.064777327935222, + "grad_norm": 2.625704853477589, + "learning_rate": 7.407653718242449e-06, + "loss": 1.6948, + "step": 1004 + }, + { + "epoch": 4.068825910931174, + "grad_norm": 2.7272435081151283, + "learning_rate": 7.401458370598753e-06, + "loss": 1.8281, + "step": 1005 + }, + { + "epoch": 4.0728744939271255, + "grad_norm": 2.507953052399452, + "learning_rate": 7.395258226771341e-06, + "loss": 1.7673, + "step": 1006 + }, + { + "epoch": 4.076923076923077, + "grad_norm": 2.5085283118904074, + "learning_rate": 7.3890532991431174e-06, + "loss": 1.6958, + "step": 1007 + }, + { + "epoch": 4.080971659919029, + "grad_norm": 2.388953051348741, + "learning_rate": 7.382843600106539e-06, + "loss": 1.7112, + "step": 1008 + }, + { + "epoch": 4.08502024291498, + "grad_norm": 2.2236808085380644, + "learning_rate": 7.376629142063597e-06, + "loss": 1.7162, + "step": 1009 + }, + { + "epoch": 4.089068825910931, + "grad_norm": 2.7412048035286505, + "learning_rate": 7.370409937425781e-06, + "loss": 1.7045, + "step": 1010 + }, + { + "epoch": 4.093117408906883, + "grad_norm": 2.3839251838504367, + "learning_rate": 7.364185998614064e-06, + "loss": 1.7854, + "step": 1011 + }, + { + "epoch": 4.097165991902834, + "grad_norm": 2.383572557144146, + "learning_rate": 7.357957338058873e-06, + "loss": 1.534, + "step": 1012 + }, + { + "epoch": 4.101214574898785, + "grad_norm": 2.7483936941368996, + "learning_rate": 7.3517239682000675e-06, + "loss": 1.7001, + "step": 1013 + }, + { + "epoch": 4.105263157894737, + "grad_norm": 2.6910416116843257, + "learning_rate": 7.345485901486908e-06, + "loss": 1.7037, + "step": 1014 + }, + { + "epoch": 4.109311740890688, + "grad_norm": 2.677750230508956, + "learning_rate": 7.33924315037804e-06, + "loss": 1.6197, + "step": 1015 + }, + { + "epoch": 4.113360323886639, + "grad_norm": 3.1184294482443717, + "learning_rate": 7.332995727341462e-06, + "loss": 1.5587, + "step": 1016 + }, + { + "epoch": 4.117408906882591, + "grad_norm": 2.697817221643411, + "learning_rate": 7.326743644854504e-06, + "loss": 1.4804, + "step": 1017 + }, + { + "epoch": 4.1214574898785425, + "grad_norm": 2.5533427892436364, + "learning_rate": 7.3204869154038015e-06, + "loss": 1.5149, + "step": 1018 + }, + { + "epoch": 4.125506072874494, + "grad_norm": 2.7058477331519604, + "learning_rate": 7.314225551485273e-06, + "loss": 1.5156, + "step": 1019 + }, + { + "epoch": 4.129554655870446, + "grad_norm": 2.8633359493766384, + "learning_rate": 7.30795956560409e-06, + "loss": 1.4187, + "step": 1020 + }, + { + "epoch": 4.133603238866397, + "grad_norm": 2.346585899707522, + "learning_rate": 7.301688970274655e-06, + "loss": 1.7718, + "step": 1021 + }, + { + "epoch": 4.137651821862348, + "grad_norm": 2.8346595314782568, + "learning_rate": 7.295413778020579e-06, + "loss": 1.6181, + "step": 1022 + }, + { + "epoch": 4.1417004048583, + "grad_norm": 2.1328033209542046, + "learning_rate": 7.289134001374654e-06, + "loss": 1.3513, + "step": 1023 + }, + { + "epoch": 4.145748987854251, + "grad_norm": 2.723527413205223, + "learning_rate": 7.282849652878824e-06, + "loss": 1.7449, + "step": 1024 + }, + { + "epoch": 4.149797570850202, + "grad_norm": 2.6296530406635648, + "learning_rate": 7.276560745084167e-06, + "loss": 1.56, + "step": 1025 + }, + { + "epoch": 4.153846153846154, + "grad_norm": 2.3607444563571645, + "learning_rate": 7.2702672905508656e-06, + "loss": 1.7373, + "step": 1026 + }, + { + "epoch": 4.157894736842105, + "grad_norm": 2.857459652562985, + "learning_rate": 7.263969301848188e-06, + "loss": 1.8929, + "step": 1027 + }, + { + "epoch": 4.161943319838056, + "grad_norm": 2.416479591453608, + "learning_rate": 7.257666791554448e-06, + "loss": 1.6155, + "step": 1028 + }, + { + "epoch": 4.165991902834008, + "grad_norm": 2.485932817739182, + "learning_rate": 7.251359772256998e-06, + "loss": 1.6856, + "step": 1029 + }, + { + "epoch": 4.17004048582996, + "grad_norm": 2.2601305066652664, + "learning_rate": 7.245048256552195e-06, + "loss": 2.1658, + "step": 1030 + }, + { + "epoch": 4.174089068825911, + "grad_norm": 2.4736185296097566, + "learning_rate": 7.2387322570453724e-06, + "loss": 1.5329, + "step": 1031 + }, + { + "epoch": 4.178137651821863, + "grad_norm": 2.902522379367228, + "learning_rate": 7.232411786350824e-06, + "loss": 1.7115, + "step": 1032 + }, + { + "epoch": 4.182186234817814, + "grad_norm": 2.1213589715944594, + "learning_rate": 7.226086857091765e-06, + "loss": 1.5227, + "step": 1033 + }, + { + "epoch": 4.186234817813765, + "grad_norm": 2.8619121355527968, + "learning_rate": 7.219757481900325e-06, + "loss": 1.6826, + "step": 1034 + }, + { + "epoch": 4.190283400809717, + "grad_norm": 2.5322052891357867, + "learning_rate": 7.213423673417508e-06, + "loss": 1.7019, + "step": 1035 + }, + { + "epoch": 4.194331983805668, + "grad_norm": 2.868097930235534, + "learning_rate": 7.207085444293172e-06, + "loss": 1.8899, + "step": 1036 + }, + { + "epoch": 4.198380566801619, + "grad_norm": 2.5521158066560288, + "learning_rate": 7.2007428071860045e-06, + "loss": 1.9495, + "step": 1037 + }, + { + "epoch": 4.202429149797571, + "grad_norm": 2.63283746068705, + "learning_rate": 7.194395774763496e-06, + "loss": 1.6451, + "step": 1038 + }, + { + "epoch": 4.206477732793522, + "grad_norm": 3.020988257996165, + "learning_rate": 7.188044359701917e-06, + "loss": 1.9686, + "step": 1039 + }, + { + "epoch": 4.2105263157894735, + "grad_norm": 2.7497468285413267, + "learning_rate": 7.181688574686292e-06, + "loss": 2.0078, + "step": 1040 + }, + { + "epoch": 4.2145748987854255, + "grad_norm": 2.4897799224246873, + "learning_rate": 7.175328432410367e-06, + "loss": 1.7921, + "step": 1041 + }, + { + "epoch": 4.218623481781377, + "grad_norm": 2.470322521256254, + "learning_rate": 7.168963945576597e-06, + "loss": 1.6719, + "step": 1042 + }, + { + "epoch": 4.222672064777328, + "grad_norm": 2.6592137837660266, + "learning_rate": 7.162595126896111e-06, + "loss": 1.5749, + "step": 1043 + }, + { + "epoch": 4.22672064777328, + "grad_norm": 2.533296478811204, + "learning_rate": 7.15622198908869e-06, + "loss": 1.7352, + "step": 1044 + }, + { + "epoch": 4.230769230769231, + "grad_norm": 2.5992050846283354, + "learning_rate": 7.149844544882742e-06, + "loss": 1.5639, + "step": 1045 + }, + { + "epoch": 4.234817813765182, + "grad_norm": 2.7675121593200367, + "learning_rate": 7.143462807015271e-06, + "loss": 1.8108, + "step": 1046 + }, + { + "epoch": 4.238866396761134, + "grad_norm": 2.658793190465704, + "learning_rate": 7.137076788231865e-06, + "loss": 1.7457, + "step": 1047 + }, + { + "epoch": 4.242914979757085, + "grad_norm": 2.604959217646965, + "learning_rate": 7.130686501286655e-06, + "loss": 1.7451, + "step": 1048 + }, + { + "epoch": 4.246963562753036, + "grad_norm": 2.5111072223063897, + "learning_rate": 7.1242919589422974e-06, + "loss": 1.5808, + "step": 1049 + }, + { + "epoch": 4.251012145748988, + "grad_norm": 2.4705422975939775, + "learning_rate": 7.11789317396995e-06, + "loss": 1.6597, + "step": 1050 + }, + { + "epoch": 4.255060728744939, + "grad_norm": 2.8012872307046726, + "learning_rate": 7.1114901591492404e-06, + "loss": 1.6728, + "step": 1051 + }, + { + "epoch": 4.2591093117408905, + "grad_norm": 2.376781495157912, + "learning_rate": 7.105082927268247e-06, + "loss": 1.561, + "step": 1052 + }, + { + "epoch": 4.2631578947368425, + "grad_norm": 2.5702431118604423, + "learning_rate": 7.0986714911234715e-06, + "loss": 1.4172, + "step": 1053 + }, + { + "epoch": 4.267206477732794, + "grad_norm": 2.508325280537679, + "learning_rate": 7.092255863519806e-06, + "loss": 1.6779, + "step": 1054 + }, + { + "epoch": 4.271255060728745, + "grad_norm": 2.540012700506, + "learning_rate": 7.085836057270521e-06, + "loss": 1.6985, + "step": 1055 + }, + { + "epoch": 4.275303643724697, + "grad_norm": 2.471796434580062, + "learning_rate": 7.079412085197229e-06, + "loss": 1.7301, + "step": 1056 + }, + { + "epoch": 4.279352226720648, + "grad_norm": 3.3244889584848107, + "learning_rate": 7.072983960129862e-06, + "loss": 1.7094, + "step": 1057 + }, + { + "epoch": 4.283400809716599, + "grad_norm": 2.983349503659567, + "learning_rate": 7.066551694906651e-06, + "loss": 1.3989, + "step": 1058 + }, + { + "epoch": 4.287449392712551, + "grad_norm": 3.036520426590972, + "learning_rate": 7.060115302374087e-06, + "loss": 1.5257, + "step": 1059 + }, + { + "epoch": 4.291497975708502, + "grad_norm": 3.2696461082092068, + "learning_rate": 7.053674795386914e-06, + "loss": 1.3769, + "step": 1060 + }, + { + "epoch": 4.295546558704453, + "grad_norm": 3.066097380387373, + "learning_rate": 7.047230186808085e-06, + "loss": 1.6842, + "step": 1061 + }, + { + "epoch": 4.299595141700405, + "grad_norm": 2.6903089198270855, + "learning_rate": 7.04078148950875e-06, + "loss": 1.8088, + "step": 1062 + }, + { + "epoch": 4.303643724696356, + "grad_norm": 2.8258995708159773, + "learning_rate": 7.034328716368224e-06, + "loss": 1.5156, + "step": 1063 + }, + { + "epoch": 4.3076923076923075, + "grad_norm": 2.858420747113862, + "learning_rate": 7.027871880273959e-06, + "loss": 1.5394, + "step": 1064 + }, + { + "epoch": 4.3117408906882595, + "grad_norm": 2.7740108493498323, + "learning_rate": 7.021410994121525e-06, + "loss": 1.549, + "step": 1065 + }, + { + "epoch": 4.315789473684211, + "grad_norm": 3.219790325593576, + "learning_rate": 7.014946070814583e-06, + "loss": 1.5296, + "step": 1066 + }, + { + "epoch": 4.319838056680162, + "grad_norm": 3.0526696821998094, + "learning_rate": 7.008477123264849e-06, + "loss": 1.4361, + "step": 1067 + }, + { + "epoch": 4.323886639676114, + "grad_norm": 2.9571662763160136, + "learning_rate": 7.0020041643920826e-06, + "loss": 1.4498, + "step": 1068 + }, + { + "epoch": 4.327935222672065, + "grad_norm": 2.819893094328226, + "learning_rate": 6.995527207124053e-06, + "loss": 1.4853, + "step": 1069 + }, + { + "epoch": 4.331983805668016, + "grad_norm": 2.7252255526223625, + "learning_rate": 6.989046264396516e-06, + "loss": 1.4535, + "step": 1070 + }, + { + "epoch": 4.336032388663968, + "grad_norm": 2.6189552228263753, + "learning_rate": 6.982561349153188e-06, + "loss": 1.5022, + "step": 1071 + }, + { + "epoch": 4.340080971659919, + "grad_norm": 2.568082005220546, + "learning_rate": 6.976072474345713e-06, + "loss": 1.4532, + "step": 1072 + }, + { + "epoch": 4.34412955465587, + "grad_norm": 2.623502257576312, + "learning_rate": 6.96957965293365e-06, + "loss": 1.4399, + "step": 1073 + }, + { + "epoch": 4.348178137651822, + "grad_norm": 3.1483597392827045, + "learning_rate": 6.963082897884439e-06, + "loss": 1.615, + "step": 1074 + }, + { + "epoch": 4.352226720647773, + "grad_norm": 3.8022601065423123, + "learning_rate": 6.956582222173374e-06, + "loss": 1.5412, + "step": 1075 + }, + { + "epoch": 4.3562753036437245, + "grad_norm": 3.177062030751366, + "learning_rate": 6.9500776387835785e-06, + "loss": 1.2047, + "step": 1076 + }, + { + "epoch": 4.3603238866396765, + "grad_norm": 3.185748452470112, + "learning_rate": 6.943569160705985e-06, + "loss": 1.6101, + "step": 1077 + }, + { + "epoch": 4.364372469635628, + "grad_norm": 2.9943825828047954, + "learning_rate": 6.9370568009393e-06, + "loss": 1.6897, + "step": 1078 + }, + { + "epoch": 4.368421052631579, + "grad_norm": 2.8396585705303297, + "learning_rate": 6.9305405724899876e-06, + "loss": 1.6066, + "step": 1079 + }, + { + "epoch": 4.372469635627531, + "grad_norm": 3.4103100269352504, + "learning_rate": 6.924020488372229e-06, + "loss": 1.6845, + "step": 1080 + }, + { + "epoch": 4.376518218623482, + "grad_norm": 2.8184107943036323, + "learning_rate": 6.917496561607915e-06, + "loss": 1.3205, + "step": 1081 + }, + { + "epoch": 4.380566801619433, + "grad_norm": 3.152451887221124, + "learning_rate": 6.91096880522661e-06, + "loss": 1.4827, + "step": 1082 + }, + { + "epoch": 4.384615384615385, + "grad_norm": 2.8506198416780317, + "learning_rate": 6.904437232265521e-06, + "loss": 1.2814, + "step": 1083 + }, + { + "epoch": 4.388663967611336, + "grad_norm": 3.2465586785242033, + "learning_rate": 6.897901855769483e-06, + "loss": 1.3431, + "step": 1084 + }, + { + "epoch": 4.392712550607287, + "grad_norm": 3.077940405612511, + "learning_rate": 6.891362688790925e-06, + "loss": 1.5208, + "step": 1085 + }, + { + "epoch": 4.396761133603239, + "grad_norm": 3.4135560109047005, + "learning_rate": 6.884819744389848e-06, + "loss": 1.3629, + "step": 1086 + }, + { + "epoch": 4.40080971659919, + "grad_norm": 2.6507174805524727, + "learning_rate": 6.878273035633795e-06, + "loss": 1.3853, + "step": 1087 + }, + { + "epoch": 4.4048582995951415, + "grad_norm": 2.5895703393651637, + "learning_rate": 6.871722575597829e-06, + "loss": 1.4423, + "step": 1088 + }, + { + "epoch": 4.4089068825910935, + "grad_norm": 3.2322118670425777, + "learning_rate": 6.865168377364506e-06, + "loss": 1.5468, + "step": 1089 + }, + { + "epoch": 4.412955465587045, + "grad_norm": 2.942042054251793, + "learning_rate": 6.858610454023842e-06, + "loss": 1.36, + "step": 1090 + }, + { + "epoch": 4.417004048582996, + "grad_norm": 3.122031784641475, + "learning_rate": 6.8520488186733e-06, + "loss": 1.6917, + "step": 1091 + }, + { + "epoch": 4.421052631578947, + "grad_norm": 3.2313772685904847, + "learning_rate": 6.845483484417756e-06, + "loss": 1.7526, + "step": 1092 + }, + { + "epoch": 4.425101214574899, + "grad_norm": 2.8735793462178023, + "learning_rate": 6.838914464369467e-06, + "loss": 1.6487, + "step": 1093 + }, + { + "epoch": 4.42914979757085, + "grad_norm": 2.954566180150772, + "learning_rate": 6.832341771648057e-06, + "loss": 1.7096, + "step": 1094 + }, + { + "epoch": 4.433198380566802, + "grad_norm": 2.587188799407319, + "learning_rate": 6.825765419380484e-06, + "loss": 1.8456, + "step": 1095 + }, + { + "epoch": 4.437246963562753, + "grad_norm": 3.0518891038101925, + "learning_rate": 6.819185420701011e-06, + "loss": 1.6224, + "step": 1096 + }, + { + "epoch": 4.441295546558704, + "grad_norm": 3.118348281802091, + "learning_rate": 6.812601788751192e-06, + "loss": 1.5498, + "step": 1097 + }, + { + "epoch": 4.445344129554655, + "grad_norm": 2.894711350660116, + "learning_rate": 6.806014536679828e-06, + "loss": 1.8041, + "step": 1098 + }, + { + "epoch": 4.449392712550607, + "grad_norm": 3.062471930595446, + "learning_rate": 6.7994236776429555e-06, + "loss": 1.5815, + "step": 1099 + }, + { + "epoch": 4.4534412955465585, + "grad_norm": 3.0993288240233263, + "learning_rate": 6.792829224803816e-06, + "loss": 1.5695, + "step": 1100 + }, + { + "epoch": 4.4574898785425106, + "grad_norm": 3.149585012325393, + "learning_rate": 6.7862311913328235e-06, + "loss": 1.9487, + "step": 1101 + }, + { + "epoch": 4.461538461538462, + "grad_norm": 4.120477147155456, + "learning_rate": 6.779629590407547e-06, + "loss": 2.1517, + "step": 1102 + }, + { + "epoch": 4.465587044534413, + "grad_norm": 3.1988261301020855, + "learning_rate": 6.773024435212678e-06, + "loss": 1.79, + "step": 1103 + }, + { + "epoch": 4.469635627530364, + "grad_norm": 2.6369221757485457, + "learning_rate": 6.7664157389400095e-06, + "loss": 1.7651, + "step": 1104 + }, + { + "epoch": 4.473684210526316, + "grad_norm": 2.7091701203884364, + "learning_rate": 6.7598035147884055e-06, + "loss": 1.6839, + "step": 1105 + }, + { + "epoch": 4.477732793522267, + "grad_norm": 3.4306069422759005, + "learning_rate": 6.753187775963773e-06, + "loss": 1.692, + "step": 1106 + }, + { + "epoch": 4.481781376518219, + "grad_norm": 2.386072562379964, + "learning_rate": 6.746568535679041e-06, + "loss": 1.6155, + "step": 1107 + }, + { + "epoch": 4.48582995951417, + "grad_norm": 2.851423578739297, + "learning_rate": 6.739945807154136e-06, + "loss": 1.5755, + "step": 1108 + }, + { + "epoch": 4.489878542510121, + "grad_norm": 3.3510139502859206, + "learning_rate": 6.733319603615941e-06, + "loss": 1.5105, + "step": 1109 + }, + { + "epoch": 4.493927125506072, + "grad_norm": 3.329100996808692, + "learning_rate": 6.726689938298289e-06, + "loss": 1.568, + "step": 1110 + }, + { + "epoch": 4.497975708502024, + "grad_norm": 2.7974205212393057, + "learning_rate": 6.72005682444192e-06, + "loss": 1.4162, + "step": 1111 + }, + { + "epoch": 4.502024291497976, + "grad_norm": 2.9991024909175676, + "learning_rate": 6.713420275294467e-06, + "loss": 1.2872, + "step": 1112 + }, + { + "epoch": 4.506072874493928, + "grad_norm": 3.341853790054196, + "learning_rate": 6.70678030411042e-06, + "loss": 1.6404, + "step": 1113 + }, + { + "epoch": 4.510121457489879, + "grad_norm": 3.2032309023708687, + "learning_rate": 6.700136924151104e-06, + "loss": 1.6321, + "step": 1114 + }, + { + "epoch": 4.51417004048583, + "grad_norm": 2.446695841899921, + "learning_rate": 6.693490148684654e-06, + "loss": 1.5906, + "step": 1115 + }, + { + "epoch": 4.518218623481781, + "grad_norm": 3.030284559367058, + "learning_rate": 6.686839990985984e-06, + "loss": 1.6148, + "step": 1116 + }, + { + "epoch": 4.522267206477733, + "grad_norm": 3.0612075992794665, + "learning_rate": 6.680186464336767e-06, + "loss": 1.5678, + "step": 1117 + }, + { + "epoch": 4.526315789473684, + "grad_norm": 3.4922710550140685, + "learning_rate": 6.673529582025398e-06, + "loss": 1.3788, + "step": 1118 + }, + { + "epoch": 4.530364372469636, + "grad_norm": 3.4134796811660166, + "learning_rate": 6.666869357346979e-06, + "loss": 1.4428, + "step": 1119 + }, + { + "epoch": 4.534412955465587, + "grad_norm": 3.6649442008937383, + "learning_rate": 6.660205803603286e-06, + "loss": 1.5671, + "step": 1120 + }, + { + "epoch": 4.538461538461538, + "grad_norm": 3.108830354735827, + "learning_rate": 6.653538934102743e-06, + "loss": 1.7903, + "step": 1121 + }, + { + "epoch": 4.5425101214574894, + "grad_norm": 2.719205109719932, + "learning_rate": 6.646868762160399e-06, + "loss": 1.6907, + "step": 1122 + }, + { + "epoch": 4.5465587044534415, + "grad_norm": 15.861026319110369, + "learning_rate": 6.640195301097896e-06, + "loss": 2.0735, + "step": 1123 + }, + { + "epoch": 4.550607287449393, + "grad_norm": 7.357015627613091, + "learning_rate": 6.633518564243442e-06, + "loss": 2.1046, + "step": 1124 + }, + { + "epoch": 4.554655870445345, + "grad_norm": 6.67996402988713, + "learning_rate": 6.626838564931797e-06, + "loss": 2.3423, + "step": 1125 + }, + { + "epoch": 4.558704453441296, + "grad_norm": 2.790707731153053, + "learning_rate": 6.620155316504225e-06, + "loss": 1.5771, + "step": 1126 + }, + { + "epoch": 4.562753036437247, + "grad_norm": 2.6424764643365544, + "learning_rate": 6.6134688323084884e-06, + "loss": 1.4544, + "step": 1127 + }, + { + "epoch": 4.566801619433198, + "grad_norm": 4.460650672408528, + "learning_rate": 6.606779125698808e-06, + "loss": 1.7848, + "step": 1128 + }, + { + "epoch": 4.57085020242915, + "grad_norm": 2.81766092171609, + "learning_rate": 6.600086210035841e-06, + "loss": 1.4465, + "step": 1129 + }, + { + "epoch": 4.574898785425101, + "grad_norm": 2.7934258737790794, + "learning_rate": 6.593390098686653e-06, + "loss": 1.7079, + "step": 1130 + }, + { + "epoch": 4.578947368421053, + "grad_norm": 2.357159807197533, + "learning_rate": 6.586690805024692e-06, + "loss": 1.4715, + "step": 1131 + }, + { + "epoch": 4.582995951417004, + "grad_norm": 2.8201575354409876, + "learning_rate": 6.579988342429764e-06, + "loss": 1.6256, + "step": 1132 + }, + { + "epoch": 4.587044534412955, + "grad_norm": 2.748728982741463, + "learning_rate": 6.573282724288001e-06, + "loss": 1.6067, + "step": 1133 + }, + { + "epoch": 4.5910931174089065, + "grad_norm": 3.0721591492986526, + "learning_rate": 6.566573963991839e-06, + "loss": 1.5832, + "step": 1134 + }, + { + "epoch": 4.5951417004048585, + "grad_norm": 2.8487748202828924, + "learning_rate": 6.559862074939989e-06, + "loss": 1.3233, + "step": 1135 + }, + { + "epoch": 4.59919028340081, + "grad_norm": 2.590591556134, + "learning_rate": 6.553147070537413e-06, + "loss": 1.3674, + "step": 1136 + }, + { + "epoch": 4.603238866396762, + "grad_norm": 2.6607589757127186, + "learning_rate": 6.546428964195289e-06, + "loss": 1.4813, + "step": 1137 + }, + { + "epoch": 4.607287449392713, + "grad_norm": 2.936419659787077, + "learning_rate": 6.539707769330995e-06, + "loss": 1.3335, + "step": 1138 + }, + { + "epoch": 4.611336032388664, + "grad_norm": 5.647454932081391, + "learning_rate": 6.532983499368078e-06, + "loss": 1.631, + "step": 1139 + }, + { + "epoch": 4.615384615384615, + "grad_norm": 2.672027285236729, + "learning_rate": 6.526256167736224e-06, + "loss": 1.6247, + "step": 1140 + }, + { + "epoch": 4.619433198380567, + "grad_norm": 3.585540725187652, + "learning_rate": 6.519525787871235e-06, + "loss": 1.365, + "step": 1141 + }, + { + "epoch": 4.623481781376518, + "grad_norm": 3.509608711468321, + "learning_rate": 6.512792373215e-06, + "loss": 1.7573, + "step": 1142 + }, + { + "epoch": 4.62753036437247, + "grad_norm": 2.971185622782078, + "learning_rate": 6.506055937215471e-06, + "loss": 1.561, + "step": 1143 + }, + { + "epoch": 4.631578947368421, + "grad_norm": 3.2949915313334035, + "learning_rate": 6.499316493326631e-06, + "loss": 1.836, + "step": 1144 + }, + { + "epoch": 4.635627530364372, + "grad_norm": 2.861710933431733, + "learning_rate": 6.492574055008474e-06, + "loss": 1.4458, + "step": 1145 + }, + { + "epoch": 4.6396761133603235, + "grad_norm": 3.3593193695088828, + "learning_rate": 6.4858286357269716e-06, + "loss": 1.6806, + "step": 1146 + }, + { + "epoch": 4.6437246963562755, + "grad_norm": 2.7995829454110317, + "learning_rate": 6.4790802489540495e-06, + "loss": 1.5849, + "step": 1147 + }, + { + "epoch": 4.647773279352227, + "grad_norm": 2.9650473845995617, + "learning_rate": 6.472328908167562e-06, + "loss": 1.6598, + "step": 1148 + }, + { + "epoch": 4.651821862348179, + "grad_norm": 2.7905940219323475, + "learning_rate": 6.465574626851262e-06, + "loss": 1.4666, + "step": 1149 + }, + { + "epoch": 4.65587044534413, + "grad_norm": 3.2553490418837323, + "learning_rate": 6.4588174184947725e-06, + "loss": 1.6918, + "step": 1150 + }, + { + "epoch": 4.659919028340081, + "grad_norm": 3.55927475882226, + "learning_rate": 6.452057296593568e-06, + "loss": 1.5207, + "step": 1151 + }, + { + "epoch": 4.663967611336032, + "grad_norm": 2.9162925097777954, + "learning_rate": 6.445294274648937e-06, + "loss": 1.6745, + "step": 1152 + }, + { + "epoch": 4.668016194331984, + "grad_norm": 2.987151078867793, + "learning_rate": 6.4385283661679624e-06, + "loss": 1.6752, + "step": 1153 + }, + { + "epoch": 4.672064777327935, + "grad_norm": 3.186333717498487, + "learning_rate": 6.431759584663492e-06, + "loss": 1.753, + "step": 1154 + }, + { + "epoch": 4.676113360323887, + "grad_norm": 9.509020769435434, + "learning_rate": 6.424987943654109e-06, + "loss": 1.6195, + "step": 1155 + }, + { + "epoch": 4.680161943319838, + "grad_norm": 3.356709601234609, + "learning_rate": 6.418213456664111e-06, + "loss": 1.6311, + "step": 1156 + }, + { + "epoch": 4.684210526315789, + "grad_norm": 2.921816366789115, + "learning_rate": 6.411436137223479e-06, + "loss": 1.4584, + "step": 1157 + }, + { + "epoch": 4.6882591093117405, + "grad_norm": 2.8660981524508338, + "learning_rate": 6.4046559988678485e-06, + "loss": 1.6084, + "step": 1158 + }, + { + "epoch": 4.6923076923076925, + "grad_norm": 3.0730207415431954, + "learning_rate": 6.397873055138487e-06, + "loss": 1.6274, + "step": 1159 + }, + { + "epoch": 4.696356275303644, + "grad_norm": 2.766004464269283, + "learning_rate": 6.391087319582264e-06, + "loss": 1.4697, + "step": 1160 + }, + { + "epoch": 4.700404858299595, + "grad_norm": 3.6099089118584136, + "learning_rate": 6.384298805751626e-06, + "loss": 1.9489, + "step": 1161 + }, + { + "epoch": 4.704453441295547, + "grad_norm": 3.442626114825173, + "learning_rate": 6.37750752720457e-06, + "loss": 1.727, + "step": 1162 + }, + { + "epoch": 4.708502024291498, + "grad_norm": 3.341066779383342, + "learning_rate": 6.370713497504607e-06, + "loss": 1.3178, + "step": 1163 + }, + { + "epoch": 4.712550607287449, + "grad_norm": 2.8791145178147386, + "learning_rate": 6.363916730220752e-06, + "loss": 1.4908, + "step": 1164 + }, + { + "epoch": 4.716599190283401, + "grad_norm": 2.8558993301680076, + "learning_rate": 6.357117238927481e-06, + "loss": 1.588, + "step": 1165 + }, + { + "epoch": 4.720647773279352, + "grad_norm": 3.403507251743757, + "learning_rate": 6.350315037204714e-06, + "loss": 1.3794, + "step": 1166 + }, + { + "epoch": 4.724696356275303, + "grad_norm": 3.28937405397847, + "learning_rate": 6.343510138637783e-06, + "loss": 1.535, + "step": 1167 + }, + { + "epoch": 4.728744939271255, + "grad_norm": 3.182353899970667, + "learning_rate": 6.336702556817405e-06, + "loss": 1.7416, + "step": 1168 + }, + { + "epoch": 4.732793522267206, + "grad_norm": 2.8393068837004285, + "learning_rate": 6.329892305339659e-06, + "loss": 1.521, + "step": 1169 + }, + { + "epoch": 4.7368421052631575, + "grad_norm": 3.0526645906441585, + "learning_rate": 6.323079397805951e-06, + "loss": 1.6001, + "step": 1170 + }, + { + "epoch": 4.7408906882591095, + "grad_norm": 3.453365846818349, + "learning_rate": 6.3162638478229965e-06, + "loss": 2.244, + "step": 1171 + }, + { + "epoch": 4.744939271255061, + "grad_norm": 2.930549437132931, + "learning_rate": 6.309445669002787e-06, + "loss": 1.6859, + "step": 1172 + }, + { + "epoch": 4.748987854251012, + "grad_norm": 3.513459131175886, + "learning_rate": 6.302624874962563e-06, + "loss": 1.5138, + "step": 1173 + }, + { + "epoch": 4.753036437246964, + "grad_norm": 3.101847130962305, + "learning_rate": 6.295801479324788e-06, + "loss": 1.4048, + "step": 1174 + }, + { + "epoch": 4.757085020242915, + "grad_norm": 2.9351108422638625, + "learning_rate": 6.288975495717124e-06, + "loss": 1.5932, + "step": 1175 + }, + { + "epoch": 4.761133603238866, + "grad_norm": 4.674100976432621, + "learning_rate": 6.282146937772399e-06, + "loss": 2.3515, + "step": 1176 + }, + { + "epoch": 4.765182186234818, + "grad_norm": 5.182394350357637, + "learning_rate": 6.2753158191285844e-06, + "loss": 2.1322, + "step": 1177 + }, + { + "epoch": 4.769230769230769, + "grad_norm": 6.057045402676707, + "learning_rate": 6.268482153428763e-06, + "loss": 2.0072, + "step": 1178 + }, + { + "epoch": 4.77327935222672, + "grad_norm": 3.1068830892655726, + "learning_rate": 6.261645954321109e-06, + "loss": 1.5127, + "step": 1179 + }, + { + "epoch": 4.777327935222672, + "grad_norm": 3.0244265678427213, + "learning_rate": 6.254807235458853e-06, + "loss": 1.7728, + "step": 1180 + }, + { + "epoch": 4.781376518218623, + "grad_norm": 2.949903538067424, + "learning_rate": 6.247966010500258e-06, + "loss": 1.78, + "step": 1181 + }, + { + "epoch": 4.7854251012145745, + "grad_norm": 3.1823383170218946, + "learning_rate": 6.241122293108594e-06, + "loss": 1.6101, + "step": 1182 + }, + { + "epoch": 4.7894736842105265, + "grad_norm": 3.0390422214285975, + "learning_rate": 6.2342760969521085e-06, + "loss": 1.5326, + "step": 1183 + }, + { + "epoch": 4.793522267206478, + "grad_norm": 3.136764973756456, + "learning_rate": 6.227427435703997e-06, + "loss": 1.5671, + "step": 1184 + }, + { + "epoch": 4.797570850202429, + "grad_norm": 3.358208559803108, + "learning_rate": 6.220576323042381e-06, + "loss": 1.5746, + "step": 1185 + }, + { + "epoch": 4.801619433198381, + "grad_norm": 2.8750507177466305, + "learning_rate": 6.213722772650277e-06, + "loss": 1.4246, + "step": 1186 + }, + { + "epoch": 4.805668016194332, + "grad_norm": 3.028809163189934, + "learning_rate": 6.206866798215571e-06, + "loss": 1.317, + "step": 1187 + }, + { + "epoch": 4.809716599190283, + "grad_norm": 3.126804073645922, + "learning_rate": 6.2000084134309905e-06, + "loss": 1.6821, + "step": 1188 + }, + { + "epoch": 4.813765182186235, + "grad_norm": 3.71033178556479, + "learning_rate": 6.193147631994073e-06, + "loss": 1.5786, + "step": 1189 + }, + { + "epoch": 4.817813765182186, + "grad_norm": 3.2129146658285346, + "learning_rate": 6.186284467607149e-06, + "loss": 1.3971, + "step": 1190 + }, + { + "epoch": 4.821862348178137, + "grad_norm": 10.210146232119035, + "learning_rate": 6.179418933977301e-06, + "loss": 2.3347, + "step": 1191 + }, + { + "epoch": 4.825910931174089, + "grad_norm": 21.275577852601224, + "learning_rate": 6.1725510448163516e-06, + "loss": 3.6222, + "step": 1192 + }, + { + "epoch": 4.82995951417004, + "grad_norm": 3.4666551476237584, + "learning_rate": 6.165680813840822e-06, + "loss": 1.4645, + "step": 1193 + }, + { + "epoch": 4.834008097165992, + "grad_norm": 3.4458166986644443, + "learning_rate": 6.1588082547719095e-06, + "loss": 1.3391, + "step": 1194 + }, + { + "epoch": 4.838056680161944, + "grad_norm": 2.919273388343095, + "learning_rate": 6.151933381335468e-06, + "loss": 1.4313, + "step": 1195 + }, + { + "epoch": 4.842105263157895, + "grad_norm": 3.0732467672720736, + "learning_rate": 6.1450562072619635e-06, + "loss": 1.4611, + "step": 1196 + }, + { + "epoch": 4.846153846153846, + "grad_norm": 3.2736024865252493, + "learning_rate": 6.138176746286468e-06, + "loss": 1.3333, + "step": 1197 + }, + { + "epoch": 4.850202429149798, + "grad_norm": 3.3437325068102486, + "learning_rate": 6.131295012148613e-06, + "loss": 1.4833, + "step": 1198 + }, + { + "epoch": 4.854251012145749, + "grad_norm": 3.6058736308138766, + "learning_rate": 6.124411018592568e-06, + "loss": 1.5521, + "step": 1199 + }, + { + "epoch": 4.8582995951417, + "grad_norm": 2.6980859324752267, + "learning_rate": 6.117524779367027e-06, + "loss": 1.4743, + "step": 1200 + }, + { + "epoch": 4.862348178137652, + "grad_norm": 3.4307422256171947, + "learning_rate": 6.110636308225157e-06, + "loss": 1.4612, + "step": 1201 + }, + { + "epoch": 4.866396761133603, + "grad_norm": 3.4665359414620625, + "learning_rate": 6.103745618924587e-06, + "loss": 1.4922, + "step": 1202 + }, + { + "epoch": 4.870445344129554, + "grad_norm": 4.034402333282032, + "learning_rate": 6.096852725227378e-06, + "loss": 1.9715, + "step": 1203 + }, + { + "epoch": 4.874493927125506, + "grad_norm": 3.6881022424154097, + "learning_rate": 6.089957640899988e-06, + "loss": 1.9107, + "step": 1204 + }, + { + "epoch": 4.8785425101214575, + "grad_norm": 3.862338875685726, + "learning_rate": 6.0830603797132574e-06, + "loss": 1.661, + "step": 1205 + }, + { + "epoch": 4.882591093117409, + "grad_norm": 3.384483266395071, + "learning_rate": 6.076160955442369e-06, + "loss": 1.5689, + "step": 1206 + }, + { + "epoch": 4.886639676113361, + "grad_norm": 3.345513039253192, + "learning_rate": 6.069259381866827e-06, + "loss": 1.1468, + "step": 1207 + }, + { + "epoch": 4.890688259109312, + "grad_norm": 2.8964038452697847, + "learning_rate": 6.0623556727704306e-06, + "loss": 1.6516, + "step": 1208 + }, + { + "epoch": 4.894736842105263, + "grad_norm": 2.9136386786268895, + "learning_rate": 6.055449841941238e-06, + "loss": 1.7215, + "step": 1209 + }, + { + "epoch": 4.898785425101215, + "grad_norm": 2.7655346557671248, + "learning_rate": 6.048541903171552e-06, + "loss": 1.4413, + "step": 1210 + }, + { + "epoch": 4.902834008097166, + "grad_norm": 3.2433937012234715, + "learning_rate": 6.041631870257882e-06, + "loss": 1.4725, + "step": 1211 + }, + { + "epoch": 4.906882591093117, + "grad_norm": 3.4688660789200325, + "learning_rate": 6.034719757000918e-06, + "loss": 1.6069, + "step": 1212 + }, + { + "epoch": 4.910931174089069, + "grad_norm": 3.106070985660449, + "learning_rate": 6.0278055772055075e-06, + "loss": 1.2312, + "step": 1213 + }, + { + "epoch": 4.91497975708502, + "grad_norm": 3.4926777350408664, + "learning_rate": 6.020889344680627e-06, + "loss": 1.3252, + "step": 1214 + }, + { + "epoch": 4.919028340080971, + "grad_norm": 3.31474250904695, + "learning_rate": 6.013971073239346e-06, + "loss": 1.3404, + "step": 1215 + }, + { + "epoch": 4.923076923076923, + "grad_norm": 2.7200582966885953, + "learning_rate": 6.007050776698816e-06, + "loss": 1.6668, + "step": 1216 + }, + { + "epoch": 4.9271255060728745, + "grad_norm": 4.194613418220712, + "learning_rate": 6.000128468880223e-06, + "loss": 1.5178, + "step": 1217 + }, + { + "epoch": 4.931174089068826, + "grad_norm": 3.6956716885492047, + "learning_rate": 5.993204163608776e-06, + "loss": 1.5313, + "step": 1218 + }, + { + "epoch": 4.935222672064778, + "grad_norm": 3.42386071095716, + "learning_rate": 5.986277874713672e-06, + "loss": 1.315, + "step": 1219 + }, + { + "epoch": 4.939271255060729, + "grad_norm": 3.4411238008448497, + "learning_rate": 5.979349616028067e-06, + "loss": 1.2599, + "step": 1220 + }, + { + "epoch": 4.94331983805668, + "grad_norm": 4.136849869910849, + "learning_rate": 5.972419401389058e-06, + "loss": 1.5671, + "step": 1221 + }, + { + "epoch": 4.947368421052632, + "grad_norm": 3.3509710910402344, + "learning_rate": 5.96548724463764e-06, + "loss": 1.3098, + "step": 1222 + }, + { + "epoch": 4.951417004048583, + "grad_norm": 3.826301738234217, + "learning_rate": 5.958553159618693e-06, + "loss": 1.2627, + "step": 1223 + }, + { + "epoch": 4.955465587044534, + "grad_norm": 4.211383102056784, + "learning_rate": 5.951617160180944e-06, + "loss": 1.4866, + "step": 1224 + }, + { + "epoch": 4.959514170040486, + "grad_norm": 3.9784296755787043, + "learning_rate": 5.944679260176947e-06, + "loss": 1.5416, + "step": 1225 + }, + { + "epoch": 4.963562753036437, + "grad_norm": 3.121952186318371, + "learning_rate": 5.937739473463047e-06, + "loss": 1.5505, + "step": 1226 + }, + { + "epoch": 4.967611336032388, + "grad_norm": 3.717226187124744, + "learning_rate": 5.930797813899364e-06, + "loss": 1.6869, + "step": 1227 + }, + { + "epoch": 4.97165991902834, + "grad_norm": 4.139266573612088, + "learning_rate": 5.923854295349751e-06, + "loss": 1.5989, + "step": 1228 + }, + { + "epoch": 4.9757085020242915, + "grad_norm": 2.8954471867608937, + "learning_rate": 5.916908931681781e-06, + "loss": 1.5245, + "step": 1229 + }, + { + "epoch": 4.979757085020243, + "grad_norm": 3.153595083245072, + "learning_rate": 5.9099617367667065e-06, + "loss": 1.6063, + "step": 1230 + }, + { + "epoch": 4.983805668016195, + "grad_norm": 2.8400997626861173, + "learning_rate": 5.9030127244794385e-06, + "loss": 1.6715, + "step": 1231 + }, + { + "epoch": 4.987854251012146, + "grad_norm": 3.2491090209153874, + "learning_rate": 5.896061908698521e-06, + "loss": 1.4666, + "step": 1232 + }, + { + "epoch": 4.991902834008097, + "grad_norm": 2.6679775725786286, + "learning_rate": 5.8891093033060945e-06, + "loss": 1.4425, + "step": 1233 + }, + { + "epoch": 4.995951417004049, + "grad_norm": 2.6288454727168067, + "learning_rate": 5.8821549221878795e-06, + "loss": 1.7597, + "step": 1234 + }, + { + "epoch": 5.0, + "grad_norm": 2.885385124366649, + "learning_rate": 5.8751987792331365e-06, + "loss": 1.4922, + "step": 1235 + }, + { + "epoch": 5.004048582995951, + "grad_norm": 2.87961175357714, + "learning_rate": 5.8682408883346535e-06, + "loss": 1.5315, + "step": 1236 + }, + { + "epoch": 5.008097165991903, + "grad_norm": 3.895617299101059, + "learning_rate": 5.861281263388699e-06, + "loss": 1.6767, + "step": 1237 + }, + { + "epoch": 5.012145748987854, + "grad_norm": 3.762686290641399, + "learning_rate": 5.854319918295012e-06, + "loss": 1.5156, + "step": 1238 + }, + { + "epoch": 5.016194331983805, + "grad_norm": 4.177708865223027, + "learning_rate": 5.8473568669567645e-06, + "loss": 1.7157, + "step": 1239 + }, + { + "epoch": 5.020242914979757, + "grad_norm": 3.5866973777228996, + "learning_rate": 5.84039212328054e-06, + "loss": 1.9457, + "step": 1240 + }, + { + "epoch": 5.0242914979757085, + "grad_norm": 3.7038579253911434, + "learning_rate": 5.833425701176294e-06, + "loss": 1.8054, + "step": 1241 + }, + { + "epoch": 5.02834008097166, + "grad_norm": 3.053021737504678, + "learning_rate": 5.826457614557342e-06, + "loss": 1.4846, + "step": 1242 + }, + { + "epoch": 5.032388663967612, + "grad_norm": 3.7131269515944236, + "learning_rate": 5.819487877340318e-06, + "loss": 1.4864, + "step": 1243 + }, + { + "epoch": 5.036437246963563, + "grad_norm": 3.47442806634264, + "learning_rate": 5.812516503445158e-06, + "loss": 1.7235, + "step": 1244 + }, + { + "epoch": 5.040485829959514, + "grad_norm": 3.509517402822926, + "learning_rate": 5.805543506795063e-06, + "loss": 1.517, + "step": 1245 + }, + { + "epoch": 5.044534412955466, + "grad_norm": 3.3619188629392305, + "learning_rate": 5.798568901316475e-06, + "loss": 1.5768, + "step": 1246 + }, + { + "epoch": 5.048582995951417, + "grad_norm": 3.557428062968091, + "learning_rate": 5.79159270093905e-06, + "loss": 1.5018, + "step": 1247 + }, + { + "epoch": 5.052631578947368, + "grad_norm": 3.7281770232445295, + "learning_rate": 5.784614919595631e-06, + "loss": 1.5785, + "step": 1248 + }, + { + "epoch": 5.05668016194332, + "grad_norm": 3.517681869861109, + "learning_rate": 5.7776355712222165e-06, + "loss": 1.4217, + "step": 1249 + }, + { + "epoch": 5.060728744939271, + "grad_norm": 3.487707428141539, + "learning_rate": 5.770654669757935e-06, + "loss": 1.5864, + "step": 1250 + }, + { + "epoch": 5.064777327935222, + "grad_norm": 3.79463286822166, + "learning_rate": 5.763672229145015e-06, + "loss": 1.5406, + "step": 1251 + }, + { + "epoch": 5.068825910931174, + "grad_norm": 3.9587280022782623, + "learning_rate": 5.756688263328762e-06, + "loss": 1.6808, + "step": 1252 + }, + { + "epoch": 5.0728744939271255, + "grad_norm": 3.574038459442136, + "learning_rate": 5.749702786257529e-06, + "loss": 1.6199, + "step": 1253 + }, + { + "epoch": 5.076923076923077, + "grad_norm": 3.9239619763747666, + "learning_rate": 5.742715811882682e-06, + "loss": 1.5554, + "step": 1254 + }, + { + "epoch": 5.080971659919029, + "grad_norm": 3.3525677000904435, + "learning_rate": 5.735727354158581e-06, + "loss": 1.5965, + "step": 1255 + }, + { + "epoch": 5.08502024291498, + "grad_norm": 3.14038896931749, + "learning_rate": 5.7287374270425475e-06, + "loss": 1.5955, + "step": 1256 + }, + { + "epoch": 5.089068825910931, + "grad_norm": 3.800313028867603, + "learning_rate": 5.721746044494838e-06, + "loss": 1.5594, + "step": 1257 + }, + { + "epoch": 5.093117408906883, + "grad_norm": 3.5079921931841707, + "learning_rate": 5.714753220478616e-06, + "loss": 1.6374, + "step": 1258 + }, + { + "epoch": 5.097165991902834, + "grad_norm": 3.3722158742610033, + "learning_rate": 5.707758968959923e-06, + "loss": 1.3947, + "step": 1259 + }, + { + "epoch": 5.101214574898785, + "grad_norm": 3.690572058964337, + "learning_rate": 5.7007633039076535e-06, + "loss": 1.5641, + "step": 1260 + }, + { + "epoch": 5.105263157894737, + "grad_norm": 3.868480542932687, + "learning_rate": 5.693766239293522e-06, + "loss": 1.5403, + "step": 1261 + }, + { + "epoch": 5.109311740890688, + "grad_norm": 3.642440736287873, + "learning_rate": 5.686767789092041e-06, + "loss": 1.4899, + "step": 1262 + }, + { + "epoch": 5.113360323886639, + "grad_norm": 4.407879993174004, + "learning_rate": 5.67976796728049e-06, + "loss": 1.4415, + "step": 1263 + }, + { + "epoch": 5.117408906882591, + "grad_norm": 3.9268283691257166, + "learning_rate": 5.672766787838884e-06, + "loss": 1.349, + "step": 1264 + }, + { + "epoch": 5.1214574898785425, + "grad_norm": 3.5424496240381282, + "learning_rate": 5.6657642647499545e-06, + "loss": 1.4005, + "step": 1265 + }, + { + "epoch": 5.125506072874494, + "grad_norm": 3.714267182183359, + "learning_rate": 5.658760411999115e-06, + "loss": 1.4047, + "step": 1266 + }, + { + "epoch": 5.129554655870446, + "grad_norm": 4.1352520308511425, + "learning_rate": 5.6517552435744325e-06, + "loss": 1.3041, + "step": 1267 + }, + { + "epoch": 5.133603238866397, + "grad_norm": 3.1992855070868185, + "learning_rate": 5.644748773466606e-06, + "loss": 1.6559, + "step": 1268 + }, + { + "epoch": 5.137651821862348, + "grad_norm": 3.852499540993822, + "learning_rate": 5.637741015668929e-06, + "loss": 1.4822, + "step": 1269 + }, + { + "epoch": 5.1417004048583, + "grad_norm": 3.0057363516680513, + "learning_rate": 5.630731984177269e-06, + "loss": 1.2246, + "step": 1270 + }, + { + "epoch": 5.145748987854251, + "grad_norm": 3.8748912975587544, + "learning_rate": 5.62372169299004e-06, + "loss": 1.5924, + "step": 1271 + }, + { + "epoch": 5.149797570850202, + "grad_norm": 3.5771984578664875, + "learning_rate": 5.616710156108167e-06, + "loss": 1.4133, + "step": 1272 + }, + { + "epoch": 5.153846153846154, + "grad_norm": 3.2086974588686576, + "learning_rate": 5.609697387535068e-06, + "loss": 1.621, + "step": 1273 + }, + { + "epoch": 5.157894736842105, + "grad_norm": 3.984819835501151, + "learning_rate": 5.6026834012766155e-06, + "loss": 1.7158, + "step": 1274 + }, + { + "epoch": 5.161943319838056, + "grad_norm": 3.2013860532982337, + "learning_rate": 5.5956682113411184e-06, + "loss": 1.4746, + "step": 1275 + }, + { + "epoch": 5.165991902834008, + "grad_norm": 3.450642934981606, + "learning_rate": 5.588651831739289e-06, + "loss": 1.5543, + "step": 1276 + }, + { + "epoch": 5.17004048582996, + "grad_norm": 3.093776549631426, + "learning_rate": 5.581634276484211e-06, + "loss": 2.074, + "step": 1277 + }, + { + "epoch": 5.174089068825911, + "grad_norm": 3.545758099078526, + "learning_rate": 5.574615559591323e-06, + "loss": 1.3906, + "step": 1278 + }, + { + "epoch": 5.178137651821863, + "grad_norm": 4.14672203994261, + "learning_rate": 5.567595695078379e-06, + "loss": 1.5738, + "step": 1279 + }, + { + "epoch": 5.182186234817814, + "grad_norm": 2.9347838837502294, + "learning_rate": 5.560574696965425e-06, + "loss": 1.3815, + "step": 1280 + }, + { + "epoch": 5.186234817813765, + "grad_norm": 3.90774860265149, + "learning_rate": 5.553552579274775e-06, + "loss": 1.5673, + "step": 1281 + }, + { + "epoch": 5.190283400809717, + "grad_norm": 3.578616704951525, + "learning_rate": 5.546529356030974e-06, + "loss": 1.5733, + "step": 1282 + }, + { + "epoch": 5.194331983805668, + "grad_norm": 4.0010401720998185, + "learning_rate": 5.539505041260779e-06, + "loss": 1.757, + "step": 1283 + }, + { + "epoch": 5.198380566801619, + "grad_norm": 3.509112575984563, + "learning_rate": 5.532479648993122e-06, + "loss": 1.8081, + "step": 1284 + }, + { + "epoch": 5.202429149797571, + "grad_norm": 3.5347317901565556, + "learning_rate": 5.525453193259094e-06, + "loss": 1.5116, + "step": 1285 + }, + { + "epoch": 5.206477732793522, + "grad_norm": 3.4675375372116184, + "learning_rate": 5.518425688091906e-06, + "loss": 1.8506, + "step": 1286 + }, + { + "epoch": 5.2105263157894735, + "grad_norm": 3.6323230014040306, + "learning_rate": 5.511397147526862e-06, + "loss": 1.8682, + "step": 1287 + }, + { + "epoch": 5.2145748987854255, + "grad_norm": 3.5536336190454048, + "learning_rate": 5.504367585601342e-06, + "loss": 1.6388, + "step": 1288 + }, + { + "epoch": 5.218623481781377, + "grad_norm": 3.6273876631462905, + "learning_rate": 5.497337016354757e-06, + "loss": 1.5266, + "step": 1289 + }, + { + "epoch": 5.222672064777328, + "grad_norm": 3.605955542328613, + "learning_rate": 5.490305453828534e-06, + "loss": 1.4274, + "step": 1290 + }, + { + "epoch": 5.22672064777328, + "grad_norm": 3.594834856645006, + "learning_rate": 5.483272912066084e-06, + "loss": 1.6117, + "step": 1291 + }, + { + "epoch": 5.230769230769231, + "grad_norm": 3.6817183177194295, + "learning_rate": 5.476239405112775e-06, + "loss": 1.4265, + "step": 1292 + }, + { + "epoch": 5.234817813765182, + "grad_norm": 4.022022675891982, + "learning_rate": 5.469204947015897e-06, + "loss": 1.668, + "step": 1293 + }, + { + "epoch": 5.238866396761134, + "grad_norm": 3.889168025126557, + "learning_rate": 5.462169551824648e-06, + "loss": 1.6076, + "step": 1294 + }, + { + "epoch": 5.242914979757085, + "grad_norm": 3.6700082316334273, + "learning_rate": 5.45513323359009e-06, + "loss": 1.6171, + "step": 1295 + }, + { + "epoch": 5.246963562753036, + "grad_norm": 3.6748741609947855, + "learning_rate": 5.448096006365132e-06, + "loss": 1.4488, + "step": 1296 + }, + { + "epoch": 5.251012145748988, + "grad_norm": 3.6290737200114993, + "learning_rate": 5.4410578842045e-06, + "loss": 1.5478, + "step": 1297 + }, + { + "epoch": 5.255060728744939, + "grad_norm": 3.8478048256636357, + "learning_rate": 5.434018881164702e-06, + "loss": 1.523, + "step": 1298 + }, + { + "epoch": 5.2591093117408905, + "grad_norm": 3.312410066611835, + "learning_rate": 5.426979011304012e-06, + "loss": 1.4463, + "step": 1299 + }, + { + "epoch": 5.2631578947368425, + "grad_norm": 3.647621711678499, + "learning_rate": 5.41993828868243e-06, + "loss": 1.2639, + "step": 1300 + }, + { + "epoch": 5.267206477732794, + "grad_norm": 3.5536727878739205, + "learning_rate": 5.412896727361663e-06, + "loss": 1.5401, + "step": 1301 + }, + { + "epoch": 5.271255060728745, + "grad_norm": 3.539451611896165, + "learning_rate": 5.405854341405088e-06, + "loss": 1.5594, + "step": 1302 + }, + { + "epoch": 5.275303643724697, + "grad_norm": 3.4030202032336394, + "learning_rate": 5.398811144877733e-06, + "loss": 1.5997, + "step": 1303 + }, + { + "epoch": 5.279352226720648, + "grad_norm": 4.605755727643003, + "learning_rate": 5.391767151846247e-06, + "loss": 1.5551, + "step": 1304 + }, + { + "epoch": 5.283400809716599, + "grad_norm": 4.210060420659593, + "learning_rate": 5.384722376378861e-06, + "loss": 1.2388, + "step": 1305 + }, + { + "epoch": 5.287449392712551, + "grad_norm": 4.288644203676987, + "learning_rate": 5.377676832545377e-06, + "loss": 1.3926, + "step": 1306 + }, + { + "epoch": 5.291497975708502, + "grad_norm": 4.344641505323721, + "learning_rate": 5.370630534417133e-06, + "loss": 1.2335, + "step": 1307 + }, + { + "epoch": 5.295546558704453, + "grad_norm": 4.293842456125265, + "learning_rate": 5.363583496066963e-06, + "loss": 1.5097, + "step": 1308 + }, + { + "epoch": 5.299595141700405, + "grad_norm": 3.5889617840380956, + "learning_rate": 5.356535731569189e-06, + "loss": 1.6798, + "step": 1309 + }, + { + "epoch": 5.303643724696356, + "grad_norm": 3.8949744261018844, + "learning_rate": 5.349487254999579e-06, + "loss": 1.3501, + "step": 1310 + }, + { + "epoch": 5.3076923076923075, + "grad_norm": 3.8938141628185394, + "learning_rate": 5.342438080435325e-06, + "loss": 1.3823, + "step": 1311 + }, + { + "epoch": 5.3117408906882595, + "grad_norm": 3.7811284620632146, + "learning_rate": 5.335388221955012e-06, + "loss": 1.4001, + "step": 1312 + }, + { + "epoch": 5.315789473684211, + "grad_norm": 4.504485300390198, + "learning_rate": 5.328337693638591e-06, + "loss": 1.3433, + "step": 1313 + }, + { + "epoch": 5.319838056680162, + "grad_norm": 3.9863561932252, + "learning_rate": 5.321286509567351e-06, + "loss": 1.2701, + "step": 1314 + }, + { + "epoch": 5.323886639676114, + "grad_norm": 4.103946070839009, + "learning_rate": 5.314234683823892e-06, + "loss": 1.2979, + "step": 1315 + }, + { + "epoch": 5.327935222672065, + "grad_norm": 3.9048810862002896, + "learning_rate": 5.307182230492089e-06, + "loss": 1.3284, + "step": 1316 + }, + { + "epoch": 5.331983805668016, + "grad_norm": 3.802962634621348, + "learning_rate": 5.300129163657081e-06, + "loss": 1.3376, + "step": 1317 + }, + { + "epoch": 5.336032388663968, + "grad_norm": 3.6151941699291696, + "learning_rate": 5.2930754974052245e-06, + "loss": 1.3976, + "step": 1318 + }, + { + "epoch": 5.340080971659919, + "grad_norm": 3.4851660754400124, + "learning_rate": 5.286021245824075e-06, + "loss": 1.3431, + "step": 1319 + }, + { + "epoch": 5.34412955465587, + "grad_norm": 3.7167755157754008, + "learning_rate": 5.2789664230023595e-06, + "loss": 1.295, + "step": 1320 + }, + { + "epoch": 5.348178137651822, + "grad_norm": 4.41974802384744, + "learning_rate": 5.2719110430299416e-06, + "loss": 1.4491, + "step": 1321 + }, + { + "epoch": 5.352226720647773, + "grad_norm": 4.277030621050548, + "learning_rate": 5.264855119997803e-06, + "loss": 1.4354, + "step": 1322 + }, + { + "epoch": 5.3562753036437245, + "grad_norm": 4.194929698692418, + "learning_rate": 5.257798667998003e-06, + "loss": 1.0844, + "step": 1323 + }, + { + "epoch": 5.3603238866396765, + "grad_norm": 4.472113694740598, + "learning_rate": 5.2507417011236625e-06, + "loss": 1.4929, + "step": 1324 + }, + { + "epoch": 5.364372469635628, + "grad_norm": 3.9849001434928866, + "learning_rate": 5.243684233468933e-06, + "loss": 1.5648, + "step": 1325 + }, + { + "epoch": 5.368421052631579, + "grad_norm": 3.864302824850682, + "learning_rate": 5.236626279128958e-06, + "loss": 1.473, + "step": 1326 + }, + { + "epoch": 5.372469635627531, + "grad_norm": 4.810968253503194, + "learning_rate": 5.22956785219986e-06, + "loss": 1.5456, + "step": 1327 + }, + { + "epoch": 5.376518218623482, + "grad_norm": 4.111208820335583, + "learning_rate": 5.222508966778702e-06, + "loss": 1.2098, + "step": 1328 + }, + { + "epoch": 5.380566801619433, + "grad_norm": 4.534807999665865, + "learning_rate": 5.2154496369634645e-06, + "loss": 1.363, + "step": 1329 + }, + { + "epoch": 5.384615384615385, + "grad_norm": 4.048755500092308, + "learning_rate": 5.208389876853014e-06, + "loss": 1.1592, + "step": 1330 + }, + { + "epoch": 5.388663967611336, + "grad_norm": 4.75370785314969, + "learning_rate": 5.201329700547077e-06, + "loss": 1.226, + "step": 1331 + }, + { + "epoch": 5.392712550607287, + "grad_norm": 4.367024994722068, + "learning_rate": 5.194269122146211e-06, + "loss": 1.4048, + "step": 1332 + }, + { + "epoch": 5.396761133603239, + "grad_norm": 4.918852915006795, + "learning_rate": 5.187208155751779e-06, + "loss": 1.2387, + "step": 1333 + }, + { + "epoch": 5.40080971659919, + "grad_norm": 3.6289200014371894, + "learning_rate": 5.180146815465915e-06, + "loss": 1.2571, + "step": 1334 + }, + { + "epoch": 5.4048582995951415, + "grad_norm": 3.7443218122005266, + "learning_rate": 5.173085115391502e-06, + "loss": 1.3062, + "step": 1335 + }, + { + "epoch": 5.4089068825910935, + "grad_norm": 4.7017026873802426, + "learning_rate": 5.16602306963214e-06, + "loss": 1.4154, + "step": 1336 + }, + { + "epoch": 5.412955465587045, + "grad_norm": 4.150505086067103, + "learning_rate": 5.158960692292122e-06, + "loss": 1.2259, + "step": 1337 + }, + { + "epoch": 5.417004048582996, + "grad_norm": 4.482184582986182, + "learning_rate": 5.151897997476403e-06, + "loss": 1.5583, + "step": 1338 + }, + { + "epoch": 5.421052631578947, + "grad_norm": 4.682227327595727, + "learning_rate": 5.144834999290567e-06, + "loss": 1.598, + "step": 1339 + }, + { + "epoch": 5.425101214574899, + "grad_norm": 4.008926002575055, + "learning_rate": 5.137771711840811e-06, + "loss": 1.5379, + "step": 1340 + }, + { + "epoch": 5.42914979757085, + "grad_norm": 4.302820633137393, + "learning_rate": 5.130708149233905e-06, + "loss": 1.5569, + "step": 1341 + }, + { + "epoch": 5.433198380566802, + "grad_norm": 3.5969352441824007, + "learning_rate": 5.123644325577168e-06, + "loss": 1.7237, + "step": 1342 + }, + { + "epoch": 5.437246963562753, + "grad_norm": 4.1865532032949035, + "learning_rate": 5.116580254978447e-06, + "loss": 1.4932, + "step": 1343 + }, + { + "epoch": 5.441295546558704, + "grad_norm": 4.443537220527738, + "learning_rate": 5.1095159515460736e-06, + "loss": 1.4349, + "step": 1344 + }, + { + "epoch": 5.445344129554655, + "grad_norm": 3.8400638359623653, + "learning_rate": 5.10245142938885e-06, + "loss": 1.6808, + "step": 1345 + }, + { + "epoch": 5.449392712550607, + "grad_norm": 4.456713357432363, + "learning_rate": 5.095386702616012e-06, + "loss": 1.4753, + "step": 1346 + }, + { + "epoch": 5.4534412955465585, + "grad_norm": 4.371248488578587, + "learning_rate": 5.088321785337207e-06, + "loss": 1.4634, + "step": 1347 + }, + { + "epoch": 5.4574898785425106, + "grad_norm": 4.503939177016205, + "learning_rate": 5.0812566916624624e-06, + "loss": 1.8175, + "step": 1348 + }, + { + "epoch": 5.461538461538462, + "grad_norm": 5.8661687643019444, + "learning_rate": 5.074191435702155e-06, + "loss": 1.9684, + "step": 1349 + }, + { + "epoch": 5.465587044534413, + "grad_norm": 4.324067092257868, + "learning_rate": 5.067126031566988e-06, + "loss": 1.6405, + "step": 1350 + }, + { + "epoch": 5.469635627530364, + "grad_norm": 3.796039870689883, + "learning_rate": 5.060060493367961e-06, + "loss": 1.6486, + "step": 1351 + }, + { + "epoch": 5.473684210526316, + "grad_norm": 3.738600525398421, + "learning_rate": 5.05299483521634e-06, + "loss": 1.5872, + "step": 1352 + }, + { + "epoch": 5.477732793522267, + "grad_norm": 4.6006758703016, + "learning_rate": 5.045929071223633e-06, + "loss": 1.5976, + "step": 1353 + }, + { + "epoch": 5.481781376518219, + "grad_norm": 3.3463637296184854, + "learning_rate": 5.038863215501555e-06, + "loss": 1.5156, + "step": 1354 + }, + { + "epoch": 5.48582995951417, + "grad_norm": 3.8425032487043813, + "learning_rate": 5.031797282162007e-06, + "loss": 1.4631, + "step": 1355 + }, + { + "epoch": 5.489878542510121, + "grad_norm": 4.548619092337232, + "learning_rate": 5.024731285317046e-06, + "loss": 1.3972, + "step": 1356 + }, + { + "epoch": 5.493927125506072, + "grad_norm": 4.814717659012562, + "learning_rate": 5.017665239078854e-06, + "loss": 1.4267, + "step": 1357 + }, + { + "epoch": 5.497975708502024, + "grad_norm": 3.6552584947768096, + "learning_rate": 5.010599157559713e-06, + "loss": 1.2966, + "step": 1358 + }, + { + "epoch": 5.502024291497976, + "grad_norm": 4.204585823006649, + "learning_rate": 5.003533054871973e-06, + "loss": 1.15, + "step": 1359 + }, + { + "epoch": 5.506072874493928, + "grad_norm": 4.634653281785678, + "learning_rate": 4.996466945128029e-06, + "loss": 1.5181, + "step": 1360 + }, + { + "epoch": 5.510121457489879, + "grad_norm": 4.3188079424314, + "learning_rate": 4.98940084244029e-06, + "loss": 1.4787, + "step": 1361 + }, + { + "epoch": 5.51417004048583, + "grad_norm": 3.332377152961891, + "learning_rate": 4.982334760921149e-06, + "loss": 1.4434, + "step": 1362 + }, + { + "epoch": 5.518218623481781, + "grad_norm": 4.271374565670683, + "learning_rate": 4.975268714682956e-06, + "loss": 1.4766, + "step": 1363 + }, + { + "epoch": 5.522267206477733, + "grad_norm": 4.388046491535482, + "learning_rate": 4.968202717837996e-06, + "loss": 1.4244, + "step": 1364 + }, + { + "epoch": 5.526315789473684, + "grad_norm": 4.81529396324836, + "learning_rate": 4.961136784498448e-06, + "loss": 1.2532, + "step": 1365 + }, + { + "epoch": 5.530364372469636, + "grad_norm": 4.589391225576633, + "learning_rate": 4.9540709287763685e-06, + "loss": 1.3152, + "step": 1366 + }, + { + "epoch": 5.534412955465587, + "grad_norm": 5.101062956149816, + "learning_rate": 4.947005164783661e-06, + "loss": 1.409, + "step": 1367 + }, + { + "epoch": 5.538461538461538, + "grad_norm": 4.286443288173012, + "learning_rate": 4.939939506632041e-06, + "loss": 1.6652, + "step": 1368 + }, + { + "epoch": 5.5425101214574894, + "grad_norm": 3.857994197551904, + "learning_rate": 4.932873968433014e-06, + "loss": 1.5821, + "step": 1369 + }, + { + "epoch": 5.5465587044534415, + "grad_norm": 82.8177825176114, + "learning_rate": 4.925808564297847e-06, + "loss": 2.0481, + "step": 1370 + }, + { + "epoch": 5.550607287449393, + "grad_norm": 8.294269069115597, + "learning_rate": 4.918743308337539e-06, + "loss": 1.9382, + "step": 1371 + }, + { + "epoch": 5.554655870445345, + "grad_norm": 8.675625865701205, + "learning_rate": 4.911678214662795e-06, + "loss": 2.2234, + "step": 1372 + }, + { + "epoch": 5.558704453441296, + "grad_norm": 3.9912695390847595, + "learning_rate": 4.9046132973839895e-06, + "loss": 1.4514, + "step": 1373 + }, + { + "epoch": 5.562753036437247, + "grad_norm": 3.603893380101875, + "learning_rate": 4.897548570611153e-06, + "loss": 1.3266, + "step": 1374 + }, + { + "epoch": 5.566801619433198, + "grad_norm": 3.6938504736682054, + "learning_rate": 4.890484048453928e-06, + "loss": 1.704, + "step": 1375 + }, + { + "epoch": 5.57085020242915, + "grad_norm": 4.1771900748802135, + "learning_rate": 4.883419745021554e-06, + "loss": 1.3432, + "step": 1376 + }, + { + "epoch": 5.574898785425101, + "grad_norm": 4.029068029464602, + "learning_rate": 4.8763556744228324e-06, + "loss": 1.5548, + "step": 1377 + }, + { + "epoch": 5.578947368421053, + "grad_norm": 3.1723858445451776, + "learning_rate": 4.869291850766097e-06, + "loss": 1.3556, + "step": 1378 + }, + { + "epoch": 5.582995951417004, + "grad_norm": 3.9383901181787118, + "learning_rate": 4.862228288159191e-06, + "loss": 1.4828, + "step": 1379 + }, + { + "epoch": 5.587044534412955, + "grad_norm": 3.8742071296776883, + "learning_rate": 4.855165000709434e-06, + "loss": 1.4776, + "step": 1380 + }, + { + "epoch": 5.5910931174089065, + "grad_norm": 4.320505162169018, + "learning_rate": 4.848102002523597e-06, + "loss": 1.4632, + "step": 1381 + }, + { + "epoch": 5.5951417004048585, + "grad_norm": 3.8728016571115496, + "learning_rate": 4.841039307707878e-06, + "loss": 1.1957, + "step": 1382 + }, + { + "epoch": 5.59919028340081, + "grad_norm": 3.492753062395854, + "learning_rate": 4.833976930367859e-06, + "loss": 1.2615, + "step": 1383 + }, + { + "epoch": 5.603238866396762, + "grad_norm": 3.5488104026542513, + "learning_rate": 4.8269148846085e-06, + "loss": 1.3531, + "step": 1384 + }, + { + "epoch": 5.607287449392713, + "grad_norm": 4.068763646311401, + "learning_rate": 4.819853184534085e-06, + "loss": 1.1753, + "step": 1385 + }, + { + "epoch": 5.611336032388664, + "grad_norm": 4.377905274086795, + "learning_rate": 4.812791844248223e-06, + "loss": 1.4958, + "step": 1386 + }, + { + "epoch": 5.615384615384615, + "grad_norm": 3.6007003800569386, + "learning_rate": 4.80573087785379e-06, + "loss": 1.4974, + "step": 1387 + }, + { + "epoch": 5.619433198380567, + "grad_norm": 4.802311568406072, + "learning_rate": 4.798670299452926e-06, + "loss": 1.2282, + "step": 1388 + }, + { + "epoch": 5.623481781376518, + "grad_norm": 4.7745139328350135, + "learning_rate": 4.7916101231469886e-06, + "loss": 1.6082, + "step": 1389 + }, + { + "epoch": 5.62753036437247, + "grad_norm": 4.123643145041474, + "learning_rate": 4.784550363036539e-06, + "loss": 1.4134, + "step": 1390 + }, + { + "epoch": 5.631578947368421, + "grad_norm": 4.402507798104486, + "learning_rate": 4.7774910332213005e-06, + "loss": 1.6983, + "step": 1391 + }, + { + "epoch": 5.635627530364372, + "grad_norm": 3.8264895380697355, + "learning_rate": 4.770432147800141e-06, + "loss": 1.2975, + "step": 1392 + }, + { + "epoch": 5.6396761133603235, + "grad_norm": 4.517127158006528, + "learning_rate": 4.763373720871044e-06, + "loss": 1.5541, + "step": 1393 + }, + { + "epoch": 5.6437246963562755, + "grad_norm": 3.773516174749104, + "learning_rate": 4.756315766531069e-06, + "loss": 1.4461, + "step": 1394 + }, + { + "epoch": 5.647773279352227, + "grad_norm": 4.115306809751942, + "learning_rate": 4.749258298876338e-06, + "loss": 1.5498, + "step": 1395 + }, + { + "epoch": 5.651821862348179, + "grad_norm": 3.6874924730709413, + "learning_rate": 4.742201332001998e-06, + "loss": 1.333, + "step": 1396 + }, + { + "epoch": 5.65587044534413, + "grad_norm": 4.445009061040838, + "learning_rate": 4.735144880002199e-06, + "loss": 1.556, + "step": 1397 + }, + { + "epoch": 5.659919028340081, + "grad_norm": 4.819457563644938, + "learning_rate": 4.728088956970059e-06, + "loss": 1.3788, + "step": 1398 + }, + { + "epoch": 5.663967611336032, + "grad_norm": 3.9520027905188275, + "learning_rate": 4.721033576997641e-06, + "loss": 1.5347, + "step": 1399 + }, + { + "epoch": 5.668016194331984, + "grad_norm": 4.124422632263573, + "learning_rate": 4.713978754175926e-06, + "loss": 1.5292, + "step": 1400 + }, + { + "epoch": 5.672064777327935, + "grad_norm": 4.475410908220464, + "learning_rate": 4.706924502594777e-06, + "loss": 1.6549, + "step": 1401 + }, + { + "epoch": 5.676113360323887, + "grad_norm": 9.027913146446028, + "learning_rate": 4.69987083634292e-06, + "loss": 1.5814, + "step": 1402 + }, + { + "epoch": 5.680161943319838, + "grad_norm": 4.584849302385236, + "learning_rate": 4.692817769507912e-06, + "loss": 1.4982, + "step": 1403 + }, + { + "epoch": 5.684210526315789, + "grad_norm": 4.088441988479735, + "learning_rate": 4.685765316176111e-06, + "loss": 1.3453, + "step": 1404 + }, + { + "epoch": 5.6882591093117405, + "grad_norm": 3.94840157844417, + "learning_rate": 4.67871349043265e-06, + "loss": 1.4717, + "step": 1405 + }, + { + "epoch": 5.6923076923076925, + "grad_norm": 4.252654676588602, + "learning_rate": 4.671662306361409e-06, + "loss": 1.4891, + "step": 1406 + }, + { + "epoch": 5.696356275303644, + "grad_norm": 3.784433251453805, + "learning_rate": 4.664611778044988e-06, + "loss": 1.3408, + "step": 1407 + }, + { + "epoch": 5.700404858299595, + "grad_norm": 4.988371722598511, + "learning_rate": 4.657561919564675e-06, + "loss": 1.8095, + "step": 1408 + }, + { + "epoch": 5.704453441295547, + "grad_norm": 4.664322457086443, + "learning_rate": 4.6505127450004216e-06, + "loss": 1.6024, + "step": 1409 + }, + { + "epoch": 5.708502024291498, + "grad_norm": 4.600715197938257, + "learning_rate": 4.643464268430812e-06, + "loss": 1.2021, + "step": 1410 + }, + { + "epoch": 5.712550607287449, + "grad_norm": 3.9099782560794503, + "learning_rate": 4.636416503933038e-06, + "loss": 1.3472, + "step": 1411 + }, + { + "epoch": 5.716599190283401, + "grad_norm": 3.9111543599245757, + "learning_rate": 4.62936946558287e-06, + "loss": 1.4523, + "step": 1412 + }, + { + "epoch": 5.720647773279352, + "grad_norm": 4.6487019160659, + "learning_rate": 4.622323167454623e-06, + "loss": 1.2302, + "step": 1413 + }, + { + "epoch": 5.724696356275303, + "grad_norm": 4.4548900152472815, + "learning_rate": 4.6152776236211415e-06, + "loss": 1.4256, + "step": 1414 + }, + { + "epoch": 5.728744939271255, + "grad_norm": 4.058092491633072, + "learning_rate": 4.608232848153757e-06, + "loss": 1.6055, + "step": 1415 + }, + { + "epoch": 5.732793522267206, + "grad_norm": 4.025502584936106, + "learning_rate": 4.601188855122269e-06, + "loss": 1.3484, + "step": 1416 + }, + { + "epoch": 5.7368421052631575, + "grad_norm": 4.1244592308665275, + "learning_rate": 4.594145658594914e-06, + "loss": 1.4537, + "step": 1417 + }, + { + "epoch": 5.7408906882591095, + "grad_norm": 4.167306098888644, + "learning_rate": 4.587103272638339e-06, + "loss": 2.0785, + "step": 1418 + }, + { + "epoch": 5.744939271255061, + "grad_norm": 3.858307172453616, + "learning_rate": 4.580061711317571e-06, + "loss": 1.5669, + "step": 1419 + }, + { + "epoch": 5.748987854251012, + "grad_norm": 4.76966444820156, + "learning_rate": 4.57302098869599e-06, + "loss": 1.3901, + "step": 1420 + }, + { + "epoch": 5.753036437246964, + "grad_norm": 4.3778097624694166, + "learning_rate": 4.565981118835299e-06, + "loss": 1.291, + "step": 1421 + }, + { + "epoch": 5.757085020242915, + "grad_norm": 4.090411706131635, + "learning_rate": 4.558942115795502e-06, + "loss": 1.4406, + "step": 1422 + }, + { + "epoch": 5.761133603238866, + "grad_norm": 5.337161250566187, + "learning_rate": 4.551903993634869e-06, + "loss": 2.1851, + "step": 1423 + }, + { + "epoch": 5.765182186234818, + "grad_norm": 6.286779559937267, + "learning_rate": 4.5448667664099125e-06, + "loss": 1.9602, + "step": 1424 + }, + { + "epoch": 5.769230769230769, + "grad_norm": 6.765386541677961, + "learning_rate": 4.537830448175354e-06, + "loss": 1.8644, + "step": 1425 + }, + { + "epoch": 5.77327935222672, + "grad_norm": 4.009998051124011, + "learning_rate": 4.530795052984104e-06, + "loss": 1.3677, + "step": 1426 + }, + { + "epoch": 5.777327935222672, + "grad_norm": 4.067144464386327, + "learning_rate": 4.523760594887228e-06, + "loss": 1.6488, + "step": 1427 + }, + { + "epoch": 5.781376518218623, + "grad_norm": 3.900176884022236, + "learning_rate": 4.5167270879339165e-06, + "loss": 1.6378, + "step": 1428 + }, + { + "epoch": 5.7854251012145745, + "grad_norm": 4.307053870196715, + "learning_rate": 4.509694546171468e-06, + "loss": 1.458, + "step": 1429 + }, + { + "epoch": 5.7894736842105265, + "grad_norm": 4.202185719713703, + "learning_rate": 4.5026629836452445e-06, + "loss": 1.3863, + "step": 1430 + }, + { + "epoch": 5.793522267206478, + "grad_norm": 4.276979157413732, + "learning_rate": 4.495632414398659e-06, + "loss": 1.4133, + "step": 1431 + }, + { + "epoch": 5.797570850202429, + "grad_norm": 4.560387387278901, + "learning_rate": 4.488602852473138e-06, + "loss": 1.4313, + "step": 1432 + }, + { + "epoch": 5.801619433198381, + "grad_norm": 3.900998231009241, + "learning_rate": 4.481574311908096e-06, + "loss": 1.3065, + "step": 1433 + }, + { + "epoch": 5.805668016194332, + "grad_norm": 3.971785106076469, + "learning_rate": 4.4745468067409055e-06, + "loss": 1.1997, + "step": 1434 + }, + { + "epoch": 5.809716599190283, + "grad_norm": 4.230506562739517, + "learning_rate": 4.467520351006878e-06, + "loss": 1.5584, + "step": 1435 + }, + { + "epoch": 5.813765182186235, + "grad_norm": 5.12301466025395, + "learning_rate": 4.460494958739223e-06, + "loss": 1.4086, + "step": 1436 + }, + { + "epoch": 5.817813765182186, + "grad_norm": 4.360480527706543, + "learning_rate": 4.453470643969027e-06, + "loss": 1.2759, + "step": 1437 + }, + { + "epoch": 5.821862348178137, + "grad_norm": 11.774868013423882, + "learning_rate": 4.446447420725227e-06, + "loss": 2.2866, + "step": 1438 + }, + { + "epoch": 5.825910931174089, + "grad_norm": 23.795049320685568, + "learning_rate": 4.439425303034576e-06, + "loss": 3.4094, + "step": 1439 + }, + { + "epoch": 5.82995951417004, + "grad_norm": 4.607383270222987, + "learning_rate": 4.432404304921624e-06, + "loss": 1.3129, + "step": 1440 + }, + { + "epoch": 5.834008097165992, + "grad_norm": 4.67077067966415, + "learning_rate": 4.4253844404086785e-06, + "loss": 1.2285, + "step": 1441 + }, + { + "epoch": 5.838056680161944, + "grad_norm": 3.9312338569636394, + "learning_rate": 4.418365723515791e-06, + "loss": 1.286, + "step": 1442 + }, + { + "epoch": 5.842105263157895, + "grad_norm": 4.003272377775398, + "learning_rate": 4.411348168260713e-06, + "loss": 1.3394, + "step": 1443 + }, + { + "epoch": 5.846153846153846, + "grad_norm": 4.140441268173913, + "learning_rate": 4.404331788658882e-06, + "loss": 1.1712, + "step": 1444 + }, + { + "epoch": 5.850202429149798, + "grad_norm": 4.57761440040013, + "learning_rate": 4.397316598723385e-06, + "loss": 1.3548, + "step": 1445 + }, + { + "epoch": 5.854251012145749, + "grad_norm": 4.860966996025116, + "learning_rate": 4.390302612464934e-06, + "loss": 1.4071, + "step": 1446 + }, + { + "epoch": 5.8582995951417, + "grad_norm": 3.557234324926702, + "learning_rate": 4.383289843891835e-06, + "loss": 1.3334, + "step": 1447 + }, + { + "epoch": 5.862348178137652, + "grad_norm": 4.6167043083990515, + "learning_rate": 4.376278307009962e-06, + "loss": 1.332, + "step": 1448 + }, + { + "epoch": 5.866396761133603, + "grad_norm": 4.529476800833651, + "learning_rate": 4.369268015822733e-06, + "loss": 1.336, + "step": 1449 + }, + { + "epoch": 5.870445344129554, + "grad_norm": 5.460345634297291, + "learning_rate": 4.362258984331074e-06, + "loss": 1.7992, + "step": 1450 + }, + { + "epoch": 5.874493927125506, + "grad_norm": 4.852544977047948, + "learning_rate": 4.355251226533396e-06, + "loss": 1.7401, + "step": 1451 + }, + { + "epoch": 5.8785425101214575, + "grad_norm": 5.091561572959863, + "learning_rate": 4.348244756425569e-06, + "loss": 1.4945, + "step": 1452 + }, + { + "epoch": 5.882591093117409, + "grad_norm": 4.66519342749034, + "learning_rate": 4.341239588000887e-06, + "loss": 1.4193, + "step": 1453 + }, + { + "epoch": 5.886639676113361, + "grad_norm": 4.442060928034546, + "learning_rate": 4.334235735250047e-06, + "loss": 1.0274, + "step": 1454 + }, + { + "epoch": 5.890688259109312, + "grad_norm": 3.911256400148853, + "learning_rate": 4.327233212161118e-06, + "loss": 1.5401, + "step": 1455 + }, + { + "epoch": 5.894736842105263, + "grad_norm": 3.8807011184816846, + "learning_rate": 4.320232032719511e-06, + "loss": 1.5831, + "step": 1456 + }, + { + "epoch": 5.898785425101215, + "grad_norm": 3.58685678874274, + "learning_rate": 4.313232210907959e-06, + "loss": 1.3268, + "step": 1457 + }, + { + "epoch": 5.902834008097166, + "grad_norm": 4.318238652473736, + "learning_rate": 4.306233760706478e-06, + "loss": 1.3389, + "step": 1458 + }, + { + "epoch": 5.906882591093117, + "grad_norm": 4.611379978717958, + "learning_rate": 4.299236696092347e-06, + "loss": 1.4306, + "step": 1459 + }, + { + "epoch": 5.910931174089069, + "grad_norm": 3.900073554354451, + "learning_rate": 4.292241031040077e-06, + "loss": 1.1163, + "step": 1460 + }, + { + "epoch": 5.91497975708502, + "grad_norm": 4.550673982692945, + "learning_rate": 4.285246779521384e-06, + "loss": 1.2052, + "step": 1461 + }, + { + "epoch": 5.919028340080971, + "grad_norm": 4.574548958146505, + "learning_rate": 4.278253955505163e-06, + "loss": 1.213, + "step": 1462 + }, + { + "epoch": 5.923076923076923, + "grad_norm": 3.5603964829525725, + "learning_rate": 4.271262572957453e-06, + "loss": 1.5401, + "step": 1463 + }, + { + "epoch": 5.9271255060728745, + "grad_norm": 4.899646738920418, + "learning_rate": 4.264272645841419e-06, + "loss": 1.3832, + "step": 1464 + }, + { + "epoch": 5.931174089068826, + "grad_norm": 4.936217075017478, + "learning_rate": 4.2572841881173205e-06, + "loss": 1.3896, + "step": 1465 + }, + { + "epoch": 5.935222672064778, + "grad_norm": 4.841906645627207, + "learning_rate": 4.250297213742473e-06, + "loss": 1.173, + "step": 1466 + }, + { + "epoch": 5.939271255060729, + "grad_norm": 4.652957613099752, + "learning_rate": 4.243311736671239e-06, + "loss": 1.1544, + "step": 1467 + }, + { + "epoch": 5.94331983805668, + "grad_norm": 5.5395351930289864, + "learning_rate": 4.236327770854987e-06, + "loss": 1.4593, + "step": 1468 + }, + { + "epoch": 5.947368421052632, + "grad_norm": 4.423876597754868, + "learning_rate": 4.229345330242067e-06, + "loss": 1.1935, + "step": 1469 + }, + { + "epoch": 5.951417004048583, + "grad_norm": 5.270192860869612, + "learning_rate": 4.222364428777786e-06, + "loss": 1.1325, + "step": 1470 + }, + { + "epoch": 5.955465587044534, + "grad_norm": 5.410786507887627, + "learning_rate": 4.2153850804043706e-06, + "loss": 1.3971, + "step": 1471 + }, + { + "epoch": 5.959514170040486, + "grad_norm": 4.884826922400209, + "learning_rate": 4.2084072990609505e-06, + "loss": 1.4698, + "step": 1472 + }, + { + "epoch": 5.963562753036437, + "grad_norm": 4.313211329480648, + "learning_rate": 4.201431098683527e-06, + "loss": 1.4382, + "step": 1473 + }, + { + "epoch": 5.967611336032388, + "grad_norm": 5.213303398147368, + "learning_rate": 4.194456493204939e-06, + "loss": 1.5175, + "step": 1474 + }, + { + "epoch": 5.97165991902834, + "grad_norm": 5.448304606946485, + "learning_rate": 4.187483496554844e-06, + "loss": 1.433, + "step": 1475 + }, + { + "epoch": 5.9757085020242915, + "grad_norm": 3.801193566372591, + "learning_rate": 4.1805121226596826e-06, + "loss": 1.4114, + "step": 1476 + }, + { + "epoch": 5.979757085020243, + "grad_norm": 4.17077172984551, + "learning_rate": 4.173542385442659e-06, + "loss": 1.4847, + "step": 1477 + }, + { + "epoch": 5.983805668016195, + "grad_norm": 3.8042786020089285, + "learning_rate": 4.166574298823707e-06, + "loss": 1.5417, + "step": 1478 + }, + { + "epoch": 5.987854251012146, + "grad_norm": 4.0974559638165795, + "learning_rate": 4.1596078767194615e-06, + "loss": 1.3383, + "step": 1479 + }, + { + "epoch": 5.991902834008097, + "grad_norm": 3.4327656830127844, + "learning_rate": 4.152643133043236e-06, + "loss": 1.3384, + "step": 1480 + }, + { + "epoch": 5.995951417004049, + "grad_norm": 3.615327810634163, + "learning_rate": 4.145680081704989e-06, + "loss": 1.6541, + "step": 1481 + }, + { + "epoch": 6.0, + "grad_norm": 3.8329106879075594, + "learning_rate": 4.138718736611302e-06, + "loss": 1.3694, + "step": 1482 + }, + { + "epoch": 6.004048582995951, + "grad_norm": 3.830450157141594, + "learning_rate": 4.131759111665349e-06, + "loss": 1.4049, + "step": 1483 + }, + { + "epoch": 6.008097165991903, + "grad_norm": 5.1111426342190684, + "learning_rate": 4.1248012207668635e-06, + "loss": 1.5639, + "step": 1484 + }, + { + "epoch": 6.012145748987854, + "grad_norm": 4.83681122900061, + "learning_rate": 4.117845077812122e-06, + "loss": 1.3693, + "step": 1485 + }, + { + "epoch": 6.016194331983805, + "grad_norm": 5.4329470747052255, + "learning_rate": 4.110890696693906e-06, + "loss": 1.5831, + "step": 1486 + }, + { + "epoch": 6.020242914979757, + "grad_norm": 4.6500916905003535, + "learning_rate": 4.103938091301479e-06, + "loss": 1.7881, + "step": 1487 + }, + { + "epoch": 6.0242914979757085, + "grad_norm": 4.885048703930011, + "learning_rate": 4.096987275520562e-06, + "loss": 1.6668, + "step": 1488 + }, + { + "epoch": 6.02834008097166, + "grad_norm": 4.13626291343727, + "learning_rate": 4.090038263233294e-06, + "loss": 1.3587, + "step": 1489 + }, + { + "epoch": 6.032388663967612, + "grad_norm": 4.904165295750069, + "learning_rate": 4.08309106831822e-06, + "loss": 1.3678, + "step": 1490 + }, + { + "epoch": 6.036437246963563, + "grad_norm": 4.636168977638758, + "learning_rate": 4.0761457046502515e-06, + "loss": 1.5829, + "step": 1491 + }, + { + "epoch": 6.040485829959514, + "grad_norm": 4.665143753358694, + "learning_rate": 4.0692021861006386e-06, + "loss": 1.382, + "step": 1492 + }, + { + "epoch": 6.044534412955466, + "grad_norm": 4.58626969694099, + "learning_rate": 4.062260526536955e-06, + "loss": 1.4891, + "step": 1493 + }, + { + "epoch": 6.048582995951417, + "grad_norm": 4.689483058767236, + "learning_rate": 4.055320739823057e-06, + "loss": 1.3764, + "step": 1494 + }, + { + "epoch": 6.052631578947368, + "grad_norm": 5.0699840890954535, + "learning_rate": 4.048382839819058e-06, + "loss": 1.4399, + "step": 1495 + }, + { + "epoch": 6.05668016194332, + "grad_norm": 4.582891853100069, + "learning_rate": 4.041446840381309e-06, + "loss": 1.2964, + "step": 1496 + }, + { + "epoch": 6.060728744939271, + "grad_norm": 4.596209939663152, + "learning_rate": 4.034512755362361e-06, + "loss": 1.4451, + "step": 1497 + }, + { + "epoch": 6.064777327935222, + "grad_norm": 5.077809534848778, + "learning_rate": 4.027580598610943e-06, + "loss": 1.3934, + "step": 1498 + }, + { + "epoch": 6.068825910931174, + "grad_norm": 5.121648526362897, + "learning_rate": 4.0206503839719335e-06, + "loss": 1.5479, + "step": 1499 + }, + { + "epoch": 6.0728744939271255, + "grad_norm": 4.611548299373776, + "learning_rate": 4.01372212528633e-06, + "loss": 1.4704, + "step": 1500 + }, + { + "epoch": 6.076923076923077, + "grad_norm": 5.312277841332635, + "learning_rate": 4.006795836391226e-06, + "loss": 1.4155, + "step": 1501 + }, + { + "epoch": 6.080971659919029, + "grad_norm": 4.964246172799465, + "learning_rate": 3.999871531119779e-06, + "loss": 1.4857, + "step": 1502 + }, + { + "epoch": 6.08502024291498, + "grad_norm": 4.070954622733409, + "learning_rate": 3.992949223301185e-06, + "loss": 1.4726, + "step": 1503 + }, + { + "epoch": 6.089068825910931, + "grad_norm": 4.91594481744365, + "learning_rate": 3.986028926760655e-06, + "loss": 1.4183, + "step": 1504 + }, + { + "epoch": 6.093117408906883, + "grad_norm": 4.691943755517188, + "learning_rate": 3.9791106553193746e-06, + "loss": 1.497, + "step": 1505 + }, + { + "epoch": 6.097165991902834, + "grad_norm": 4.475695489598384, + "learning_rate": 3.972194422794493e-06, + "loss": 1.2572, + "step": 1506 + }, + { + "epoch": 6.101214574898785, + "grad_norm": 4.947241370368582, + "learning_rate": 3.965280242999083e-06, + "loss": 1.4398, + "step": 1507 + }, + { + "epoch": 6.105263157894737, + "grad_norm": 5.319805507480639, + "learning_rate": 3.9583681297421194e-06, + "loss": 1.3871, + "step": 1508 + }, + { + "epoch": 6.109311740890688, + "grad_norm": 4.749559720069604, + "learning_rate": 3.951458096828449e-06, + "loss": 1.375, + "step": 1509 + }, + { + "epoch": 6.113360323886639, + "grad_norm": 5.727885976477068, + "learning_rate": 3.944550158058762e-06, + "loss": 1.3195, + "step": 1510 + }, + { + "epoch": 6.117408906882591, + "grad_norm": 5.227063382939529, + "learning_rate": 3.937644327229572e-06, + "loss": 1.2256, + "step": 1511 + }, + { + "epoch": 6.1214574898785425, + "grad_norm": 4.738297898420654, + "learning_rate": 3.930740618133173e-06, + "loss": 1.2919, + "step": 1512 + }, + { + "epoch": 6.125506072874494, + "grad_norm": 4.796528713602936, + "learning_rate": 3.923839044557632e-06, + "loss": 1.3028, + "step": 1513 + }, + { + "epoch": 6.129554655870446, + "grad_norm": 5.590663766511934, + "learning_rate": 3.916939620286743e-06, + "loss": 1.1917, + "step": 1514 + }, + { + "epoch": 6.133603238866397, + "grad_norm": 4.16713103068686, + "learning_rate": 3.9100423591000124e-06, + "loss": 1.54, + "step": 1515 + }, + { + "epoch": 6.137651821862348, + "grad_norm": 5.035939317822777, + "learning_rate": 3.903147274772624e-06, + "loss": 1.3571, + "step": 1516 + }, + { + "epoch": 6.1417004048583, + "grad_norm": 4.0009552855543955, + "learning_rate": 3.896254381075416e-06, + "loss": 1.1103, + "step": 1517 + }, + { + "epoch": 6.145748987854251, + "grad_norm": 5.217383616489112, + "learning_rate": 3.8893636917748455e-06, + "loss": 1.4538, + "step": 1518 + }, + { + "epoch": 6.149797570850202, + "grad_norm": 4.709807039436491, + "learning_rate": 3.882475220632975e-06, + "loss": 1.2834, + "step": 1519 + }, + { + "epoch": 6.153846153846154, + "grad_norm": 4.179087376153956, + "learning_rate": 3.875588981407433e-06, + "loss": 1.5023, + "step": 1520 + }, + { + "epoch": 6.157894736842105, + "grad_norm": 5.387448869675948, + "learning_rate": 3.86870498785139e-06, + "loss": 1.5494, + "step": 1521 + }, + { + "epoch": 6.161943319838056, + "grad_norm": 4.138048095732358, + "learning_rate": 3.861823253713535e-06, + "loss": 1.3442, + "step": 1522 + }, + { + "epoch": 6.165991902834008, + "grad_norm": 4.522673016609398, + "learning_rate": 3.854943792738037e-06, + "loss": 1.4306, + "step": 1523 + }, + { + "epoch": 6.17004048582996, + "grad_norm": 4.04807524846957, + "learning_rate": 3.848066618664534e-06, + "loss": 1.9855, + "step": 1524 + }, + { + "epoch": 6.174089068825911, + "grad_norm": 4.797553089745047, + "learning_rate": 3.841191745228091e-06, + "loss": 1.2562, + "step": 1525 + }, + { + "epoch": 6.178137651821863, + "grad_norm": 5.562886515767805, + "learning_rate": 3.834319186159179e-06, + "loss": 1.4532, + "step": 1526 + }, + { + "epoch": 6.182186234817814, + "grad_norm": 3.8582598799938315, + "learning_rate": 3.82744895518365e-06, + "loss": 1.2517, + "step": 1527 + }, + { + "epoch": 6.186234817813765, + "grad_norm": 4.976499846840885, + "learning_rate": 3.8205810660227e-06, + "loss": 1.4395, + "step": 1528 + }, + { + "epoch": 6.190283400809717, + "grad_norm": 5.013759086459238, + "learning_rate": 3.8137155323928526e-06, + "loss": 1.4579, + "step": 1529 + }, + { + "epoch": 6.194331983805668, + "grad_norm": 5.210004353191725, + "learning_rate": 3.8068523680059287e-06, + "loss": 1.6307, + "step": 1530 + }, + { + "epoch": 6.198380566801619, + "grad_norm": 4.444756027075356, + "learning_rate": 3.799991586569012e-06, + "loss": 1.6785, + "step": 1531 + }, + { + "epoch": 6.202429149797571, + "grad_norm": 4.581599022941181, + "learning_rate": 3.7931332017844302e-06, + "loss": 1.3911, + "step": 1532 + }, + { + "epoch": 6.206477732793522, + "grad_norm": 4.426732929526946, + "learning_rate": 3.786277227349724e-06, + "loss": 1.7226, + "step": 1533 + }, + { + "epoch": 6.2105263157894735, + "grad_norm": 4.573503321332187, + "learning_rate": 3.77942367695762e-06, + "loss": 1.7276, + "step": 1534 + }, + { + "epoch": 6.2145748987854255, + "grad_norm": 4.632474175205992, + "learning_rate": 3.7725725642960047e-06, + "loss": 1.4984, + "step": 1535 + }, + { + "epoch": 6.218623481781377, + "grad_norm": 5.004422527391663, + "learning_rate": 3.7657239030478927e-06, + "loss": 1.3822, + "step": 1536 + }, + { + "epoch": 6.222672064777328, + "grad_norm": 4.730329238431976, + "learning_rate": 3.758877706891407e-06, + "loss": 1.3005, + "step": 1537 + }, + { + "epoch": 6.22672064777328, + "grad_norm": 4.696618081800561, + "learning_rate": 3.752033989499742e-06, + "loss": 1.4995, + "step": 1538 + }, + { + "epoch": 6.230769230769231, + "grad_norm": 4.819216438393582, + "learning_rate": 3.7451927645411466e-06, + "loss": 1.2958, + "step": 1539 + }, + { + "epoch": 6.234817813765182, + "grad_norm": 5.4741629869641315, + "learning_rate": 3.7383540456788915e-06, + "loss": 1.5321, + "step": 1540 + }, + { + "epoch": 6.238866396761134, + "grad_norm": 5.271140694357475, + "learning_rate": 3.7315178465712364e-06, + "loss": 1.4701, + "step": 1541 + }, + { + "epoch": 6.242914979757085, + "grad_norm": 4.870369052928556, + "learning_rate": 3.7246841808714172e-06, + "loss": 1.4965, + "step": 1542 + }, + { + "epoch": 6.246963562753036, + "grad_norm": 4.627274116359122, + "learning_rate": 3.717853062227604e-06, + "loss": 1.3376, + "step": 1543 + }, + { + "epoch": 6.251012145748988, + "grad_norm": 4.862725711210235, + "learning_rate": 3.7110245042828786e-06, + "loss": 1.436, + "step": 1544 + }, + { + "epoch": 6.255060728744939, + "grad_norm": 4.948809530195508, + "learning_rate": 3.704198520675214e-06, + "loss": 1.3922, + "step": 1545 + }, + { + "epoch": 6.2591093117408905, + "grad_norm": 4.36897138423846, + "learning_rate": 3.69737512503744e-06, + "loss": 1.3391, + "step": 1546 + }, + { + "epoch": 6.2631578947368425, + "grad_norm": 4.774874457232701, + "learning_rate": 3.690554330997215e-06, + "loss": 1.1307, + "step": 1547 + }, + { + "epoch": 6.267206477732794, + "grad_norm": 4.560395256546156, + "learning_rate": 3.6837361521770056e-06, + "loss": 1.4205, + "step": 1548 + }, + { + "epoch": 6.271255060728745, + "grad_norm": 4.657377226532245, + "learning_rate": 3.6769206021940505e-06, + "loss": 1.4284, + "step": 1549 + }, + { + "epoch": 6.275303643724697, + "grad_norm": 4.523918352960143, + "learning_rate": 3.670107694660343e-06, + "loss": 1.4865, + "step": 1550 + }, + { + "epoch": 6.279352226720648, + "grad_norm": 6.060799013063325, + "learning_rate": 3.6632974431825965e-06, + "loss": 1.4177, + "step": 1551 + }, + { + "epoch": 6.283400809716599, + "grad_norm": 5.508975855268233, + "learning_rate": 3.656489861362218e-06, + "loss": 1.0975, + "step": 1552 + }, + { + "epoch": 6.287449392712551, + "grad_norm": 5.591620230854365, + "learning_rate": 3.6496849627952875e-06, + "loss": 1.2607, + "step": 1553 + }, + { + "epoch": 6.291497975708502, + "grad_norm": 5.501342695470275, + "learning_rate": 3.6428827610725203e-06, + "loss": 1.113, + "step": 1554 + }, + { + "epoch": 6.295546558704453, + "grad_norm": 5.371568603468503, + "learning_rate": 3.636083269779249e-06, + "loss": 1.3579, + "step": 1555 + }, + { + "epoch": 6.299595141700405, + "grad_norm": 4.658495618502483, + "learning_rate": 3.6292865024953945e-06, + "loss": 1.5612, + "step": 1556 + }, + { + "epoch": 6.303643724696356, + "grad_norm": 5.171922327948163, + "learning_rate": 3.622492472795432e-06, + "loss": 1.196, + "step": 1557 + }, + { + "epoch": 6.3076923076923075, + "grad_norm": 5.187630245267101, + "learning_rate": 3.615701194248375e-06, + "loss": 1.2403, + "step": 1558 + }, + { + "epoch": 6.3117408906882595, + "grad_norm": 4.739560149771274, + "learning_rate": 3.6089126804177373e-06, + "loss": 1.2748, + "step": 1559 + }, + { + "epoch": 6.315789473684211, + "grad_norm": 5.8421200692609405, + "learning_rate": 3.6021269448615148e-06, + "loss": 1.1801, + "step": 1560 + }, + { + "epoch": 6.319838056680162, + "grad_norm": 5.003939683781086, + "learning_rate": 3.595344001132154e-06, + "loss": 1.1334, + "step": 1561 + }, + { + "epoch": 6.323886639676114, + "grad_norm": 5.213320704625486, + "learning_rate": 3.5885638627765228e-06, + "loss": 1.1662, + "step": 1562 + }, + { + "epoch": 6.327935222672065, + "grad_norm": 5.12672208334294, + "learning_rate": 3.5817865433358902e-06, + "loss": 1.1897, + "step": 1563 + }, + { + "epoch": 6.331983805668016, + "grad_norm": 4.990310131147776, + "learning_rate": 3.5750120563458924e-06, + "loss": 1.2197, + "step": 1564 + }, + { + "epoch": 6.336032388663968, + "grad_norm": 5.404582388895142, + "learning_rate": 3.568240415336509e-06, + "loss": 1.2979, + "step": 1565 + }, + { + "epoch": 6.340080971659919, + "grad_norm": 4.459387759024826, + "learning_rate": 3.5614716338320384e-06, + "loss": 1.2379, + "step": 1566 + }, + { + "epoch": 6.34412955465587, + "grad_norm": 4.906670384808422, + "learning_rate": 3.554705725351063e-06, + "loss": 1.1656, + "step": 1567 + }, + { + "epoch": 6.348178137651822, + "grad_norm": 5.788345645390745, + "learning_rate": 3.547942703406433e-06, + "loss": 1.3082, + "step": 1568 + }, + { + "epoch": 6.352226720647773, + "grad_norm": 5.367912057539721, + "learning_rate": 3.5411825815052296e-06, + "loss": 1.313, + "step": 1569 + }, + { + "epoch": 6.3562753036437245, + "grad_norm": 5.326205519895874, + "learning_rate": 3.534425373148741e-06, + "loss": 0.9762, + "step": 1570 + }, + { + "epoch": 6.3603238866396765, + "grad_norm": 5.708844505808687, + "learning_rate": 3.52767109183244e-06, + "loss": 1.373, + "step": 1571 + }, + { + "epoch": 6.364372469635628, + "grad_norm": 4.876273122171325, + "learning_rate": 3.5209197510459526e-06, + "loss": 1.448, + "step": 1572 + }, + { + "epoch": 6.368421052631579, + "grad_norm": 4.935122614604545, + "learning_rate": 3.5141713642730305e-06, + "loss": 1.3476, + "step": 1573 + }, + { + "epoch": 6.372469635627531, + "grad_norm": 6.109929961302762, + "learning_rate": 3.507425944991529e-06, + "loss": 1.4072, + "step": 1574 + }, + { + "epoch": 6.376518218623482, + "grad_norm": 5.409803828147351, + "learning_rate": 3.5006835066733707e-06, + "loss": 1.0987, + "step": 1575 + }, + { + "epoch": 6.380566801619433, + "grad_norm": 5.907878971006872, + "learning_rate": 3.4939440627845305e-06, + "loss": 1.2467, + "step": 1576 + }, + { + "epoch": 6.384615384615385, + "grad_norm": 5.060588652380501, + "learning_rate": 3.4872076267850015e-06, + "loss": 1.0512, + "step": 1577 + }, + { + "epoch": 6.388663967611336, + "grad_norm": 6.199263715395586, + "learning_rate": 3.480474212128766e-06, + "loss": 1.1192, + "step": 1578 + }, + { + "epoch": 6.392712550607287, + "grad_norm": 5.68773960369221, + "learning_rate": 3.473743832263778e-06, + "loss": 1.2989, + "step": 1579 + }, + { + "epoch": 6.396761133603239, + "grad_norm": 6.5411566006758886, + "learning_rate": 3.4670165006319236e-06, + "loss": 1.1125, + "step": 1580 + }, + { + "epoch": 6.40080971659919, + "grad_norm": 4.779266992013558, + "learning_rate": 3.4602922306690062e-06, + "loss": 1.1461, + "step": 1581 + }, + { + "epoch": 6.4048582995951415, + "grad_norm": 4.983422698218311, + "learning_rate": 3.453571035804714e-06, + "loss": 1.1805, + "step": 1582 + }, + { + "epoch": 6.4089068825910935, + "grad_norm": 6.281439869347411, + "learning_rate": 3.4468529294625895e-06, + "loss": 1.2865, + "step": 1583 + }, + { + "epoch": 6.412955465587045, + "grad_norm": 5.447638251945489, + "learning_rate": 3.4401379250600124e-06, + "loss": 1.112, + "step": 1584 + }, + { + "epoch": 6.417004048582996, + "grad_norm": 6.031371603465583, + "learning_rate": 3.433426036008163e-06, + "loss": 1.4222, + "step": 1585 + }, + { + "epoch": 6.421052631578947, + "grad_norm": 6.344172383462025, + "learning_rate": 3.4267172757120005e-06, + "loss": 1.4558, + "step": 1586 + }, + { + "epoch": 6.425101214574899, + "grad_norm": 5.253990555737164, + "learning_rate": 3.420011657570238e-06, + "loss": 1.4408, + "step": 1587 + }, + { + "epoch": 6.42914979757085, + "grad_norm": 5.944240629250275, + "learning_rate": 3.413309194975309e-06, + "loss": 1.4281, + "step": 1588 + }, + { + "epoch": 6.433198380566802, + "grad_norm": 4.690048614883703, + "learning_rate": 3.406609901313349e-06, + "loss": 1.6038, + "step": 1589 + }, + { + "epoch": 6.437246963562753, + "grad_norm": 5.538761343018897, + "learning_rate": 3.39991378996416e-06, + "loss": 1.3818, + "step": 1590 + }, + { + "epoch": 6.441295546558704, + "grad_norm": 5.904913245197766, + "learning_rate": 3.393220874301193e-06, + "loss": 1.324, + "step": 1591 + }, + { + "epoch": 6.445344129554655, + "grad_norm": 4.935839021246995, + "learning_rate": 3.386531167691512e-06, + "loss": 1.569, + "step": 1592 + }, + { + "epoch": 6.449392712550607, + "grad_norm": 5.96200793571726, + "learning_rate": 3.379844683495775e-06, + "loss": 1.3697, + "step": 1593 + }, + { + "epoch": 6.4534412955465585, + "grad_norm": 5.74218375449931, + "learning_rate": 3.3731614350682045e-06, + "loss": 1.3591, + "step": 1594 + }, + { + "epoch": 6.4574898785425106, + "grad_norm": 5.819819829923634, + "learning_rate": 3.36648143575656e-06, + "loss": 1.7039, + "step": 1595 + }, + { + "epoch": 6.461538461538462, + "grad_norm": 7.530849687169004, + "learning_rate": 3.3598046989021073e-06, + "loss": 1.8161, + "step": 1596 + }, + { + "epoch": 6.465587044534413, + "grad_norm": 5.773184926893142, + "learning_rate": 3.3531312378396026e-06, + "loss": 1.506, + "step": 1597 + }, + { + "epoch": 6.469635627530364, + "grad_norm": 5.095389257052112, + "learning_rate": 3.3464610658972584e-06, + "loss": 1.5432, + "step": 1598 + }, + { + "epoch": 6.473684210526316, + "grad_norm": 4.864855264853332, + "learning_rate": 3.3397941963967162e-06, + "loss": 1.502, + "step": 1599 + }, + { + "epoch": 6.477732793522267, + "grad_norm": 6.57365780985993, + "learning_rate": 3.333130642653024e-06, + "loss": 1.5104, + "step": 1600 + }, + { + "epoch": 6.481781376518219, + "grad_norm": 4.515682901106996, + "learning_rate": 3.326470417974604e-06, + "loss": 1.4218, + "step": 1601 + }, + { + "epoch": 6.48582995951417, + "grad_norm": 5.044572956084713, + "learning_rate": 3.3198135356632353e-06, + "loss": 1.3685, + "step": 1602 + }, + { + "epoch": 6.489878542510121, + "grad_norm": 6.114856919793026, + "learning_rate": 3.313160009014017e-06, + "loss": 1.3026, + "step": 1603 + }, + { + "epoch": 6.493927125506072, + "grad_norm": 6.169486015477941, + "learning_rate": 3.3065098513153473e-06, + "loss": 1.2931, + "step": 1604 + }, + { + "epoch": 6.497975708502024, + "grad_norm": 4.671907121620305, + "learning_rate": 3.299863075848898e-06, + "loss": 1.203, + "step": 1605 + }, + { + "epoch": 6.502024291497976, + "grad_norm": 5.556963177721959, + "learning_rate": 3.2932196958895816e-06, + "loss": 1.0369, + "step": 1606 + }, + { + "epoch": 6.506072874493928, + "grad_norm": 6.041668515369977, + "learning_rate": 3.2865797247055354e-06, + "loss": 1.4057, + "step": 1607 + }, + { + "epoch": 6.510121457489879, + "grad_norm": 5.622532023329238, + "learning_rate": 3.2799431755580814e-06, + "loss": 1.3496, + "step": 1608 + }, + { + "epoch": 6.51417004048583, + "grad_norm": 4.164381858883872, + "learning_rate": 3.2733100617017126e-06, + "loss": 1.3227, + "step": 1609 + }, + { + "epoch": 6.518218623481781, + "grad_norm": 5.565945707547888, + "learning_rate": 3.266680396384061e-06, + "loss": 1.3552, + "step": 1610 + }, + { + "epoch": 6.522267206477733, + "grad_norm": 6.1834705735871855, + "learning_rate": 3.2600541928458664e-06, + "loss": 1.2943, + "step": 1611 + }, + { + "epoch": 6.526315789473684, + "grad_norm": 6.088692550743796, + "learning_rate": 3.2534314643209597e-06, + "loss": 1.132, + "step": 1612 + }, + { + "epoch": 6.530364372469636, + "grad_norm": 5.618439646445004, + "learning_rate": 3.2468122240362287e-06, + "loss": 1.2075, + "step": 1613 + }, + { + "epoch": 6.534412955465587, + "grad_norm": 6.117262117177891, + "learning_rate": 3.2401964852115954e-06, + "loss": 1.2648, + "step": 1614 + }, + { + "epoch": 6.538461538461538, + "grad_norm": 5.488938699999532, + "learning_rate": 3.233584261059991e-06, + "loss": 1.5484, + "step": 1615 + }, + { + "epoch": 6.5425101214574894, + "grad_norm": 4.965386729846099, + "learning_rate": 3.226975564787322e-06, + "loss": 1.486, + "step": 1616 + }, + { + "epoch": 6.5465587044534415, + "grad_norm": 18.62707478890267, + "learning_rate": 3.2203704095924536e-06, + "loss": 2.0005, + "step": 1617 + }, + { + "epoch": 6.550607287449393, + "grad_norm": 9.55782070389464, + "learning_rate": 3.213768808667177e-06, + "loss": 1.7957, + "step": 1618 + }, + { + "epoch": 6.554655870445345, + "grad_norm": 9.720812117855125, + "learning_rate": 3.2071707751961838e-06, + "loss": 2.144, + "step": 1619 + }, + { + "epoch": 6.558704453441296, + "grad_norm": 5.342719089296339, + "learning_rate": 3.200576322357044e-06, + "loss": 1.3436, + "step": 1620 + }, + { + "epoch": 6.562753036437247, + "grad_norm": 4.64296304030207, + "learning_rate": 3.1939854633201727e-06, + "loss": 1.2129, + "step": 1621 + }, + { + "epoch": 6.566801619433198, + "grad_norm": 4.806685098084674, + "learning_rate": 3.187398211248811e-06, + "loss": 1.5973, + "step": 1622 + }, + { + "epoch": 6.57085020242915, + "grad_norm": 5.159929877257071, + "learning_rate": 3.1808145792989914e-06, + "loss": 1.2471, + "step": 1623 + }, + { + "epoch": 6.574898785425101, + "grad_norm": 4.881818219879603, + "learning_rate": 3.1742345806195196e-06, + "loss": 1.4285, + "step": 1624 + }, + { + "epoch": 6.578947368421053, + "grad_norm": 4.079931587528226, + "learning_rate": 3.1676582283519454e-06, + "loss": 1.2586, + "step": 1625 + }, + { + "epoch": 6.582995951417004, + "grad_norm": 5.067504014062879, + "learning_rate": 3.1610855356305354e-06, + "loss": 1.3673, + "step": 1626 + }, + { + "epoch": 6.587044534412955, + "grad_norm": 4.954367681109359, + "learning_rate": 3.1545165155822453e-06, + "loss": 1.3681, + "step": 1627 + }, + { + "epoch": 6.5910931174089065, + "grad_norm": 5.605429782413848, + "learning_rate": 3.1479511813267006e-06, + "loss": 1.3636, + "step": 1628 + }, + { + "epoch": 6.5951417004048585, + "grad_norm": 4.958815188693233, + "learning_rate": 3.141389545976159e-06, + "loss": 1.0862, + "step": 1629 + }, + { + "epoch": 6.59919028340081, + "grad_norm": 4.427052082332069, + "learning_rate": 3.134831622635496e-06, + "loss": 1.1727, + "step": 1630 + }, + { + "epoch": 6.603238866396762, + "grad_norm": 4.453414798921641, + "learning_rate": 3.1282774244021717e-06, + "loss": 1.2508, + "step": 1631 + }, + { + "epoch": 6.607287449392713, + "grad_norm": 5.086142474437995, + "learning_rate": 3.1217269643662063e-06, + "loss": 1.0497, + "step": 1632 + }, + { + "epoch": 6.611336032388664, + "grad_norm": 5.252726223787453, + "learning_rate": 3.115180255610154e-06, + "loss": 1.352, + "step": 1633 + }, + { + "epoch": 6.615384615384615, + "grad_norm": 4.618158368136601, + "learning_rate": 3.1086373112090762e-06, + "loss": 1.3803, + "step": 1634 + }, + { + "epoch": 6.619433198380567, + "grad_norm": 5.797639722448207, + "learning_rate": 3.1020981442305187e-06, + "loss": 1.1187, + "step": 1635 + }, + { + "epoch": 6.623481781376518, + "grad_norm": 5.892627204449989, + "learning_rate": 3.095562767734481e-06, + "loss": 1.4805, + "step": 1636 + }, + { + "epoch": 6.62753036437247, + "grad_norm": 4.995284041826363, + "learning_rate": 3.089031194773392e-06, + "loss": 1.2999, + "step": 1637 + }, + { + "epoch": 6.631578947368421, + "grad_norm": 5.424221812925032, + "learning_rate": 3.082503438392086e-06, + "loss": 1.5812, + "step": 1638 + }, + { + "epoch": 6.635627530364372, + "grad_norm": 4.773802128035484, + "learning_rate": 3.0759795116277723e-06, + "loss": 1.1799, + "step": 1639 + }, + { + "epoch": 6.6396761133603235, + "grad_norm": 5.573651737656804, + "learning_rate": 3.069459427510014e-06, + "loss": 1.4498, + "step": 1640 + }, + { + "epoch": 6.6437246963562755, + "grad_norm": 4.742522853775909, + "learning_rate": 3.0629431990607e-06, + "loss": 1.3417, + "step": 1641 + }, + { + "epoch": 6.647773279352227, + "grad_norm": 5.292712065001537, + "learning_rate": 3.056430839294015e-06, + "loss": 1.45, + "step": 1642 + }, + { + "epoch": 6.651821862348179, + "grad_norm": 4.5550435224065335, + "learning_rate": 3.049922361216422e-06, + "loss": 1.2275, + "step": 1643 + }, + { + "epoch": 6.65587044534413, + "grad_norm": 5.633966620000232, + "learning_rate": 3.043417777826627e-06, + "loss": 1.4383, + "step": 1644 + }, + { + "epoch": 6.659919028340081, + "grad_norm": 5.977264180838899, + "learning_rate": 3.036917102115561e-06, + "loss": 1.2502, + "step": 1645 + }, + { + "epoch": 6.663967611336032, + "grad_norm": 5.050359221231472, + "learning_rate": 3.0304203470663507e-06, + "loss": 1.4135, + "step": 1646 + }, + { + "epoch": 6.668016194331984, + "grad_norm": 5.3518078778159435, + "learning_rate": 3.023927525654288e-06, + "loss": 1.4064, + "step": 1647 + }, + { + "epoch": 6.672064777327935, + "grad_norm": 5.575471681679863, + "learning_rate": 3.017438650846815e-06, + "loss": 1.5635, + "step": 1648 + }, + { + "epoch": 6.676113360323887, + "grad_norm": 4.758858070207382, + "learning_rate": 3.0109537356034856e-06, + "loss": 1.5306, + "step": 1649 + }, + { + "epoch": 6.680161943319838, + "grad_norm": 5.646630068141117, + "learning_rate": 3.0044727928759487e-06, + "loss": 1.3876, + "step": 1650 + }, + { + "epoch": 6.684210526315789, + "grad_norm": 5.245224305674558, + "learning_rate": 2.9979958356079195e-06, + "loss": 1.2497, + "step": 1651 + }, + { + "epoch": 6.6882591093117405, + "grad_norm": 4.976281468525487, + "learning_rate": 2.991522876735154e-06, + "loss": 1.3506, + "step": 1652 + }, + { + "epoch": 6.6923076923076925, + "grad_norm": 5.375432065764104, + "learning_rate": 2.98505392918542e-06, + "loss": 1.3676, + "step": 1653 + }, + { + "epoch": 6.696356275303644, + "grad_norm": 4.849539565202561, + "learning_rate": 2.978589005878476e-06, + "loss": 1.2348, + "step": 1654 + }, + { + "epoch": 6.700404858299595, + "grad_norm": 6.373782199327902, + "learning_rate": 2.9721281197260427e-06, + "loss": 1.6916, + "step": 1655 + }, + { + "epoch": 6.704453441295547, + "grad_norm": 5.797065404713431, + "learning_rate": 2.965671283631778e-06, + "loss": 1.4917, + "step": 1656 + }, + { + "epoch": 6.708502024291498, + "grad_norm": 5.561054188837486, + "learning_rate": 2.959218510491252e-06, + "loss": 1.1089, + "step": 1657 + }, + { + "epoch": 6.712550607287449, + "grad_norm": 4.841361841602314, + "learning_rate": 2.9527698131919156e-06, + "loss": 1.2314, + "step": 1658 + }, + { + "epoch": 6.716599190283401, + "grad_norm": 4.961647413029597, + "learning_rate": 2.9463252046130884e-06, + "loss": 1.3488, + "step": 1659 + }, + { + "epoch": 6.720647773279352, + "grad_norm": 6.030520417168003, + "learning_rate": 2.9398846976259136e-06, + "loss": 1.1124, + "step": 1660 + }, + { + "epoch": 6.724696356275303, + "grad_norm": 5.376150681226648, + "learning_rate": 2.9334483050933506e-06, + "loss": 1.3305, + "step": 1661 + }, + { + "epoch": 6.728744939271255, + "grad_norm": 4.997899902629033, + "learning_rate": 2.9270160398701387e-06, + "loss": 1.4987, + "step": 1662 + }, + { + "epoch": 6.732793522267206, + "grad_norm": 5.003930672267123, + "learning_rate": 2.920587914802772e-06, + "loss": 1.2143, + "step": 1663 + }, + { + "epoch": 6.7368421052631575, + "grad_norm": 5.099065318842715, + "learning_rate": 2.91416394272948e-06, + "loss": 1.3239, + "step": 1664 + }, + { + "epoch": 6.7408906882591095, + "grad_norm": 5.065783888856437, + "learning_rate": 2.907744136480194e-06, + "loss": 1.9473, + "step": 1665 + }, + { + "epoch": 6.744939271255061, + "grad_norm": 4.828636889161134, + "learning_rate": 2.901328508876531e-06, + "loss": 1.4691, + "step": 1666 + }, + { + "epoch": 6.748987854251012, + "grad_norm": 5.887659634670204, + "learning_rate": 2.894917072731753e-06, + "loss": 1.2826, + "step": 1667 + }, + { + "epoch": 6.753036437246964, + "grad_norm": 5.421606621102472, + "learning_rate": 2.88850984085076e-06, + "loss": 1.1948, + "step": 1668 + }, + { + "epoch": 6.757085020242915, + "grad_norm": 5.2144985221753615, + "learning_rate": 2.8821068260300505e-06, + "loss": 1.3159, + "step": 1669 + }, + { + "epoch": 6.761133603238866, + "grad_norm": 6.35388499196324, + "learning_rate": 2.8757080410577042e-06, + "loss": 2.064, + "step": 1670 + }, + { + "epoch": 6.765182186234818, + "grad_norm": 6.533956411029131, + "learning_rate": 2.8693134987133464e-06, + "loss": 1.8202, + "step": 1671 + }, + { + "epoch": 6.769230769230769, + "grad_norm": 7.388143224357747, + "learning_rate": 2.8629232117681354e-06, + "loss": 1.7417, + "step": 1672 + }, + { + "epoch": 6.77327935222672, + "grad_norm": 4.928577825497661, + "learning_rate": 2.8565371929847286e-06, + "loss": 1.2534, + "step": 1673 + }, + { + "epoch": 6.777327935222672, + "grad_norm": 5.033866214652084, + "learning_rate": 2.8501554551172613e-06, + "loss": 1.5421, + "step": 1674 + }, + { + "epoch": 6.781376518218623, + "grad_norm": 4.739685237811317, + "learning_rate": 2.843778010911311e-06, + "loss": 1.5263, + "step": 1675 + }, + { + "epoch": 6.7854251012145745, + "grad_norm": 5.136372890884333, + "learning_rate": 2.83740487310389e-06, + "loss": 1.3327, + "step": 1676 + }, + { + "epoch": 6.7894736842105265, + "grad_norm": 4.941908173697463, + "learning_rate": 2.8310360544234057e-06, + "loss": 1.2674, + "step": 1677 + }, + { + "epoch": 6.793522267206478, + "grad_norm": 5.393271110505753, + "learning_rate": 2.8246715675896354e-06, + "loss": 1.2836, + "step": 1678 + }, + { + "epoch": 6.797570850202429, + "grad_norm": 5.454849249006355, + "learning_rate": 2.81831142531371e-06, + "loss": 1.3156, + "step": 1679 + }, + { + "epoch": 6.801619433198381, + "grad_norm": 4.939088394387297, + "learning_rate": 2.811955640298083e-06, + "loss": 1.2068, + "step": 1680 + }, + { + "epoch": 6.805668016194332, + "grad_norm": 4.809916773128364, + "learning_rate": 2.8056042252365046e-06, + "loss": 1.0997, + "step": 1681 + }, + { + "epoch": 6.809716599190283, + "grad_norm": 5.329896547784682, + "learning_rate": 2.7992571928139984e-06, + "loss": 1.4471, + "step": 1682 + }, + { + "epoch": 6.813765182186235, + "grad_norm": 6.511906878209839, + "learning_rate": 2.7929145557068303e-06, + "loss": 1.2595, + "step": 1683 + }, + { + "epoch": 6.817813765182186, + "grad_norm": 5.372364570471038, + "learning_rate": 2.786576326582493e-06, + "loss": 1.1699, + "step": 1684 + }, + { + "epoch": 6.821862348178137, + "grad_norm": 13.8652581579135, + "learning_rate": 2.780242518099675e-06, + "loss": 2.2106, + "step": 1685 + }, + { + "epoch": 6.825910931174089, + "grad_norm": 25.171093577196388, + "learning_rate": 2.7739131429082373e-06, + "loss": 3.2586, + "step": 1686 + }, + { + "epoch": 6.82995951417004, + "grad_norm": 5.726221697590718, + "learning_rate": 2.7675882136491795e-06, + "loss": 1.1889, + "step": 1687 + }, + { + "epoch": 6.834008097165992, + "grad_norm": 5.969801910273205, + "learning_rate": 2.761267742954629e-06, + "loss": 1.1408, + "step": 1688 + }, + { + "epoch": 6.838056680161944, + "grad_norm": 5.061214863990714, + "learning_rate": 2.7549517434478063e-06, + "loss": 1.1687, + "step": 1689 + }, + { + "epoch": 6.842105263157895, + "grad_norm": 4.867474293725249, + "learning_rate": 2.7486402277430026e-06, + "loss": 1.2449, + "step": 1690 + }, + { + "epoch": 6.846153846153846, + "grad_norm": 5.1018055774076645, + "learning_rate": 2.7423332084455543e-06, + "loss": 1.0478, + "step": 1691 + }, + { + "epoch": 6.850202429149798, + "grad_norm": 6.018705752891283, + "learning_rate": 2.736030698151815e-06, + "loss": 1.2496, + "step": 1692 + }, + { + "epoch": 6.854251012145749, + "grad_norm": 6.104939352615399, + "learning_rate": 2.7297327094491344e-06, + "loss": 1.287, + "step": 1693 + }, + { + "epoch": 6.8582995951417, + "grad_norm": 4.340656711987505, + "learning_rate": 2.723439254915834e-06, + "loss": 1.2266, + "step": 1694 + }, + { + "epoch": 6.862348178137652, + "grad_norm": 5.698807470646283, + "learning_rate": 2.717150347121177e-06, + "loss": 1.2273, + "step": 1695 + }, + { + "epoch": 6.866396761133603, + "grad_norm": 5.5042411488110154, + "learning_rate": 2.710865998625348e-06, + "loss": 1.2081, + "step": 1696 + }, + { + "epoch": 6.870445344129554, + "grad_norm": 6.8240067723829405, + "learning_rate": 2.704586221979422e-06, + "loss": 1.6486, + "step": 1697 + }, + { + "epoch": 6.874493927125506, + "grad_norm": 5.905111755452213, + "learning_rate": 2.698311029725346e-06, + "loss": 1.5976, + "step": 1698 + }, + { + "epoch": 6.8785425101214575, + "grad_norm": 6.1571466759316, + "learning_rate": 2.6920404343959106e-06, + "loss": 1.3605, + "step": 1699 + }, + { + "epoch": 6.882591093117409, + "grad_norm": 5.716713309024074, + "learning_rate": 2.6857744485147286e-06, + "loss": 1.2964, + "step": 1700 + }, + { + "epoch": 6.886639676113361, + "grad_norm": 5.42925803199323, + "learning_rate": 2.6795130845961993e-06, + "loss": 0.9267, + "step": 1701 + }, + { + "epoch": 6.890688259109312, + "grad_norm": 4.919365319165041, + "learning_rate": 2.673256355145499e-06, + "loss": 1.4449, + "step": 1702 + }, + { + "epoch": 6.894736842105263, + "grad_norm": 4.863542774795551, + "learning_rate": 2.667004272658541e-06, + "loss": 1.4657, + "step": 1703 + }, + { + "epoch": 6.898785425101215, + "grad_norm": 4.299136007306504, + "learning_rate": 2.660756849621962e-06, + "loss": 1.2369, + "step": 1704 + }, + { + "epoch": 6.902834008097166, + "grad_norm": 5.213129071990759, + "learning_rate": 2.6545140985130934e-06, + "loss": 1.2244, + "step": 1705 + }, + { + "epoch": 6.906882591093117, + "grad_norm": 5.578872418777055, + "learning_rate": 2.6482760317999338e-06, + "loss": 1.2811, + "step": 1706 + }, + { + "epoch": 6.910931174089069, + "grad_norm": 4.626194423109011, + "learning_rate": 2.642042661941129e-06, + "loss": 1.0198, + "step": 1707 + }, + { + "epoch": 6.91497975708502, + "grad_norm": 5.352887557319016, + "learning_rate": 2.635814001385938e-06, + "loss": 1.1012, + "step": 1708 + }, + { + "epoch": 6.919028340080971, + "grad_norm": 5.579613506703107, + "learning_rate": 2.629590062574221e-06, + "loss": 1.1085, + "step": 1709 + }, + { + "epoch": 6.923076923076923, + "grad_norm": 4.252011072382573, + "learning_rate": 2.623370857936404e-06, + "loss": 1.431, + "step": 1710 + }, + { + "epoch": 6.9271255060728745, + "grad_norm": 5.916388957924838, + "learning_rate": 2.6171563998934605e-06, + "loss": 1.2774, + "step": 1711 + }, + { + "epoch": 6.931174089068826, + "grad_norm": 5.953432162823518, + "learning_rate": 2.610946700856885e-06, + "loss": 1.2618, + "step": 1712 + }, + { + "epoch": 6.935222672064778, + "grad_norm": 6.19929364838639, + "learning_rate": 2.604741773228661e-06, + "loss": 1.0577, + "step": 1713 + }, + { + "epoch": 6.939271255060729, + "grad_norm": 5.789164804068839, + "learning_rate": 2.5985416294012487e-06, + "loss": 1.0688, + "step": 1714 + }, + { + "epoch": 6.94331983805668, + "grad_norm": 6.659571736165462, + "learning_rate": 2.592346281757552e-06, + "loss": 1.3636, + "step": 1715 + }, + { + "epoch": 6.947368421052632, + "grad_norm": 5.314697446259228, + "learning_rate": 2.586155742670897e-06, + "loss": 1.0952, + "step": 1716 + }, + { + "epoch": 6.951417004048583, + "grad_norm": 6.659337503952005, + "learning_rate": 2.5799700245050074e-06, + "loss": 1.0229, + "step": 1717 + }, + { + "epoch": 6.955465587044534, + "grad_norm": 6.65312440022192, + "learning_rate": 2.5737891396139713e-06, + "loss": 1.3201, + "step": 1718 + }, + { + "epoch": 6.959514170040486, + "grad_norm": 5.938881485697329, + "learning_rate": 2.5676131003422317e-06, + "loss": 1.3962, + "step": 1719 + }, + { + "epoch": 6.963562753036437, + "grad_norm": 5.4389936951171025, + "learning_rate": 2.561441919024551e-06, + "loss": 1.346, + "step": 1720 + }, + { + "epoch": 6.967611336032388, + "grad_norm": 6.814603646499591, + "learning_rate": 2.5552756079859904e-06, + "loss": 1.3755, + "step": 1721 + }, + { + "epoch": 6.97165991902834, + "grad_norm": 6.557034047725967, + "learning_rate": 2.549114179541884e-06, + "loss": 1.2917, + "step": 1722 + }, + { + "epoch": 6.9757085020242915, + "grad_norm": 4.666089006915814, + "learning_rate": 2.542957645997811e-06, + "loss": 1.3178, + "step": 1723 + }, + { + "epoch": 6.979757085020243, + "grad_norm": 5.4101007526641, + "learning_rate": 2.5368060196495785e-06, + "loss": 1.3848, + "step": 1724 + }, + { + "epoch": 6.983805668016195, + "grad_norm": 5.003638917729553, + "learning_rate": 2.530659312783192e-06, + "loss": 1.4391, + "step": 1725 + }, + { + "epoch": 6.987854251012146, + "grad_norm": 4.982884862825928, + "learning_rate": 2.5245175376748334e-06, + "loss": 1.2329, + "step": 1726 + }, + { + "epoch": 6.991902834008097, + "grad_norm": 4.383040697186735, + "learning_rate": 2.5183807065908296e-06, + "loss": 1.2466, + "step": 1727 + }, + { + "epoch": 6.995951417004049, + "grad_norm": 4.833585025134396, + "learning_rate": 2.512248831787639e-06, + "loss": 1.5637, + "step": 1728 + }, + { + "epoch": 7.0, + "grad_norm": 4.848560799578388, + "learning_rate": 2.5061219255118186e-06, + "loss": 1.2677, + "step": 1729 + }, + { + "epoch": 7.004048582995951, + "grad_norm": 4.901375359150507, + "learning_rate": 2.5000000000000015e-06, + "loss": 1.3023, + "step": 1730 + }, + { + "epoch": 7.008097165991903, + "grad_norm": 6.545083705424055, + "learning_rate": 2.4938830674788756e-06, + "loss": 1.4651, + "step": 1731 + }, + { + "epoch": 7.012145748987854, + "grad_norm": 6.141277943301318, + "learning_rate": 2.4877711401651562e-06, + "loss": 1.2554, + "step": 1732 + }, + { + "epoch": 7.016194331983805, + "grad_norm": 6.544269798324027, + "learning_rate": 2.4816642302655634e-06, + "loss": 1.479, + "step": 1733 + }, + { + "epoch": 7.020242914979757, + "grad_norm": 5.746379418360751, + "learning_rate": 2.475562349976791e-06, + "loss": 1.656, + "step": 1734 + }, + { + "epoch": 7.0242914979757085, + "grad_norm": 6.035436258524213, + "learning_rate": 2.4694655114854936e-06, + "loss": 1.5592, + "step": 1735 + }, + { + "epoch": 7.02834008097166, + "grad_norm": 5.223633858026752, + "learning_rate": 2.4633737269682546e-06, + "loss": 1.2619, + "step": 1736 + }, + { + "epoch": 7.032388663967612, + "grad_norm": 5.890887028411126, + "learning_rate": 2.4572870085915628e-06, + "loss": 1.2686, + "step": 1737 + }, + { + "epoch": 7.036437246963563, + "grad_norm": 5.4867419263331785, + "learning_rate": 2.4512053685117916e-06, + "loss": 1.4711, + "step": 1738 + }, + { + "epoch": 7.040485829959514, + "grad_norm": 5.856066296731616, + "learning_rate": 2.445128818875166e-06, + "loss": 1.2784, + "step": 1739 + }, + { + "epoch": 7.044534412955466, + "grad_norm": 5.685747261263775, + "learning_rate": 2.4390573718177507e-06, + "loss": 1.4178, + "step": 1740 + }, + { + "epoch": 7.048582995951417, + "grad_norm": 5.580589694434444, + "learning_rate": 2.4329910394654167e-06, + "loss": 1.2819, + "step": 1741 + }, + { + "epoch": 7.052631578947368, + "grad_norm": 6.1734653161832345, + "learning_rate": 2.4269298339338205e-06, + "loss": 1.3334, + "step": 1742 + }, + { + "epoch": 7.05668016194332, + "grad_norm": 5.647156467107709, + "learning_rate": 2.4208737673283818e-06, + "loss": 1.1932, + "step": 1743 + }, + { + "epoch": 7.060728744939271, + "grad_norm": 5.571147412614646, + "learning_rate": 2.414822851744249e-06, + "loss": 1.3354, + "step": 1744 + }, + { + "epoch": 7.064777327935222, + "grad_norm": 6.222421117643815, + "learning_rate": 2.408777099266291e-06, + "loss": 1.2747, + "step": 1745 + }, + { + "epoch": 7.068825910931174, + "grad_norm": 6.251859136759403, + "learning_rate": 2.4027365219690617e-06, + "loss": 1.444, + "step": 1746 + }, + { + "epoch": 7.0728744939271255, + "grad_norm": 5.555376265690771, + "learning_rate": 2.3967011319167804e-06, + "loss": 1.3478, + "step": 1747 + }, + { + "epoch": 7.076923076923077, + "grad_norm": 6.222350987405198, + "learning_rate": 2.3906709411633073e-06, + "loss": 1.3069, + "step": 1748 + }, + { + "epoch": 7.080971659919029, + "grad_norm": 5.290175219718593, + "learning_rate": 2.384645961752113e-06, + "loss": 1.4103, + "step": 1749 + }, + { + "epoch": 7.08502024291498, + "grad_norm": 4.882921637643386, + "learning_rate": 2.378626205716265e-06, + "loss": 1.3698, + "step": 1750 + }, + { + "epoch": 7.089068825910931, + "grad_norm": 5.893035167375215, + "learning_rate": 2.3726116850783987e-06, + "loss": 1.3153, + "step": 1751 + }, + { + "epoch": 7.093117408906883, + "grad_norm": 5.440462022348463, + "learning_rate": 2.3666024118506937e-06, + "loss": 1.3918, + "step": 1752 + }, + { + "epoch": 7.097165991902834, + "grad_norm": 5.298541554798929, + "learning_rate": 2.3605983980348446e-06, + "loss": 1.1493, + "step": 1753 + }, + { + "epoch": 7.101214574898785, + "grad_norm": 5.873912109321258, + "learning_rate": 2.354599655622049e-06, + "loss": 1.3419, + "step": 1754 + }, + { + "epoch": 7.105263157894737, + "grad_norm": 6.515086572176515, + "learning_rate": 2.3486061965929695e-06, + "loss": 1.2658, + "step": 1755 + }, + { + "epoch": 7.109311740890688, + "grad_norm": 5.640239544492155, + "learning_rate": 2.3426180329177217e-06, + "loss": 1.2778, + "step": 1756 + }, + { + "epoch": 7.113360323886639, + "grad_norm": 6.602620889096045, + "learning_rate": 2.3366351765558437e-06, + "loss": 1.2168, + "step": 1757 + }, + { + "epoch": 7.117408906882591, + "grad_norm": 6.23335605433251, + "learning_rate": 2.3306576394562748e-06, + "loss": 1.1279, + "step": 1758 + }, + { + "epoch": 7.1214574898785425, + "grad_norm": 5.812741962332591, + "learning_rate": 2.3246854335573303e-06, + "loss": 1.2, + "step": 1759 + }, + { + "epoch": 7.125506072874494, + "grad_norm": 5.7653076766991465, + "learning_rate": 2.318718570786675e-06, + "loss": 1.2204, + "step": 1760 + }, + { + "epoch": 7.129554655870446, + "grad_norm": 6.592268657435819, + "learning_rate": 2.3127570630613064e-06, + "loss": 1.0923, + "step": 1761 + }, + { + "epoch": 7.133603238866397, + "grad_norm": 5.105109462079527, + "learning_rate": 2.3068009222875256e-06, + "loss": 1.4491, + "step": 1762 + }, + { + "epoch": 7.137651821862348, + "grad_norm": 6.139171319338175, + "learning_rate": 2.3008501603609147e-06, + "loss": 1.2557, + "step": 1763 + }, + { + "epoch": 7.1417004048583, + "grad_norm": 4.871725004057816, + "learning_rate": 2.294904789166315e-06, + "loss": 1.023, + "step": 1764 + }, + { + "epoch": 7.145748987854251, + "grad_norm": 6.491293356249618, + "learning_rate": 2.288964820577797e-06, + "loss": 1.3439, + "step": 1765 + }, + { + "epoch": 7.149797570850202, + "grad_norm": 5.837952957007555, + "learning_rate": 2.283030266458644e-06, + "loss": 1.182, + "step": 1766 + }, + { + "epoch": 7.153846153846154, + "grad_norm": 5.104308775866129, + "learning_rate": 2.2771011386613268e-06, + "loss": 1.4117, + "step": 1767 + }, + { + "epoch": 7.157894736842105, + "grad_norm": 6.518827958790034, + "learning_rate": 2.2711774490274767e-06, + "loss": 1.4173, + "step": 1768 + }, + { + "epoch": 7.161943319838056, + "grad_norm": 4.94266123667569, + "learning_rate": 2.265259209387867e-06, + "loss": 1.2429, + "step": 1769 + }, + { + "epoch": 7.165991902834008, + "grad_norm": 5.473631523594278, + "learning_rate": 2.259346431562379e-06, + "loss": 1.3316, + "step": 1770 + }, + { + "epoch": 7.17004048582996, + "grad_norm": 5.001369544056481, + "learning_rate": 2.2534391273599937e-06, + "loss": 1.9136, + "step": 1771 + }, + { + "epoch": 7.174089068825911, + "grad_norm": 5.913295650699435, + "learning_rate": 2.2475373085787568e-06, + "loss": 1.1497, + "step": 1772 + }, + { + "epoch": 7.178137651821863, + "grad_norm": 6.952533318275522, + "learning_rate": 2.2416409870057577e-06, + "loss": 1.353, + "step": 1773 + }, + { + "epoch": 7.182186234817814, + "grad_norm": 4.723432595191292, + "learning_rate": 2.2357501744171105e-06, + "loss": 1.1492, + "step": 1774 + }, + { + "epoch": 7.186234817813765, + "grad_norm": 6.058020017509188, + "learning_rate": 2.229864882577921e-06, + "loss": 1.3322, + "step": 1775 + }, + { + "epoch": 7.190283400809717, + "grad_norm": 5.788151410477542, + "learning_rate": 2.2239851232422736e-06, + "loss": 1.3631, + "step": 1776 + }, + { + "epoch": 7.194331983805668, + "grad_norm": 6.262252651618726, + "learning_rate": 2.218110908153202e-06, + "loss": 1.5276, + "step": 1777 + }, + { + "epoch": 7.198380566801619, + "grad_norm": 5.208163192867401, + "learning_rate": 2.2122422490426676e-06, + "loss": 1.5831, + "step": 1778 + }, + { + "epoch": 7.202429149797571, + "grad_norm": 5.390523496529594, + "learning_rate": 2.206379157631532e-06, + "loss": 1.2908, + "step": 1779 + }, + { + "epoch": 7.206477732793522, + "grad_norm": 5.162249120166779, + "learning_rate": 2.200521645629542e-06, + "loss": 1.6171, + "step": 1780 + }, + { + "epoch": 7.2105263157894735, + "grad_norm": 5.391588507251084, + "learning_rate": 2.194669724735296e-06, + "loss": 1.6111, + "step": 1781 + }, + { + "epoch": 7.2145748987854255, + "grad_norm": 6.1034967557731665, + "learning_rate": 2.1888234066362303e-06, + "loss": 1.3854, + "step": 1782 + }, + { + "epoch": 7.218623481781377, + "grad_norm": 6.167454760308808, + "learning_rate": 2.18298270300859e-06, + "loss": 1.2693, + "step": 1783 + }, + { + "epoch": 7.222672064777328, + "grad_norm": 5.69770152013801, + "learning_rate": 2.1771476255174056e-06, + "loss": 1.2078, + "step": 1784 + }, + { + "epoch": 7.22672064777328, + "grad_norm": 5.460410860926906, + "learning_rate": 2.1713181858164746e-06, + "loss": 1.413, + "step": 1785 + }, + { + "epoch": 7.230769230769231, + "grad_norm": 5.566118830424516, + "learning_rate": 2.165494395548329e-06, + "loss": 1.1968, + "step": 1786 + }, + { + "epoch": 7.234817813765182, + "grad_norm": 6.43649848295101, + "learning_rate": 2.159676266344222e-06, + "loss": 1.4229, + "step": 1787 + }, + { + "epoch": 7.238866396761134, + "grad_norm": 6.290508191897902, + "learning_rate": 2.1538638098241e-06, + "loss": 1.3623, + "step": 1788 + }, + { + "epoch": 7.242914979757085, + "grad_norm": 5.730502481155649, + "learning_rate": 2.14805703759658e-06, + "loss": 1.396, + "step": 1789 + }, + { + "epoch": 7.246963562753036, + "grad_norm": 5.437978852325137, + "learning_rate": 2.1422559612589266e-06, + "loss": 1.252, + "step": 1790 + }, + { + "epoch": 7.251012145748988, + "grad_norm": 5.7552412936402435, + "learning_rate": 2.136460592397025e-06, + "loss": 1.344, + "step": 1791 + }, + { + "epoch": 7.255060728744939, + "grad_norm": 5.804592913810575, + "learning_rate": 2.1306709425853663e-06, + "loss": 1.291, + "step": 1792 + }, + { + "epoch": 7.2591093117408905, + "grad_norm": 5.304611515686778, + "learning_rate": 2.124887023387017e-06, + "loss": 1.25, + "step": 1793 + }, + { + "epoch": 7.2631578947368425, + "grad_norm": 5.579310956319717, + "learning_rate": 2.1191088463535997e-06, + "loss": 1.0352, + "step": 1794 + }, + { + "epoch": 7.267206477732794, + "grad_norm": 5.280713442914896, + "learning_rate": 2.113336423025269e-06, + "loss": 1.3293, + "step": 1795 + }, + { + "epoch": 7.271255060728745, + "grad_norm": 5.695843923044428, + "learning_rate": 2.1075697649306838e-06, + "loss": 1.3279, + "step": 1796 + }, + { + "epoch": 7.275303643724697, + "grad_norm": 5.537225853611836, + "learning_rate": 2.1018088835869943e-06, + "loss": 1.4052, + "step": 1797 + }, + { + "epoch": 7.279352226720648, + "grad_norm": 7.310804417037736, + "learning_rate": 2.0960537904998113e-06, + "loss": 1.3052, + "step": 1798 + }, + { + "epoch": 7.283400809716599, + "grad_norm": 6.5207473345683455, + "learning_rate": 2.0903044971631854e-06, + "loss": 0.9953, + "step": 1799 + }, + { + "epoch": 7.287449392712551, + "grad_norm": 6.891390925467454, + "learning_rate": 2.084561015059585e-06, + "loss": 1.1524, + "step": 1800 + }, + { + "epoch": 7.291497975708502, + "grad_norm": 6.511458265596788, + "learning_rate": 2.0788233556598688e-06, + "loss": 1.019, + "step": 1801 + }, + { + "epoch": 7.295546558704453, + "grad_norm": 6.525945460785431, + "learning_rate": 2.0730915304232692e-06, + "loss": 1.2347, + "step": 1802 + }, + { + "epoch": 7.299595141700405, + "grad_norm": 5.806148576127675, + "learning_rate": 2.067365550797367e-06, + "loss": 1.4674, + "step": 1803 + }, + { + "epoch": 7.303643724696356, + "grad_norm": 6.6525694728213685, + "learning_rate": 2.061645428218067e-06, + "loss": 1.0762, + "step": 1804 + }, + { + "epoch": 7.3076923076923075, + "grad_norm": 6.212203279710177, + "learning_rate": 2.055931174109579e-06, + "loss": 1.1289, + "step": 1805 + }, + { + "epoch": 7.3117408906882595, + "grad_norm": 5.666269345071883, + "learning_rate": 2.050222799884387e-06, + "loss": 1.1799, + "step": 1806 + }, + { + "epoch": 7.315789473684211, + "grad_norm": 7.0629439288873, + "learning_rate": 2.044520316943235e-06, + "loss": 1.0631, + "step": 1807 + }, + { + "epoch": 7.319838056680162, + "grad_norm": 6.059126520843265, + "learning_rate": 2.0388237366751005e-06, + "loss": 1.03, + "step": 1808 + }, + { + "epoch": 7.323886639676114, + "grad_norm": 6.3174918869462635, + "learning_rate": 2.0331330704571746e-06, + "loss": 1.0775, + "step": 1809 + }, + { + "epoch": 7.327935222672065, + "grad_norm": 6.098595972628923, + "learning_rate": 2.027448329654832e-06, + "loss": 1.0956, + "step": 1810 + }, + { + "epoch": 7.331983805668016, + "grad_norm": 6.07010789176819, + "learning_rate": 2.02176952562162e-06, + "loss": 1.132, + "step": 1811 + }, + { + "epoch": 7.336032388663968, + "grad_norm": 5.673793373139681, + "learning_rate": 2.0160966696992195e-06, + "loss": 1.235, + "step": 1812 + }, + { + "epoch": 7.340080971659919, + "grad_norm": 5.42325757234182, + "learning_rate": 2.0104297732174403e-06, + "loss": 1.1607, + "step": 1813 + }, + { + "epoch": 7.34412955465587, + "grad_norm": 5.845384796389491, + "learning_rate": 2.004768847494186e-06, + "loss": 1.069, + "step": 1814 + }, + { + "epoch": 7.348178137651822, + "grad_norm": 6.716611305618001, + "learning_rate": 1.999113903835438e-06, + "loss": 1.2088, + "step": 1815 + }, + { + "epoch": 7.352226720647773, + "grad_norm": 6.335024142337415, + "learning_rate": 1.9934649535352286e-06, + "loss": 1.215, + "step": 1816 + }, + { + "epoch": 7.3562753036437245, + "grad_norm": 6.074016020941024, + "learning_rate": 1.987822007875617e-06, + "loss": 0.8957, + "step": 1817 + }, + { + "epoch": 7.3603238866396765, + "grad_norm": 6.669356187358129, + "learning_rate": 1.982185078126676e-06, + "loss": 1.2878, + "step": 1818 + }, + { + "epoch": 7.364372469635628, + "grad_norm": 5.5205879930863055, + "learning_rate": 1.9765541755464605e-06, + "loss": 1.3594, + "step": 1819 + }, + { + "epoch": 7.368421052631579, + "grad_norm": 5.791173021479898, + "learning_rate": 1.9709293113809876e-06, + "loss": 1.2518, + "step": 1820 + }, + { + "epoch": 7.372469635627531, + "grad_norm": 7.085668027134953, + "learning_rate": 1.965310496864217e-06, + "loss": 1.3044, + "step": 1821 + }, + { + "epoch": 7.376518218623482, + "grad_norm": 6.30070905341863, + "learning_rate": 1.9596977432180212e-06, + "loss": 1.0096, + "step": 1822 + }, + { + "epoch": 7.380566801619433, + "grad_norm": 6.668544077573982, + "learning_rate": 1.954091061652172e-06, + "loss": 1.1521, + "step": 1823 + }, + { + "epoch": 7.384615384615385, + "grad_norm": 5.685627571377497, + "learning_rate": 1.948490463364313e-06, + "loss": 0.9629, + "step": 1824 + }, + { + "epoch": 7.388663967611336, + "grad_norm": 7.099232364097355, + "learning_rate": 1.942895959539939e-06, + "loss": 1.0332, + "step": 1825 + }, + { + "epoch": 7.392712550607287, + "grad_norm": 6.449023103797025, + "learning_rate": 1.9373075613523728e-06, + "loss": 1.219, + "step": 1826 + }, + { + "epoch": 7.396761133603239, + "grad_norm": 7.603243728006548, + "learning_rate": 1.9317252799627393e-06, + "loss": 1.0144, + "step": 1827 + }, + { + "epoch": 7.40080971659919, + "grad_norm": 5.630823437903324, + "learning_rate": 1.9261491265199526e-06, + "loss": 1.0604, + "step": 1828 + }, + { + "epoch": 7.4048582995951415, + "grad_norm": 5.804060941623419, + "learning_rate": 1.920579112160685e-06, + "loss": 1.0906, + "step": 1829 + }, + { + "epoch": 7.4089068825910935, + "grad_norm": 7.107387654645546, + "learning_rate": 1.915015248009348e-06, + "loss": 1.1866, + "step": 1830 + }, + { + "epoch": 7.412955465587045, + "grad_norm": 6.216151169357513, + "learning_rate": 1.9094575451780727e-06, + "loss": 1.0234, + "step": 1831 + }, + { + "epoch": 7.417004048582996, + "grad_norm": 7.173346243896998, + "learning_rate": 1.903906014766681e-06, + "loss": 1.3152, + "step": 1832 + }, + { + "epoch": 7.421052631578947, + "grad_norm": 7.353654026214847, + "learning_rate": 1.8983606678626665e-06, + "loss": 1.3466, + "step": 1833 + }, + { + "epoch": 7.425101214574899, + "grad_norm": 6.168388032585026, + "learning_rate": 1.8928215155411773e-06, + "loss": 1.3615, + "step": 1834 + }, + { + "epoch": 7.42914979757085, + "grad_norm": 7.177909922740221, + "learning_rate": 1.8872885688649879e-06, + "loss": 1.3325, + "step": 1835 + }, + { + "epoch": 7.433198380566802, + "grad_norm": 5.5067246147195315, + "learning_rate": 1.8817618388844783e-06, + "loss": 1.5126, + "step": 1836 + }, + { + "epoch": 7.437246963562753, + "grad_norm": 6.480398605143195, + "learning_rate": 1.8762413366376159e-06, + "loss": 1.2967, + "step": 1837 + }, + { + "epoch": 7.441295546558704, + "grad_norm": 7.239184730466869, + "learning_rate": 1.8707270731499223e-06, + "loss": 1.2391, + "step": 1838 + }, + { + "epoch": 7.445344129554655, + "grad_norm": 5.881764731806458, + "learning_rate": 1.865219059434467e-06, + "loss": 1.4892, + "step": 1839 + }, + { + "epoch": 7.449392712550607, + "grad_norm": 7.287338664223354, + "learning_rate": 1.8597173064918333e-06, + "loss": 1.2865, + "step": 1840 + }, + { + "epoch": 7.4534412955465585, + "grad_norm": 6.989877908949274, + "learning_rate": 1.854221825310103e-06, + "loss": 1.2753, + "step": 1841 + }, + { + "epoch": 7.4574898785425106, + "grad_norm": 6.967142936381031, + "learning_rate": 1.8487326268648314e-06, + "loss": 1.6209, + "step": 1842 + }, + { + "epoch": 7.461538461538462, + "grad_norm": 9.165493801033026, + "learning_rate": 1.8432497221190227e-06, + "loss": 1.7021, + "step": 1843 + }, + { + "epoch": 7.465587044534413, + "grad_norm": 7.201939055537971, + "learning_rate": 1.8377731220231144e-06, + "loss": 1.4113, + "step": 1844 + }, + { + "epoch": 7.469635627530364, + "grad_norm": 6.447673122675899, + "learning_rate": 1.832302837514952e-06, + "loss": 1.4683, + "step": 1845 + }, + { + "epoch": 7.473684210526316, + "grad_norm": 5.915439909033562, + "learning_rate": 1.8268388795197683e-06, + "loss": 1.4386, + "step": 1846 + }, + { + "epoch": 7.477732793522267, + "grad_norm": 7.791713816072655, + "learning_rate": 1.8213812589501611e-06, + "loss": 1.4409, + "step": 1847 + }, + { + "epoch": 7.481781376518219, + "grad_norm": 5.76907536016399, + "learning_rate": 1.815929986706066e-06, + "loss": 1.357, + "step": 1848 + }, + { + "epoch": 7.48582995951417, + "grad_norm": 6.324576322221301, + "learning_rate": 1.8104850736747458e-06, + "loss": 1.3014, + "step": 1849 + }, + { + "epoch": 7.489878542510121, + "grad_norm": 7.955436278806627, + "learning_rate": 1.8050465307307602e-06, + "loss": 1.2541, + "step": 1850 + }, + { + "epoch": 7.493927125506072, + "grad_norm": 8.3800061367103, + "learning_rate": 1.7996143687359475e-06, + "loss": 1.2069, + "step": 1851 + }, + { + "epoch": 7.497975708502024, + "grad_norm": 5.859852613078974, + "learning_rate": 1.7941885985394025e-06, + "loss": 1.1389, + "step": 1852 + }, + { + "epoch": 7.502024291497976, + "grad_norm": 6.714230939191411, + "learning_rate": 1.78876923097745e-06, + "loss": 0.96, + "step": 1853 + }, + { + "epoch": 7.506072874493928, + "grad_norm": 7.478771265211495, + "learning_rate": 1.783356276873633e-06, + "loss": 1.3238, + "step": 1854 + }, + { + "epoch": 7.510121457489879, + "grad_norm": 6.964602737040841, + "learning_rate": 1.7779497470386826e-06, + "loss": 1.2515, + "step": 1855 + }, + { + "epoch": 7.51417004048583, + "grad_norm": 5.135869484791375, + "learning_rate": 1.7725496522704998e-06, + "loss": 1.2487, + "step": 1856 + }, + { + "epoch": 7.518218623481781, + "grad_norm": 6.736233605627823, + "learning_rate": 1.7671560033541364e-06, + "loss": 1.2647, + "step": 1857 + }, + { + "epoch": 7.522267206477733, + "grad_norm": 7.4340596808517585, + "learning_rate": 1.7617688110617653e-06, + "loss": 1.1983, + "step": 1858 + }, + { + "epoch": 7.526315789473684, + "grad_norm": 7.142575001524021, + "learning_rate": 1.7563880861526656e-06, + "loss": 1.037, + "step": 1859 + }, + { + "epoch": 7.530364372469636, + "grad_norm": 6.461217060280809, + "learning_rate": 1.7510138393732029e-06, + "loss": 1.125, + "step": 1860 + }, + { + "epoch": 7.534412955465587, + "grad_norm": 7.120411669751328, + "learning_rate": 1.7456460814568032e-06, + "loss": 1.1532, + "step": 1861 + }, + { + "epoch": 7.538461538461538, + "grad_norm": 6.677578923600314, + "learning_rate": 1.7402848231239317e-06, + "loss": 1.447, + "step": 1862 + }, + { + "epoch": 7.5425101214574894, + "grad_norm": 5.995680414752151, + "learning_rate": 1.7349300750820758e-06, + "loss": 1.414, + "step": 1863 + }, + { + "epoch": 7.5465587044534415, + "grad_norm": 70.49787838581857, + "learning_rate": 1.7295818480257148e-06, + "loss": 1.9394, + "step": 1864 + }, + { + "epoch": 7.550607287449393, + "grad_norm": 11.227616663799225, + "learning_rate": 1.7242401526363095e-06, + "loss": 1.6974, + "step": 1865 + }, + { + "epoch": 7.554655870445345, + "grad_norm": 15.917128296917474, + "learning_rate": 1.7189049995822748e-06, + "loss": 2.0666, + "step": 1866 + }, + { + "epoch": 7.558704453441296, + "grad_norm": 6.5545578057982254, + "learning_rate": 1.7135763995189574e-06, + "loss": 1.2566, + "step": 1867 + }, + { + "epoch": 7.562753036437247, + "grad_norm": 5.608919892200609, + "learning_rate": 1.70825436308862e-06, + "loss": 1.1258, + "step": 1868 + }, + { + "epoch": 7.566801619433198, + "grad_norm": 5.78898827199352, + "learning_rate": 1.70293890092041e-06, + "loss": 1.511, + "step": 1869 + }, + { + "epoch": 7.57085020242915, + "grad_norm": 6.1957471468572605, + "learning_rate": 1.6976300236303505e-06, + "loss": 1.1713, + "step": 1870 + }, + { + "epoch": 7.574898785425101, + "grad_norm": 5.919353556112893, + "learning_rate": 1.692327741821312e-06, + "loss": 1.3418, + "step": 1871 + }, + { + "epoch": 7.578947368421053, + "grad_norm": 4.818508692645506, + "learning_rate": 1.6870320660829908e-06, + "loss": 1.1787, + "step": 1872 + }, + { + "epoch": 7.582995951417004, + "grad_norm": 6.074378707133634, + "learning_rate": 1.6817430069918939e-06, + "loss": 1.2772, + "step": 1873 + }, + { + "epoch": 7.587044534412955, + "grad_norm": 6.043486629250494, + "learning_rate": 1.676460575111306e-06, + "loss": 1.2858, + "step": 1874 + }, + { + "epoch": 7.5910931174089065, + "grad_norm": 6.824574202718084, + "learning_rate": 1.671184780991283e-06, + "loss": 1.2792, + "step": 1875 + }, + { + "epoch": 7.5951417004048585, + "grad_norm": 6.003146333113679, + "learning_rate": 1.6659156351686202e-06, + "loss": 0.9987, + "step": 1876 + }, + { + "epoch": 7.59919028340081, + "grad_norm": 5.257435712843031, + "learning_rate": 1.6606531481668364e-06, + "loss": 1.1001, + "step": 1877 + }, + { + "epoch": 7.603238866396762, + "grad_norm": 5.19698994619142, + "learning_rate": 1.6553973304961528e-06, + "loss": 1.1799, + "step": 1878 + }, + { + "epoch": 7.607287449392713, + "grad_norm": 5.841701091792967, + "learning_rate": 1.6501481926534658e-06, + "loss": 0.9594, + "step": 1879 + }, + { + "epoch": 7.611336032388664, + "grad_norm": 6.19240531240544, + "learning_rate": 1.6449057451223354e-06, + "loss": 1.2521, + "step": 1880 + }, + { + "epoch": 7.615384615384615, + "grad_norm": 5.549994801931837, + "learning_rate": 1.639669998372958e-06, + "loss": 1.2949, + "step": 1881 + }, + { + "epoch": 7.619433198380567, + "grad_norm": 6.675501333896787, + "learning_rate": 1.6344409628621482e-06, + "loss": 1.0393, + "step": 1882 + }, + { + "epoch": 7.623481781376518, + "grad_norm": 6.8185578077235025, + "learning_rate": 1.6292186490333172e-06, + "loss": 1.3907, + "step": 1883 + }, + { + "epoch": 7.62753036437247, + "grad_norm": 5.788785194808056, + "learning_rate": 1.6240030673164492e-06, + "loss": 1.2266, + "step": 1884 + }, + { + "epoch": 7.631578947368421, + "grad_norm": 6.240532210004539, + "learning_rate": 1.6187942281280838e-06, + "loss": 1.4968, + "step": 1885 + }, + { + "epoch": 7.635627530364372, + "grad_norm": 5.438972394942183, + "learning_rate": 1.6135921418712959e-06, + "loss": 1.0917, + "step": 1886 + }, + { + "epoch": 7.6396761133603235, + "grad_norm": 6.412673367253676, + "learning_rate": 1.6083968189356724e-06, + "loss": 1.3789, + "step": 1887 + }, + { + "epoch": 7.6437246963562755, + "grad_norm": 5.536347657482411, + "learning_rate": 1.6032082696972945e-06, + "loss": 1.2638, + "step": 1888 + }, + { + "epoch": 7.647773279352227, + "grad_norm": 6.127206089252584, + "learning_rate": 1.5980265045187139e-06, + "loss": 1.3732, + "step": 1889 + }, + { + "epoch": 7.651821862348179, + "grad_norm": 5.193216915475832, + "learning_rate": 1.5928515337489292e-06, + "loss": 1.1536, + "step": 1890 + }, + { + "epoch": 7.65587044534413, + "grad_norm": 6.4405008029321635, + "learning_rate": 1.5876833677233754e-06, + "loss": 1.3585, + "step": 1891 + }, + { + "epoch": 7.659919028340081, + "grad_norm": 6.735596126416384, + "learning_rate": 1.5825220167638945e-06, + "loss": 1.1643, + "step": 1892 + }, + { + "epoch": 7.663967611336032, + "grad_norm": 5.578067115309463, + "learning_rate": 1.5773674911787157e-06, + "loss": 1.3335, + "step": 1893 + }, + { + "epoch": 7.668016194331984, + "grad_norm": 5.847753238206834, + "learning_rate": 1.5722198012624418e-06, + "loss": 1.3156, + "step": 1894 + }, + { + "epoch": 7.672064777327935, + "grad_norm": 6.167981268598202, + "learning_rate": 1.567078957296016e-06, + "loss": 1.4919, + "step": 1895 + }, + { + "epoch": 7.676113360323887, + "grad_norm": 5.209386411212645, + "learning_rate": 1.5619449695467142e-06, + "loss": 1.4698, + "step": 1896 + }, + { + "epoch": 7.680161943319838, + "grad_norm": 6.423491328339259, + "learning_rate": 1.556817848268118e-06, + "loss": 1.3083, + "step": 1897 + }, + { + "epoch": 7.684210526315789, + "grad_norm": 6.099826757015211, + "learning_rate": 1.5516976037000941e-06, + "loss": 1.1861, + "step": 1898 + }, + { + "epoch": 7.6882591093117405, + "grad_norm": 5.753586753644626, + "learning_rate": 1.5465842460687786e-06, + "loss": 1.2721, + "step": 1899 + }, + { + "epoch": 7.6923076923076925, + "grad_norm": 6.272583592648715, + "learning_rate": 1.5414777855865466e-06, + "loss": 1.2911, + "step": 1900 + }, + { + "epoch": 7.696356275303644, + "grad_norm": 5.68165710538138, + "learning_rate": 1.5363782324520033e-06, + "loss": 1.1648, + "step": 1901 + }, + { + "epoch": 7.700404858299595, + "grad_norm": 7.460829794563436, + "learning_rate": 1.5312855968499574e-06, + "loss": 1.6084, + "step": 1902 + }, + { + "epoch": 7.704453441295547, + "grad_norm": 6.5692354666682276, + "learning_rate": 1.5261998889514017e-06, + "loss": 1.4184, + "step": 1903 + }, + { + "epoch": 7.708502024291498, + "grad_norm": 6.3186571601325525, + "learning_rate": 1.5211211189134955e-06, + "loss": 1.0412, + "step": 1904 + }, + { + "epoch": 7.712550607287449, + "grad_norm": 5.682537504028156, + "learning_rate": 1.516049296879535e-06, + "loss": 1.1573, + "step": 1905 + }, + { + "epoch": 7.716599190283401, + "grad_norm": 5.812434487226451, + "learning_rate": 1.510984432978947e-06, + "loss": 1.2783, + "step": 1906 + }, + { + "epoch": 7.720647773279352, + "grad_norm": 7.075156192084278, + "learning_rate": 1.5059265373272574e-06, + "loss": 1.0288, + "step": 1907 + }, + { + "epoch": 7.724696356275303, + "grad_norm": 6.467523066478314, + "learning_rate": 1.5008756200260776e-06, + "loss": 1.2684, + "step": 1908 + }, + { + "epoch": 7.728744939271255, + "grad_norm": 5.838154690826828, + "learning_rate": 1.4958316911630827e-06, + "loss": 1.4278, + "step": 1909 + }, + { + "epoch": 7.732793522267206, + "grad_norm": 5.866932075199195, + "learning_rate": 1.4907947608119866e-06, + "loss": 1.1213, + "step": 1910 + }, + { + "epoch": 7.7368421052631575, + "grad_norm": 6.005636196644713, + "learning_rate": 1.4857648390325257e-06, + "loss": 1.2309, + "step": 1911 + }, + { + "epoch": 7.7408906882591095, + "grad_norm": 5.736349178634425, + "learning_rate": 1.4807419358704433e-06, + "loss": 1.8603, + "step": 1912 + }, + { + "epoch": 7.744939271255061, + "grad_norm": 5.608575893991077, + "learning_rate": 1.475726061357463e-06, + "loss": 1.4053, + "step": 1913 + }, + { + "epoch": 7.748987854251012, + "grad_norm": 6.949290018272913, + "learning_rate": 1.47071722551127e-06, + "loss": 1.2025, + "step": 1914 + }, + { + "epoch": 7.753036437246964, + "grad_norm": 6.470859543707123, + "learning_rate": 1.4657154383354948e-06, + "loss": 1.1287, + "step": 1915 + }, + { + "epoch": 7.757085020242915, + "grad_norm": 6.10955142295277, + "learning_rate": 1.4607207098196851e-06, + "loss": 1.2334, + "step": 1916 + }, + { + "epoch": 7.761133603238866, + "grad_norm": 6.5763762413068045, + "learning_rate": 1.4557330499392952e-06, + "loss": 1.9826, + "step": 1917 + }, + { + "epoch": 7.765182186234818, + "grad_norm": 7.723579817578996, + "learning_rate": 1.4507524686556612e-06, + "loss": 1.721, + "step": 1918 + }, + { + "epoch": 7.769230769230769, + "grad_norm": 8.397235796894286, + "learning_rate": 1.4457789759159813e-06, + "loss": 1.6659, + "step": 1919 + }, + { + "epoch": 7.77327935222672, + "grad_norm": 5.642365455166119, + "learning_rate": 1.4408125816532981e-06, + "loss": 1.1808, + "step": 1920 + }, + { + "epoch": 7.777327935222672, + "grad_norm": 5.725043241965928, + "learning_rate": 1.435853295786473e-06, + "loss": 1.4747, + "step": 1921 + }, + { + "epoch": 7.781376518218623, + "grad_norm": 5.394430714546486, + "learning_rate": 1.430901128220174e-06, + "loss": 1.4528, + "step": 1922 + }, + { + "epoch": 7.7854251012145745, + "grad_norm": 5.930712388463373, + "learning_rate": 1.4259560888448526e-06, + "loss": 1.2558, + "step": 1923 + }, + { + "epoch": 7.7894736842105265, + "grad_norm": 5.519869867138563, + "learning_rate": 1.4210181875367229e-06, + "loss": 1.1873, + "step": 1924 + }, + { + "epoch": 7.793522267206478, + "grad_norm": 6.265126307081154, + "learning_rate": 1.4160874341577447e-06, + "loss": 1.1916, + "step": 1925 + }, + { + "epoch": 7.797570850202429, + "grad_norm": 6.13894194733797, + "learning_rate": 1.4111638385555965e-06, + "loss": 1.2401, + "step": 1926 + }, + { + "epoch": 7.801619433198381, + "grad_norm": 5.721727948891365, + "learning_rate": 1.406247410563667e-06, + "loss": 1.1375, + "step": 1927 + }, + { + "epoch": 7.805668016194332, + "grad_norm": 5.409329610323807, + "learning_rate": 1.4013381600010278e-06, + "loss": 1.0394, + "step": 1928 + }, + { + "epoch": 7.809716599190283, + "grad_norm": 5.946216975378077, + "learning_rate": 1.396436096672416e-06, + "loss": 1.3717, + "step": 1929 + }, + { + "epoch": 7.813765182186235, + "grad_norm": 7.501336587253134, + "learning_rate": 1.3915412303682162e-06, + "loss": 1.1632, + "step": 1930 + }, + { + "epoch": 7.817813765182186, + "grad_norm": 6.192994323170135, + "learning_rate": 1.3866535708644335e-06, + "loss": 1.095, + "step": 1931 + }, + { + "epoch": 7.821862348178137, + "grad_norm": 14.576419437798382, + "learning_rate": 1.3817731279226843e-06, + "loss": 2.1725, + "step": 1932 + }, + { + "epoch": 7.825910931174089, + "grad_norm": 25.425127776950244, + "learning_rate": 1.376899911290172e-06, + "loss": 3.1191, + "step": 1933 + }, + { + "epoch": 7.82995951417004, + "grad_norm": 6.5130908283906574, + "learning_rate": 1.3720339306996666e-06, + "loss": 1.1065, + "step": 1934 + }, + { + "epoch": 7.834008097165992, + "grad_norm": 6.8625067545378755, + "learning_rate": 1.367175195869488e-06, + "loss": 1.076, + "step": 1935 + }, + { + "epoch": 7.838056680161944, + "grad_norm": 5.862839226770468, + "learning_rate": 1.3623237165034807e-06, + "loss": 1.0877, + "step": 1936 + }, + { + "epoch": 7.842105263157895, + "grad_norm": 5.587464620521552, + "learning_rate": 1.3574795022910014e-06, + "loss": 1.181, + "step": 1937 + }, + { + "epoch": 7.846153846153846, + "grad_norm": 5.741544735607096, + "learning_rate": 1.3526425629068968e-06, + "loss": 0.9695, + "step": 1938 + }, + { + "epoch": 7.850202429149798, + "grad_norm": 7.078793165923023, + "learning_rate": 1.347812908011485e-06, + "loss": 1.1728, + "step": 1939 + }, + { + "epoch": 7.854251012145749, + "grad_norm": 7.029454395604512, + "learning_rate": 1.3429905472505344e-06, + "loss": 1.2049, + "step": 1940 + }, + { + "epoch": 7.8582995951417, + "grad_norm": 4.858460051035453, + "learning_rate": 1.3381754902552474e-06, + "loss": 1.1544, + "step": 1941 + }, + { + "epoch": 7.862348178137652, + "grad_norm": 6.543690353473279, + "learning_rate": 1.3333677466422357e-06, + "loss": 1.1535, + "step": 1942 + }, + { + "epoch": 7.866396761133603, + "grad_norm": 6.2618770897927165, + "learning_rate": 1.3285673260135073e-06, + "loss": 1.1238, + "step": 1943 + }, + { + "epoch": 7.870445344129554, + "grad_norm": 7.787458993836756, + "learning_rate": 1.323774237956445e-06, + "loss": 1.5443, + "step": 1944 + }, + { + "epoch": 7.874493927125506, + "grad_norm": 6.60339760790844, + "learning_rate": 1.3189884920437867e-06, + "loss": 1.4939, + "step": 1945 + }, + { + "epoch": 7.8785425101214575, + "grad_norm": 6.952377816462855, + "learning_rate": 1.314210097833607e-06, + "loss": 1.2695, + "step": 1946 + }, + { + "epoch": 7.882591093117409, + "grad_norm": 6.440482664289205, + "learning_rate": 1.309439064869295e-06, + "loss": 1.2076, + "step": 1947 + }, + { + "epoch": 7.886639676113361, + "grad_norm": 5.96904543777947, + "learning_rate": 1.3046754026795406e-06, + "loss": 0.8564, + "step": 1948 + }, + { + "epoch": 7.890688259109312, + "grad_norm": 5.611903455141828, + "learning_rate": 1.2999191207783129e-06, + "loss": 1.3827, + "step": 1949 + }, + { + "epoch": 7.894736842105263, + "grad_norm": 5.50366242354655, + "learning_rate": 1.2951702286648399e-06, + "loss": 1.3867, + "step": 1950 + }, + { + "epoch": 7.898785425101215, + "grad_norm": 4.771234777762805, + "learning_rate": 1.290428735823593e-06, + "loss": 1.1739, + "step": 1951 + }, + { + "epoch": 7.902834008097166, + "grad_norm": 5.7833279202719075, + "learning_rate": 1.2856946517242608e-06, + "loss": 1.1495, + "step": 1952 + }, + { + "epoch": 7.906882591093117, + "grad_norm": 6.107712126684077, + "learning_rate": 1.28096798582174e-06, + "loss": 1.1842, + "step": 1953 + }, + { + "epoch": 7.910931174089069, + "grad_norm": 5.059953747053966, + "learning_rate": 1.2762487475561109e-06, + "loss": 0.9544, + "step": 1954 + }, + { + "epoch": 7.91497975708502, + "grad_norm": 5.819489630730656, + "learning_rate": 1.2715369463526173e-06, + "loss": 1.0285, + "step": 1955 + }, + { + "epoch": 7.919028340080971, + "grad_norm": 6.14238425845007, + "learning_rate": 1.2668325916216534e-06, + "loss": 1.0359, + "step": 1956 + }, + { + "epoch": 7.923076923076923, + "grad_norm": 4.708687979766823, + "learning_rate": 1.2621356927587353e-06, + "loss": 1.3581, + "step": 1957 + }, + { + "epoch": 7.9271255060728745, + "grad_norm": 6.6570477016899074, + "learning_rate": 1.257446259144494e-06, + "loss": 1.2012, + "step": 1958 + }, + { + "epoch": 7.931174089068826, + "grad_norm": 6.636474405464404, + "learning_rate": 1.2527643001446493e-06, + "loss": 1.181, + "step": 1959 + }, + { + "epoch": 7.935222672064778, + "grad_norm": 6.89647738144804, + "learning_rate": 1.248089825109991e-06, + "loss": 0.9855, + "step": 1960 + }, + { + "epoch": 7.939271255060729, + "grad_norm": 6.54652294560363, + "learning_rate": 1.2434228433763657e-06, + "loss": 1.0055, + "step": 1961 + }, + { + "epoch": 7.94331983805668, + "grad_norm": 7.466794850354919, + "learning_rate": 1.2387633642646501e-06, + "loss": 1.2977, + "step": 1962 + }, + { + "epoch": 7.947368421052632, + "grad_norm": 5.859347969468438, + "learning_rate": 1.2341113970807368e-06, + "loss": 1.0272, + "step": 1963 + }, + { + "epoch": 7.951417004048583, + "grad_norm": 7.526875704374519, + "learning_rate": 1.2294669511155193e-06, + "loss": 0.939, + "step": 1964 + }, + { + "epoch": 7.955465587044534, + "grad_norm": 7.225249295703587, + "learning_rate": 1.224830035644868e-06, + "loss": 1.2616, + "step": 1965 + }, + { + "epoch": 7.959514170040486, + "grad_norm": 6.683599476135708, + "learning_rate": 1.2202006599296122e-06, + "loss": 1.3384, + "step": 1966 + }, + { + "epoch": 7.963562753036437, + "grad_norm": 6.087314726468543, + "learning_rate": 1.215578833215526e-06, + "loss": 1.2777, + "step": 1967 + }, + { + "epoch": 7.967611336032388, + "grad_norm": 7.6203305950770766, + "learning_rate": 1.2109645647333018e-06, + "loss": 1.2766, + "step": 1968 + }, + { + "epoch": 7.97165991902834, + "grad_norm": 7.4075603041461155, + "learning_rate": 1.2063578636985402e-06, + "loss": 1.2, + "step": 1969 + }, + { + "epoch": 7.9757085020242915, + "grad_norm": 5.356896060806783, + "learning_rate": 1.201758739311728e-06, + "loss": 1.2542, + "step": 1970 + }, + { + "epoch": 7.979757085020243, + "grad_norm": 6.6184401008685, + "learning_rate": 1.1971672007582192e-06, + "loss": 1.3138, + "step": 1971 + }, + { + "epoch": 7.983805668016195, + "grad_norm": 5.952389025814739, + "learning_rate": 1.1925832572082184e-06, + "loss": 1.3645, + "step": 1972 + }, + { + "epoch": 7.987854251012146, + "grad_norm": 5.869009321326924, + "learning_rate": 1.1880069178167586e-06, + "loss": 1.1615, + "step": 1973 + }, + { + "epoch": 7.991902834008097, + "grad_norm": 5.240716232576427, + "learning_rate": 1.1834381917236881e-06, + "loss": 1.1793, + "step": 1974 + }, + { + "epoch": 7.995951417004049, + "grad_norm": 6.017014067933477, + "learning_rate": 1.178877088053651e-06, + "loss": 1.5002, + "step": 1975 + }, + { + "epoch": 8.0, + "grad_norm": 5.843845057775898, + "learning_rate": 1.1743236159160654e-06, + "loss": 1.2012, + "step": 1976 + } + ], + "logging_steps": 1, + "max_steps": 2470, + "num_input_tokens_seen": 0, + "num_train_epochs": 10, + "save_steps": 1976, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 600758819225600.0, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +}