| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 2.9975510204081632, | |
| "eval_steps": 500, | |
| "global_step": 459, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.0326530612244898, | |
| "grad_norm": 0.3159657120704651, | |
| "learning_rate": 4.9985361990992455e-05, | |
| "loss": 0.1654, | |
| "step": 5 | |
| }, | |
| { | |
| "epoch": 0.0653061224489796, | |
| "grad_norm": 0.32706642150878906, | |
| "learning_rate": 4.9941465105674435e-05, | |
| "loss": 0.1369, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.09795918367346938, | |
| "grad_norm": 0.33407843112945557, | |
| "learning_rate": 4.986836074908616e-05, | |
| "loss": 0.1259, | |
| "step": 15 | |
| }, | |
| { | |
| "epoch": 0.1306122448979592, | |
| "grad_norm": 0.3189881443977356, | |
| "learning_rate": 4.976613452940604e-05, | |
| "loss": 0.1041, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.16326530612244897, | |
| "grad_norm": 0.3424989581108093, | |
| "learning_rate": 4.9634906157700036e-05, | |
| "loss": 0.1004, | |
| "step": 25 | |
| }, | |
| { | |
| "epoch": 0.19591836734693877, | |
| "grad_norm": 0.3253389298915863, | |
| "learning_rate": 4.9474829307735115e-05, | |
| "loss": 0.0941, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.22857142857142856, | |
| "grad_norm": 0.26078635454177856, | |
| "learning_rate": 4.9286091436021015e-05, | |
| "loss": 0.0867, | |
| "step": 35 | |
| }, | |
| { | |
| "epoch": 0.2612244897959184, | |
| "grad_norm": 0.252139687538147, | |
| "learning_rate": 4.906891356229103e-05, | |
| "loss": 0.0853, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.2938775510204082, | |
| "grad_norm": 0.3403972387313843, | |
| "learning_rate": 4.882355001067892e-05, | |
| "loss": 0.0863, | |
| "step": 45 | |
| }, | |
| { | |
| "epoch": 0.32653061224489793, | |
| "grad_norm": 0.4710679352283478, | |
| "learning_rate": 4.855028811189496e-05, | |
| "loss": 0.0874, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.35918367346938773, | |
| "grad_norm": 0.3147217929363251, | |
| "learning_rate": 4.8249447866750025e-05, | |
| "loss": 0.0733, | |
| "step": 55 | |
| }, | |
| { | |
| "epoch": 0.39183673469387753, | |
| "grad_norm": 0.3265310823917389, | |
| "learning_rate": 4.792138157142158e-05, | |
| "loss": 0.0719, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.42448979591836733, | |
| "grad_norm": 0.35432252287864685, | |
| "learning_rate": 4.75664734049005e-05, | |
| "loss": 0.0824, | |
| "step": 65 | |
| }, | |
| { | |
| "epoch": 0.45714285714285713, | |
| "grad_norm": 0.3701626658439636, | |
| "learning_rate": 4.7185138979101864e-05, | |
| "loss": 0.0731, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.4897959183673469, | |
| "grad_norm": 0.35868266224861145, | |
| "learning_rate": 4.677782485216644e-05, | |
| "loss": 0.0725, | |
| "step": 75 | |
| }, | |
| { | |
| "epoch": 0.5224489795918368, | |
| "grad_norm": 0.32440632581710815, | |
| "learning_rate": 4.6345008005522966e-05, | |
| "loss": 0.0694, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.5551020408163265, | |
| "grad_norm": 0.3003002405166626, | |
| "learning_rate": 4.588719528532342e-05, | |
| "loss": 0.072, | |
| "step": 85 | |
| }, | |
| { | |
| "epoch": 0.5877551020408164, | |
| "grad_norm": 0.34989920258522034, | |
| "learning_rate": 4.540492280890555e-05, | |
| "loss": 0.0646, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.6204081632653061, | |
| "grad_norm": 0.473254919052124, | |
| "learning_rate": 4.4898755336977673e-05, | |
| "loss": 0.0732, | |
| "step": 95 | |
| }, | |
| { | |
| "epoch": 0.6530612244897959, | |
| "grad_norm": 0.30768489837646484, | |
| "learning_rate": 4.436928561226087e-05, | |
| "loss": 0.068, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.6857142857142857, | |
| "grad_norm": 0.31777673959732056, | |
| "learning_rate": 4.381713366536311e-05, | |
| "loss": 0.0749, | |
| "step": 105 | |
| }, | |
| { | |
| "epoch": 0.7183673469387755, | |
| "grad_norm": 0.40202295780181885, | |
| "learning_rate": 4.324294608869817e-05, | |
| "loss": 0.0652, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.7510204081632653, | |
| "grad_norm": 0.4353463053703308, | |
| "learning_rate": 4.264739527929959e-05, | |
| "loss": 0.0562, | |
| "step": 115 | |
| }, | |
| { | |
| "epoch": 0.7836734693877551, | |
| "grad_norm": 0.4079282879829407, | |
| "learning_rate": 4.203117865141635e-05, | |
| "loss": 0.0602, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.8163265306122449, | |
| "grad_norm": 0.40490931272506714, | |
| "learning_rate": 4.1395017819812445e-05, | |
| "loss": 0.0639, | |
| "step": 125 | |
| }, | |
| { | |
| "epoch": 0.8489795918367347, | |
| "grad_norm": 0.423981636762619, | |
| "learning_rate": 4.07396577547265e-05, | |
| "loss": 0.0651, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.8816326530612245, | |
| "grad_norm": 0.3315489590167999, | |
| "learning_rate": 4.0065865909481417e-05, | |
| "loss": 0.0676, | |
| "step": 135 | |
| }, | |
| { | |
| "epoch": 0.9142857142857143, | |
| "grad_norm": 0.560667097568512, | |
| "learning_rate": 3.937443132176517e-05, | |
| "loss": 0.0669, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.9469387755102041, | |
| "grad_norm": 0.4202517569065094, | |
| "learning_rate": 3.8666163689635616e-05, | |
| "loss": 0.0631, | |
| "step": 145 | |
| }, | |
| { | |
| "epoch": 0.9795918367346939, | |
| "grad_norm": 0.4563729465007782, | |
| "learning_rate": 3.794189242333106e-05, | |
| "loss": 0.0649, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 1.0122448979591836, | |
| "grad_norm": 0.3522864580154419, | |
| "learning_rate": 3.720246567399712e-05, | |
| "loss": 0.059, | |
| "step": 155 | |
| }, | |
| { | |
| "epoch": 1.0448979591836736, | |
| "grad_norm": 0.38459908962249756, | |
| "learning_rate": 3.644874934046716e-05, | |
| "loss": 0.062, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 1.0775510204081633, | |
| "grad_norm": 0.37406954169273376, | |
| "learning_rate": 3.568162605525953e-05, | |
| "loss": 0.0565, | |
| "step": 165 | |
| }, | |
| { | |
| "epoch": 1.110204081632653, | |
| "grad_norm": 0.3395706117153168, | |
| "learning_rate": 3.490199415097892e-05, | |
| "loss": 0.0575, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 1.1428571428571428, | |
| "grad_norm": 0.31273558735847473, | |
| "learning_rate": 3.4110766608332347e-05, | |
| "loss": 0.0589, | |
| "step": 175 | |
| }, | |
| { | |
| "epoch": 1.1755102040816325, | |
| "grad_norm": 0.44675061106681824, | |
| "learning_rate": 3.330886998699149e-05, | |
| "loss": 0.0611, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 1.2081632653061225, | |
| "grad_norm": 0.34769827127456665, | |
| "learning_rate": 3.249724334055367e-05, | |
| "loss": 0.062, | |
| "step": 185 | |
| }, | |
| { | |
| "epoch": 1.2408163265306122, | |
| "grad_norm": 0.31832146644592285, | |
| "learning_rate": 3.167683711687179e-05, | |
| "loss": 0.0616, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 1.273469387755102, | |
| "grad_norm": 0.4153653085231781, | |
| "learning_rate": 3.084861204504122e-05, | |
| "loss": 0.0586, | |
| "step": 195 | |
| }, | |
| { | |
| "epoch": 1.306122448979592, | |
| "grad_norm": 0.36249643564224243, | |
| "learning_rate": 3.001353801034688e-05, | |
| "loss": 0.0578, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 1.3387755102040817, | |
| "grad_norm": 0.43447113037109375, | |
| "learning_rate": 2.917259291848814e-05, | |
| "loss": 0.0547, | |
| "step": 205 | |
| }, | |
| { | |
| "epoch": 1.3714285714285714, | |
| "grad_norm": 0.3350294530391693, | |
| "learning_rate": 2.8326761550411345e-05, | |
| "loss": 0.0581, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 1.4040816326530612, | |
| "grad_norm": 0.34255513548851013, | |
| "learning_rate": 2.747703440909128e-05, | |
| "loss": 0.0535, | |
| "step": 215 | |
| }, | |
| { | |
| "epoch": 1.436734693877551, | |
| "grad_norm": 0.6323994398117065, | |
| "learning_rate": 2.662440655961183e-05, | |
| "loss": 0.0666, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 1.469387755102041, | |
| "grad_norm": 0.3421262502670288, | |
| "learning_rate": 2.5769876463904265e-05, | |
| "loss": 0.053, | |
| "step": 225 | |
| }, | |
| { | |
| "epoch": 1.5020408163265306, | |
| "grad_norm": 0.4133753478527069, | |
| "learning_rate": 2.491444481150763e-05, | |
| "loss": 0.0602, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 1.5346938775510204, | |
| "grad_norm": 0.37677302956581116, | |
| "learning_rate": 2.4059113347720574e-05, | |
| "loss": 0.0573, | |
| "step": 235 | |
| }, | |
| { | |
| "epoch": 1.5673469387755103, | |
| "grad_norm": 0.3683429956436157, | |
| "learning_rate": 2.3204883700516812e-05, | |
| "loss": 0.0544, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 1.6, | |
| "grad_norm": 0.34340280294418335, | |
| "learning_rate": 2.235275620759797e-05, | |
| "loss": 0.0552, | |
| "step": 245 | |
| }, | |
| { | |
| "epoch": 1.6326530612244898, | |
| "grad_norm": 0.39541885256767273, | |
| "learning_rate": 2.150372874495739e-05, | |
| "loss": 0.0543, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 1.6653061224489796, | |
| "grad_norm": 0.384330689907074, | |
| "learning_rate": 2.0658795558326743e-05, | |
| "loss": 0.0481, | |
| "step": 255 | |
| }, | |
| { | |
| "epoch": 1.6979591836734693, | |
| "grad_norm": 0.4634612798690796, | |
| "learning_rate": 1.9818946098873766e-05, | |
| "loss": 0.052, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 1.730612244897959, | |
| "grad_norm": 0.4186592400074005, | |
| "learning_rate": 1.8985163864514645e-05, | |
| "loss": 0.0516, | |
| "step": 265 | |
| }, | |
| { | |
| "epoch": 1.763265306122449, | |
| "grad_norm": 0.4726428985595703, | |
| "learning_rate": 1.815842524819793e-05, | |
| "loss": 0.0559, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 1.7959183673469388, | |
| "grad_norm": 0.38660183548927307, | |
| "learning_rate": 1.733969839450863e-05, | |
| "loss": 0.0554, | |
| "step": 275 | |
| }, | |
| { | |
| "epoch": 1.8285714285714287, | |
| "grad_norm": 0.40636590123176575, | |
| "learning_rate": 1.6529942065931477e-05, | |
| "loss": 0.0576, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 1.8612244897959185, | |
| "grad_norm": 0.3667762875556946, | |
| "learning_rate": 1.5730104520100982e-05, | |
| "loss": 0.0546, | |
| "step": 285 | |
| }, | |
| { | |
| "epoch": 1.8938775510204082, | |
| "grad_norm": 0.38867512345314026, | |
| "learning_rate": 1.4941122399353185e-05, | |
| "loss": 0.0539, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 1.926530612244898, | |
| "grad_norm": 0.37944862246513367, | |
| "learning_rate": 1.4163919633879324e-05, | |
| "loss": 0.0537, | |
| "step": 295 | |
| }, | |
| { | |
| "epoch": 1.9591836734693877, | |
| "grad_norm": 0.394191175699234, | |
| "learning_rate": 1.339940635976592e-05, | |
| "loss": 0.0493, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 1.9918367346938775, | |
| "grad_norm": 0.3522741496562958, | |
| "learning_rate": 1.2648477853188395e-05, | |
| "loss": 0.0514, | |
| "step": 305 | |
| }, | |
| { | |
| "epoch": 2.024489795918367, | |
| "grad_norm": 0.3195081949234009, | |
| "learning_rate": 1.1912013482006243e-05, | |
| "loss": 0.0526, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 2.057142857142857, | |
| "grad_norm": 0.34689584374427795, | |
| "learning_rate": 1.1190875675987356e-05, | |
| "loss": 0.0521, | |
| "step": 315 | |
| }, | |
| { | |
| "epoch": 2.089795918367347, | |
| "grad_norm": 0.382920503616333, | |
| "learning_rate": 1.0485908916867774e-05, | |
| "loss": 0.0506, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 2.122448979591837, | |
| "grad_norm": 0.42133641242980957, | |
| "learning_rate": 9.797938749429087e-06, | |
| "loss": 0.0549, | |
| "step": 325 | |
| }, | |
| { | |
| "epoch": 2.1551020408163266, | |
| "grad_norm": 0.3694668114185333, | |
| "learning_rate": 9.127770814751933e-06, | |
| "loss": 0.0522, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 2.1877551020408164, | |
| "grad_norm": 0.4703994393348694, | |
| "learning_rate": 8.476189906777458e-06, | |
| "loss": 0.0499, | |
| "step": 335 | |
| }, | |
| { | |
| "epoch": 2.220408163265306, | |
| "grad_norm": 0.343485563993454, | |
| "learning_rate": 7.843959053281663e-06, | |
| "loss": 0.0556, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 2.253061224489796, | |
| "grad_norm": 0.45534154772758484, | |
| "learning_rate": 7.231818622338823e-06, | |
| "loss": 0.0514, | |
| "step": 345 | |
| }, | |
| { | |
| "epoch": 2.2857142857142856, | |
| "grad_norm": 0.35300225019454956, | |
| "learning_rate": 6.6404854553202865e-06, | |
| "loss": 0.0539, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 2.3183673469387753, | |
| "grad_norm": 0.38405096530914307, | |
| "learning_rate": 6.070652027444102e-06, | |
| "loss": 0.0506, | |
| "step": 355 | |
| }, | |
| { | |
| "epoch": 2.351020408163265, | |
| "grad_norm": 0.40313202142715454, | |
| "learning_rate": 5.522985636858239e-06, | |
| "loss": 0.0494, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 2.3836734693877553, | |
| "grad_norm": 0.4369838833808899, | |
| "learning_rate": 4.998127623207405e-06, | |
| "loss": 0.0502, | |
| "step": 365 | |
| }, | |
| { | |
| "epoch": 2.416326530612245, | |
| "grad_norm": 0.3443393111228943, | |
| "learning_rate": 4.496692616598111e-06, | |
| "loss": 0.0503, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 2.4489795918367347, | |
| "grad_norm": 0.43407443165779114, | |
| "learning_rate": 4.019267817841835e-06, | |
| "loss": 0.0471, | |
| "step": 375 | |
| }, | |
| { | |
| "epoch": 2.4816326530612245, | |
| "grad_norm": 0.39149248600006104, | |
| "learning_rate": 3.566412310818945e-06, | |
| "loss": 0.051, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 2.5142857142857142, | |
| "grad_norm": 0.3471136689186096, | |
| "learning_rate": 3.1386564077687113e-06, | |
| "loss": 0.0468, | |
| "step": 385 | |
| }, | |
| { | |
| "epoch": 2.546938775510204, | |
| "grad_norm": 0.424050509929657, | |
| "learning_rate": 2.7365010282720952e-06, | |
| "loss": 0.048, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 2.5795918367346937, | |
| "grad_norm": 0.4177871346473694, | |
| "learning_rate": 2.360417112654481e-06, | |
| "loss": 0.0508, | |
| "step": 395 | |
| }, | |
| { | |
| "epoch": 2.612244897959184, | |
| "grad_norm": 0.471732497215271, | |
| "learning_rate": 2.0108450704954348e-06, | |
| "loss": 0.0495, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 2.644897959183673, | |
| "grad_norm": 0.41895270347595215, | |
| "learning_rate": 1.6881942648911076e-06, | |
| "loss": 0.0511, | |
| "step": 405 | |
| }, | |
| { | |
| "epoch": 2.6775510204081634, | |
| "grad_norm": 0.38744378089904785, | |
| "learning_rate": 1.392842533073388e-06, | |
| "loss": 0.0486, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 2.710204081632653, | |
| "grad_norm": 0.405487984418869, | |
| "learning_rate": 1.125135743947145e-06, | |
| "loss": 0.0552, | |
| "step": 415 | |
| }, | |
| { | |
| "epoch": 2.742857142857143, | |
| "grad_norm": 0.42694640159606934, | |
| "learning_rate": 8.85387393063622e-07, | |
| "loss": 0.0523, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 2.7755102040816326, | |
| "grad_norm": 0.36827659606933594, | |
| "learning_rate": 6.738782355044049e-07, | |
| "loss": 0.0517, | |
| "step": 425 | |
| }, | |
| { | |
| "epoch": 2.8081632653061224, | |
| "grad_norm": 0.38273078203201294, | |
| "learning_rate": 4.908559571057736e-07, | |
| "loss": 0.0515, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 2.840816326530612, | |
| "grad_norm": 0.37218937277793884, | |
| "learning_rate": 3.3653488440851255e-07, | |
| "loss": 0.0528, | |
| "step": 435 | |
| }, | |
| { | |
| "epoch": 2.873469387755102, | |
| "grad_norm": 0.38275858759880066, | |
| "learning_rate": 2.1109573367279479e-07, | |
| "loss": 0.0459, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 2.906122448979592, | |
| "grad_norm": 0.3631555438041687, | |
| "learning_rate": 1.1468539925209298e-07, | |
| "loss": 0.0556, | |
| "step": 445 | |
| }, | |
| { | |
| "epoch": 2.938775510204082, | |
| "grad_norm": 0.3471234440803528, | |
| "learning_rate": 4.741678157389739e-08, | |
| "loss": 0.052, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 2.9714285714285715, | |
| "grad_norm": 0.40650439262390137, | |
| "learning_rate": 9.368654928731957e-09, | |
| "loss": 0.0532, | |
| "step": 455 | |
| }, | |
| { | |
| "epoch": 2.9975510204081632, | |
| "step": 459, | |
| "total_flos": 2.854627336867676e+17, | |
| "train_loss": 0.0628165303362221, | |
| "train_runtime": 9275.9124, | |
| "train_samples_per_second": 0.792, | |
| "train_steps_per_second": 0.049 | |
| } | |
| ], | |
| "logging_steps": 5, | |
| "max_steps": 459, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 3, | |
| "save_steps": 100, | |
| "total_flos": 2.854627336867676e+17, | |
| "train_batch_size": 2, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |