Training in progress, step 29000, checkpoint
Browse files- last-checkpoint/optimizer.pt +1 -1
- last-checkpoint/pytorch_model.bin +1 -1
- last-checkpoint/rng_state_0.pth +1 -1
- last-checkpoint/rng_state_1.pth +1 -1
- last-checkpoint/rng_state_2.pth +1 -1
- last-checkpoint/rng_state_3.pth +1 -1
- last-checkpoint/scheduler.pt +1 -1
- last-checkpoint/trainer_state.json +353 -3
last-checkpoint/optimizer.pt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 189211642
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:8b38bfb27a9fc22c4b4574f2214176972bfa73ac8d3142646b820761f674ca95
|
| 3 |
size 189211642
|
last-checkpoint/pytorch_model.bin
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 363608098
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:ea9176a26395979f62437009f3466faa938a707a4323d4e687b5d57a6d6f8f1c
|
| 3 |
size 363608098
|
last-checkpoint/rng_state_0.pth
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 14960
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:eb28b82c42a9bb5bf55d64e97db9f27987fa81a7c592e21d815d9cffa7c0a5df
|
| 3 |
size 14960
|
last-checkpoint/rng_state_1.pth
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 14960
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:4e1aa25338743d0285d43b88c2600ca688a8287786133cac01ad1f618ed37739
|
| 3 |
size 14960
|
last-checkpoint/rng_state_2.pth
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 14960
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:884b90160675995812f0bd6498c004f5dca4bb18e90a7252a45f89ada159e10c
|
| 3 |
size 14960
|
last-checkpoint/rng_state_3.pth
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 14960
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:89ce837998bb8f91c98f4e51f479825ed96387b3201a4c2cea96eef1bf292192
|
| 3 |
size 14960
|
last-checkpoint/scheduler.pt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 1064
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:3cebc651d508c09e942cf4f665fc7edfe43526ad316e05d80134d3ca9d4a1b0b
|
| 3 |
size 1064
|
last-checkpoint/trainer_state.json
CHANGED
|
@@ -2,9 +2,9 @@
|
|
| 2 |
"best_global_step": null,
|
| 3 |
"best_metric": null,
|
| 4 |
"best_model_checkpoint": null,
|
| 5 |
-
"epoch": 0.
|
| 6 |
"eval_steps": 500,
|
| 7 |
-
"global_step":
|
| 8 |
"is_hyper_param_search": false,
|
| 9 |
"is_local_process_zero": true,
|
| 10 |
"is_world_process_zero": true,
|
|
@@ -9808,6 +9808,356 @@
|
|
| 9808 |
"learning_rate": 4.9320921131134654e-05,
|
| 9809 |
"loss": 39.6667,
|
| 9810 |
"step": 28000
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 9811 |
}
|
| 9812 |
],
|
| 9813 |
"logging_steps": 20,
|
|
@@ -9827,7 +10177,7 @@
|
|
| 9827 |
"attributes": {}
|
| 9828 |
}
|
| 9829 |
},
|
| 9830 |
-
"total_flos": 1.
|
| 9831 |
"train_batch_size": 48,
|
| 9832 |
"trial_name": null,
|
| 9833 |
"trial_params": null
|
|
|
|
| 2 |
"best_global_step": null,
|
| 3 |
"best_metric": null,
|
| 4 |
"best_model_checkpoint": null,
|
| 5 |
+
"epoch": 0.042958126196161614,
|
| 6 |
"eval_steps": 500,
|
| 7 |
+
"global_step": 29000,
|
| 8 |
"is_hyper_param_search": false,
|
| 9 |
"is_local_process_zero": true,
|
| 10 |
"is_world_process_zero": true,
|
|
|
|
| 9808 |
"learning_rate": 4.9320921131134654e-05,
|
| 9809 |
"loss": 39.6667,
|
| 9810 |
"step": 28000
|
| 9811 |
+
},
|
| 9812 |
+
{
|
| 9813 |
+
"epoch": 0.041506437793670636,
|
| 9814 |
+
"grad_norm": 79.0,
|
| 9815 |
+
"learning_rate": 4.93204272376339e-05,
|
| 9816 |
+
"loss": 39.7124,
|
| 9817 |
+
"step": 28020
|
| 9818 |
+
},
|
| 9819 |
+
{
|
| 9820 |
+
"epoch": 0.04153606408759902,
|
| 9821 |
+
"grad_norm": 56.0,
|
| 9822 |
+
"learning_rate": 4.931993334413314e-05,
|
| 9823 |
+
"loss": 39.702,
|
| 9824 |
+
"step": 28040
|
| 9825 |
+
},
|
| 9826 |
+
{
|
| 9827 |
+
"epoch": 0.041565690381527416,
|
| 9828 |
+
"grad_norm": 70.0,
|
| 9829 |
+
"learning_rate": 4.9319439450632384e-05,
|
| 9830 |
+
"loss": 39.6675,
|
| 9831 |
+
"step": 28060
|
| 9832 |
+
},
|
| 9833 |
+
{
|
| 9834 |
+
"epoch": 0.0415953166754558,
|
| 9835 |
+
"grad_norm": 58.25,
|
| 9836 |
+
"learning_rate": 4.931894555713163e-05,
|
| 9837 |
+
"loss": 39.603,
|
| 9838 |
+
"step": 28080
|
| 9839 |
+
},
|
| 9840 |
+
{
|
| 9841 |
+
"epoch": 0.04162494296938419,
|
| 9842 |
+
"grad_norm": 58.0,
|
| 9843 |
+
"learning_rate": 4.931845166363087e-05,
|
| 9844 |
+
"loss": 39.6839,
|
| 9845 |
+
"step": 28100
|
| 9846 |
+
},
|
| 9847 |
+
{
|
| 9848 |
+
"epoch": 0.041654569263312576,
|
| 9849 |
+
"grad_norm": 66.0,
|
| 9850 |
+
"learning_rate": 4.9317957770130115e-05,
|
| 9851 |
+
"loss": 39.7011,
|
| 9852 |
+
"step": 28120
|
| 9853 |
+
},
|
| 9854 |
+
{
|
| 9855 |
+
"epoch": 0.04168419555724096,
|
| 9856 |
+
"grad_norm": 63.25,
|
| 9857 |
+
"learning_rate": 4.931746387662936e-05,
|
| 9858 |
+
"loss": 39.702,
|
| 9859 |
+
"step": 28140
|
| 9860 |
+
},
|
| 9861 |
+
{
|
| 9862 |
+
"epoch": 0.04171382185116935,
|
| 9863 |
+
"grad_norm": 66.0,
|
| 9864 |
+
"learning_rate": 4.93169699831286e-05,
|
| 9865 |
+
"loss": 39.6368,
|
| 9866 |
+
"step": 28160
|
| 9867 |
+
},
|
| 9868 |
+
{
|
| 9869 |
+
"epoch": 0.041743448145097735,
|
| 9870 |
+
"grad_norm": 71.0,
|
| 9871 |
+
"learning_rate": 4.9316476089627845e-05,
|
| 9872 |
+
"loss": 39.6087,
|
| 9873 |
+
"step": 28180
|
| 9874 |
+
},
|
| 9875 |
+
{
|
| 9876 |
+
"epoch": 0.04177307443902612,
|
| 9877 |
+
"grad_norm": 76.5,
|
| 9878 |
+
"learning_rate": 4.931598219612709e-05,
|
| 9879 |
+
"loss": 39.6342,
|
| 9880 |
+
"step": 28200
|
| 9881 |
+
},
|
| 9882 |
+
{
|
| 9883 |
+
"epoch": 0.041802700732954515,
|
| 9884 |
+
"grad_norm": 79.5,
|
| 9885 |
+
"learning_rate": 4.9315488302626325e-05,
|
| 9886 |
+
"loss": 39.5706,
|
| 9887 |
+
"step": 28220
|
| 9888 |
+
},
|
| 9889 |
+
{
|
| 9890 |
+
"epoch": 0.0418323270268829,
|
| 9891 |
+
"grad_norm": 66.5,
|
| 9892 |
+
"learning_rate": 4.9314994409125575e-05,
|
| 9893 |
+
"loss": 39.6543,
|
| 9894 |
+
"step": 28240
|
| 9895 |
+
},
|
| 9896 |
+
{
|
| 9897 |
+
"epoch": 0.04186195332081129,
|
| 9898 |
+
"grad_norm": 58.5,
|
| 9899 |
+
"learning_rate": 4.931450051562482e-05,
|
| 9900 |
+
"loss": 39.5878,
|
| 9901 |
+
"step": 28260
|
| 9902 |
+
},
|
| 9903 |
+
{
|
| 9904 |
+
"epoch": 0.041891579614739674,
|
| 9905 |
+
"grad_norm": 59.0,
|
| 9906 |
+
"learning_rate": 4.931400662212406e-05,
|
| 9907 |
+
"loss": 39.6459,
|
| 9908 |
+
"step": 28280
|
| 9909 |
+
},
|
| 9910 |
+
{
|
| 9911 |
+
"epoch": 0.04192120590866806,
|
| 9912 |
+
"grad_norm": 49.25,
|
| 9913 |
+
"learning_rate": 4.93135127286233e-05,
|
| 9914 |
+
"loss": 39.5152,
|
| 9915 |
+
"step": 28300
|
| 9916 |
+
},
|
| 9917 |
+
{
|
| 9918 |
+
"epoch": 0.04195083220259645,
|
| 9919 |
+
"grad_norm": 64.0,
|
| 9920 |
+
"learning_rate": 4.931301883512255e-05,
|
| 9921 |
+
"loss": 39.5766,
|
| 9922 |
+
"step": 28320
|
| 9923 |
+
},
|
| 9924 |
+
{
|
| 9925 |
+
"epoch": 0.04198045849652483,
|
| 9926 |
+
"grad_norm": 58.0,
|
| 9927 |
+
"learning_rate": 4.931252494162179e-05,
|
| 9928 |
+
"loss": 39.5904,
|
| 9929 |
+
"step": 28340
|
| 9930 |
+
},
|
| 9931 |
+
{
|
| 9932 |
+
"epoch": 0.04201008479045322,
|
| 9933 |
+
"grad_norm": 58.25,
|
| 9934 |
+
"learning_rate": 4.9312031048121036e-05,
|
| 9935 |
+
"loss": 39.6196,
|
| 9936 |
+
"step": 28360
|
| 9937 |
+
},
|
| 9938 |
+
{
|
| 9939 |
+
"epoch": 0.04203971108438161,
|
| 9940 |
+
"grad_norm": 61.25,
|
| 9941 |
+
"learning_rate": 4.931153715462028e-05,
|
| 9942 |
+
"loss": 39.5726,
|
| 9943 |
+
"step": 28380
|
| 9944 |
+
},
|
| 9945 |
+
{
|
| 9946 |
+
"epoch": 0.04206933737831,
|
| 9947 |
+
"grad_norm": 61.75,
|
| 9948 |
+
"learning_rate": 4.931104326111952e-05,
|
| 9949 |
+
"loss": 39.5673,
|
| 9950 |
+
"step": 28400
|
| 9951 |
+
},
|
| 9952 |
+
{
|
| 9953 |
+
"epoch": 0.042098963672238386,
|
| 9954 |
+
"grad_norm": 74.0,
|
| 9955 |
+
"learning_rate": 4.9310549367618766e-05,
|
| 9956 |
+
"loss": 39.546,
|
| 9957 |
+
"step": 28420
|
| 9958 |
+
},
|
| 9959 |
+
{
|
| 9960 |
+
"epoch": 0.04212858996616677,
|
| 9961 |
+
"grad_norm": 60.75,
|
| 9962 |
+
"learning_rate": 4.9310055474118e-05,
|
| 9963 |
+
"loss": 39.5487,
|
| 9964 |
+
"step": 28440
|
| 9965 |
+
},
|
| 9966 |
+
{
|
| 9967 |
+
"epoch": 0.04215821626009516,
|
| 9968 |
+
"grad_norm": 52.0,
|
| 9969 |
+
"learning_rate": 4.930956158061725e-05,
|
| 9970 |
+
"loss": 39.5736,
|
| 9971 |
+
"step": 28460
|
| 9972 |
+
},
|
| 9973 |
+
{
|
| 9974 |
+
"epoch": 0.042187842554023545,
|
| 9975 |
+
"grad_norm": 72.5,
|
| 9976 |
+
"learning_rate": 4.9309067687116496e-05,
|
| 9977 |
+
"loss": 39.45,
|
| 9978 |
+
"step": 28480
|
| 9979 |
+
},
|
| 9980 |
+
{
|
| 9981 |
+
"epoch": 0.04221746884795193,
|
| 9982 |
+
"grad_norm": 75.5,
|
| 9983 |
+
"learning_rate": 4.930857379361574e-05,
|
| 9984 |
+
"loss": 39.5851,
|
| 9985 |
+
"step": 28500
|
| 9986 |
+
},
|
| 9987 |
+
{
|
| 9988 |
+
"epoch": 0.04224709514188032,
|
| 9989 |
+
"grad_norm": 98.5,
|
| 9990 |
+
"learning_rate": 4.9308079900114976e-05,
|
| 9991 |
+
"loss": 39.5567,
|
| 9992 |
+
"step": 28520
|
| 9993 |
+
},
|
| 9994 |
+
{
|
| 9995 |
+
"epoch": 0.04227672143580871,
|
| 9996 |
+
"grad_norm": 56.25,
|
| 9997 |
+
"learning_rate": 4.9307586006614227e-05,
|
| 9998 |
+
"loss": 39.4901,
|
| 9999 |
+
"step": 28540
|
| 10000 |
+
},
|
| 10001 |
+
{
|
| 10002 |
+
"epoch": 0.0423063477297371,
|
| 10003 |
+
"grad_norm": 61.0,
|
| 10004 |
+
"learning_rate": 4.930709211311347e-05,
|
| 10005 |
+
"loss": 39.4569,
|
| 10006 |
+
"step": 28560
|
| 10007 |
+
},
|
| 10008 |
+
{
|
| 10009 |
+
"epoch": 0.042335974023665485,
|
| 10010 |
+
"grad_norm": 64.0,
|
| 10011 |
+
"learning_rate": 4.930659821961271e-05,
|
| 10012 |
+
"loss": 39.5612,
|
| 10013 |
+
"step": 28580
|
| 10014 |
+
},
|
| 10015 |
+
{
|
| 10016 |
+
"epoch": 0.04236560031759387,
|
| 10017 |
+
"grad_norm": 58.25,
|
| 10018 |
+
"learning_rate": 4.930610432611195e-05,
|
| 10019 |
+
"loss": 39.4928,
|
| 10020 |
+
"step": 28600
|
| 10021 |
+
},
|
| 10022 |
+
{
|
| 10023 |
+
"epoch": 0.04239522661152226,
|
| 10024 |
+
"grad_norm": 63.75,
|
| 10025 |
+
"learning_rate": 4.93056104326112e-05,
|
| 10026 |
+
"loss": 39.5275,
|
| 10027 |
+
"step": 28620
|
| 10028 |
+
},
|
| 10029 |
+
{
|
| 10030 |
+
"epoch": 0.042424852905450644,
|
| 10031 |
+
"grad_norm": 68.0,
|
| 10032 |
+
"learning_rate": 4.9305116539110444e-05,
|
| 10033 |
+
"loss": 39.4773,
|
| 10034 |
+
"step": 28640
|
| 10035 |
+
},
|
| 10036 |
+
{
|
| 10037 |
+
"epoch": 0.04245447919937903,
|
| 10038 |
+
"grad_norm": 72.0,
|
| 10039 |
+
"learning_rate": 4.930462264560968e-05,
|
| 10040 |
+
"loss": 39.5318,
|
| 10041 |
+
"step": 28660
|
| 10042 |
+
},
|
| 10043 |
+
{
|
| 10044 |
+
"epoch": 0.04248410549330742,
|
| 10045 |
+
"grad_norm": 56.0,
|
| 10046 |
+
"learning_rate": 4.930412875210893e-05,
|
| 10047 |
+
"loss": 39.4474,
|
| 10048 |
+
"step": 28680
|
| 10049 |
+
},
|
| 10050 |
+
{
|
| 10051 |
+
"epoch": 0.04251373178723581,
|
| 10052 |
+
"grad_norm": 66.0,
|
| 10053 |
+
"learning_rate": 4.9303634858608174e-05,
|
| 10054 |
+
"loss": 39.485,
|
| 10055 |
+
"step": 28700
|
| 10056 |
+
},
|
| 10057 |
+
{
|
| 10058 |
+
"epoch": 0.0425433580811642,
|
| 10059 |
+
"grad_norm": 71.5,
|
| 10060 |
+
"learning_rate": 4.930314096510741e-05,
|
| 10061 |
+
"loss": 39.4602,
|
| 10062 |
+
"step": 28720
|
| 10063 |
+
},
|
| 10064 |
+
{
|
| 10065 |
+
"epoch": 0.04257298437509258,
|
| 10066 |
+
"grad_norm": 58.0,
|
| 10067 |
+
"learning_rate": 4.9302647071606654e-05,
|
| 10068 |
+
"loss": 39.4508,
|
| 10069 |
+
"step": 28740
|
| 10070 |
+
},
|
| 10071 |
+
{
|
| 10072 |
+
"epoch": 0.04260261066902097,
|
| 10073 |
+
"grad_norm": 62.5,
|
| 10074 |
+
"learning_rate": 4.9302153178105904e-05,
|
| 10075 |
+
"loss": 39.4066,
|
| 10076 |
+
"step": 28760
|
| 10077 |
+
},
|
| 10078 |
+
{
|
| 10079 |
+
"epoch": 0.042632236962949356,
|
| 10080 |
+
"grad_norm": 61.75,
|
| 10081 |
+
"learning_rate": 4.930165928460515e-05,
|
| 10082 |
+
"loss": 39.4411,
|
| 10083 |
+
"step": 28780
|
| 10084 |
+
},
|
| 10085 |
+
{
|
| 10086 |
+
"epoch": 0.04266186325687774,
|
| 10087 |
+
"grad_norm": 62.0,
|
| 10088 |
+
"learning_rate": 4.9301165391104384e-05,
|
| 10089 |
+
"loss": 39.4268,
|
| 10090 |
+
"step": 28800
|
| 10091 |
+
},
|
| 10092 |
+
{
|
| 10093 |
+
"epoch": 0.04269148955080613,
|
| 10094 |
+
"grad_norm": 54.0,
|
| 10095 |
+
"learning_rate": 4.930067149760363e-05,
|
| 10096 |
+
"loss": 39.3404,
|
| 10097 |
+
"step": 28820
|
| 10098 |
+
},
|
| 10099 |
+
{
|
| 10100 |
+
"epoch": 0.042721115844734515,
|
| 10101 |
+
"grad_norm": 56.75,
|
| 10102 |
+
"learning_rate": 4.930017760410288e-05,
|
| 10103 |
+
"loss": 39.3246,
|
| 10104 |
+
"step": 28840
|
| 10105 |
+
},
|
| 10106 |
+
{
|
| 10107 |
+
"epoch": 0.04275074213866291,
|
| 10108 |
+
"grad_norm": 58.25,
|
| 10109 |
+
"learning_rate": 4.929968371060212e-05,
|
| 10110 |
+
"loss": 39.3779,
|
| 10111 |
+
"step": 28860
|
| 10112 |
+
},
|
| 10113 |
+
{
|
| 10114 |
+
"epoch": 0.042780368432591295,
|
| 10115 |
+
"grad_norm": 64.0,
|
| 10116 |
+
"learning_rate": 4.929918981710136e-05,
|
| 10117 |
+
"loss": 39.4462,
|
| 10118 |
+
"step": 28880
|
| 10119 |
+
},
|
| 10120 |
+
{
|
| 10121 |
+
"epoch": 0.04280999472651968,
|
| 10122 |
+
"grad_norm": 66.0,
|
| 10123 |
+
"learning_rate": 4.92986959236006e-05,
|
| 10124 |
+
"loss": 39.3394,
|
| 10125 |
+
"step": 28900
|
| 10126 |
+
},
|
| 10127 |
+
{
|
| 10128 |
+
"epoch": 0.04283962102044807,
|
| 10129 |
+
"grad_norm": 57.25,
|
| 10130 |
+
"learning_rate": 4.929820203009985e-05,
|
| 10131 |
+
"loss": 39.3868,
|
| 10132 |
+
"step": 28920
|
| 10133 |
+
},
|
| 10134 |
+
{
|
| 10135 |
+
"epoch": 0.042869247314376455,
|
| 10136 |
+
"grad_norm": 78.0,
|
| 10137 |
+
"learning_rate": 4.929770813659909e-05,
|
| 10138 |
+
"loss": 39.3587,
|
| 10139 |
+
"step": 28940
|
| 10140 |
+
},
|
| 10141 |
+
{
|
| 10142 |
+
"epoch": 0.04289887360830484,
|
| 10143 |
+
"grad_norm": 63.25,
|
| 10144 |
+
"learning_rate": 4.929721424309833e-05,
|
| 10145 |
+
"loss": 39.4021,
|
| 10146 |
+
"step": 28960
|
| 10147 |
+
},
|
| 10148 |
+
{
|
| 10149 |
+
"epoch": 0.04292849990223323,
|
| 10150 |
+
"grad_norm": 51.5,
|
| 10151 |
+
"learning_rate": 4.929672034959758e-05,
|
| 10152 |
+
"loss": 39.313,
|
| 10153 |
+
"step": 28980
|
| 10154 |
+
},
|
| 10155 |
+
{
|
| 10156 |
+
"epoch": 0.042958126196161614,
|
| 10157 |
+
"grad_norm": 57.5,
|
| 10158 |
+
"learning_rate": 4.9296226456096825e-05,
|
| 10159 |
+
"loss": 39.3612,
|
| 10160 |
+
"step": 29000
|
| 10161 |
}
|
| 10162 |
],
|
| 10163 |
"logging_steps": 20,
|
|
|
|
| 10177 |
"attributes": {}
|
| 10178 |
}
|
| 10179 |
},
|
| 10180 |
+
"total_flos": 1.869765281936782e+19,
|
| 10181 |
"train_batch_size": 48,
|
| 10182 |
"trial_name": null,
|
| 10183 |
"trial_params": null
|