diff --git a/.gitattributes b/.gitattributes
index a6344aac8c09253b3b630fb776ae94478aa0275b..e8dd95b11055693eb71da2b9bdbcdee95093d961 100644
--- a/.gitattributes
+++ b/.gitattributes
@@ -33,3 +33,15 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+checkpoint-80000/trainer_state.json filter=lfs diff=lfs merge=lfs -text
+checkpoint-90000/trainer_state.json filter=lfs diff=lfs merge=lfs -text
+checkpoint-70000/trainer_state.json filter=lfs diff=lfs merge=lfs -text
+checkpoint-100000/trainer_state.json filter=lfs diff=lfs merge=lfs -text
+checkpoint-110000/trainer_state.json filter=lfs diff=lfs merge=lfs -text
+checkpoint-130000/trainer_state.json filter=lfs diff=lfs merge=lfs -text
+checkpoint-120000/trainer_state.json filter=lfs diff=lfs merge=lfs -text
+checkpoint-150000/trainer_state.json filter=lfs diff=lfs merge=lfs -text
+checkpoint-60000/trainer_state.json filter=lfs diff=lfs merge=lfs -text
+checkpoint-140000/trainer_state.json filter=lfs diff=lfs merge=lfs -text
+checkpoint-160000/trainer_state.json filter=lfs diff=lfs merge=lfs -text
+checkpoint-168144/trainer_state.json filter=lfs diff=lfs merge=lfs -text
diff --git a/checkpoint-10000/rng_state_4.pth b/checkpoint-10000/rng_state_4.pth
new file mode 100644
index 0000000000000000000000000000000000000000..28f7e8868fd4df63a6756b111fe1ea4a8e7eb6e2
--- /dev/null
+++ b/checkpoint-10000/rng_state_4.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:399c4700dab9ae7b754110ce307fb7e26e22cb49b5bb233c435f0f12b77c202f
+size 15984
diff --git a/checkpoint-100000/rng_state_3.pth b/checkpoint-100000/rng_state_3.pth
new file mode 100644
index 0000000000000000000000000000000000000000..b05597dd523f7c9a629beaa525e6ad7122b018f9
--- /dev/null
+++ b/checkpoint-100000/rng_state_3.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:348742498d81780506d9760c655a7a7555185b5fbd70a7ae296d88fd9aeecd84
+size 15984
diff --git a/checkpoint-100000/rng_state_6.pth b/checkpoint-100000/rng_state_6.pth
new file mode 100644
index 0000000000000000000000000000000000000000..d9cc9104b41c399577a5b19f280ae6ba448edc4f
--- /dev/null
+++ b/checkpoint-100000/rng_state_6.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:25755ba07299ee6ff45936dd04df329596319c9f8095af71e6f3a219e7543e26
+size 15984
diff --git a/checkpoint-100000/trainer_state.json b/checkpoint-100000/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..cf08ec7042b3da56376bdb9b2ab96e5ab166c2a5
--- /dev/null
+++ b/checkpoint-100000/trainer_state.json
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:94b3296cfba91b3d85433c52f73f4360316db6e8016670a8319248166d45ce97
+size 17536460
diff --git a/checkpoint-110000/rng_state_7.pth b/checkpoint-110000/rng_state_7.pth
new file mode 100644
index 0000000000000000000000000000000000000000..7736ce8fe27978c921d21f9fd6d1cda8c15a03f9
--- /dev/null
+++ b/checkpoint-110000/rng_state_7.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:908f018cd701ed629c41299726da4a25f202f20a1d4bc2075a2266ed4013db3a
+size 15984
diff --git a/checkpoint-110000/trainer_state.json b/checkpoint-110000/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..3dac6e22d8f4d1f2523b16ab1eb77a83ab76b517
--- /dev/null
+++ b/checkpoint-110000/trainer_state.json
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6cbe4b60888b9a20e749a39c9202095897a81986432139cc12052aaba373c8f5
+size 19298676
diff --git a/checkpoint-120000/rng_state_0.pth b/checkpoint-120000/rng_state_0.pth
new file mode 100644
index 0000000000000000000000000000000000000000..8be4da50e08ef9215e5f46f9cc7ebcb8fd4593eb
--- /dev/null
+++ b/checkpoint-120000/rng_state_0.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:955b895101e13c81ba510512313a06782795770a0bf998c90f718166d25f1664
+size 15984
diff --git a/checkpoint-120000/rng_state_1.pth b/checkpoint-120000/rng_state_1.pth
new file mode 100644
index 0000000000000000000000000000000000000000..3450e76a148654c7a712cc0f7f4fbfb8af73468e
--- /dev/null
+++ b/checkpoint-120000/rng_state_1.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:096e351ace65ff91008be171a45173ed6369cc639fce73a288f671041e24b0ec
+size 15984
diff --git a/checkpoint-120000/rng_state_6.pth b/checkpoint-120000/rng_state_6.pth
new file mode 100644
index 0000000000000000000000000000000000000000..d9cc9104b41c399577a5b19f280ae6ba448edc4f
--- /dev/null
+++ b/checkpoint-120000/rng_state_6.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:25755ba07299ee6ff45936dd04df329596319c9f8095af71e6f3a219e7543e26
+size 15984
diff --git a/checkpoint-120000/rng_state_7.pth b/checkpoint-120000/rng_state_7.pth
new file mode 100644
index 0000000000000000000000000000000000000000..7736ce8fe27978c921d21f9fd6d1cda8c15a03f9
--- /dev/null
+++ b/checkpoint-120000/rng_state_7.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:908f018cd701ed629c41299726da4a25f202f20a1d4bc2075a2266ed4013db3a
+size 15984
diff --git a/checkpoint-120000/scheduler.pt b/checkpoint-120000/scheduler.pt
new file mode 100644
index 0000000000000000000000000000000000000000..e52026a1de16197826c58b573974d4d04245c5a8
--- /dev/null
+++ b/checkpoint-120000/scheduler.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:77a70af2bb64373017d7032bb8f7147abac42d23ddbd67e3a6a4aaa6a92b7190
+size 1064
diff --git a/checkpoint-120000/trainer_state.json b/checkpoint-120000/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..15eab27368bf8dfd741eb56966b97857a632ad8b
--- /dev/null
+++ b/checkpoint-120000/trainer_state.json
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:993df490285ebdfafcf544990feb3e287af648a1d83d34041aae53d3d802feb1
+size 21060347
diff --git a/checkpoint-130000/rng_state_1.pth b/checkpoint-130000/rng_state_1.pth
new file mode 100644
index 0000000000000000000000000000000000000000..3450e76a148654c7a712cc0f7f4fbfb8af73468e
--- /dev/null
+++ b/checkpoint-130000/rng_state_1.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:096e351ace65ff91008be171a45173ed6369cc639fce73a288f671041e24b0ec
+size 15984
diff --git a/checkpoint-130000/rng_state_3.pth b/checkpoint-130000/rng_state_3.pth
new file mode 100644
index 0000000000000000000000000000000000000000..b05597dd523f7c9a629beaa525e6ad7122b018f9
--- /dev/null
+++ b/checkpoint-130000/rng_state_3.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:348742498d81780506d9760c655a7a7555185b5fbd70a7ae296d88fd9aeecd84
+size 15984
diff --git a/checkpoint-130000/rng_state_5.pth b/checkpoint-130000/rng_state_5.pth
new file mode 100644
index 0000000000000000000000000000000000000000..b81cf1f9e78f606b05125af5ec552416d94116f4
--- /dev/null
+++ b/checkpoint-130000/rng_state_5.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:864ea2379cc907eb4189c52706cb978150d9c26e18abf74679590729a8f0c8e8
+size 15984
diff --git a/checkpoint-130000/rng_state_7.pth b/checkpoint-130000/rng_state_7.pth
new file mode 100644
index 0000000000000000000000000000000000000000..7736ce8fe27978c921d21f9fd6d1cda8c15a03f9
--- /dev/null
+++ b/checkpoint-130000/rng_state_7.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:908f018cd701ed629c41299726da4a25f202f20a1d4bc2075a2266ed4013db3a
+size 15984
diff --git a/checkpoint-130000/trainer_state.json b/checkpoint-130000/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..5d8f298201578b5e75482695178e12721dc0fe77
--- /dev/null
+++ b/checkpoint-130000/trainer_state.json
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:76c7190fec1856b279f80b2acd81b6a56d3f939e97f15ea315a6c55456430680
+size 22814765
diff --git a/checkpoint-140000/rng_state_2.pth b/checkpoint-140000/rng_state_2.pth
new file mode 100644
index 0000000000000000000000000000000000000000..9b3bdae1f6c60868005426daf443380b27066cba
--- /dev/null
+++ b/checkpoint-140000/rng_state_2.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f21c61b1a7e793bbdec183de3b52da90042305234bc7e5887986655cd3fc2192
+size 15984
diff --git a/checkpoint-140000/rng_state_4.pth b/checkpoint-140000/rng_state_4.pth
new file mode 100644
index 0000000000000000000000000000000000000000..28f7e8868fd4df63a6756b111fe1ea4a8e7eb6e2
--- /dev/null
+++ b/checkpoint-140000/rng_state_4.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:399c4700dab9ae7b754110ce307fb7e26e22cb49b5bb233c435f0f12b77c202f
+size 15984
diff --git a/checkpoint-140000/rng_state_7.pth b/checkpoint-140000/rng_state_7.pth
new file mode 100644
index 0000000000000000000000000000000000000000..7736ce8fe27978c921d21f9fd6d1cda8c15a03f9
--- /dev/null
+++ b/checkpoint-140000/rng_state_7.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:908f018cd701ed629c41299726da4a25f202f20a1d4bc2075a2266ed4013db3a
+size 15984
diff --git a/checkpoint-140000/scheduler.pt b/checkpoint-140000/scheduler.pt
new file mode 100644
index 0000000000000000000000000000000000000000..89ffb1508c775d23a46daa67ed0f6bea7440eac7
--- /dev/null
+++ b/checkpoint-140000/scheduler.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f253f091177361c8be3f2e930d54158f465c06ecd7559dd797f06d351edf12b7
+size 1064
diff --git a/checkpoint-140000/trainer_state.json b/checkpoint-140000/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..592933229b946e6caec13f6703fbf6a2520b67ac
--- /dev/null
+++ b/checkpoint-140000/trainer_state.json
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4afc54e9b48f65fc658b6c50d37e7b5d80e2a52f58f339636428726cb1c94b81
+size 24571198
diff --git a/checkpoint-150000/rng_state_1.pth b/checkpoint-150000/rng_state_1.pth
new file mode 100644
index 0000000000000000000000000000000000000000..3450e76a148654c7a712cc0f7f4fbfb8af73468e
--- /dev/null
+++ b/checkpoint-150000/rng_state_1.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:096e351ace65ff91008be171a45173ed6369cc639fce73a288f671041e24b0ec
+size 15984
diff --git a/checkpoint-150000/rng_state_3.pth b/checkpoint-150000/rng_state_3.pth
new file mode 100644
index 0000000000000000000000000000000000000000..b05597dd523f7c9a629beaa525e6ad7122b018f9
--- /dev/null
+++ b/checkpoint-150000/rng_state_3.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:348742498d81780506d9760c655a7a7555185b5fbd70a7ae296d88fd9aeecd84
+size 15984
diff --git a/checkpoint-150000/rng_state_4.pth b/checkpoint-150000/rng_state_4.pth
new file mode 100644
index 0000000000000000000000000000000000000000..28f7e8868fd4df63a6756b111fe1ea4a8e7eb6e2
--- /dev/null
+++ b/checkpoint-150000/rng_state_4.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:399c4700dab9ae7b754110ce307fb7e26e22cb49b5bb233c435f0f12b77c202f
+size 15984
diff --git a/checkpoint-150000/scheduler.pt b/checkpoint-150000/scheduler.pt
new file mode 100644
index 0000000000000000000000000000000000000000..b3ac53646ec1365189412c6d4ec909c5f13375a1
--- /dev/null
+++ b/checkpoint-150000/scheduler.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:0bea14e25f2a7f84d371ef8fe19b4ffa2c20fbf0a696f18247df48a4aaebe6fc
+size 1064
diff --git a/checkpoint-150000/trainer_state.json b/checkpoint-150000/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..27bdee179b514a5d1421102d5276ec8438905959
--- /dev/null
+++ b/checkpoint-150000/trainer_state.json
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3a7ead47e80809a72ad6540965942da9094aa2208d0120e9d316011f3fe4f3e4
+size 26332314
diff --git a/checkpoint-160000/rng_state_0.pth b/checkpoint-160000/rng_state_0.pth
new file mode 100644
index 0000000000000000000000000000000000000000..8be4da50e08ef9215e5f46f9cc7ebcb8fd4593eb
--- /dev/null
+++ b/checkpoint-160000/rng_state_0.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:955b895101e13c81ba510512313a06782795770a0bf998c90f718166d25f1664
+size 15984
diff --git a/checkpoint-160000/rng_state_1.pth b/checkpoint-160000/rng_state_1.pth
new file mode 100644
index 0000000000000000000000000000000000000000..3450e76a148654c7a712cc0f7f4fbfb8af73468e
--- /dev/null
+++ b/checkpoint-160000/rng_state_1.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:096e351ace65ff91008be171a45173ed6369cc639fce73a288f671041e24b0ec
+size 15984
diff --git a/checkpoint-160000/rng_state_2.pth b/checkpoint-160000/rng_state_2.pth
new file mode 100644
index 0000000000000000000000000000000000000000..9b3bdae1f6c60868005426daf443380b27066cba
--- /dev/null
+++ b/checkpoint-160000/rng_state_2.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f21c61b1a7e793bbdec183de3b52da90042305234bc7e5887986655cd3fc2192
+size 15984
diff --git a/checkpoint-160000/rng_state_3.pth b/checkpoint-160000/rng_state_3.pth
new file mode 100644
index 0000000000000000000000000000000000000000..b05597dd523f7c9a629beaa525e6ad7122b018f9
--- /dev/null
+++ b/checkpoint-160000/rng_state_3.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:348742498d81780506d9760c655a7a7555185b5fbd70a7ae296d88fd9aeecd84
+size 15984
diff --git a/checkpoint-160000/rng_state_4.pth b/checkpoint-160000/rng_state_4.pth
new file mode 100644
index 0000000000000000000000000000000000000000..28f7e8868fd4df63a6756b111fe1ea4a8e7eb6e2
--- /dev/null
+++ b/checkpoint-160000/rng_state_4.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:399c4700dab9ae7b754110ce307fb7e26e22cb49b5bb233c435f0f12b77c202f
+size 15984
diff --git a/checkpoint-160000/rng_state_5.pth b/checkpoint-160000/rng_state_5.pth
new file mode 100644
index 0000000000000000000000000000000000000000..b81cf1f9e78f606b05125af5ec552416d94116f4
--- /dev/null
+++ b/checkpoint-160000/rng_state_5.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:864ea2379cc907eb4189c52706cb978150d9c26e18abf74679590729a8f0c8e8
+size 15984
diff --git a/checkpoint-160000/rng_state_7.pth b/checkpoint-160000/rng_state_7.pth
new file mode 100644
index 0000000000000000000000000000000000000000..7736ce8fe27978c921d21f9fd6d1cda8c15a03f9
--- /dev/null
+++ b/checkpoint-160000/rng_state_7.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:908f018cd701ed629c41299726da4a25f202f20a1d4bc2075a2266ed4013db3a
+size 15984
diff --git a/checkpoint-160000/scheduler.pt b/checkpoint-160000/scheduler.pt
new file mode 100644
index 0000000000000000000000000000000000000000..b38d402b4519c807dc5c0d2cd4a4229e3f6f5cab
--- /dev/null
+++ b/checkpoint-160000/scheduler.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:731f143540afcf9b80a64272108a9c11ac26849d842d12d4ec579b8a47a14c02
+size 1064
diff --git a/checkpoint-160000/trainer_state.json b/checkpoint-160000/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..283b8b29ec3192f2f670f4d76c5bafd8678cd772
--- /dev/null
+++ b/checkpoint-160000/trainer_state.json
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:216e5da91b5b5ee3f4b82a535cd7f763b93f7c753aa01b60fb0c5bfba8313727
+size 28090096
diff --git a/checkpoint-168144/rng_state_1.pth b/checkpoint-168144/rng_state_1.pth
new file mode 100644
index 0000000000000000000000000000000000000000..3450e76a148654c7a712cc0f7f4fbfb8af73468e
--- /dev/null
+++ b/checkpoint-168144/rng_state_1.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:096e351ace65ff91008be171a45173ed6369cc639fce73a288f671041e24b0ec
+size 15984
diff --git a/checkpoint-168144/rng_state_2.pth b/checkpoint-168144/rng_state_2.pth
new file mode 100644
index 0000000000000000000000000000000000000000..9b3bdae1f6c60868005426daf443380b27066cba
--- /dev/null
+++ b/checkpoint-168144/rng_state_2.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f21c61b1a7e793bbdec183de3b52da90042305234bc7e5887986655cd3fc2192
+size 15984
diff --git a/checkpoint-168144/rng_state_3.pth b/checkpoint-168144/rng_state_3.pth
new file mode 100644
index 0000000000000000000000000000000000000000..b05597dd523f7c9a629beaa525e6ad7122b018f9
--- /dev/null
+++ b/checkpoint-168144/rng_state_3.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:348742498d81780506d9760c655a7a7555185b5fbd70a7ae296d88fd9aeecd84
+size 15984
diff --git a/checkpoint-168144/rng_state_4.pth b/checkpoint-168144/rng_state_4.pth
new file mode 100644
index 0000000000000000000000000000000000000000..28f7e8868fd4df63a6756b111fe1ea4a8e7eb6e2
--- /dev/null
+++ b/checkpoint-168144/rng_state_4.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:399c4700dab9ae7b754110ce307fb7e26e22cb49b5bb233c435f0f12b77c202f
+size 15984
diff --git a/checkpoint-168144/rng_state_5.pth b/checkpoint-168144/rng_state_5.pth
new file mode 100644
index 0000000000000000000000000000000000000000..b81cf1f9e78f606b05125af5ec552416d94116f4
--- /dev/null
+++ b/checkpoint-168144/rng_state_5.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:864ea2379cc907eb4189c52706cb978150d9c26e18abf74679590729a8f0c8e8
+size 15984
diff --git a/checkpoint-168144/rng_state_6.pth b/checkpoint-168144/rng_state_6.pth
new file mode 100644
index 0000000000000000000000000000000000000000..d9cc9104b41c399577a5b19f280ae6ba448edc4f
--- /dev/null
+++ b/checkpoint-168144/rng_state_6.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:25755ba07299ee6ff45936dd04df329596319c9f8095af71e6f3a219e7543e26
+size 15984
diff --git a/checkpoint-168144/rng_state_7.pth b/checkpoint-168144/rng_state_7.pth
new file mode 100644
index 0000000000000000000000000000000000000000..7736ce8fe27978c921d21f9fd6d1cda8c15a03f9
--- /dev/null
+++ b/checkpoint-168144/rng_state_7.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:908f018cd701ed629c41299726da4a25f202f20a1d4bc2075a2266ed4013db3a
+size 15984
diff --git a/checkpoint-168144/scheduler.pt b/checkpoint-168144/scheduler.pt
new file mode 100644
index 0000000000000000000000000000000000000000..f76a1a1aa589e1fe2011615c41e836b685293f37
--- /dev/null
+++ b/checkpoint-168144/scheduler.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:04f3abfb86c3a716132002fa3f0d0d10f7b28502b039c62c368f9e817f93511a
+size 1064
diff --git a/checkpoint-168144/trainer_state.json b/checkpoint-168144/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..767c2a2e77e6569abc83b0e6deddb00af6ff5faa
--- /dev/null
+++ b/checkpoint-168144/trainer_state.json
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d85a901acda4da7aafc3d6baf7f9fd9a3c77f6132a3da2ed2921f9a40b462824
+size 29522959
diff --git a/checkpoint-20000/rng_state_0.pth b/checkpoint-20000/rng_state_0.pth
new file mode 100644
index 0000000000000000000000000000000000000000..8be4da50e08ef9215e5f46f9cc7ebcb8fd4593eb
--- /dev/null
+++ b/checkpoint-20000/rng_state_0.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:955b895101e13c81ba510512313a06782795770a0bf998c90f718166d25f1664
+size 15984
diff --git a/checkpoint-20000/rng_state_1.pth b/checkpoint-20000/rng_state_1.pth
new file mode 100644
index 0000000000000000000000000000000000000000..3450e76a148654c7a712cc0f7f4fbfb8af73468e
--- /dev/null
+++ b/checkpoint-20000/rng_state_1.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:096e351ace65ff91008be171a45173ed6369cc639fce73a288f671041e24b0ec
+size 15984
diff --git a/checkpoint-20000/rng_state_3.pth b/checkpoint-20000/rng_state_3.pth
new file mode 100644
index 0000000000000000000000000000000000000000..b05597dd523f7c9a629beaa525e6ad7122b018f9
--- /dev/null
+++ b/checkpoint-20000/rng_state_3.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:348742498d81780506d9760c655a7a7555185b5fbd70a7ae296d88fd9aeecd84
+size 15984
diff --git a/checkpoint-20000/scheduler.pt b/checkpoint-20000/scheduler.pt
new file mode 100644
index 0000000000000000000000000000000000000000..1a9bd72b74e20c35ea6f869fc9f9e4e2b1ba359e
--- /dev/null
+++ b/checkpoint-20000/scheduler.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:1b4e5e645588d44ccac291aff2441d3eb20907e8b281244135ce511322173202
+size 1064
diff --git a/checkpoint-30000/rng_state_3.pth b/checkpoint-30000/rng_state_3.pth
new file mode 100644
index 0000000000000000000000000000000000000000..b05597dd523f7c9a629beaa525e6ad7122b018f9
--- /dev/null
+++ b/checkpoint-30000/rng_state_3.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:348742498d81780506d9760c655a7a7555185b5fbd70a7ae296d88fd9aeecd84
+size 15984
diff --git a/checkpoint-30000/rng_state_7.pth b/checkpoint-30000/rng_state_7.pth
new file mode 100644
index 0000000000000000000000000000000000000000..7736ce8fe27978c921d21f9fd6d1cda8c15a03f9
--- /dev/null
+++ b/checkpoint-30000/rng_state_7.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:908f018cd701ed629c41299726da4a25f202f20a1d4bc2075a2266ed4013db3a
+size 15984
diff --git a/checkpoint-30000/scheduler.pt b/checkpoint-30000/scheduler.pt
new file mode 100644
index 0000000000000000000000000000000000000000..f0f4db67f3b19e1d5530cd80b6d17f9ae238047a
--- /dev/null
+++ b/checkpoint-30000/scheduler.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:87ac17e8507155f7414e20152345d2f5db88cecd158e7a02814b19d43988faee
+size 1064
diff --git a/checkpoint-40000/rng_state_5.pth b/checkpoint-40000/rng_state_5.pth
new file mode 100644
index 0000000000000000000000000000000000000000..b81cf1f9e78f606b05125af5ec552416d94116f4
--- /dev/null
+++ b/checkpoint-40000/rng_state_5.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:864ea2379cc907eb4189c52706cb978150d9c26e18abf74679590729a8f0c8e8
+size 15984
diff --git a/checkpoint-60000/rng_state_3.pth b/checkpoint-60000/rng_state_3.pth
new file mode 100644
index 0000000000000000000000000000000000000000..b05597dd523f7c9a629beaa525e6ad7122b018f9
--- /dev/null
+++ b/checkpoint-60000/rng_state_3.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:348742498d81780506d9760c655a7a7555185b5fbd70a7ae296d88fd9aeecd84
+size 15984
diff --git a/checkpoint-60000/scheduler.pt b/checkpoint-60000/scheduler.pt
new file mode 100644
index 0000000000000000000000000000000000000000..df0fc1cbe0c68e4584aa721487452526f0c69b0a
--- /dev/null
+++ b/checkpoint-60000/scheduler.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:bcebd3f938e91ea913e2057ecf107b9ad246ba1337158c0c87f5d254795a47c0
+size 1064
diff --git a/checkpoint-60000/trainer_state.json b/checkpoint-60000/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..895bc29267793578079b7de8eadccf24db710e1c
--- /dev/null
+++ b/checkpoint-60000/trainer_state.json
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:cd50d067cf406c83e0172807c3c1b78af83ca27ce1755fd6edc00f220d7f9bc0
+size 10524148
diff --git a/checkpoint-70000/rng_state_1.pth b/checkpoint-70000/rng_state_1.pth
new file mode 100644
index 0000000000000000000000000000000000000000..3450e76a148654c7a712cc0f7f4fbfb8af73468e
--- /dev/null
+++ b/checkpoint-70000/rng_state_1.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:096e351ace65ff91008be171a45173ed6369cc639fce73a288f671041e24b0ec
+size 15984
diff --git a/checkpoint-70000/trainer_state.json b/checkpoint-70000/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..e1afae31f123c45a0eeff84f18f86050e06778ca
--- /dev/null
+++ b/checkpoint-70000/trainer_state.json
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4136876c2e2426601ffe902bfc79528124f4aeb56d48684d8ed3be3d2d23bff8
+size 12276143
diff --git a/checkpoint-80000/scheduler.pt b/checkpoint-80000/scheduler.pt
new file mode 100644
index 0000000000000000000000000000000000000000..128a22828626d50b218620a136eb10dc1fd3d850
--- /dev/null
+++ b/checkpoint-80000/scheduler.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b5f3b67ae2794985196af21293251ae5b5b1e1f177e172218e270f7569cdec36
+size 1064
diff --git a/checkpoint-80000/trainer_state.json b/checkpoint-80000/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..12426daf67e77faf792b94244b56567dc481dd7a
--- /dev/null
+++ b/checkpoint-80000/trainer_state.json
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9081f1eb392c8332dfbb4d6fcd776585ab97bea9f56f9558db63d6c6b98099c1
+size 14031756
diff --git a/checkpoint-90000/config.json b/checkpoint-90000/config.json
new file mode 100644
index 0000000000000000000000000000000000000000..87dd93f00e47ebb0e5d9883ccc75f5850ac6aedc
--- /dev/null
+++ b/checkpoint-90000/config.json
@@ -0,0 +1,28 @@
+{
+  "architectures": [
+    "Qwen2ForCausalLM"
+  ],
+  "attention_dropout": 0.0,
+  "bos_token_id": 151643,
+  "eos_token_id": 151645,
+  "hidden_act": "silu",
+  "hidden_size": 1536,
+  "initializer_range": 0.02,
+  "intermediate_size": 8960,
+  "max_position_embeddings": 32768,
+  "max_window_layers": 21,
+  "model_type": "qwen2",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 28,
+  "num_key_value_heads": 2,
+  "rms_norm_eps": 1e-06,
+  "rope_scaling": null,
+  "rope_theta": 1000000.0,
+  "sliding_window": 32768,
+  "tie_word_embeddings": true,
+  "torch_dtype": "float32",
+  "transformers_version": "4.51.3",
+  "use_cache": true,
+  "use_sliding_window": false,
+  "vocab_size": 180348
+}
diff --git a/checkpoint-90000/generation_config.json b/checkpoint-90000/generation_config.json
new file mode 100644
index 0000000000000000000000000000000000000000..676d263ba4b8835e0d1c53f899a2645043e2e9df
--- /dev/null
+++ b/checkpoint-90000/generation_config.json
@@ -0,0 +1,14 @@
+{
+  "bos_token_id": 151643,
+  "do_sample": true,
+  "eos_token_id": [
+    151645,
+    151643
+  ],
+  "pad_token_id": 151643,
+  "repetition_penalty": 1.1,
+  "temperature": 0.7,
+  "top_k": 20,
+  "top_p": 0.8,
+  "transformers_version": "4.51.3"
+}
diff --git a/checkpoint-90000/model.safetensors.index.json b/checkpoint-90000/model.safetensors.index.json
new file mode 100644
index 0000000000000000000000000000000000000000..3237092e4fcd45e652baa15621e9e4b23aa11253
--- /dev/null
+++ b/checkpoint-90000/model.safetensors.index.json
@@ -0,0 +1,346 @@
+{
+  "metadata": {
+    "total_size": 7457478656
+  },
+  "weight_map": {
+    "lm_head.weight": "model-00002-of-00002.safetensors",
+    "model.embed_tokens.weight": "model-00001-of-00002.safetensors",
+    "model.layers.0.input_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.0.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.0.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.0.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.0.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.0.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
+    "model.layers.0.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.0.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.0.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
+    "model.layers.0.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.0.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
+    "model.layers.0.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.1.input_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.1.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.1.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.1.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.1.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.1.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
+    "model.layers.1.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.1.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.1.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
+    "model.layers.1.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.1.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
+    "model.layers.1.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.10.input_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.10.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.10.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.10.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.10.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.10.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
+    "model.layers.10.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.10.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.10.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
+    "model.layers.10.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.10.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
+    "model.layers.10.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.11.input_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.11.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.11.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.11.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.11.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.11.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
+    "model.layers.11.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.11.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.11.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
+    "model.layers.11.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.11.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
+    "model.layers.11.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.12.input_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.12.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.12.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.12.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.12.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.12.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
+    "model.layers.12.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.12.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.12.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
+    "model.layers.12.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.12.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
+    "model.layers.12.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.13.input_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.13.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.13.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.13.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.13.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.13.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
+    "model.layers.13.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.13.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.13.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
+    "model.layers.13.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.13.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
+    "model.layers.13.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.14.input_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.14.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.14.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.14.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.14.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.14.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
+    "model.layers.14.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.14.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.14.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
+    "model.layers.14.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.14.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
+    "model.layers.14.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.15.input_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.15.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.15.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.15.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.15.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.15.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
+    "model.layers.15.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.15.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.15.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
+    "model.layers.15.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.15.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
+    "model.layers.15.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.16.input_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.16.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.16.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.16.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.16.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.16.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
+    "model.layers.16.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.16.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.16.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
+    "model.layers.16.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.16.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
+    "model.layers.16.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.17.input_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.17.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.17.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.17.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.17.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.17.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
+    "model.layers.17.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.17.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.17.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
+    "model.layers.17.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.17.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
+    "model.layers.17.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.18.input_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.18.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.18.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.18.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.18.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.18.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
+    "model.layers.18.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.18.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.18.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
+    "model.layers.18.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.18.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
+    "model.layers.18.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.19.input_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.19.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.19.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.19.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.19.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.19.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
+    "model.layers.19.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.19.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.19.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
+    "model.layers.19.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.19.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
+    "model.layers.19.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.2.input_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.2.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.2.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.2.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.2.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.2.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
+    "model.layers.2.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.2.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.2.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
+    "model.layers.2.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.2.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
+    "model.layers.2.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.20.input_layernorm.weight": "model-00002-of-00002.safetensors",
+    "model.layers.20.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.20.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.20.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.20.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
+    "model.layers.20.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
+    "model.layers.20.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.20.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.20.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
+    "model.layers.20.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.20.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
+    "model.layers.20.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.21.input_layernorm.weight": "model-00002-of-00002.safetensors",
+    "model.layers.21.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.21.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.21.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.21.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
+    "model.layers.21.self_attn.k_proj.bias": "model-00002-of-00002.safetensors",
+    "model.layers.21.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.21.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.21.self_attn.q_proj.bias": "model-00002-of-00002.safetensors",
+    "model.layers.21.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.21.self_attn.v_proj.bias": "model-00002-of-00002.safetensors",
+    "model.layers.21.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.22.input_layernorm.weight": "model-00002-of-00002.safetensors",
+    "model.layers.22.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.22.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.22.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.22.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
+    "model.layers.22.self_attn.k_proj.bias": "model-00002-of-00002.safetensors",
+    "model.layers.22.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.22.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.22.self_attn.q_proj.bias": "model-00002-of-00002.safetensors",
+    "model.layers.22.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.22.self_attn.v_proj.bias": "model-00002-of-00002.safetensors",
+    "model.layers.22.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.23.input_layernorm.weight": "model-00002-of-00002.safetensors",
+    "model.layers.23.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.23.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.23.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.23.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
+    "model.layers.23.self_attn.k_proj.bias": "model-00002-of-00002.safetensors",
+    "model.layers.23.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.23.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.23.self_attn.q_proj.bias": "model-00002-of-00002.safetensors",
+    "model.layers.23.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.23.self_attn.v_proj.bias": "model-00002-of-00002.safetensors",
+    "model.layers.23.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.24.input_layernorm.weight": "model-00002-of-00002.safetensors",
+    "model.layers.24.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.24.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.24.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.24.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
+    "model.layers.24.self_attn.k_proj.bias": "model-00002-of-00002.safetensors",
+    "model.layers.24.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.24.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.24.self_attn.q_proj.bias": "model-00002-of-00002.safetensors",
+    "model.layers.24.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.24.self_attn.v_proj.bias": "model-00002-of-00002.safetensors",
+    "model.layers.24.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.25.input_layernorm.weight": "model-00002-of-00002.safetensors",
+    "model.layers.25.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.25.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.25.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.25.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
+    "model.layers.25.self_attn.k_proj.bias": "model-00002-of-00002.safetensors",
+    "model.layers.25.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.25.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.25.self_attn.q_proj.bias": "model-00002-of-00002.safetensors",
+    "model.layers.25.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.25.self_attn.v_proj.bias": "model-00002-of-00002.safetensors",
+    "model.layers.25.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.26.input_layernorm.weight": "model-00002-of-00002.safetensors",
+    "model.layers.26.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.26.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.26.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.26.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
+    "model.layers.26.self_attn.k_proj.bias": "model-00002-of-00002.safetensors",
+    "model.layers.26.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.26.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.26.self_attn.q_proj.bias": "model-00002-of-00002.safetensors",
+    "model.layers.26.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.26.self_attn.v_proj.bias": "model-00002-of-00002.safetensors",
+    "model.layers.26.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.27.input_layernorm.weight": "model-00002-of-00002.safetensors",
+    "model.layers.27.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.27.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.27.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.27.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
+    "model.layers.27.self_attn.k_proj.bias": "model-00002-of-00002.safetensors",
+    "model.layers.27.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.27.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.27.self_attn.q_proj.bias": "model-00002-of-00002.safetensors",
+    "model.layers.27.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.27.self_attn.v_proj.bias": "model-00002-of-00002.safetensors",
+    "model.layers.27.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.3.input_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.3.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.3.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.3.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.3.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.3.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
+    "model.layers.3.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.3.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.3.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
+    "model.layers.3.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.3.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
+    "model.layers.3.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.4.input_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.4.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.4.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.4.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.4.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.4.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
+    "model.layers.4.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.4.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.4.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
+    "model.layers.4.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.4.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
+    "model.layers.4.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.5.input_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.5.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.5.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.5.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.5.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.5.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
+    "model.layers.5.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.5.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.5.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
+    "model.layers.5.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.5.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
+    "model.layers.5.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.6.input_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.6.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.6.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.6.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.6.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.6.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
+    "model.layers.6.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.6.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.6.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
+    "model.layers.6.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.6.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
+    "model.layers.6.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.7.input_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.7.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.7.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.7.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.7.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.7.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
+    "model.layers.7.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.7.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.7.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
+    "model.layers.7.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.7.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
+    "model.layers.7.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.8.input_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.8.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.8.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.8.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.8.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.8.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
+    "model.layers.8.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.8.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.8.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
+    "model.layers.8.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.8.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
+    "model.layers.8.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.9.input_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.9.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.9.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.9.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.9.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.9.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
+    "model.layers.9.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.9.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.9.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
+    "model.layers.9.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.9.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
+    "model.layers.9.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
+    "model.norm.weight": "model-00002-of-00002.safetensors"
+  }
+}
diff --git a/checkpoint-90000/rng_state_1.pth b/checkpoint-90000/rng_state_1.pth
new file mode 100644
index 0000000000000000000000000000000000000000..3450e76a148654c7a712cc0f7f4fbfb8af73468e
--- /dev/null
+++ b/checkpoint-90000/rng_state_1.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:096e351ace65ff91008be171a45173ed6369cc639fce73a288f671041e24b0ec
+size 15984
diff --git a/checkpoint-90000/rng_state_2.pth b/checkpoint-90000/rng_state_2.pth
new file mode 100644
index 0000000000000000000000000000000000000000..9b3bdae1f6c60868005426daf443380b27066cba
--- /dev/null
+++ b/checkpoint-90000/rng_state_2.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f21c61b1a7e793bbdec183de3b52da90042305234bc7e5887986655cd3fc2192
+size 15984
diff --git a/checkpoint-90000/rng_state_3.pth b/checkpoint-90000/rng_state_3.pth
new file mode 100644
index 0000000000000000000000000000000000000000..b05597dd523f7c9a629beaa525e6ad7122b018f9
--- /dev/null
+++ b/checkpoint-90000/rng_state_3.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:348742498d81780506d9760c655a7a7555185b5fbd70a7ae296d88fd9aeecd84
+size 15984
diff --git a/checkpoint-90000/rng_state_4.pth b/checkpoint-90000/rng_state_4.pth
new file mode 100644
index 0000000000000000000000000000000000000000..28f7e8868fd4df63a6756b111fe1ea4a8e7eb6e2
--- /dev/null
+++ b/checkpoint-90000/rng_state_4.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:399c4700dab9ae7b754110ce307fb7e26e22cb49b5bb233c435f0f12b77c202f
+size 15984
diff --git a/checkpoint-90000/rng_state_5.pth b/checkpoint-90000/rng_state_5.pth
new file mode 100644
index 0000000000000000000000000000000000000000..b81cf1f9e78f606b05125af5ec552416d94116f4
--- /dev/null
+++ b/checkpoint-90000/rng_state_5.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:864ea2379cc907eb4189c52706cb978150d9c26e18abf74679590729a8f0c8e8
+size 15984
diff --git a/checkpoint-90000/rng_state_6.pth b/checkpoint-90000/rng_state_6.pth
new file mode 100644
index 0000000000000000000000000000000000000000..d9cc9104b41c399577a5b19f280ae6ba448edc4f
--- /dev/null
+++ b/checkpoint-90000/rng_state_6.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:25755ba07299ee6ff45936dd04df329596319c9f8095af71e6f3a219e7543e26
+size 15984
diff --git a/checkpoint-90000/rng_state_7.pth b/checkpoint-90000/rng_state_7.pth
new file mode 100644
index 0000000000000000000000000000000000000000..7736ce8fe27978c921d21f9fd6d1cda8c15a03f9
--- /dev/null
+++ b/checkpoint-90000/rng_state_7.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:908f018cd701ed629c41299726da4a25f202f20a1d4bc2075a2266ed4013db3a
+size 15984
diff --git a/checkpoint-90000/trainer_state.json b/checkpoint-90000/trainer_state.json
new file mode 100644
index 0000000000000000000000000000000000000000..3754faface84e046577fa0c69dbf61e6abfb11b1
--- /dev/null
+++ b/checkpoint-90000/trainer_state.json
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c8d89782af592b189da9aca67572d0b32482e0e18a75d1cddc158497c8195d49
+size 15785048