Duplicate from mwhanna/qwen3-4b-transcoders
Browse filesCo-authored-by: Michael Hanna <[email protected]>
- .gitattributes +35 -0
- README.md +3 -0
- layer_0.safetensors +3 -0
- layer_1.safetensors +3 -0
- layer_10.safetensors +3 -0
- layer_11.safetensors +3 -0
- layer_12.safetensors +3 -0
- layer_13.safetensors +3 -0
- layer_14.safetensors +3 -0
- layer_15.safetensors +3 -0
- layer_16.safetensors +3 -0
- layer_17.safetensors +3 -0
- layer_18.safetensors +3 -0
- layer_19.safetensors +3 -0
- layer_2.safetensors +3 -0
- layer_20.safetensors +3 -0
- layer_21.safetensors +3 -0
- layer_22.safetensors +3 -0
- layer_23.safetensors +3 -0
- layer_24.safetensors +3 -0
- layer_25.safetensors +3 -0
- layer_26.safetensors +3 -0
- layer_27.safetensors +3 -0
- layer_28.safetensors +3 -0
- layer_29.safetensors +3 -0
- layer_3.safetensors +3 -0
- layer_30.safetensors +3 -0
- layer_31.safetensors +3 -0
- layer_32.safetensors +3 -0
- layer_33.safetensors +3 -0
- layer_34.safetensors +3 -0
- layer_35.safetensors +3 -0
- layer_4.safetensors +3 -0
- layer_5.safetensors +3 -0
- layer_6.safetensors +3 -0
- layer_7.safetensors +3 -0
- layer_8.safetensors +3 -0
- layer_9.safetensors +3 -0
- wanb-config.yaml +83 -0
.gitattributes
ADDED
@@ -0,0 +1,35 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
*.7z filter=lfs diff=lfs merge=lfs -text
|
2 |
+
*.arrow filter=lfs diff=lfs merge=lfs -text
|
3 |
+
*.bin filter=lfs diff=lfs merge=lfs -text
|
4 |
+
*.bz2 filter=lfs diff=lfs merge=lfs -text
|
5 |
+
*.ckpt filter=lfs diff=lfs merge=lfs -text
|
6 |
+
*.ftz filter=lfs diff=lfs merge=lfs -text
|
7 |
+
*.gz filter=lfs diff=lfs merge=lfs -text
|
8 |
+
*.h5 filter=lfs diff=lfs merge=lfs -text
|
9 |
+
*.joblib filter=lfs diff=lfs merge=lfs -text
|
10 |
+
*.lfs.* filter=lfs diff=lfs merge=lfs -text
|
11 |
+
*.mlmodel filter=lfs diff=lfs merge=lfs -text
|
12 |
+
*.model filter=lfs diff=lfs merge=lfs -text
|
13 |
+
*.msgpack filter=lfs diff=lfs merge=lfs -text
|
14 |
+
*.npy filter=lfs diff=lfs merge=lfs -text
|
15 |
+
*.npz filter=lfs diff=lfs merge=lfs -text
|
16 |
+
*.onnx filter=lfs diff=lfs merge=lfs -text
|
17 |
+
*.ot filter=lfs diff=lfs merge=lfs -text
|
18 |
+
*.parquet filter=lfs diff=lfs merge=lfs -text
|
19 |
+
*.pb filter=lfs diff=lfs merge=lfs -text
|
20 |
+
*.pickle filter=lfs diff=lfs merge=lfs -text
|
21 |
+
*.pkl filter=lfs diff=lfs merge=lfs -text
|
22 |
+
*.pt filter=lfs diff=lfs merge=lfs -text
|
23 |
+
*.pth filter=lfs diff=lfs merge=lfs -text
|
24 |
+
*.rar filter=lfs diff=lfs merge=lfs -text
|
25 |
+
*.safetensors filter=lfs diff=lfs merge=lfs -text
|
26 |
+
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
27 |
+
*.tar.* filter=lfs diff=lfs merge=lfs -text
|
28 |
+
*.tar filter=lfs diff=lfs merge=lfs -text
|
29 |
+
*.tflite filter=lfs diff=lfs merge=lfs -text
|
30 |
+
*.tgz filter=lfs diff=lfs merge=lfs -text
|
31 |
+
*.wasm filter=lfs diff=lfs merge=lfs -text
|
32 |
+
*.xz filter=lfs diff=lfs merge=lfs -text
|
33 |
+
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
+
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
+
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
README.md
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
---
|
2 |
+
license: mit
|
3 |
+
---
|
layer_0.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:6f57f1cc3292492a73733192adfdb122d7e361a5e797327f363646a352a95ff5
|
3 |
+
size 1678054736
|
layer_1.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:aafc41c7486aa31a700707b6a9bef3c9a2dfb158ccd8f69cd63c9d5ef0a94396
|
3 |
+
size 1678054736
|
layer_10.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:a0a8f99900efe0c7e5310fcea60a74655172e0cd3c5d29a8b78ca0468ea85bf8
|
3 |
+
size 1678054736
|
layer_11.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:9a82abfcbc1e915bf90303171590487e2e7f97bc110165c3aef5a6a64e31b50d
|
3 |
+
size 1678054736
|
layer_12.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:c4ae9f81f1f1985ca3a0b2b7341db2e686988a407fca04d3795c92975efeb584
|
3 |
+
size 1678054736
|
layer_13.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:d632107b61559ded3306214b55bc6e77afc5fa570d8c8c31421b75ddde275a9b
|
3 |
+
size 1678054736
|
layer_14.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:e0655c96cf7b556398214c08cf18a3e72d3cb7cee3bcc24d43de6208e5433ec1
|
3 |
+
size 1678054736
|
layer_15.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:e2da70628fba42012d86df97714560957d03ca6ed40e053f1a728500a36a30ea
|
3 |
+
size 1678054736
|
layer_16.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:78285e291b0cc3df7a7eff18f2a32cca64630d8b5210318ac8e4774d694b7dd7
|
3 |
+
size 1678054736
|
layer_17.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:5ce85f9315a0ef68285d44ee5581527651cf358a97abc5e33c8d5148933929bb
|
3 |
+
size 1678054736
|
layer_18.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:8d970cd59bafcbace2efbcc5cb95e06b1bf5e57b906bd17d84a04ef7ad872db0
|
3 |
+
size 1678054736
|
layer_19.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:8dc12824ae21b2106d00e50b450bcf590fd75857498acff18813d98f947c6db9
|
3 |
+
size 1678054736
|
layer_2.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:bdc5818da3b1e855707b7de8430467e05662885f40fd8b895dbed8fdf28aaa3e
|
3 |
+
size 1678054736
|
layer_20.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:576119742b830846ec9cb7c53311f7dfc0f58adffb9dbcbeb0a501d2b1d8c2dd
|
3 |
+
size 1678054736
|
layer_21.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:d013a31692a263e6beb7c574e79e8169b27b365a622d1efbb4082d0467353c93
|
3 |
+
size 1678054736
|
layer_22.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:ed8102a91559b5334b08b7b4cb84dadbfea108a73890f78c0fdd7f95285b4f99
|
3 |
+
size 1678054736
|
layer_23.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:d1747f36f31a956f31e6ea03c70b1b4a9ac7efbee52643e87095dc92521ad217
|
3 |
+
size 1678054736
|
layer_24.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:91a84bc9d36f6e3b21d596a98e30fab83dbb9011d736bc6a3206a942f4cc3289
|
3 |
+
size 1678054736
|
layer_25.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:371ba57f9e04bdc42f5e24a0840cdce8791562626028af49bf917f2ef868f2e3
|
3 |
+
size 1678054736
|
layer_26.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:f51d80b017591d1c6b5b3c24e535888ab05c2524d911dc0a43fc69bfb6fc3667
|
3 |
+
size 1678054736
|
layer_27.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:b6229944269113980e8a4a5c58182ed81a601463591d9519d2fcb7cd5e44dc02
|
3 |
+
size 1678054736
|
layer_28.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:786dda24ee88348833e264de9a2109d4bed676b9184a39d01b0f402a76409418
|
3 |
+
size 1678054736
|
layer_29.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:90818b782d6c49623889423fe0a5856b585cc577f31fabee13ff5ec77740b2f9
|
3 |
+
size 1678054736
|
layer_3.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:78ceddf0e5524ecb11b73fbee9f831aaf8a1cca86f5c19ee874b8e0638404f9e
|
3 |
+
size 1678054736
|
layer_30.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:bd1d2801420e644323a79d36ab900e14912e44db0493b9ba34d69992f09d597a
|
3 |
+
size 1678054736
|
layer_31.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:7ff6fcf9ae067206049726a798ef16a29939f97843e95d465a654a0e1ba5b63e
|
3 |
+
size 1678054736
|
layer_32.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:367531b9152ed6ae75fc14f3414005f6a682f4d06bd9b4ca7fda338b60eb22e7
|
3 |
+
size 1678054736
|
layer_33.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:542899ee0098a278a2b8bb16a1dd6b45d2d65b688d2f505f9bb1e1eba5d026b5
|
3 |
+
size 1678054736
|
layer_34.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:c66263fbfbc6eb447bada4311cf1b5cb3389f990894ae69a5ba8ba332c003c85
|
3 |
+
size 1678054736
|
layer_35.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:72b36d6232c2e8816fd97a4f404faf77e9f393d55a4da882cfcf37bc59c3ca2f
|
3 |
+
size 1678054736
|
layer_4.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:ac83291a8bc07516288dbe2ec356dd99e79b0622e7ecc8f3df0c7d337b9f850e
|
3 |
+
size 1678054736
|
layer_5.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:2ba78a2c4633e91755bb247517476a2d578edd75021ca95d844063a575a7ec47
|
3 |
+
size 1678054736
|
layer_6.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:73077054c45dad48872c1b84f3b148a64fc0a1be05d12e78c4b1ad118722f29a
|
3 |
+
size 1678054736
|
layer_7.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:e9d2e1f2f348db31ed5b1684cd7399d1dc69be25c0c107fcd4a488906eeef600
|
3 |
+
size 1678054736
|
layer_8.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:18acd2e45e52e9616b518eeea4db986718868daec496f40619e1cd9118af63c4
|
3 |
+
size 1678054736
|
layer_9.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:f29f08ee4835c28e907ca75ee92c3502bde8e5a829dcada8e51921642c5328fb
|
3 |
+
size 1678054736
|
wanb-config.yaml
ADDED
@@ -0,0 +1,83 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
_wandb:
|
2 |
+
value:
|
3 |
+
cli_version: 0.20.1
|
4 |
+
m:
|
5 |
+
- "1": gpu/memory_allocated_gb
|
6 |
+
"6":
|
7 |
+
- 3
|
8 |
+
"7": []
|
9 |
+
- "1": gpu/memory_reserved_gb
|
10 |
+
"6":
|
11 |
+
- 3
|
12 |
+
"7": []
|
13 |
+
- "1": gpu/max_memory_allocated_gb
|
14 |
+
"6":
|
15 |
+
- 3
|
16 |
+
"7": []
|
17 |
+
python_version: 3.11.10
|
18 |
+
t:
|
19 |
+
"1":
|
20 |
+
- 1
|
21 |
+
- 11
|
22 |
+
- 49
|
23 |
+
- 71
|
24 |
+
"2":
|
25 |
+
- 1
|
26 |
+
- 11
|
27 |
+
- 49
|
28 |
+
- 71
|
29 |
+
"3":
|
30 |
+
- 7
|
31 |
+
- 13
|
32 |
+
- 16
|
33 |
+
- 55
|
34 |
+
- 61
|
35 |
+
"4": 3.11.10
|
36 |
+
"5": 0.20.1
|
37 |
+
"6": 4.52.4
|
38 |
+
"12": 0.20.1
|
39 |
+
"13": linux-x86_64
|
40 |
+
act_fn:
|
41 |
+
value: relu
|
42 |
+
batch_size:
|
43 |
+
value: 8192
|
44 |
+
before_ln:
|
45 |
+
value: false
|
46 |
+
c_coeff:
|
47 |
+
value: 4
|
48 |
+
cooldown_start_frac:
|
49 |
+
value: 0.8
|
50 |
+
d_feature:
|
51 |
+
value: 163840
|
52 |
+
d_model:
|
53 |
+
value: 2560
|
54 |
+
device:
|
55 |
+
value: cuda:0
|
56 |
+
initial_lr:
|
57 |
+
value: 0.0002
|
58 |
+
layer_idx:
|
59 |
+
value: 0
|
60 |
+
lr:
|
61 |
+
value: 0.0002
|
62 |
+
min_lr_ratio:
|
63 |
+
value: 0
|
64 |
+
model_name:
|
65 |
+
value: Qwen/Qwen3-4B
|
66 |
+
model_type:
|
67 |
+
value: qwen
|
68 |
+
n_batches:
|
69 |
+
value: 152
|
70 |
+
n_grad_steps:
|
71 |
+
value: 4
|
72 |
+
n_steps:
|
73 |
+
value: 122070
|
74 |
+
preact_coeff:
|
75 |
+
value: 6e-05
|
76 |
+
skip_connections:
|
77 |
+
value: false
|
78 |
+
sparsity_coeff_final:
|
79 |
+
value: 8
|
80 |
+
x_scale:
|
81 |
+
value: 1
|
82 |
+
y_scale:
|
83 |
+
value: 1
|