Update README.md
Browse files
README.md
CHANGED
@@ -1,3 +1,106 @@
|
|
1 |
-
---
|
2 |
-
|
3 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
---
|
2 |
+
quantized_by: anikifoss
|
3 |
+
pipeline_tag: text-generation
|
4 |
+
base_model: Qwen/Qwen3-235B-A22B-Thinking-2507
|
5 |
+
license: apache-2.0
|
6 |
+
license_link: LICENSE
|
7 |
+
base_model_relation: quantized
|
8 |
+
tags:
|
9 |
+
- conversational
|
10 |
+
---
|
11 |
+
|
12 |
+
# Model Card
|
13 |
+
|
14 |
+
High quality quantization of **Qwen3-235B-A22B-Thinking-2507** without using imatrix.
|
15 |
+
|
16 |
+
# Run
|
17 |
+
|
18 |
+
## ik_llama.cpp
|
19 |
+
|
20 |
+
```
|
21 |
+
./build/bin/llama-server \
|
22 |
+
--alias anikifoss/Qwen3-235B-A22B-Thinking-2507-DQ4_K \
|
23 |
+
--model /mnt/data/Models/anikifoss/Qwen3-235B-A22B-Thinking-2507-DQ4_K/Qwen3-235B-A22B-Thinking-2507-DQ4_K-00001-of-00003.gguf \
|
24 |
+
--no-mmap -rtr \
|
25 |
+
--temp 0.5 --top-k 0 --top-p 1.0 --min-p 0.1 --repeat-penalty 1.0 \
|
26 |
+
--ctx-size 101000 \
|
27 |
+
-ctk f16 -ctv f16 \
|
28 |
+
-fa \
|
29 |
+
-b 1024 -ub 1024 \
|
30 |
+
-fmoe \
|
31 |
+
--n-gpu-layers 99 \
|
32 |
+
--override-tensor exps=CPU \
|
33 |
+
--parallel 1 \
|
34 |
+
--threads 32 \
|
35 |
+
--threads-batch 64 \
|
36 |
+
--host 127.0.0.1 \
|
37 |
+
--port 8090
|
38 |
+
```
|
39 |
+
|
40 |
+
## llama.cpp
|
41 |
+
|
42 |
+
```
|
43 |
+
./build/bin/llama-server \
|
44 |
+
--alias anikifoss/Qwen3-235B-A22B-Thinking-2507-DQ4_K \
|
45 |
+
--model /mnt/data/Models/anikifoss/Qwen3-235B-A22B-Thinking-2507-DQ4_K/Qwen3-235B-A22B-Thinking-2507-DQ4_K-00001-of-00003.gguf \
|
46 |
+
--no-mmap \
|
47 |
+
--temp 0.5 --top-k 0 --top-p 1.0 --min-p 0.1 --repeat-penalty 1.0 \
|
48 |
+
--ctx-size 101000 \
|
49 |
+
-ctk f16 -ctv f16 \
|
50 |
+
-fa \
|
51 |
+
-b 1024 -ub 1024 \
|
52 |
+
--n-gpu-layers 99 \
|
53 |
+
--override-tensor exps=CPU \
|
54 |
+
--parallel 1 \
|
55 |
+
--threads 32 \
|
56 |
+
--threads-batch 64 \
|
57 |
+
--host 127.0.0.1 \
|
58 |
+
--port 8090
|
59 |
+
```
|
60 |
+
|
61 |
+
## Quantization Recipe
|
62 |
+
Quantized with [ik_llama](https://github.com/ikawrakow/ik_llama.cpp), but should work with any GGUF compatible inference framework.
|
63 |
+
|
64 |
+
```bash
|
65 |
+
#!/usr/bin/env bash
|
66 |
+
|
67 |
+
custom="
|
68 |
+
# Token embedding and output tensors
|
69 |
+
output\.weight=bf16
|
70 |
+
output_norm\.weight=f32
|
71 |
+
token_embd\.weight=bf16
|
72 |
+
|
73 |
+
# Attention
|
74 |
+
blk\.[0-93]\.attn_k\.weight=q8_0
|
75 |
+
blk\.[0-93]\.attn_k_norm\.weight=f32
|
76 |
+
blk\.[0-93]\.attn_norm\.weight=f32
|
77 |
+
blk\.[0-93]\.attn_output\.weight=q8_0
|
78 |
+
blk\.[0-93]\.attn_q\.weight=q8_0
|
79 |
+
blk\.[0-93]\.attn_q_norm\.weight=f32
|
80 |
+
blk\.[0-93]\.attn_v\.weight=q8_0
|
81 |
+
|
82 |
+
# MoE
|
83 |
+
blk\.[0-93]\.ffn_down_exps\.weight=q6_K
|
84 |
+
blk\.[0-93]\.ffn_gate_exps\.weight=q4_K
|
85 |
+
blk\.[0-93]\.ffn_up_exps\.weight=q4_K
|
86 |
+
|
87 |
+
# Other
|
88 |
+
blk.[0-93].ffn_gate_inp.weight=f32
|
89 |
+
blk.[0-93].ffn_norm.weight=f32
|
90 |
+
"
|
91 |
+
|
92 |
+
custom=$(
|
93 |
+
echo "$custom" | grep -v '^#' | \
|
94 |
+
sed -Ez 's:\n+:,:g;s:,$::;s:^,::'
|
95 |
+
)
|
96 |
+
|
97 |
+
echo "Running with: -custom-q $custom"
|
98 |
+
|
99 |
+
mkdir -p /mnt/data/Models/anikifoss/Qwen3-235B-A22B-Thinking-2507-DQ4_K
|
100 |
+
./build/bin/llama-quantize \
|
101 |
+
--custom-q "$custom" \
|
102 |
+
/mnt/data/Models/Qwen/Qwen3-235B-A22B-Thinking-2507-GGUF/Qwen3-235B-A22B-Thinking-2507-BF16-00001-of-00010.gguf \
|
103 |
+
/mnt/data/Models/anikifoss/Qwen3-235B-A22B-Thinking-2507-DQ4_K/Qwen3-235B-A22B-Thinking-2507-DQ4_K.gguf \
|
104 |
+
Q4_K \
|
105 |
+
32
|
106 |
+
```
|