Model Card

High quality quantization of Qwen3-235B-A22B-Thinking-2507 without using imatrix.

Run

ik_llama.cpp

./build/bin/llama-server \
    --alias anikifoss/Qwen3-235B-A22B-Thinking-2507-DQ4_K \
    --model /mnt/data/Models/anikifoss/Qwen3-235B-A22B-Thinking-2507-DQ4_K/Qwen3-235B-A22B-Thinking-2507-DQ4_K-00001-of-00003.gguf \
    --no-mmap -rtr \
    --temp 0.5 --top-k 0 --top-p 1.0 --min-p 0.1 --repeat-penalty 1.0 \
    --ctx-size 101000 \
    -ctk f16 -ctv f16 \
    -fa \
    -b 1024 -ub 1024 \
    -fmoe \
    --n-gpu-layers 99 \
    --override-tensor exps=CPU \
    --parallel 1 \
    --threads 32 \
    --threads-batch 64 \
    --host 127.0.0.1 \
    --port 8090

llama.cpp

./build/bin/llama-server \
    --alias anikifoss/Qwen3-235B-A22B-Thinking-2507-DQ4_K \
    --model /mnt/data/Models/anikifoss/Qwen3-235B-A22B-Thinking-2507-DQ4_K/Qwen3-235B-A22B-Thinking-2507-DQ4_K-00001-of-00003.gguf \
    --no-mmap \
    --temp 0.5 --top-k 0 --top-p 1.0 --min-p 0.1 --repeat-penalty 1.0 \
    --ctx-size 101000 \
    -ctk f16 -ctv f16 \
    -fa \
    -b 1024 -ub 1024 \
    --n-gpu-layers 99 \
    --override-tensor exps=CPU \
    --parallel 1 \
    --threads 32 \
    --threads-batch 64 \
    --host 127.0.0.1 \
    --port 8090

Quantization Recipe

Quantized with ik_llama, but should work with any GGUF compatible inference framework.

#!/usr/bin/env bash

custom="
# Token embedding and output tensors
output\.weight=bf16
output_norm\.weight=f32
token_embd\.weight=bf16

# Attention
blk\.[0-93]\.attn_k\.weight=q8_0
blk\.[0-93]\.attn_k_norm\.weight=f32
blk\.[0-93]\.attn_norm\.weight=f32
blk\.[0-93]\.attn_output\.weight=q8_0
blk\.[0-93]\.attn_q\.weight=q8_0
blk\.[0-93]\.attn_q_norm\.weight=f32
blk\.[0-93]\.attn_v\.weight=q8_0

# MoE
blk\.[0-93]\.ffn_down_exps\.weight=q6_K
blk\.[0-93]\.ffn_gate_exps\.weight=q4_K
blk\.[0-93]\.ffn_up_exps\.weight=q4_K

# Other
blk.[0-93].ffn_gate_inp.weight=f32
blk.[0-93].ffn_norm.weight=f32
"

custom=$(
  echo "$custom" | grep -v '^#' | \
  sed -Ez 's:\n+:,:g;s:,$::;s:^,::'
)

echo "Running with: -custom-q $custom"

mkdir -p /mnt/data/Models/anikifoss/Qwen3-235B-A22B-Thinking-2507-DQ4_K
./build/bin/llama-quantize \
    --custom-q "$custom" \
    /mnt/data/Models/Qwen/Qwen3-235B-A22B-Thinking-2507-GGUF/Qwen3-235B-A22B-Thinking-2507-BF16-00001-of-00010.gguf \
    /mnt/data/Models/anikifoss/Qwen3-235B-A22B-Thinking-2507-DQ4_K/Qwen3-235B-A22B-Thinking-2507-DQ4_K.gguf \
    Q4_K \
    32

anikifoss
/

Qwen3-235B-A22B-Thinking-2507-HQ4_K

Model Card

Run

ik_llama.cpp

llama.cpp

Quantization Recipe

Model tree for anikifoss/Qwen3-235B-A22B-Thinking-2507-HQ4_K