GLM-4.6-AWQ / quantize_glm46_awq.py
bullpoint's picture
Add files using upload-large-folder tool
d578beb verified
#!/usr/bin/env python3
"""
GLM-4.6 AWQ Quantization Script
Quantizes GLM-4.6 (357B MoE) to 4-bit AWQ for efficient inference with vLLM.
Requirements:
- 1× GPU with 48GB+ VRAM (single GPU is optimal)
- 768GB+ system RAM (DDR4/DDR5)
- 300GB+ swap space (will be actively used)
- PyTorch with CUDA support
- llm-compressor
- transformers
- datasets
Hardware Notes:
- Multi-GPU provides NO quantization speedup (process is RAM-bound, not GPU-bound)
- The full BF16 model (~714GB) will be offloaded to system RAM/swap
- Quantized using: 1× RTX PRO 6000 Blackwell Max-Q (96GB) + 768GB RAM
- Quantization time: ~5 hours (includes calibration, smoothing, compression, and saving)
Usage:
python quantize_glm46_awq.py --model zai-org/GLM-4.6 --output ./GLM-4.6-AWQ
Advanced options:
python quantize_glm46_awq.py \
--model zai-org/GLM-4.6 \
--output ./GLM-4.6-AWQ \
--device-map sequential \
--max-cpu-memory 750GiB \
--cal-samples 512
"""
import os
import argparse
import json
import shutil
import pathlib
from typing import List
import torch
from transformers import AutoConfig, AutoTokenizer, AutoModelForCausalLM
from datasets import load_dataset
from llmcompressor import oneshot
from llmcompressor.modifiers.awq import AWQModifier
def add_no_split(cfg: AutoConfig, classes: List[str]) -> AutoConfig:
"""Prevent splitting specific module classes across devices."""
ns = set(getattr(cfg, "no_split_module_classes", []) or [])
ns.update(classes)
cfg.no_split_module_classes = list(ns)
return cfg
def compute_batch_size(seq_len: int, target_tokens: int) -> int:
"""Calculate batch size to achieve target tokens per calibration step."""
return max(1, target_tokens // seq_len)
def clone_and_fix_index(src_dir: str) -> str:
"""
Clone model directory and fix empty-string key in weight_map if present.
This prevents device_map='auto' errors with some sharded checkpoints.
"""
src = pathlib.Path(src_dir)
dst = src.parent / (src.name + "_fixed_index")
if dst.exists():
shutil.rmtree(dst)
shutil.copytree(src, dst)
candidates = ["model.safetensors.index.json", "pytorch_model.bin.index.json"]
found = None
for c in candidates:
p = dst / c
if p.exists():
found = p
break
if not found:
return str(dst)
with open(found, "r") as f:
idx = json.load(f)
wm = idx.get("weight_map", {})
if "" in wm:
del wm[""]
idx["weight_map"] = wm
with open(found, "w") as f:
json.dump(idx, f)
return str(dst)
def main():
parser = argparse.ArgumentParser(description="Quantize GLM-4.6 to 4-bit AWQ")
parser.add_argument("--model", required=True, help="Path or HF ID of GLM-4.6 model (e.g., zai-org/GLM-4.6)")
parser.add_argument("--output", required=True, help="Output directory for quantized model")
parser.add_argument("--cal-samples", type=int, default=512, help="Number of calibration samples (default: 512)")
parser.add_argument("--cal-seq-len", type=int, default=2048, help="Calibration sequence length (default: 2048)")
parser.add_argument("--batch-tokens", type=int, default=131072, help="Tokens per calibration step (default: 131072)")
parser.add_argument("--dataset", default="neuralmagic/LLM_compression_calibration", help="Calibration dataset")
parser.add_argument("--dataset-split", default="train", help="Dataset split to use")
parser.add_argument("--device-map", choices=["auto", "sequential"], default="auto",
help="Device placement strategy: 'auto' (recommended) or 'sequential' (robust)")
parser.add_argument("--max-memory-per-gpu", type=str, default="92GiB",
help="Max memory per GPU (default: 92GiB for 96GB GPUs)")
parser.add_argument("--max-cpu-memory", type=str, default="500GiB",
help="Max CPU memory for offloading (default: 500GiB)")
args = parser.parse_args()
# Environment setup
os.environ.setdefault("TOKENIZERS_PARALLELISM", "false")
os.environ.setdefault("PYTORCH_ALLOC_CONF", "expandable_segments:True,max_split_size_mb:512")
# Use only GPU 0 for quantization (multi-GPU provides no benefit - process is RAM-bound)
os.environ.setdefault("CUDA_VISIBLE_DEVICES", "0")
# Enable TF32 for faster computation on Ampere+ GPUs
try:
torch.backends.cuda.matmul.fp32_precision = "tf32"
torch.backends.cudnn.conv.fp32_precision = "tf32"
except Exception:
pass
torch.set_num_threads(8)
# Verify CUDA availability
if not torch.cuda.is_available():
raise RuntimeError("CUDA is not available. This script requires GPU(s).")
num_gpus = torch.cuda.device_count()
print(f"✓ Found {num_gpus} CUDA device(s)")
print(f"✓ Using GPU 0 for quantization (CUDA_VISIBLE_DEVICES={os.environ.get('CUDA_VISIBLE_DEVICES', 'all')})")
print(f"\nNote: Multi-GPU provides NO speedup for quantization - the process is RAM-bound.")
print(f" The full BF16 model (~714GB) will be offloaded to system RAM/swap.")
# Load configuration
print(f"Loading config from: {args.model}")
cfg = AutoConfig.from_pretrained(args.model, trust_remote_code=True)
# Prevent splitting merged linear layers across devices
cfg = add_no_split(cfg, ["MergedColumnParallelLinear"])
# Load tokenizer
print("Loading tokenizer...")
tokenizer = AutoTokenizer.from_pretrained(args.model, trust_remote_code=True, use_fast=True)
# Load model with device placement
print(f"Loading model weights from: {args.model}")
load_dir = args.model
if args.device_map == "auto":
try:
load_dir = clone_and_fix_index(args.model)
except Exception as e:
print(f"Index sanitization skipped: {e}")
# Configure memory allocation
max_mem = {i: args.max_memory_per_gpu for i in range(num_gpus)}
max_mem["cpu"] = args.max_cpu_memory
try:
model = AutoModelForCausalLM.from_pretrained(
load_dir,
torch_dtype=torch.bfloat16,
low_cpu_mem_usage=True,
trust_remote_code=True,
device_map=args.device_map,
config=cfg,
max_memory=max_mem,
offload_folder=None,
offload_state_dict=False,
)
except KeyError as e:
if args.device_map == "auto":
print(f"Auto device_map failed with {e}; falling back to sequential...")
model = AutoModelForCausalLM.from_pretrained(
load_dir,
torch_dtype=torch.bfloat16,
low_cpu_mem_usage=True,
trust_remote_code=True,
device_map="sequential",
config=cfg,
max_memory=max_mem,
)
else:
raise
print("✓ Model loaded successfully")
# Print GPU memory usage
print("\nGPU Memory Usage:")
for i in range(num_gpus):
allocated = torch.cuda.memory_allocated(i) / 1e9
peak = torch.cuda.max_memory_allocated(i) / 1e9
print(f" GPU {i}: {allocated:.2f} GB allocated / {peak:.2f} GB peak")
# Load calibration dataset
print(f"\nLoading calibration dataset: {args.dataset}")
ds = load_dataset(args.dataset, split=args.dataset_split)
ds = ds.shuffle(seed=42).select(range(args.cal_samples))
print(f"✓ Selected {len(ds)} calibration samples")
seq_len = args.cal_seq_len
batch_size = compute_batch_size(seq_len, args.batch_tokens)
print(f"Calibration config: seq_len={seq_len}, batch_size={batch_size}")
# AWQ quantization recipe
# Keep critical layers at full precision for quality
ignore_patterns = [
"lm_head",
"model.embed_tokens",
"re:.*input_layernorm$",
"re:.*post_attention_layernorm$",
"model.norm",
"re:.*q_norm$",
"re:.*k_norm$",
"re:.*shared_experts.*", # Always-active experts
"re:.*mlp\\.gate\\.weight$", # MoE router
"re:.*mlp\\.gate\\..*bias$", # MoE router bias
"re:model.layers.[0-2]\\.", # First 3 layers for quality
]
# Target patterns for quantization
targets = [
"re:.*gate_proj.*", # MLP projections
"re:.*up_proj.*",
"re:.*down_proj.*",
"re:.*k_proj.*", # Attention projections
"re:.*q_proj.*",
"re:.*v_proj.*",
"re:.*o_proj.*",
]
recipe = [
AWQModifier(
ignore=ignore_patterns,
config_groups={
"group_0": {
"targets": targets,
"weights": {
"num_bits": 4,
"type": "int",
"symmetric": True,
"group_size": 128,
"strategy": "group",
"dynamic": False,
},
"input_activations": None,
"output_activations": None,
"format": None,
}
},
)
]
# Run AWQ quantization
print("\n" + "="*80)
print("Starting AWQ quantization...")
print("="*80)
with torch.inference_mode():
oneshot_args = {
"model": model,
"dataset": ds,
"recipe": recipe,
"max_seq_length": seq_len,
"num_calibration_samples": len(ds),
}
# Add batch_size if supported
try:
from inspect import signature
if "batch_size" in signature(oneshot).parameters:
oneshot_args["batch_size"] = batch_size
except Exception:
pass
oneshot(**oneshot_args)
print("\n✓ AWQ quantization completed successfully")
# Save quantized model
print(f"\nSaving quantized model to: {args.output}")
os.makedirs(args.output, exist_ok=True)
model.save_pretrained(args.output, save_compressed=True)
tokenizer.save_pretrained(args.output)
print("\n" + "="*80)
print("QUANTIZATION COMPLETE")
print("="*80)
print(f"Quantized model saved to: {args.output}")
print(f"\nModel size on disk: ~176 GB (39 safetensors files)")
print(f"\nTo use with vLLM:")
print(f" vllm serve {args.output} \\")
print(f" --tensor-parallel-size 4 \\")
print(f" --enable-expert-parallel \\")
print(f" --trust-remote-code")
print("="*80)
if __name__ == "__main__":
main()