4 bit Quantization of GLM-4.1V-9B

Quantization Methods

from transformers import AutoProcessor, Glm4vForConditionalGeneration
import torch
from transformers import BitsAndBytesConfig

MODEL_PATH = "THUDM/GLM-4.1V-9B-Thinking"
SAVE_PATH = "/define/the/path/to/save/model"

quant_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16,
    bnb_4bit_use_double_quant=True
)

processor = AutoProcessor.from_pretrained(MODEL_PATH, use_fast=True)
model = Glm4vForConditionalGeneration.from_pretrained(
    pretrained_model_name_or_path=MODEL_PATH,
    quantization_config=quant_config,
    device_map="auto",
)

model.save_pretrained(SAVE_PATH)
processor.save_pretrained(SAVE_PATH)

Serve the model with Transformers (Fast API)

#install necessary libraries
pip install fastapi uvicorn pydantic

#Run the script to set up the server
from transformers import AutoProcessor, Glm4vForConditionalGeneration
from fastapi import FastAPI
from pydantic import BaseModel
import torch
import base64
from PIL import Image
import io

app = FastAPI()
model = Glm4vForConditionalGeneration.from_pretrained(
    "/path/to/the/model/safesensors",
    device_map="auto",
    quantization_config={"load_in_4bit": True, "bnb_4bit_quant_type": "nf4", "bnb_4bit_compute_dtype": torch.bfloat16}
)
processor = AutoProcessor.from_pretrained("/home/ross/Q4", use_fast=True)

class CompletionRequest(BaseModel):
    prompt: str
    image: str | None = None
    max_tokens: int = 50

@app.post("/v1/completions")
async def generate_completion(request: CompletionRequest):
    messages = [{"role": "user", "content": [{"type": "text", "text": request.prompt}]}]    
    if request.image:
        try:
            image_data = base64.b64decode(request.image)
            image = Image.open(io.BytesIO(image_data))

            messages[0]["content"].append({"type": "image", "image": image})
        except Exception as e:
            return {"error": f"Failed to process image: {str(e)}"}
    
    inputs = processor.apply_chat_template(
        messages,
        tokenize=True,
        add_generation_prompt=True,
        return_dict=True,
        return_tensors="pt"
    ).to(model.device)
    
    inputs = {k: v.to(torch.bfloat16) if v.dtype.is_floating_point else v for k, v in inputs.items()}
    
    generated_ids = model.generate(**inputs, max_new_tokens=request.max_tokens)
    output_text = processor.decode(generated_ids[0][inputs["input_ids"].shape[1]:], skip_special_tokens=False)
    
    return {"choices": [{"text": output_text}]}

if __name__ == "__main__":
    import uvicorn
    uvicorn.run(app, host="0.0.0.0", port=8000)

Script to test the vision model

import base64
import requests
import json
import os

def encode_image_to_base64(image_path):
    """Encode an image file to base64 string."""
    try:
        with open(image_path, "rb") as image_file:
            encoded = base64.b64encode(image_file.read()).decode("utf-8")
            return encoded
    except FileNotFoundError:
        print(f"Error: Image file '{image_path}' not found.")
        return None
    except Exception as e:
        print(f"Error encoding image: {str(e)}")
        return None

def send_request(base64_image):
    """Send base64 image to vision model server using requests."""
    try:
        payload = {
            "prompt": "Describe this image in detail.",
            "image": base64_image,
            "max_tokens": 300
        }
        
        response = requests.post(
            "http://localhost:8000/v1/completions",
            headers={"Content-Type": "application/json"},
            json=payload
        )
        
        if response.status_code == 200:
            print("Server response:")
            print(response.text)
        else:
            print(f"Request failed with status code {response.status_code}:")
            print(response.text)
            
    except requests.RequestException as e:
        print(f"Error sending request: {str(e)}")
    except Exception as e:
        print(f"Unexpected error: {str(e)}")

def main():
    image_path = "/path/to/sample.jpeg"
    
    base64_image = encode_image_to_base64(image_path)
    if base64_image:
        send_request(base64_image)

if __name__ == "__main__":
    main()

The quantized model can be loaded using a single GPU with VRAM larger than 12GB (Tested on Tesla T10)

+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 570.133.20             Driver Version: 570.133.20     CUDA Version: 12.8     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|=========================================+========================+======================|
|   0  Tesla T10                      Off |   00000000:01:00.0 Off |                  Off |
| N/A   46C    P0             39W /  150W |    7594MiB /  16384MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+

+-----------------------------------------------------------------------------------------+
| Processes:                                                                              |
|  GPU   GI   CI              PID   Type   Process name                        GPU Memory |
|        ID   ID                                                               Usage      |
|=========================================================================================|
|    0   N/A  N/A            6824      G   /usr/lib/xorg/Xorg                        4MiB |
|    0   N/A  N/A           12956      C   python                                 7586MiB |
+-----------------------------------------------------------------------------------------+

Rainnighttram
/

GLM-4.1V-9B-Thinking-bnb-4bit

4 bit Quantization of GLM-4.1V-9B

Quantization Methods

Serve the model with Transformers (Fast API)

Model tree for Rainnighttram/GLM-4.1V-9B-Thinking-bnb-4bit