4 bit Quantization of GLM-4.1V-9B
Quantization Methods
from transformers import AutoProcessor, Glm4vForConditionalGeneration
import torch
from transformers import BitsAndBytesConfig
MODEL_PATH = "THUDM/GLM-4.1V-9B-Thinking"
SAVE_PATH = "/define/the/path/to/save/model"
quant_config = BitsAndBytesConfig(
load_in_4bit=True,
bnb_4bit_quant_type="nf4",
bnb_4bit_compute_dtype=torch.bfloat16,
bnb_4bit_use_double_quant=True
)
processor = AutoProcessor.from_pretrained(MODEL_PATH, use_fast=True)
model = Glm4vForConditionalGeneration.from_pretrained(
pretrained_model_name_or_path=MODEL_PATH,
quantization_config=quant_config,
device_map="auto",
)
model.save_pretrained(SAVE_PATH)
processor.save_pretrained(SAVE_PATH)
Serve the model with Transformers (Fast API)
#install necessary libraries
pip install fastapi uvicorn pydantic
#Run the script to set up the server
from transformers import AutoProcessor, Glm4vForConditionalGeneration
from fastapi import FastAPI
from pydantic import BaseModel
import torch
import base64
from PIL import Image
import io
app = FastAPI()
model = Glm4vForConditionalGeneration.from_pretrained(
"/path/to/the/model/safesensors",
device_map="auto",
quantization_config={"load_in_4bit": True, "bnb_4bit_quant_type": "nf4", "bnb_4bit_compute_dtype": torch.bfloat16}
)
processor = AutoProcessor.from_pretrained("/home/ross/Q4", use_fast=True)
class CompletionRequest(BaseModel):
prompt: str
image: str | None = None
max_tokens: int = 50
@app.post("/v1/completions")
async def generate_completion(request: CompletionRequest):
messages = [{"role": "user", "content": [{"type": "text", "text": request.prompt}]}]
if request.image:
try:
image_data = base64.b64decode(request.image)
image = Image.open(io.BytesIO(image_data))
messages[0]["content"].append({"type": "image", "image": image})
except Exception as e:
return {"error": f"Failed to process image: {str(e)}"}
inputs = processor.apply_chat_template(
messages,
tokenize=True,
add_generation_prompt=True,
return_dict=True,
return_tensors="pt"
).to(model.device)
inputs = {k: v.to(torch.bfloat16) if v.dtype.is_floating_point else v for k, v in inputs.items()}
generated_ids = model.generate(**inputs, max_new_tokens=request.max_tokens)
output_text = processor.decode(generated_ids[0][inputs["input_ids"].shape[1]:], skip_special_tokens=False)
return {"choices": [{"text": output_text}]}
if __name__ == "__main__":
import uvicorn
uvicorn.run(app, host="0.0.0.0", port=8000)
Script to test the vision model
import base64
import requests
import json
import os
def encode_image_to_base64(image_path):
"""Encode an image file to base64 string."""
try:
with open(image_path, "rb") as image_file:
encoded = base64.b64encode(image_file.read()).decode("utf-8")
return encoded
except FileNotFoundError:
print(f"Error: Image file '{image_path}' not found.")
return None
except Exception as e:
print(f"Error encoding image: {str(e)}")
return None
def send_request(base64_image):
"""Send base64 image to vision model server using requests."""
try:
payload = {
"prompt": "Describe this image in detail.",
"image": base64_image,
"max_tokens": 300
}
response = requests.post(
"http://localhost:8000/v1/completions",
headers={"Content-Type": "application/json"},
json=payload
)
if response.status_code == 200:
print("Server response:")
print(response.text)
else:
print(f"Request failed with status code {response.status_code}:")
print(response.text)
except requests.RequestException as e:
print(f"Error sending request: {str(e)}")
except Exception as e:
print(f"Unexpected error: {str(e)}")
def main():
image_path = "/path/to/sample.jpeg"
base64_image = encode_image_to_base64(image_path)
if base64_image:
send_request(base64_image)
if __name__ == "__main__":
main()
The quantized model can be loaded using a single GPU with VRAM larger than 12GB (Tested on Tesla T10)
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 570.133.20 Driver Version: 570.133.20 CUDA Version: 12.8 |
|-----------------------------------------+------------------------+----------------------+
| GPU Name Persistence-M | Bus-Id Disp.A | Volatile Uncorr. ECC |
| Fan Temp Perf Pwr:Usage/Cap | Memory-Usage | GPU-Util Compute M. |
| | | MIG M. |
|=========================================+========================+======================|
| 0 Tesla T10 Off | 00000000:01:00.0 Off | Off |
| N/A 46C P0 39W / 150W | 7594MiB / 16384MiB | 0% Default |
| | | N/A |
+-----------------------------------------+------------------------+----------------------+
+-----------------------------------------------------------------------------------------+
| Processes: |
| GPU GI CI PID Type Process name GPU Memory |
| ID ID Usage |
|=========================================================================================|
| 0 N/A N/A 6824 G /usr/lib/xorg/Xorg 4MiB |
| 0 N/A N/A 12956 C python 7586MiB |
+-----------------------------------------------------------------------------------------+
- Downloads last month
- 1,648
Inference Providers
NEW
This model isn't deployed by any Inference Provider.
馃檵
Ask for provider support