export CUDA_VISIBLE_DEVICES=0,1,2,3 | |
export VLLM_ALLOW_LONG_MAX_MODEL_LEN=1 | |
export VLLM_USE_V1=0 | |
MODEL_PATH='' | |
vllm serve $MODEL_PATH \ | |
--served-model-name medguide-v \ | |
--port 8232 \ | |
--host 0.0.0.0 \ | |
--dtype bfloat16 \ | |
--limit-mm-per-prompt image=6,video=0 \ | |
--max-model-len 12000 \ | |
--gpu-memory-utilization 0.9 \ | |
--tensor-parallel-size 4 \ | |
--api-key medguide-v \ | |
--generation-config vllm |