I could not run it, please check my steps

by alexchenyu - opened Aug 25

Aug 25

:$ mkdir ds31-env
:$ cd ds31-env
:/ds31-env$ ls
:/ds31-env$ uv venv dsv31-uv-env
Using CPython 3.10.12 interpreter at: /usr/bin/python3
Creating virtual environment at: dsv31-uv-env
Activate with: source dsv31-uv-env/bin/activate
:/ds31-env$ ^C
:/ds31-env$ source dsv31-uv-env/bin/activate
(dsv31-uv-env) :~/ds31-env$ uv pip install vllm==0.9.0
Using Python 3.10.12 environment at: dsv31-uv-env
Resolved 149 packages in 455ms
Prepared 8 packages in 13.96s
Installed 149 packages in 326ms

aiohappyeyeballs==2.6.1
aiohttp==3.12.15
aiosignal==1.4.0
airportsdata==20250811
annotated-types==0.7.0
anyio==4.10.0
astor==0.8.1
async-timeout==5.0.1
attrs==25.3.0
blake3==1.0.5
cachetools==6.2.0
certifi==2025.8.3
charset-normalizer==3.4.3
click==8.2.1
cloudpickle==3.1.1
compressed-tensors==0.9.4
cupy-cuda12x==13.6.0
depyf==0.18.0
dill==0.4.0
diskcache==5.6.3
distro==1.9.0
dnspython==2.7.0
einops==0.8.1
email-validator==2.2.0
exceptiongroup==1.3.0
fastapi==0.116.1
fastapi-cli==0.0.8
fastapi-cloud-cli==0.1.5
fastrlock==0.8.3
filelock==3.19.1
frozenlist==1.7.0
fsspec==2025.7.0
gguf==0.17.1
googleapis-common-protos==1.70.0
grpcio==1.74.0
h11==0.16.0
hf-xet==1.1.8
httpcore==1.0.9
httptools==0.6.4
httpx==0.28.1
huggingface-hub==0.34.4
idna==3.10
importlib-metadata==8.7.0
interegular==0.3.3
jinja2==3.1.6
jiter==0.10.0
jsonschema==4.25.1
jsonschema-specifications==2025.4.1
lark==1.2.2
llguidance==0.7.30
llvmlite==0.44.0
lm-format-enforcer==0.10.12
markdown-it-py==4.0.0
markupsafe==3.0.2
mdurl==0.1.2
mistral-common==1.8.4
mpmath==1.3.0
msgpack==1.1.1
msgspec==0.19.0
multidict==6.6.4
nest-asyncio==1.6.0
networkx==3.4.2
ninja==1.13.0
numba==0.61.2
numpy==2.2.6
nvidia-cublas-cu12==12.6.4.1
nvidia-cuda-cupti-cu12==12.6.80
nvidia-cuda-nvrtc-cu12==12.6.77
nvidia-cuda-runtime-cu12==12.6.77
nvidia-cudnn-cu12==9.5.1.17
nvidia-cufft-cu12==11.3.0.4
nvidia-cufile-cu12==1.11.1.6
nvidia-curand-cu12==10.3.7.77
nvidia-cusolver-cu12==11.7.1.2
nvidia-cusparse-cu12==12.5.4.2
nvidia-cusparselt-cu12==0.6.3
nvidia-nccl-cu12==2.26.2
nvidia-nvjitlink-cu12==12.6.85
nvidia-nvtx-cu12==12.6.77
openai==1.101.0
opencv-python-headless==4.12.0.88
opentelemetry-api==1.36.0
opentelemetry-exporter-otlp==1.36.0
opentelemetry-exporter-otlp-proto-common==1.36.0
opentelemetry-exporter-otlp-proto-grpc==1.36.0
opentelemetry-exporter-otlp-proto-http==1.36.0
opentelemetry-proto==1.36.0
opentelemetry-sdk==1.36.0
opentelemetry-semantic-conventions==0.57b0
opentelemetry-semantic-conventions-ai==0.4.13
outlines==0.1.11
outlines-core==0.1.26
packaging==25.0
partial-json-parser==0.2.1.1.post6
pillow==11.3.0
prometheus-client==0.22.1
prometheus-fastapi-instrumentator==7.1.0
propcache==0.3.2
protobuf==6.32.0
psutil==7.0.0
py-cpuinfo==9.0.0
pycountry==24.6.1
pydantic==2.11.7
pydantic-core==2.33.2
pydantic-extra-types==2.10.5
pygments==2.19.2
python-dotenv==1.1.1
python-json-logger==3.3.0
python-multipart==0.0.20
pyyaml==6.0.2
pyzmq==27.0.2
ray==2.48.0
referencing==0.36.2
regex==2025.7.34
requests==2.32.5
rich==14.1.0
rich-toolkit==0.15.0
rignore==0.6.4
rpds-py==0.27.0
safetensors==0.6.2
scipy==1.15.3
sentencepiece==0.2.1
sentry-sdk==2.35.0
setuptools==80.9.0
shellingham==1.5.4
sniffio==1.3.1
starlette==0.47.3
sympy==1.14.0
tiktoken==0.11.0
tokenizers==0.21.4
torch==2.7.0
torchaudio==2.7.0
torchvision==0.22.0
tqdm==4.67.1
transformers==4.55.4
triton==3.3.0
typer==0.16.1
typing-extensions==4.15.0
typing-inspection==0.4.1
urllib3==2.5.0
uvicorn==0.35.0
uvloop==0.21.0
vllm==0.9.0
watchfiles==1.1.0
websockets==15.0.1
xformers==0.0.30
xgrammar==0.1.19
yarl==1.20.1
zipp==3.23.0
(dsv31-uv-env) :~/ds31-env$ uv pip install transformers==4.53
Using Python 3.10.12 environment at: dsv31-uv-env
Resolved 18 packages in 26ms
Uninstalled 1 package in 69ms
Installed 1 package in 192ms

transformers==4.55.4

transformers==4.53.0
(dsv31-uv-env) :/ds31-env$ uv pip install huggingface_hub
Using Python 3.10.12 environment at: dsv31-uv-env
Audited 1 package in 6ms
(dsv31-uv-env) :/ds31-env$ huggingface-cli download QuantTrio/DeepSeek-V3.1-AWQ --local-dir /models/DeepSeek-V3.1-AWQ --local-dir-use-symlinks False
/home/user/ds31-env/dsv31-uv-env/lib/python3.10/site-packages/huggingface_hub/commands/download.py:141: FutureWarning: Ignoring --local-dir-use-symlinks. Downloading to a local directory does not use symlinks anymore.
warnings.warn(
⚠️ Warning: 'huggingface-cli download' is deprecated. Use 'hf download' instead.
Fetching 151 files: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 151/151 [00:00<00:00, 901.00it/s]
/home/user/models/DeepSeek-V3.1-AWQ
(dsv31-uv-env) :/ds31-env$ SITE_PACKAGES=$(python -c "import site; print(site.getsitepackages()[0])")
(dsv31-uv-env) :/ds31-env$ cp ~/models/DeepSeek-V3.1-AWQ/awq_marlin.py "$SITE_PACKAGES/vllm/model_executor/layers/quantization/utils/awq_marlin.py"
(dsv31-uv-env) :/ds31-env$ hf download QuantTrio/DeepSeek-V3.1-AWQ --local-dir /models/DeepSeek-V3.1-AWQ --local-dir-use-symlinks False
usage: hf []
hf: error: unrecognized arguments: --local-dir-use-symlinks False
(dsv31-uv-env) :/ds31-env$ hf download QuantTrio/DeepSeek-V3.1-AWQ --local-dir /models/DeepSeek-V3.1-AWQ
Fetching 151 files: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 151/151 [00:00<00:00, 967.36it/s]
/home/user/models/DeepSeek-V3.1-AWQ
(dsv31-uv-env) :/ds31-env$ vllm serve "$HOME/models/DeepSeek-V3.1-AWQ"
--served-model-name DeepSeek-V3.1-AWQ
--swap-space 16
--tensor-parallel-size 8
--tool-call-parser deepseek_v3
--enable-expert-parallel
--enable-auto-tool-choice
--max-model-len 131072
--max-seq-len-to-capture 131072
--enforce-eager
--max-num-seqs 2
--trust-remote-code
--kv-cache-dtype auto
--gpu-memory-utilization 0.85
--disable-log-requests
INFO 08-25 14:16:26 [__init__.py:243] Automatically detected platform cuda.
INFO 08-25 14:16:53 [__init__.py:31] Available plugins for group vllm.general_plugins:
INFO 08-25 14:16:53 [__init__.py:33] - lora_filesystem_resolver -> vllm.plugins.lora_resolvers.filesystem_resolver:register_filesystem_resolver
INFO 08-25 14:16:53 [init.py:36] All plugins in this group will be loaded. Set VLLM_PLUGINS to control which plugins to load.
INFO 08-25 14:16:54 [api_server.py:1289] vLLM API server version 0.9.0
INFO 08-25 14:16:55 [cli_args.py:300] non-default args: {'api_key': 'eyJhIjoiYmI5ZW', 'enable_auto_tool_choice': True, 'tool_call_parser': 'deepseek_v3', 'trust_remote_code': True, 'max_model_len': 131072, 'enforce_eager': True, 'max_seq_len_to_capture': 131072, 'served_model_name': ['DeepSeek-V3.1-AWQ'], 'tensor_parallel_size': 8, 'enable_expert_parallel': True, 'gpu_memory_utilization': 0.85, 'swap_space': 16.0, 'max_num_seqs': 2, 'disable_log_requests': True}
INFO 08-25 14:16:55 [config.py:213] Replacing legacy 'type' key with 'rope_type'
INFO 08-25 14:17:01 [config.py:793] This model supports multiple tasks: {'reward', 'classify', 'embed', 'score', 'generate'}. Defaulting to 'generate'.
INFO 08-25 14:17:01 [awq_marlin.py:115] The model is convertible to awq_marlin during runtime. Using awq_marlin kernel.
INFO 08-25 14:17:01 [config.py:1875] Defaulting to use mp for distributed inference
INFO 08-25 14:17:01 [config.py:2118] Chunked prefill is enabled with max_num_batched_tokens=8192.
WARNING 08-25 14:17:01 [cuda.py:87] To see benefits of async output processing, enable CUDA graph. Since, enforce-eager is enabled, async output processor cannot be used
INFO 08-25 14:17:01 [cuda.py:151] Forcing kv cache block size to 64 for FlashMLA backend.
INFO 08-25 14:17:05 [init.py:243] Automatically detected platform cuda.
INFO 08-25 14:17:07 [core.py:438] Waiting for init message from front-end.
INFO 08-25 14:17:07 [init.py:31] Available plugins for group vllm.general_plugins:
INFO 08-25 14:17:07 [init.py:33] - lora_filesystem_resolver -> vllm.plugins.lora_resolvers.filesystem_resolver:register_filesystem_resolver
INFO 08-25 14:17:07 [init.py:36] All plugins in this group will be loaded. Set VLLM_PLUGINS to control which plugins to load.
INFO 08-25 14:17:07 [core.py:65] Initializing a V1 LLM engine (v0.9.0) with config: model='/home/user/models/DeepSeek-V3.1-AWQ', speculative_config=None, tokenizer='/home/user/models/DeepSeek-V3.1-AWQ', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, override_neuron_config={}, tokenizer_revision=None, trust_remote_code=True, dtype=torch.bfloat16, max_seq_len=131072, download_dir=None, load_format=auto, tensor_parallel_size=8, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=awq_marlin, enforce_eager=True, kv_cache_dtype=auto, device_config=cuda, decoding_config=DecodingConfig(backend='auto', disable_fallback=False, disable_any_whitespace=False, disable_additional_properties=False, reasoning_backend=''), observability_config=ObservabilityConfig(show_hidden_metrics_for_version=None, otlp_traces_endpoint=None, collect_detailed_traces=None), seed=0, served_model_name=DeepSeek-V3.1-AWQ, num_scheduler_steps=1, multi_step_stream_outputs=True, enable_prefix_caching=True, chunked_prefill_enabled=True, use_async_output_proc=False, pooler_config=None, compilation_config={"compile_sizes": [], "inductor_compile_config": {"enable_auto_functionalized_v2": false}, "cudagraph_capture_sizes": [], "max_capture_size": 0}
WARNING 08-25 14:17:07 [multiproc_worker_utils.py:306] Reducing Torch parallelism from 104 threads to 1 to avoid unnecessary CPU contention. Set OMP_NUM_THREADS in the external environment to tune this value as needed.
INFO 08-25 14:17:07 [shm_broadcast.py:250] vLLM message queue communication handle: Handle(local_reader_ranks=[0, 1, 2, 3, 4, 5, 6, 7], buffer_handle=(8, 10485760, 10, 'psm_d25f6886'), local_subscribe_addr='ipc:///tmp/5f995bb2-6804-4fb2-b318-17cea2e87526', remote_subscribe_addr=None, remote_addr_ipv6=False)
INFO 08-25 14:17:10 [init.py:243] Automatically detected platform cuda.
INFO 08-25 14:17:10 [init.py:243] Automatically detected platform cuda.
INFO 08-25 14:17:11 [init.py:243] Automatically detected platform cuda.
INFO 08-25 14:17:11 [init.py:243] Automatically detected platform cuda.
INFO 08-25 14:17:11 [init.py:243] Automatically detected platform cuda.
INFO 08-25 14:17:11 [init.py:243] Automatically detected platform cuda.
INFO 08-25 14:17:11 [init.py:243] Automatically detected platform cuda.
INFO 08-25 14:17:11 [init.py:243] Automatically detected platform cuda.
INFO 08-25 14:17:15 [init.py:31] Available plugins for group vllm.general_plugins:
INFO 08-25 14:17:15 [init.py:33] - lora_filesystem_resolver -> vllm.plugins.lora_resolvers.filesystem_resolver:register_filesystem_resolver
INFO 08-25 14:17:15 [init.py:36] All plugins in this group will be loaded. Set VLLM_PLUGINS to control which plugins to load.
INFO 08-25 14:17:15 [init.py:31] Available plugins for group vllm.general_plugins:
INFO 08-25 14:17:15 [init.py:33] - lora_filesystem_resolver -> vllm.plugins.lora_resolvers.filesystem_resolver:register_filesystem_resolver
INFO 08-25 14:17:15 [init.py:36] All plugins in this group will be loaded. Set VLLM_PLUGINS to control which plugins to load.
INFO 08-25 14:17:15 [init.py:31] Available plugins for group vllm.general_plugins:
INFO 08-25 14:17:15 [init.py:33] - lora_filesystem_resolver -> vllm.plugins.lora_resolvers.filesystem_resolver:register_filesystem_resolver
INFO 08-25 14:17:15 [init.py:36] All plugins in this group will be loaded. Set VLLM_PLUGINS to control which plugins to load.
INFO 08-25 14:17:15 [init.py:31] Available plugins for group vllm.general_plugins:
INFO 08-25 14:17:15 [init.py:33] - lora_filesystem_resolver -> vllm.plugins.lora_resolvers.filesystem_resolver:register_filesystem_resolver
INFO 08-25 14:17:15 [init.py:36] All plugins in this group will be loaded. Set VLLM_PLUGINS to control which plugins to load.
INFO 08-25 14:17:15 [init.py:31] Available plugins for group vllm.general_plugins:
INFO 08-25 14:17:15 [init.py:33] - lora_filesystem_resolver -> vllm.plugins.lora_resolvers.filesystem_resolver:register_filesystem_resolver
INFO 08-25 14:17:15 [init.py:36] All plugins in this group will be loaded. Set VLLM_PLUGINS to control which plugins to load.
INFO 08-25 14:17:15 [init.py:31] Available plugins for group vllm.general_plugins:
INFO 08-25 14:17:15 [init.py:33] - lora_filesystem_resolver -> vllm.plugins.lora_resolvers.filesystem_resolver:register_filesystem_resolver
INFO 08-25 14:17:15 [init.py:36] All plugins in this group will be loaded. Set VLLM_PLUGINS to control which plugins to load.
INFO 08-25 14:17:15 [init.py:31] Available plugins for group vllm.general_plugins:
INFO 08-25 14:17:15 [init.py:33] - lora_filesystem_resolver -> vllm.plugins.lora_resolvers.filesystem_resolver:register_filesystem_resolver
INFO 08-25 14:17:15 [init.py:36] All plugins in this group will be loaded. Set VLLM_PLUGINS to control which plugins to load.
INFO 08-25 14:17:15 [init.py:31] Available plugins for group vllm.general_plugins:
INFO 08-25 14:17:15 [init.py:33] - lora_filesystem_resolver -> vllm.plugins.lora_resolvers.filesystem_resolver:register_filesystem_resolver
INFO 08-25 14:17:15 [init.py:36] All plugins in this group will be loaded. Set VLLM_PLUGINS to control which plugins to load.
WARNING 08-25 14:17:15 [utils.py:2671] Methods determine_num_available_blocks,device_config,get_cache_block_size_bytes,initialize_cache not implemented in <vllm.v1.worker.gpu_worker.Worker object at 0x7eff9bfc3460>
WARNING 08-25 14:17:15 [utils.py:2671] Methods determine_num_available_blocks,device_config,get_cache_block_size_bytes,initialize_cache not implemented in <vllm.v1.worker.gpu_worker.Worker object at 0x7f88c43b5b70>
WARNING 08-25 14:17:15 [utils.py:2671] Methods determine_num_available_blocks,device_config,get_cache_block_size_bytes,initialize_cache not implemented in <vllm.v1.worker.gpu_worker.Worker object at 0x7f118e4afd90>
WARNING 08-25 14:17:15 [utils.py:2671] Methods determine_num_available_blocks,device_config,get_cache_block_size_bytes,initialize_cache not implemented in <vllm.v1.worker.gpu_worker.Worker object at 0x7f4a995c7d60>
(VllmWorker rank=2 pid=28520) INFO 08-25 14:17:15 [shm_broadcast.py:250] vLLM message queue communication handle: Handle(local_reader_ranks=[0], buffer_handle=(1, 10485760, 10, 'psm_e02888cd'), local_subscribe_addr='ipc:///tmp/23da0016-24b4-4b72-b285-71f399e7f466', remote_subscribe_addr=None, remote_addr_ipv6=False)
(VllmWorker rank=5 pid=28523) INFO 08-25 14:17:15 [shm_broadcast.py:250] vLLM message queue communication handle: Handle(local_reader_ranks=[0], buffer_handle=(1, 10485760, 10, 'psm_eeeed842'), local_subscribe_addr='ipc:///tmp/1ded14f2-5494-41f8-b5d0-f2317c547471', remote_subscribe_addr=None, remote_addr_ipv6=False)
(VllmWorker rank=6 pid=28524) INFO 08-25 14:17:15 [shm_broadcast.py:250] vLLM message queue communication handle: Handle(local_reader_ranks=[0], buffer_handle=(1, 10485760, 10, 'psm_ec314e1f'), local_subscribe_addr='ipc:///tmp/b703ad44-cfce-44dd-ae43-265b557ad75e', remote_subscribe_addr=None, remote_addr_ipv6=False)
(VllmWorker rank=1 pid=28519) INFO 08-25 14:17:15 [shm_broadcast.py:250] vLLM message queue communication handle: Handle(local_reader_ranks=[0], buffer_handle=(1, 10485760, 10, 'psm_36a30e49'), local_subscribe_addr='ipc:///tmp/85ab75c0-35f6-4024-a714-10a5637fe751', remote_subscribe_addr=None, remote_addr_ipv6=False)
WARNING 08-25 14:17:15 [utils.py:2671] Methods determine_num_available_blocks,device_config,get_cache_block_size_bytes,initialize_cache not implemented in <vllm.v1.worker.gpu_worker.Worker object at 0x7efdfcc37d30>
(VllmWorker rank=3 pid=28521) INFO 08-25 14:17:15 [shm_broadcast.py:250] vLLM message queue communication handle: Handle(local_reader_ranks=[0], buffer_handle=(1, 10485760, 10, 'psm_ba8eef96'), local_subscribe_addr='ipc:///tmp/faefe075-cb3b-4f46-b749-3af1b0b0b908', remote_subscribe_addr=None, remote_addr_ipv6=False)
WARNING 08-25 14:17:15 [utils.py:2671] Methods determine_num_available_blocks,device_config,get_cache_block_size_bytes,initialize_cache not implemented in <vllm.v1.worker.gpu_worker.Worker object at 0x7f3323ca8880>
(VllmWorker rank=0 pid=28518) INFO 08-25 14:17:15 [shm_broadcast.py:250] vLLM message queue communication handle: Handle(local_reader_ranks=[0], buffer_handle=(1, 10485760, 10, 'psm_849b1dcf'), local_subscribe_addr='ipc:///tmp/3a193b53-45fb-4d9f-9893-d00fcdba49bb', remote_subscribe_addr=None, remote_addr_ipv6=False)
WARNING 08-25 14:17:16 [utils.py:2671] Methods determine_num_available_blocks,device_config,get_cache_block_size_bytes,initialize_cache not implemented in <vllm.v1.worker.gpu_worker.Worker object at 0x7fbbb09afd90>
(VllmWorker rank=7 pid=28525) INFO 08-25 14:17:16 [shm_broadcast.py:250] vLLM message queue communication handle: Handle(local_reader_ranks=[0], buffer_handle=(1, 10485760, 10, 'psm_72f861c6'), local_subscribe_addr='ipc:///tmp/9fae716a-c25c-4ed2-ab33-dccfb2397a41', remote_subscribe_addr=None, remote_addr_ipv6=False)
WARNING 08-25 14:17:16 [utils.py:2671] Methods determine_num_available_blocks,device_config,get_cache_block_size_bytes,initialize_cache not implemented in <vllm.v1.worker.gpu_worker.Worker object at 0x7f7c7b133d90>
(VllmWorker rank=4 pid=28522) INFO 08-25 14:17:16 [shm_broadcast.py:250] vLLM message queue communication handle: Handle(local_reader_ranks=[0], buffer_handle=(1, 10485760, 10, 'psm_b0f06d40'), local_subscribe_addr='ipc:///tmp/3b9fafee-8558-4040-967f-69ed9b092e5f', remote_subscribe_addr=None, remote_addr_ipv6=False)
(VllmWorker rank=0 pid=28518) INFO 08-25 14:17:22 [utils.py:1077] Found nccl from library libnccl.so.2
(VllmWorker rank=0 pid=28518) INFO 08-25 14:17:22 [pynccl.py:69] vLLM is using nccl==2.26.2
(VllmWorker rank=7 pid=28525) INFO 08-25 14:17:22 [utils.py:1077] Found nccl from library libnccl.so.2
(VllmWorker rank=4 pid=28522) INFO 08-25 14:17:22 [utils.py:1077] Found nccl from library libnccl.so.2
(VllmWorker rank=7 pid=28525) INFO 08-25 14:17:22 [pynccl.py:69] vLLM is using nccl==2.26.2
(VllmWorker rank=4 pid=28522) INFO 08-25 14:17:22 [pynccl.py:69] vLLM is using nccl==2.26.2
(VllmWorker rank=6 pid=28524) INFO 08-25 14:17:22 [utils.py:1077] Found nccl from library libnccl.so.2
(VllmWorker rank=3 pid=28521) INFO 08-25 14:17:22 [utils.py:1077] Found nccl from library libnccl.so.2
(VllmWorker rank=6 pid=28524) INFO 08-25 14:17:22 [pynccl.py:69] vLLM is using nccl==2.26.2
(VllmWorker rank=2 pid=28520) INFO 08-25 14:17:22 [utils.py:1077] Found nccl from library libnccl.so.2
(VllmWorker rank=3 pid=28521) INFO 08-25 14:17:22 [pynccl.py:69] vLLM is using nccl==2.26.2
(VllmWorker rank=2 pid=28520) INFO 08-25 14:17:22 [pynccl.py:69] vLLM is using nccl==2.26.2
(VllmWorker rank=1 pid=28519) INFO 08-25 14:17:22 [utils.py:1077] Found nccl from library libnccl.so.2
(VllmWorker rank=1 pid=28519) INFO 08-25 14:17:22 [pynccl.py:69] vLLM is using nccl==2.26.2
(VllmWorker rank=5 pid=28523) INFO 08-25 14:17:22 [utils.py:1077] Found nccl from library libnccl.so.2
(VllmWorker rank=5 pid=28523) INFO 08-25 14:17:22 [pynccl.py:69] vLLM is using nccl==2.26.2
(VllmWorker rank=0 pid=28518) INFO 08-25 14:17:24 [custom_all_reduce_utils.py:245] reading GPU P2P access cache from /home/user/.cache/vllm/gpu_p2p_access_cache_for_0,1,2,3,4,5,6,7.json
(VllmWorker rank=7 pid=28525) INFO 08-25 14:17:24 [custom_all_reduce_utils.py:245] reading GPU P2P access cache from /home/user/.cache/vllm/gpu_p2p_access_cache_for_0,1,2,3,4,5,6,7.json
(VllmWorker rank=3 pid=28521) INFO 08-25 14:17:24 [custom_all_reduce_utils.py:245] reading GPU P2P access cache from /home/user/.cache/vllm/gpu_p2p_access_cache_for_0,1,2,3,4,5,6,7.json
(VllmWorker rank=1 pid=28519) INFO 08-25 14:17:24 [custom_all_reduce_utils.py:245] reading GPU P2P access cache from /home/user/.cache/vllm/gpu_p2p_access_cache_for_0,1,2,3,4,5,6,7.json
(VllmWorker rank=5 pid=28523) INFO 08-25 14:17:24 [custom_all_reduce_utils.py:245] reading GPU P2P access cache from /home/user/.cache/vllm/gpu_p2p_access_cache_for_0,1,2,3,4,5,6,7.json
(VllmWorker rank=6 pid=28524) INFO 08-25 14:17:24 [custom_all_reduce_utils.py:245] reading GPU P2P access cache from /home/user/.cache/vllm/gpu_p2p_access_cache_for_0,1,2,3,4,5,6,7.json
(VllmWorker rank=4 pid=28522) INFO 08-25 14:17:24 [custom_all_reduce_utils.py:245] reading GPU P2P access cache from /home/user/.cache/vllm/gpu_p2p_access_cache_for_0,1,2,3,4,5,6,7.json
(VllmWorker rank=2 pid=28520) INFO 08-25 14:17:24 [custom_all_reduce_utils.py:245] reading GPU P2P access cache from /home/user/.cache/vllm/gpu_p2p_access_cache_for_0,1,2,3,4,5,6,7.json
(VllmWorker rank=0 pid=28518) INFO 08-25 14:17:24 [shm_broadcast.py:250] vLLM message queue communication handle: Handle(local_reader_ranks=[1, 2, 3, 4, 5, 6, 7], buffer_handle=(7, 4194304, 6, 'psm_0b30b0b1'), local_subscribe_addr='ipc:///tmp/11fd5b6d-549d-453d-b0db-48adaa7c5b8c', remote_subscribe_addr=None, remote_addr_ipv6=False)
(VllmWorker rank=1 pid=28519) INFO 08-25 14:17:25 [parallel_state.py:1064] rank 1 in world size 8 is assigned as DP rank 0, PP rank 0, TP rank 1, EP rank 1
(VllmWorker rank=0 pid=28518) INFO 08-25 14:17:25 [parallel_state.py:1064] rank 0 in world size 8 is assigned as DP rank 0, PP rank 0, TP rank 0, EP rank 0
(VllmWorker rank=4 pid=28522) INFO 08-25 14:17:25 [parallel_state.py:1064] rank 4 in world size 8 is assigned as DP rank 0, PP rank 0, TP rank 4, EP rank 4
(VllmWorker rank=3 pid=28521) INFO 08-25 14:17:25 [parallel_state.py:1064] rank 3 in world size 8 is assigned as DP rank 0, PP rank 0, TP rank 3, EP rank 3
(VllmWorker rank=2 pid=28520) INFO 08-25 14:17:25 [parallel_state.py:1064] rank 2 in world size 8 is assigned as DP rank 0, PP rank 0, TP rank 2, EP rank 2
(VllmWorker rank=7 pid=28525) INFO 08-25 14:17:25 [parallel_state.py:1064] rank 7 in world size 8 is assigned as DP rank 0, PP rank 0, TP rank 7, EP rank 7
(VllmWorker rank=6 pid=28524) INFO 08-25 14:17:25 [parallel_state.py:1064] rank 6 in world size 8 is assigned as DP rank 0, PP rank 0, TP rank 6, EP rank 6
(VllmWorker rank=5 pid=28523) INFO 08-25 14:17:25 [parallel_state.py:1064] rank 5 in world size 8 is assigned as DP rank 0, PP rank 0, TP rank 5, EP rank 5
(VllmWorker rank=1 pid=28519) WARNING 08-25 14:17:25 [topk_topp_sampler.py:58] FlashInfer is not available. Falling back to the PyTorch-native implementation of top-p & top-k sampling. For the best performance, please install FlashInfer.
(VllmWorker rank=7 pid=28525) WARNING 08-25 14:17:25 [topk_topp_sampler.py:58] FlashInfer is not available. Falling back to the PyTorch-native implementation of top-p & top-k sampling. For the best performance, please install FlashInfer.
(VllmWorker rank=6 pid=28524) WARNING 08-25 14:17:25 [topk_topp_sampler.py:58] FlashInfer is not available. Falling back to the PyTorch-native implementation of top-p & top-k sampling. For the best performance, please install FlashInfer.
(VllmWorker rank=5 pid=28523) WARNING 08-25 14:17:25 [topk_topp_sampler.py:58] FlashInfer is not available. Falling back to the PyTorch-native implementation of top-p & top-k sampling. For the best performance, please install FlashInfer.
(VllmWorker rank=0 pid=28518) WARNING 08-25 14:17:25 [topk_topp_sampler.py:58] FlashInfer is not available. Falling back to the PyTorch-native implementation of top-p & top-k sampling. For the best performance, please install FlashInfer.
(VllmWorker rank=3 pid=28521) WARNING 08-25 14:17:25 [topk_topp_sampler.py:58] FlashInfer is not available. Falling back to the PyTorch-native implementation of top-p & top-k sampling. For the best performance, please install FlashInfer.
(VllmWorker rank=1 pid=28519) INFO 08-25 14:17:25 [gpu_model_runner.py:1531] Starting to load model /home/user/models/DeepSeek-V3.1-AWQ...
(VllmWorker rank=7 pid=28525) INFO 08-25 14:17:25 [gpu_model_runner.py:1531] Starting to load model /home/user/models/DeepSeek-V3.1-AWQ...
(VllmWorker rank=4 pid=28522) WARNING 08-25 14:17:25 [topk_topp_sampler.py:58] FlashInfer is not available. Falling back to the PyTorch-native implementation of top-p & top-k sampling. For the best performance, please install FlashInfer.
(VllmWorker rank=6 pid=28524) INFO 08-25 14:17:25 [gpu_model_runner.py:1531] Starting to load model /home/user/models/DeepSeek-V3.1-AWQ...
(VllmWorker rank=5 pid=28523) INFO 08-25 14:17:25 [gpu_model_runner.py:1531] Starting to load model /home/user/models/DeepSeek-V3.1-AWQ...
(VllmWorker rank=2 pid=28520) WARNING 08-25 14:17:25 [topk_topp_sampler.py:58] FlashInfer is not available. Falling back to the PyTorch-native implementation of top-p & top-k sampling. For the best performance, please install FlashInfer.
(VllmWorker rank=6 pid=28524) WARNING 08-25 14:17:25 [utils.py:298] The model class DeepseekV3ForCausalLM has not defined packed_modules_mapping, this may lead to incorrect mapping of quantized or ignored modules
(VllmWorker rank=7 pid=28525) WARNING 08-25 14:17:25 [utils.py:298] The model class DeepseekV3ForCausalLM has not defined packed_modules_mapping, this may lead to incorrect mapping of quantized or ignored modules
(VllmWorker rank=1 pid=28519) WARNING 08-25 14:17:25 [utils.py:298] The model class DeepseekV3ForCausalLM has not defined packed_modules_mapping, this may lead to incorrect mapping of quantized or ignored modules
(VllmWorker rank=5 pid=28523) WARNING 08-25 14:17:25 [utils.py:298] The model class DeepseekV3ForCausalLM has not defined packed_modules_mapping, this may lead to incorrect mapping of quantized or ignored modules
(VllmWorker rank=3 pid=28521) INFO 08-25 14:17:25 [gpu_model_runner.py:1531] Starting to load model /home/user/models/DeepSeek-V3.1-AWQ...
(VllmWorker rank=0 pid=28518) INFO 08-25 14:17:25 [gpu_model_runner.py:1531] Starting to load model /home/user/models/DeepSeek-V3.1-AWQ...
(VllmWorker rank=2 pid=28520) INFO 08-25 14:17:25 [gpu_model_runner.py:1531] Starting to load model /home/user/models/DeepSeek-V3.1-AWQ...
(VllmWorker rank=0 pid=28518) WARNING 08-25 14:17:25 [utils.py:298] The model class DeepseekV3ForCausalLM has not defined packed_modules_mapping, this may lead to incorrect mapping of quantized or ignored modules
(VllmWorker rank=4 pid=28522) INFO 08-25 14:17:25 [gpu_model_runner.py:1531] Starting to load model /home/user/models/DeepSeek-V3.1-AWQ...
(VllmWorker rank=3 pid=28521) WARNING 08-25 14:17:25 [utils.py:298] The model class DeepseekV3ForCausalLM has not defined packed_modules_mapping, this may lead to incorrect mapping of quantized or ignored modules
(VllmWorker rank=2 pid=28520) WARNING 08-25 14:17:25 [utils.py:298] The model class DeepseekV3ForCausalLM has not defined packed_modules_mapping, this may lead to incorrect mapping of quantized or ignored modules
(VllmWorker rank=4 pid=28522) WARNING 08-25 14:17:25 [utils.py:298] The model class DeepseekV3ForCausalLM has not defined packed_modules_mapping, this may lead to incorrect mapping of quantized or ignored modules
(VllmWorker rank=6 pid=28524) INFO 08-25 14:17:25 [cuda.py:200] Using FlashMLA backend on V1 engine.
(VllmWorker rank=4 pid=28522) INFO 08-25 14:17:25 [cuda.py:200] Using FlashMLA backend on V1 engine.
(VllmWorker rank=5 pid=28523) INFO 08-25 14:17:25 [cuda.py:200] Using FlashMLA backend on V1 engine.
(VllmWorker rank=7 pid=28525) INFO 08-25 14:17:25 [cuda.py:200] Using FlashMLA backend on V1 engine.
(VllmWorker rank=1 pid=28519) INFO 08-25 14:17:25 [cuda.py:200] Using FlashMLA backend on V1 engine.
(VllmWorker rank=3 pid=28521) INFO 08-25 14:17:25 [cuda.py:200] Using FlashMLA backend on V1 engine.
(VllmWorker rank=0 pid=28518) INFO 08-25 14:17:25 [cuda.py:200] Using FlashMLA backend on V1 engine.
(VllmWorker rank=2 pid=28520) INFO 08-25 14:17:25 [cuda.py:200] Using FlashMLA backend on V1 engine.
Loading safetensors checkpoint shards: 0% Completed | 0/134 [00:00<?, ?it/s]
Loading safetensors checkpoint shards: 1% Completed | 1/134 [00:00<00:16, 8.08it/s]
Loading safetensors checkpoint shards: 1% Completed | 2/134 [00:00<00:27, 4.79it/s]
Loading safetensors checkpoint shards: 2% Completed | 3/134 [00:00<00:32, 4.08it/s]
Loading safetensors checkpoint shards: 3% Completed | 4/134 [00:00<00:33, 3.88it/s]
Loading safetensors checkpoint shards: 4% Completed | 5/134 [00:01<00:27, 4.77it/s]
Loading safetensors checkpoint shards: 4% Completed | 6/134 [00:01<00:23, 5.51it/s]
Loading safetensors checkpoint shards: 5% Completed | 7/134 [00:01<00:20, 6.18it/s]
Loading safetensors checkpoint shards: 6% Completed | 8/134 [00:01<00:23, 5.41it/s]
Loading safetensors checkpoint shards: 7% Completed | 9/134 [00:01<00:26, 4.76it/s]
Loading safetensors checkpoint shards: 7% Completed | 10/134 [00:01<00:23, 5.35it/s]
Loading safetensors checkpoint shards: 8% Completed | 11/134 [00:02<00:20, 6.03it/s]
Loading safetensors checkpoint shards: 9% Completed | 12/134 [00:02<00:23, 5.11it/s]
Loading safetensors checkpoint shards: 10% Completed | 13/134 [00:02<00:25, 4.66it/s]
Loading safetensors checkpoint shards: 10% Completed | 14/134 [00:02<00:23, 5.16it/s]
Loading safetensors checkpoint shards: 12% Completed | 16/134 [00:02<00:15, 7.49it/s]
Loading safetensors checkpoint shards: 13% Completed | 17/134 [00:03<00:15, 7.65it/s]
Loading safetensors checkpoint shards: 13% Completed | 18/134 [00:03<00:18, 6.21it/s]
Loading safetensors checkpoint shards: 15% Completed | 20/134 [00:03<00:13, 8.27it/s]
Loading safetensors checkpoint shards: 16% Completed | 21/134 [00:03<00:17, 6.52it/s]
Loading safetensors checkpoint shards: 16% Completed | 22/134 [00:03<00:20, 5.49it/s]
Loading safetensors checkpoint shards: 18% Completed | 24/134 [00:04<00:15, 6.99it/s]
Loading safetensors checkpoint shards: 20% Completed | 27/134 [00:04<00:12, 8.58it/s]
Loading safetensors checkpoint shards: 21% Completed | 28/134 [00:04<00:13, 7.64it/s]
(VllmWorker rank=4 pid=28522) ERROR 08-25 14:17:30 [multiproc_executor.py:487] WorkerProc failed to start.
(VllmWorker rank=4 pid=28522) ERROR 08-25 14:17:30 [multiproc_executor.py:487] Traceback (most recent call last):
(VllmWorker rank=4 pid=28522) ERROR 08-25 14:17:30 [multiproc_executor.py:487] File "/home/user/ds31-env/dsv31-uv-env/lib/python3.10/site-packages/vllm/v1/executor/multiproc_executor.py", line 461, in worker_main
(VllmWorker rank=4 pid=28522) ERROR 08-25 14:17:30 [multiproc_executor.py:487] worker = WorkerProc(*args, **kwargs)
(VllmWorker rank=4 pid=28522) ERROR 08-25 14:17:30 [multiproc_executor.py:487] File "/home/user/ds31-env/dsv31-uv-env/lib/python3.10/site-packages/vllm/v1/executor/multiproc_executor.py", line 358, in __init__
(VllmWorker rank=4 pid=28522) ERROR 08-25 14:17:30 [multiproc_executor.py:487] self.worker.load_model()
(VllmWorker rank=4 pid=28522) ERROR 08-25 14:17:30 [multiproc_executor.py:487] File "/home/user/ds31-env/dsv31-uv-env/lib/python3.10/site-packages/vllm/v1/worker/gpu_worker.py", line 164, in load_model
(VllmWorker rank=4 pid=28522) ERROR 08-25 14:17:30 [multiproc_executor.py:487] self.model_runner.load_model()
(VllmWorker rank=4 pid=28522) ERROR 08-25 14:17:30 [multiproc_executor.py:487] File "/home/user/ds31-env/dsv31-uv-env/lib/python3.10/site-packages/vllm/v1/worker/gpu_model_runner.py", line 1534, in load_model
(VllmWorker rank=4 pid=28522) ERROR 08-25 14:17:30 [multiproc_executor.py:487] self.model = get_model(vllm_config=self.vllm_config)
(VllmWorker rank=4 pid=28522) ERROR 08-25 14:17:30 [multiproc_executor.py:487] File "/home/user/ds31-env/dsv31-uv-env/lib/python3.10/site-packages/vllm/model_executor/model_loader/__init__.py", line 58, in get_model
(VllmWorker rank=4 pid=28522) ERROR 08-25 14:17:30 [multiproc_executor.py:487] return loader.load_model(vllm_config=vllm_config,
(VllmWorker rank=4 pid=28522) ERROR 08-25 14:17:30 [multiproc_executor.py:487] File "/home/user/ds31-env/dsv31-uv-env/lib/python3.10/site-packages/vllm/model_executor/model_loader/default_loader.py", line 277, in load_model
(VllmWorker rank=4 pid=28522) ERROR 08-25 14:17:30 [multiproc_executor.py:487] loaded_weights = model.load_weights(
(VllmWorker rank=4 pid=28522) ERROR 08-25 14:17:30 [multiproc_executor.py:487] File "/home/user/ds31-env/dsv31-uv-env/lib/python3.10/site-packages/vllm/model_executor/models/deepseek_v2.py", line 800, in load_weights
(VllmWorker rank=4 pid=28522) ERROR 08-25 14:17:30 [multiproc_executor.py:487] param = params_dict[name]
(VllmWorker rank=4 pid=28522) ERROR 08-25 14:17:30 [multiproc_executor.py:487] KeyError: 'model.layers.60.mlp.experts.w2_weight'
Loading safetensors checkpoint shards: 21% Completed | 28/134 [00:04<00:18, 5.61it/s]
(VllmWorker rank=0 pid=28518)
(VllmWorker rank=5 pid=28523) ERROR 08-25 14:17:30 [multiproc_executor.py:487] WorkerProc failed to start.
(VllmWorker rank=5 pid=28523) ERROR 08-25 14:17:30 [multiproc_executor.py:487] Traceback (most recent call last):
(VllmWorker rank=5 pid=28523) ERROR 08-25 14:17:30 [multiproc_executor.py:487] File "/home/user/ds31-env/dsv31-uv-env/lib/python3.10/site-packages/vllm/v1/executor/multiproc_executor.py", line 461, in worker_main
(VllmWorker rank=5 pid=28523) ERROR 08-25 14:17:30 [multiproc_executor.py:487] worker = WorkerProc(*args, **kwargs)
(VllmWorker rank=5 pid=28523) ERROR 08-25 14:17:30 [multiproc_executor.py:487] File "/home/user/ds31-env/dsv31-uv-env/lib/python3.10/site-packages/vllm/v1/executor/multiproc_executor.py", line 358, in __init__
(VllmWorker rank=5 pid=28523) ERROR 08-25 14:17:30 [multiproc_executor.py:487] self.worker.load_model()
(VllmWorker rank=5 pid=28523) ERROR 08-25 14:17:30 [multiproc_executor.py:487] File "/home/user/ds31-env/dsv31-uv-env/lib/python3.10/site-packages/vllm/v1/worker/gpu_worker.py", line 164, in load_model
(VllmWorker rank=5 pid=28523) ERROR 08-25 14:17:30 [multiproc_executor.py:487] self.model_runner.load_model()
(VllmWorker rank=5 pid=28523) ERROR 08-25 14:17:30 [multiproc_executor.py:487] File "/home/user/ds31-env/dsv31-uv-env/lib/python3.10/site-packages/vllm/v1/worker/gpu_model_runner.py", line 1534, in load_model
(VllmWorker rank=5 pid=28523) ERROR 08-25 14:17:30 [multiproc_executor.py:487] self.model = get_model(vllm_config=self.vllm_config)
(VllmWorker rank=5 pid=28523) ERROR 08-25 14:17:30 [multiproc_executor.py:487] File "/home/user/ds31-env/dsv31-uv-env/lib/python3.10/site-packages/vllm/model_executor/model_loader/__init__.py", line 58, in get_model
(VllmWorker rank=5 pid=28523) ERROR 08-25 14:17:30 [multiproc_executor.py:487] return loader.load_model(vllm_config=vllm_config,
(VllmWorker rank=5 pid=28523) ERROR 08-25 14:17:30 [multiproc_executor.py:487] File "/home/user/ds31-env/dsv31-uv-env/lib/python3.10/site-packages/vllm/model_executor/model_loader/default_loader.py", line 277, in load_model
(VllmWorker rank=5 pid=28523) ERROR 08-25 14:17:30 [multiproc_executor.py:487] loaded_weights = model.load_weights(
(VllmWorker rank=5 pid=28523) ERROR 08-25 14:17:30 [multiproc_executor.py:487] File "/home/user/ds31-env/dsv31-uv-env/lib/python3.10/site-packages/vllm/model_executor/models/deepseek_v2.py", line 758, in load_weights
(VllmWorker rank=5 pid=28523) ERROR 08-25 14:17:30 [multiproc_executor.py:487] for name, loaded_weight in weights:
(VllmWorker rank=5 pid=28523) ERROR 08-25 14:17:30 [multiproc_executor.py:487] File "/home/user/ds31-env/dsv31-uv-env/lib/python3.10/site-packages/vllm/model_executor/model_loader/default_loader.py", line 252, in get_all_weights
(VllmWorker rank=5 pid=28523) ERROR 08-25 14:17:30 [multiproc_executor.py:487] yield from self._get_weights_iterator(primary_weights)
(VllmWorker rank=5 pid=28523) ERROR 08-25 14:17:30 [multiproc_executor.py:487] File "/home/user/ds31-env/dsv31-uv-env/lib/python3.10/site-packages/vllm/model_executor/model_loader/default_loader.py", line 235, in
(VllmWorker rank=5 pid=28523) ERROR 08-25 14:17:30 [multiproc_executor.py:487] return ((source.prefix + name, tensor)
(VllmWorker rank=5 pid=28523) ERROR 08-25 14:17:30 [multiproc_executor.py:487] File "/home/user/ds31-env/dsv31-uv-env/lib/python3.10/site-packages/vllm/model_executor/model_loader/weight_utils.py", line 470, in safetensors_weights_iterator
(VllmWorker rank=5 pid=28523) ERROR 08-25 14:17:30 [multiproc_executor.py:487] param = f.get_tensor(name)
(VllmWorker rank=5 pid=28523) ERROR 08-25 14:17:30 [multiproc_executor.py:487] ValueError: could not determine the shape of object type 'torch.storage.UntypedStorage'
[rank0]:[W825 14:17:31.025692350 ProcessGroupNCCL.cpp:1476] Warning: WARNING: destroy_process_group() was not called before program exit, which can leak resources. For more info, please see https://pytorch.org/docs/stable/distributed.html#shutdown (function operator())
ERROR 08-25 14:17:33 [core.py:500] EngineCore failed to start.
ERROR 08-25 14:17:33 [core.py:500] Traceback (most recent call last):
ERROR 08-25 14:17:33 [core.py:500] File "/home/user/ds31-env/dsv31-uv-env/lib/python3.10/site-packages/vllm/v1/engine/core.py", line 491, in run_engine_core
ERROR 08-25 14:17:33 [core.py:500] engine_core = EngineCoreProc(*args, **kwargs)
ERROR 08-25 14:17:33 [core.py:500] File "/home/user/ds31-env/dsv31-uv-env/lib/python3.10/site-packages/vllm/v1/engine/core.py", line 390, in init
ERROR 08-25 14:17:33 [core.py:500] super().init(vllm_config, executor_class, log_stats,
ERROR 08-25 14:17:33 [core.py:500] File "/home/user/ds31-env/dsv31-uv-env/lib/python3.10/site-packages/vllm/v1/engine/core.py", line 71, in init
ERROR 08-25 14:17:33 [core.py:500] self.model_executor = executor_class(vllm_config)
ERROR 08-25 14:17:33 [core.py:500] File "/home/user/ds31-env/dsv31-uv-env/lib/python3.10/site-packages/vllm/executor/executor_base.py", line 52, in init
ERROR 08-25 14:17:33 [core.py:500] self._init_executor()
ERROR 08-25 14:17:33 [core.py:500] File "/home/user/ds31-env/dsv31-uv-env/lib/python3.10/site-packages/vllm/v1/executor/multiproc_executor.py", line 93, in _init_executor
ERROR 08-25 14:17:33 [core.py:500] self.workers = WorkerProc.wait_for_ready(unready_workers)
ERROR 08-25 14:17:33 [core.py:500] File "/home/user/ds31-env/dsv31-uv-env/lib/python3.10/site-packages/vllm/v1/executor/multiproc_executor.py", line 422, in wait_for_ready
ERROR 08-25 14:17:33 [core.py:500] raise e from None
ERROR 08-25 14:17:33 [core.py:500] Exception: WorkerProc initialization failed due to an exception in a background process. See stack trace for root cause.
Process EngineCore_0:
Traceback (most recent call last):
File "/usr/lib/python3.10/multiprocessing/process.py", line 314, in _bootstrap
self.run()
File "/usr/lib/python3.10/multiprocessing/process.py", line 108, in run
self._target(*self._args, **self._kwargs)
File "/home/user/ds31-env/dsv31-uv-env/lib/python3.10/site-packages/vllm/v1/engine/core.py", line 504, in run_engine_core
raise e
File "/home/user/ds31-env/dsv31-uv-env/lib/python3.10/site-packages/vllm/v1/engine/core.py", line 491, in run_engine_core
engine_core = EngineCoreProc(*args, **kwargs)
File "/home/user/ds31-env/dsv31-uv-env/lib/python3.10/site-packages/vllm/v1/engine/core.py", line 390, in init
super().init(vllm_config, executor_class, log_stats,
File "/home/user/ds31-env/dsv31-uv-env/lib/python3.10/site-packages/vllm/v1/engine/core.py", line 71, in init
self.model_executor = executor_class(vllm_config)
File "/home/user/ds31-env/dsv31-uv-env/lib/python3.10/site-packages/vllm/executor/executor_base.py", line 52, in init
self._init_executor()
File "/home/user/ds31-env/dsv31-uv-env/lib/python3.10/site-packages/vllm/v1/executor/multiproc_executor.py", line 93, in _init_executor
self.workers = WorkerProc.wait_for_ready(unready_workers)
File "/home/user/ds31-env/dsv31-uv-env/lib/python3.10/site-packages/vllm/v1/executor/multiproc_executor.py", line 422, in wait_for_ready
raise e from None
Exception: WorkerProc initialization failed due to an exception in a background process. See stack trace for root cause.
Traceback (most recent call last):
File "/home/user/ds31-env/dsv31-uv-env/bin/vllm", line 10, in
sys.exit(main())
File "/home/user/ds31-env/dsv31-uv-env/lib/python3.10/site-packages/vllm/entrypoints/cli/main.py", line 56, in main
args.dispatch_function(args)
File "/home/user/ds31-env/dsv31-uv-env/lib/python3.10/site-packages/vllm/entrypoints/cli/serve.py", line 42, in cmd
uvloop.run(run_server(args))
File "/home/user/ds31-env/dsv31-uv-env/lib/python3.10/site-packages/uvloop/init.py", line 82, in run
return loop.run_until_complete(wrapper())
File "uvloop/loop.pyx", line 1518, in uvloop.loop.Loop.run_until_complete
File "/home/user/ds31-env/dsv31-uv-env/lib/python3.10/site-packages/uvloop/init.py", line 61, in wrapper
return await main
File "/home/user/ds31-env/dsv31-uv-env/lib/python3.10/site-packages/vllm/entrypoints/openai/api_server.py", line 1324, in run_server
async with build_async_engine_client(args) as engine_client:
File "/usr/lib/python3.10/contextlib.py", line 199, in aenter
return await anext(self.gen)
File "/home/user/ds31-env/dsv31-uv-env/lib/python3.10/site-packages/vllm/entrypoints/openai/api_server.py", line 153, in build_async_engine_client
async with build_async_engine_client_from_engine_args(
File "/usr/lib/python3.10/contextlib.py", line 199, in aenter
return await anext(self.gen)
File "/home/user/ds31-env/dsv31-uv-env/lib/python3.10/site-packages/vllm/entrypoints/openai/api_server.py", line 185, in build_async_engine_client_from_engine_args
async_llm = AsyncLLM.from_vllm_config(
File "/home/user/ds31-env/dsv31-uv-env/lib/python3.10/site-packages/vllm/v1/engine/async_llm.py", line 157, in from_vllm_config
return cls(
File "/home/user/ds31-env/dsv31-uv-env/lib/python3.10/site-packages/vllm/v1/engine/async_llm.py", line 123, in init
self.engine_core = core_client_class(
File "/home/user/ds31-env/dsv31-uv-env/lib/python3.10/site-packages/vllm/v1/engine/core_client.py", line 734, in init
super().init(
File "/home/user/ds31-env/dsv31-uv-env/lib/python3.10/site-packages/vllm/v1/engine/core_client.py", line 418, in init
self._wait_for_engine_startup(output_address, parallel_config)
File "/home/user/ds31-env/dsv31-uv-env/lib/python3.10/site-packages/vllm/v1/engine/core_client.py", line 484, in _wait_for_engine_startup
raise RuntimeError("Engine core initialization failed. "
RuntimeError: Engine core initialization failed. See root cause above. Failed core proc(s): {}
/usr/lib/python3.10/multiprocessing/resource_tracker.py:224: UserWarning: resource_tracker: There appear to be 1 leaked shared_memory objects to clean up at shutdown
warnings.warn('resource_tracker: There appear to be %d '
(dsv31-uv-env) :~/ds31-env$

tclf90

QuantTrio org Aug 26

•

edited Aug 26

Your transformers version is too high, need to follow the installation steps in the README.md

# ❗there are glitches with vllm 0.10.1.1, still looking for resolutions❗
# ❗downgrade vllm for now ❗
pip install vllm==0.9.0
pip install transformers==4.53

# ❗patch up AWQ MoE quant config, otherwise some modules cannot be properly loaded❗
SITE_PACKAGES=$(pip -V | awk '{print $4}' | sed 's/\/pip$//')
cp awq_marlin.py "$SITE_PACKAGES/vllm/model_executor/layers/quantization/awq_marlin.py"

The patch is to fix awq modules_to_not_convert issue, which has been merged in the later version of vllm.
We need to manually cp the file for now.

talrid

Aug 26

•

edited Aug 26

Your transformers version is too high, need to follow the installation steps in the README.md
# ❗there are glitches with vllm 0.10.1.1, still looking for resolutions❗
# ❗downgrade vllm for now ❗
pip install vllm==0.9.0
pip install transformers==4.53

# ❗patch up AWQ MoE quant config, otherwise some modules cannot be properly loaded❗
SITE_PACKAGES=$(pip -V | awk '{print $4}' | sed 's/\/pip$//')
cp awq_marlin.py "$SITE_PACKAGES/vllm/model_executor/layers/quantization/awq_marlin.py"
The patch is to fix awq modules_to_not_convert issue, which has been merged in the later version of vllm.
We need to manually cp the file for now.

Thanks for the tip @tclf90 . I have been trying for an hour to play with different versions of vllm and additional params, but without the 'awq_marlin' i guess it's unsolvable :-)

I recommend putting these important instructions at the top of the README. Make them more noticable

alexchenyu

Sep 4

it is working now, thanks

Upload images, audio, and videos by dragging in the text input, pasting, or clicking here.

Tap or paste here to upload images

· Sign up or log in to comment