I could not run it, please check my steps

#1
by alexchenyu - opened

:$ mkdir ds31-env
:
$ cd ds31-env
:/ds31-env$ ls
:
/ds31-env$ uv venv dsv31-uv-env
Using CPython 3.10.12 interpreter at: /usr/bin/python3
Creating virtual environment at: dsv31-uv-env
Activate with: source dsv31-uv-env/bin/activate
:/ds31-env$ ^C
:
/ds31-env$ source dsv31-uv-env/bin/activate
(dsv31-uv-env) :~/ds31-env$ uv pip install vllm==0.9.0
Using Python 3.10.12 environment at: dsv31-uv-env
Resolved 149 packages in 455ms
Prepared 8 packages in 13.96s
Installed 149 packages in 326ms

  • aiohappyeyeballs==2.6.1
  • aiohttp==3.12.15
  • aiosignal==1.4.0
  • airportsdata==20250811
  • annotated-types==0.7.0
  • anyio==4.10.0
  • astor==0.8.1
  • async-timeout==5.0.1
  • attrs==25.3.0
  • blake3==1.0.5
  • cachetools==6.2.0
  • certifi==2025.8.3
  • charset-normalizer==3.4.3
  • click==8.2.1
  • cloudpickle==3.1.1
  • compressed-tensors==0.9.4
  • cupy-cuda12x==13.6.0
  • depyf==0.18.0
  • dill==0.4.0
  • diskcache==5.6.3
  • distro==1.9.0
  • dnspython==2.7.0
  • einops==0.8.1
  • email-validator==2.2.0
  • exceptiongroup==1.3.0
  • fastapi==0.116.1
  • fastapi-cli==0.0.8
  • fastapi-cloud-cli==0.1.5
  • fastrlock==0.8.3
  • filelock==3.19.1
  • frozenlist==1.7.0
  • fsspec==2025.7.0
  • gguf==0.17.1
  • googleapis-common-protos==1.70.0
  • grpcio==1.74.0
  • h11==0.16.0
  • hf-xet==1.1.8
  • httpcore==1.0.9
  • httptools==0.6.4
  • httpx==0.28.1
  • huggingface-hub==0.34.4
  • idna==3.10
  • importlib-metadata==8.7.0
  • interegular==0.3.3
  • jinja2==3.1.6
  • jiter==0.10.0
  • jsonschema==4.25.1
  • jsonschema-specifications==2025.4.1
  • lark==1.2.2
  • llguidance==0.7.30
  • llvmlite==0.44.0
  • lm-format-enforcer==0.10.12
  • markdown-it-py==4.0.0
  • markupsafe==3.0.2
  • mdurl==0.1.2
  • mistral-common==1.8.4
  • mpmath==1.3.0
  • msgpack==1.1.1
  • msgspec==0.19.0
  • multidict==6.6.4
  • nest-asyncio==1.6.0
  • networkx==3.4.2
  • ninja==1.13.0
  • numba==0.61.2
  • numpy==2.2.6
  • nvidia-cublas-cu12==12.6.4.1
  • nvidia-cuda-cupti-cu12==12.6.80
  • nvidia-cuda-nvrtc-cu12==12.6.77
  • nvidia-cuda-runtime-cu12==12.6.77
  • nvidia-cudnn-cu12==9.5.1.17
  • nvidia-cufft-cu12==11.3.0.4
  • nvidia-cufile-cu12==1.11.1.6
  • nvidia-curand-cu12==10.3.7.77
  • nvidia-cusolver-cu12==11.7.1.2
  • nvidia-cusparse-cu12==12.5.4.2
  • nvidia-cusparselt-cu12==0.6.3
  • nvidia-nccl-cu12==2.26.2
  • nvidia-nvjitlink-cu12==12.6.85
  • nvidia-nvtx-cu12==12.6.77
  • openai==1.101.0
  • opencv-python-headless==4.12.0.88
  • opentelemetry-api==1.36.0
  • opentelemetry-exporter-otlp==1.36.0
  • opentelemetry-exporter-otlp-proto-common==1.36.0
  • opentelemetry-exporter-otlp-proto-grpc==1.36.0
  • opentelemetry-exporter-otlp-proto-http==1.36.0
  • opentelemetry-proto==1.36.0
  • opentelemetry-sdk==1.36.0
  • opentelemetry-semantic-conventions==0.57b0
  • opentelemetry-semantic-conventions-ai==0.4.13
  • outlines==0.1.11
  • outlines-core==0.1.26
  • packaging==25.0
  • partial-json-parser==0.2.1.1.post6
  • pillow==11.3.0
  • prometheus-client==0.22.1
  • prometheus-fastapi-instrumentator==7.1.0
  • propcache==0.3.2
  • protobuf==6.32.0
  • psutil==7.0.0
  • py-cpuinfo==9.0.0
  • pycountry==24.6.1
  • pydantic==2.11.7
  • pydantic-core==2.33.2
  • pydantic-extra-types==2.10.5
  • pygments==2.19.2
  • python-dotenv==1.1.1
  • python-json-logger==3.3.0
  • python-multipart==0.0.20
  • pyyaml==6.0.2
  • pyzmq==27.0.2
  • ray==2.48.0
  • referencing==0.36.2
  • regex==2025.7.34
  • requests==2.32.5
  • rich==14.1.0
  • rich-toolkit==0.15.0
  • rignore==0.6.4
  • rpds-py==0.27.0
  • safetensors==0.6.2
  • scipy==1.15.3
  • sentencepiece==0.2.1
  • sentry-sdk==2.35.0
  • setuptools==80.9.0
  • shellingham==1.5.4
  • sniffio==1.3.1
  • starlette==0.47.3
  • sympy==1.14.0
  • tiktoken==0.11.0
  • tokenizers==0.21.4
  • torch==2.7.0
  • torchaudio==2.7.0
  • torchvision==0.22.0
  • tqdm==4.67.1
  • transformers==4.55.4
  • triton==3.3.0
  • typer==0.16.1
  • typing-extensions==4.15.0
  • typing-inspection==0.4.1
  • urllib3==2.5.0
  • uvicorn==0.35.0
  • uvloop==0.21.0
  • vllm==0.9.0
  • watchfiles==1.1.0
  • websockets==15.0.1
  • xformers==0.0.30
  • xgrammar==0.1.19
  • yarl==1.20.1
  • zipp==3.23.0
    (dsv31-uv-env) :~/ds31-env$ uv pip install transformers==4.53
    Using Python 3.10.12 environment at: dsv31-uv-env
    Resolved 18 packages in 26ms
    Uninstalled 1 package in 69ms
    Installed 1 package in 192ms
  • transformers==4.55.4
  • transformers==4.53.0
    (dsv31-uv-env) :/ds31-env$ uv pip install huggingface_hub
    Using Python 3.10.12 environment at: dsv31-uv-env
    Audited 1 package in 6ms
    (dsv31-uv-env) :
    /ds31-env$ huggingface-cli download QuantTrio/DeepSeek-V3.1-AWQ --local-dir /models/DeepSeek-V3.1-AWQ --local-dir-use-symlinks False
    /home/user/ds31-env/dsv31-uv-env/lib/python3.10/site-packages/huggingface_hub/commands/download.py:141: FutureWarning: Ignoring --local-dir-use-symlinks. Downloading to a local directory does not use symlinks anymore.
    warnings.warn(
    ⚠️ Warning: 'huggingface-cli download' is deprecated. Use 'hf download' instead.
    Fetching 151 files: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 151/151 [00:00<00:00, 901.00it/s]
    /home/user/models/DeepSeek-V3.1-AWQ
    (dsv31-uv-env) :
    /ds31-env$ SITE_PACKAGES=$(python -c "import site; print(site.getsitepackages()[0])")
    (dsv31-uv-env) :/ds31-env$ cp ~/models/DeepSeek-V3.1-AWQ/awq_marlin.py "$SITE_PACKAGES/vllm/model_executor/layers/quantization/utils/awq_marlin.py"
    (dsv31-uv-env) :
    /ds31-env$ hf download QuantTrio/DeepSeek-V3.1-AWQ --local-dir /models/DeepSeek-V3.1-AWQ --local-dir-use-symlinks False
    usage: hf []
    hf: error: unrecognized arguments: --local-dir-use-symlinks False
    (dsv31-uv-env) :
    /ds31-env$ hf download QuantTrio/DeepSeek-V3.1-AWQ --local-dir /models/DeepSeek-V3.1-AWQ
    Fetching 151 files: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 151/151 [00:00<00:00, 967.36it/s]
    /home/user/models/DeepSeek-V3.1-AWQ
    (dsv31-uv-env) :
    /ds31-env$ vllm serve "$HOME/models/DeepSeek-V3.1-AWQ"
    --served-model-name DeepSeek-V3.1-AWQ
    --swap-space 16
    --tensor-parallel-size 8
    --tool-call-parser deepseek_v3
    --enable-expert-parallel
    --enable-auto-tool-choice
    --max-model-len 131072
    --max-seq-len-to-capture 131072
    --enforce-eager
    --max-num-seqs 2
    --trust-remote-code
    --kv-cache-dtype auto
    --gpu-memory-utilization 0.85
    --disable-log-requests
    INFO 08-25 14:16:26 [__init__.py:243] Automatically detected platform cuda.
    INFO 08-25 14:16:53 [__init__.py:31] Available plugins for group vllm.general_plugins:
    INFO 08-25 14:16:53 [__init__.py:33] - lora_filesystem_resolver -> vllm.plugins.lora_resolvers.filesystem_resolver:register_filesystem_resolver
    INFO 08-25 14:16:53 [init.py:36] All plugins in this group will be loaded. Set VLLM_PLUGINS to control which plugins to load.
    INFO 08-25 14:16:54 [api_server.py:1289] vLLM API server version 0.9.0
    INFO 08-25 14:16:55 [cli_args.py:300] non-default args: {'api_key': 'eyJhIjoiYmI5ZW', 'enable_auto_tool_choice': True, 'tool_call_parser': 'deepseek_v3', 'trust_remote_code': True, 'max_model_len': 131072, 'enforce_eager': True, 'max_seq_len_to_capture': 131072, 'served_model_name': ['DeepSeek-V3.1-AWQ'], 'tensor_parallel_size': 8, 'enable_expert_parallel': True, 'gpu_memory_utilization': 0.85, 'swap_space': 16.0, 'max_num_seqs': 2, 'disable_log_requests': True}
    INFO 08-25 14:16:55 [config.py:213] Replacing legacy 'type' key with 'rope_type'
    INFO 08-25 14:17:01 [config.py:793] This model supports multiple tasks: {'reward', 'classify', 'embed', 'score', 'generate'}. Defaulting to 'generate'.
    INFO 08-25 14:17:01 [awq_marlin.py:115] The model is convertible to awq_marlin during runtime. Using awq_marlin kernel.
    INFO 08-25 14:17:01 [config.py:1875] Defaulting to use mp for distributed inference
    INFO 08-25 14:17:01 [config.py:2118] Chunked prefill is enabled with max_num_batched_tokens=8192.
    WARNING 08-25 14:17:01 [cuda.py:87] To see benefits of async output processing, enable CUDA graph. Since, enforce-eager is enabled, async output processor cannot be used
    INFO 08-25 14:17:01 [cuda.py:151] Forcing kv cache block size to 64 for FlashMLA backend.
    INFO 08-25 14:17:05 [init.py:243] Automatically detected platform cuda.
    INFO 08-25 14:17:07 [core.py:438] Waiting for init message from front-end.
    INFO 08-25 14:17:07 [init.py:31] Available plugins for group vllm.general_plugins:
    INFO 08-25 14:17:07 [init.py:33] - lora_filesystem_resolver -> vllm.plugins.lora_resolvers.filesystem_resolver:register_filesystem_resolver
    INFO 08-25 14:17:07 [init.py:36] All plugins in this group will be loaded. Set VLLM_PLUGINS to control which plugins to load.
    INFO 08-25 14:17:07 [core.py:65] Initializing a V1 LLM engine (v0.9.0) with config: model='/home/user/models/DeepSeek-V3.1-AWQ', speculative_config=None, tokenizer='/home/user/models/DeepSeek-V3.1-AWQ', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, override_neuron_config={}, tokenizer_revision=None, trust_remote_code=True, dtype=torch.bfloat16, max_seq_len=131072, download_dir=None, load_format=auto, tensor_parallel_size=8, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=awq_marlin, enforce_eager=True, kv_cache_dtype=auto, device_config=cuda, decoding_config=DecodingConfig(backend='auto', disable_fallback=False, disable_any_whitespace=False, disable_additional_properties=False, reasoning_backend=''), observability_config=ObservabilityConfig(show_hidden_metrics_for_version=None, otlp_traces_endpoint=None, collect_detailed_traces=None), seed=0, served_model_name=DeepSeek-V3.1-AWQ, num_scheduler_steps=1, multi_step_stream_outputs=True, enable_prefix_caching=True, chunked_prefill_enabled=True, use_async_output_proc=False, pooler_config=None, compilation_config={"compile_sizes": [], "inductor_compile_config": {"enable_auto_functionalized_v2": false}, "cudagraph_capture_sizes": [], "max_capture_size": 0}
    WARNING 08-25 14:17:07 [multiproc_worker_utils.py:306] Reducing Torch parallelism from 104 threads to 1 to avoid unnecessary CPU contention. Set OMP_NUM_THREADS in the external environment to tune this value as needed.
    INFO 08-25 14:17:07 [shm_broadcast.py:250] vLLM message queue communication handle: Handle(local_reader_ranks=[0, 1, 2, 3, 4, 5, 6, 7], buffer_handle=(8, 10485760, 10, 'psm_d25f6886'), local_subscribe_addr='ipc:///tmp/5f995bb2-6804-4fb2-b318-17cea2e87526', remote_subscribe_addr=None, remote_addr_ipv6=False)
    INFO 08-25 14:17:10 [init.py:243] Automatically detected platform cuda.
    INFO 08-25 14:17:10 [init.py:243] Automatically detected platform cuda.
    INFO 08-25 14:17:11 [init.py:243] Automatically detected platform cuda.
    INFO 08-25 14:17:11 [init.py:243] Automatically detected platform cuda.
    INFO 08-25 14:17:11 [init.py:243] Automatically detected platform cuda.
    INFO 08-25 14:17:11 [init.py:243] Automatically detected platform cuda.
    INFO 08-25 14:17:11 [init.py:243] Automatically detected platform cuda.
    INFO 08-25 14:17:11 [init.py:243] Automatically detected platform cuda.
    INFO 08-25 14:17:15 [init.py:31] Available plugins for group vllm.general_plugins:
    INFO 08-25 14:17:15 [init.py:33] - lora_filesystem_resolver -> vllm.plugins.lora_resolvers.filesystem_resolver:register_filesystem_resolver
    INFO 08-25 14:17:15 [init.py:36] All plugins in this group will be loaded. Set VLLM_PLUGINS to control which plugins to load.
    INFO 08-25 14:17:15 [init.py:31] Available plugins for group vllm.general_plugins:
    INFO 08-25 14:17:15 [init.py:33] - lora_filesystem_resolver -> vllm.plugins.lora_resolvers.filesystem_resolver:register_filesystem_resolver
    INFO 08-25 14:17:15 [init.py:36] All plugins in this group will be loaded. Set VLLM_PLUGINS to control which plugins to load.
    INFO 08-25 14:17:15 [init.py:31] Available plugins for group vllm.general_plugins:
    INFO 08-25 14:17:15 [init.py:33] - lora_filesystem_resolver -> vllm.plugins.lora_resolvers.filesystem_resolver:register_filesystem_resolver
    INFO 08-25 14:17:15 [init.py:36] All plugins in this group will be loaded. Set VLLM_PLUGINS to control which plugins to load.
    INFO 08-25 14:17:15 [init.py:31] Available plugins for group vllm.general_plugins:
    INFO 08-25 14:17:15 [init.py:33] - lora_filesystem_resolver -> vllm.plugins.lora_resolvers.filesystem_resolver:register_filesystem_resolver
    INFO 08-25 14:17:15 [init.py:36] All plugins in this group will be loaded. Set VLLM_PLUGINS to control which plugins to load.
    INFO 08-25 14:17:15 [init.py:31] Available plugins for group vllm.general_plugins:
    INFO 08-25 14:17:15 [init.py:33] - lora_filesystem_resolver -> vllm.plugins.lora_resolvers.filesystem_resolver:register_filesystem_resolver
    INFO 08-25 14:17:15 [init.py:36] All plugins in this group will be loaded. Set VLLM_PLUGINS to control which plugins to load.
    INFO 08-25 14:17:15 [init.py:31] Available plugins for group vllm.general_plugins:
    INFO 08-25 14:17:15 [init.py:33] - lora_filesystem_resolver -> vllm.plugins.lora_resolvers.filesystem_resolver:register_filesystem_resolver
    INFO 08-25 14:17:15 [init.py:36] All plugins in this group will be loaded. Set VLLM_PLUGINS to control which plugins to load.
    INFO 08-25 14:17:15 [init.py:31] Available plugins for group vllm.general_plugins:
    INFO 08-25 14:17:15 [init.py:33] - lora_filesystem_resolver -> vllm.plugins.lora_resolvers.filesystem_resolver:register_filesystem_resolver
    INFO 08-25 14:17:15 [init.py:36] All plugins in this group will be loaded. Set VLLM_PLUGINS to control which plugins to load.
    INFO 08-25 14:17:15 [init.py:31] Available plugins for group vllm.general_plugins:
    INFO 08-25 14:17:15 [init.py:33] - lora_filesystem_resolver -> vllm.plugins.lora_resolvers.filesystem_resolver:register_filesystem_resolver
    INFO 08-25 14:17:15 [init.py:36] All plugins in this group will be loaded. Set VLLM_PLUGINS to control which plugins to load.
    WARNING 08-25 14:17:15 [utils.py:2671] Methods determine_num_available_blocks,device_config,get_cache_block_size_bytes,initialize_cache not implemented in <vllm.v1.worker.gpu_worker.Worker object at 0x7eff9bfc3460>
    WARNING 08-25 14:17:15 [utils.py:2671] Methods determine_num_available_blocks,device_config,get_cache_block_size_bytes,initialize_cache not implemented in <vllm.v1.worker.gpu_worker.Worker object at 0x7f88c43b5b70>
    WARNING 08-25 14:17:15 [utils.py:2671] Methods determine_num_available_blocks,device_config,get_cache_block_size_bytes,initialize_cache not implemented in <vllm.v1.worker.gpu_worker.Worker object at 0x7f118e4afd90>
    WARNING 08-25 14:17:15 [utils.py:2671] Methods determine_num_available_blocks,device_config,get_cache_block_size_bytes,initialize_cache not implemented in <vllm.v1.worker.gpu_worker.Worker object at 0x7f4a995c7d60>
    (VllmWorker rank=2 pid=28520) INFO 08-25 14:17:15 [shm_broadcast.py:250] vLLM message queue communication handle: Handle(local_reader_ranks=[0], buffer_handle=(1, 10485760, 10, 'psm_e02888cd'), local_subscribe_addr='ipc:///tmp/23da0016-24b4-4b72-b285-71f399e7f466', remote_subscribe_addr=None, remote_addr_ipv6=False)
    (VllmWorker rank=5 pid=28523) INFO 08-25 14:17:15 [shm_broadcast.py:250] vLLM message queue communication handle: Handle(local_reader_ranks=[0], buffer_handle=(1, 10485760, 10, 'psm_eeeed842'), local_subscribe_addr='ipc:///tmp/1ded14f2-5494-41f8-b5d0-f2317c547471', remote_subscribe_addr=None, remote_addr_ipv6=False)
    (VllmWorker rank=6 pid=28524) INFO 08-25 14:17:15 [shm_broadcast.py:250] vLLM message queue communication handle: Handle(local_reader_ranks=[0], buffer_handle=(1, 10485760, 10, 'psm_ec314e1f'), local_subscribe_addr='ipc:///tmp/b703ad44-cfce-44dd-ae43-265b557ad75e', remote_subscribe_addr=None, remote_addr_ipv6=False)
    (VllmWorker rank=1 pid=28519) INFO 08-25 14:17:15 [shm_broadcast.py:250] vLLM message queue communication handle: Handle(local_reader_ranks=[0], buffer_handle=(1, 10485760, 10, 'psm_36a30e49'), local_subscribe_addr='ipc:///tmp/85ab75c0-35f6-4024-a714-10a5637fe751', remote_subscribe_addr=None, remote_addr_ipv6=False)
    WARNING 08-25 14:17:15 [utils.py:2671] Methods determine_num_available_blocks,device_config,get_cache_block_size_bytes,initialize_cache not implemented in <vllm.v1.worker.gpu_worker.Worker object at 0x7efdfcc37d30>
    (VllmWorker rank=3 pid=28521) INFO 08-25 14:17:15 [shm_broadcast.py:250] vLLM message queue communication handle: Handle(local_reader_ranks=[0], buffer_handle=(1, 10485760, 10, 'psm_ba8eef96'), local_subscribe_addr='ipc:///tmp/faefe075-cb3b-4f46-b749-3af1b0b0b908', remote_subscribe_addr=None, remote_addr_ipv6=False)
    WARNING 08-25 14:17:15 [utils.py:2671] Methods determine_num_available_blocks,device_config,get_cache_block_size_bytes,initialize_cache not implemented in <vllm.v1.worker.gpu_worker.Worker object at 0x7f3323ca8880>
    (VllmWorker rank=0 pid=28518) INFO 08-25 14:17:15 [shm_broadcast.py:250] vLLM message queue communication handle: Handle(local_reader_ranks=[0], buffer_handle=(1, 10485760, 10, 'psm_849b1dcf'), local_subscribe_addr='ipc:///tmp/3a193b53-45fb-4d9f-9893-d00fcdba49bb', remote_subscribe_addr=None, remote_addr_ipv6=False)
    WARNING 08-25 14:17:16 [utils.py:2671] Methods determine_num_available_blocks,device_config,get_cache_block_size_bytes,initialize_cache not implemented in <vllm.v1.worker.gpu_worker.Worker object at 0x7fbbb09afd90>
    (VllmWorker rank=7 pid=28525) INFO 08-25 14:17:16 [shm_broadcast.py:250] vLLM message queue communication handle: Handle(local_reader_ranks=[0], buffer_handle=(1, 10485760, 10, 'psm_72f861c6'), local_subscribe_addr='ipc:///tmp/9fae716a-c25c-4ed2-ab33-dccfb2397a41', remote_subscribe_addr=None, remote_addr_ipv6=False)
    WARNING 08-25 14:17:16 [utils.py:2671] Methods determine_num_available_blocks,device_config,get_cache_block_size_bytes,initialize_cache not implemented in <vllm.v1.worker.gpu_worker.Worker object at 0x7f7c7b133d90>
    (VllmWorker rank=4 pid=28522) INFO 08-25 14:17:16 [shm_broadcast.py:250] vLLM message queue communication handle: Handle(local_reader_ranks=[0], buffer_handle=(1, 10485760, 10, 'psm_b0f06d40'), local_subscribe_addr='ipc:///tmp/3b9fafee-8558-4040-967f-69ed9b092e5f', remote_subscribe_addr=None, remote_addr_ipv6=False)
    (VllmWorker rank=0 pid=28518) INFO 08-25 14:17:22 [utils.py:1077] Found nccl from library libnccl.so.2
    (VllmWorker rank=0 pid=28518) INFO 08-25 14:17:22 [pynccl.py:69] vLLM is using nccl==2.26.2
    (VllmWorker rank=7 pid=28525) INFO 08-25 14:17:22 [utils.py:1077] Found nccl from library libnccl.so.2
    (VllmWorker rank=4 pid=28522) INFO 08-25 14:17:22 [utils.py:1077] Found nccl from library libnccl.so.2
    (VllmWorker rank=7 pid=28525) INFO 08-25 14:17:22 [pynccl.py:69] vLLM is using nccl==2.26.2
    (VllmWorker rank=4 pid=28522) INFO 08-25 14:17:22 [pynccl.py:69] vLLM is using nccl==2.26.2
    (VllmWorker rank=6 pid=28524) INFO 08-25 14:17:22 [utils.py:1077] Found nccl from library libnccl.so.2
    (VllmWorker rank=3 pid=28521) INFO 08-25 14:17:22 [utils.py:1077] Found nccl from library libnccl.so.2
    (VllmWorker rank=6 pid=28524) INFO 08-25 14:17:22 [pynccl.py:69] vLLM is using nccl==2.26.2
    (VllmWorker rank=2 pid=28520) INFO 08-25 14:17:22 [utils.py:1077] Found nccl from library libnccl.so.2
    (VllmWorker rank=3 pid=28521) INFO 08-25 14:17:22 [pynccl.py:69] vLLM is using nccl==2.26.2
    (VllmWorker rank=2 pid=28520) INFO 08-25 14:17:22 [pynccl.py:69] vLLM is using nccl==2.26.2
    (VllmWorker rank=1 pid=28519) INFO 08-25 14:17:22 [utils.py:1077] Found nccl from library libnccl.so.2
    (VllmWorker rank=1 pid=28519) INFO 08-25 14:17:22 [pynccl.py:69] vLLM is using nccl==2.26.2
    (VllmWorker rank=5 pid=28523) INFO 08-25 14:17:22 [utils.py:1077] Found nccl from library libnccl.so.2
    (VllmWorker rank=5 pid=28523) INFO 08-25 14:17:22 [pynccl.py:69] vLLM is using nccl==2.26.2
    (VllmWorker rank=0 pid=28518) INFO 08-25 14:17:24 [custom_all_reduce_utils.py:245] reading GPU P2P access cache from /home/user/.cache/vllm/gpu_p2p_access_cache_for_0,1,2,3,4,5,6,7.json
    (VllmWorker rank=7 pid=28525) INFO 08-25 14:17:24 [custom_all_reduce_utils.py:245] reading GPU P2P access cache from /home/user/.cache/vllm/gpu_p2p_access_cache_for_0,1,2,3,4,5,6,7.json
    (VllmWorker rank=3 pid=28521) INFO 08-25 14:17:24 [custom_all_reduce_utils.py:245] reading GPU P2P access cache from /home/user/.cache/vllm/gpu_p2p_access_cache_for_0,1,2,3,4,5,6,7.json
    (VllmWorker rank=1 pid=28519) INFO 08-25 14:17:24 [custom_all_reduce_utils.py:245] reading GPU P2P access cache from /home/user/.cache/vllm/gpu_p2p_access_cache_for_0,1,2,3,4,5,6,7.json
    (VllmWorker rank=5 pid=28523) INFO 08-25 14:17:24 [custom_all_reduce_utils.py:245] reading GPU P2P access cache from /home/user/.cache/vllm/gpu_p2p_access_cache_for_0,1,2,3,4,5,6,7.json
    (VllmWorker rank=6 pid=28524) INFO 08-25 14:17:24 [custom_all_reduce_utils.py:245] reading GPU P2P access cache from /home/user/.cache/vllm/gpu_p2p_access_cache_for_0,1,2,3,4,5,6,7.json
    (VllmWorker rank=4 pid=28522) INFO 08-25 14:17:24 [custom_all_reduce_utils.py:245] reading GPU P2P access cache from /home/user/.cache/vllm/gpu_p2p_access_cache_for_0,1,2,3,4,5,6,7.json
    (VllmWorker rank=2 pid=28520) INFO 08-25 14:17:24 [custom_all_reduce_utils.py:245] reading GPU P2P access cache from /home/user/.cache/vllm/gpu_p2p_access_cache_for_0,1,2,3,4,5,6,7.json
    (VllmWorker rank=0 pid=28518) INFO 08-25 14:17:24 [shm_broadcast.py:250] vLLM message queue communication handle: Handle(local_reader_ranks=[1, 2, 3, 4, 5, 6, 7], buffer_handle=(7, 4194304, 6, 'psm_0b30b0b1'), local_subscribe_addr='ipc:///tmp/11fd5b6d-549d-453d-b0db-48adaa7c5b8c', remote_subscribe_addr=None, remote_addr_ipv6=False)
    (VllmWorker rank=1 pid=28519) INFO 08-25 14:17:25 [parallel_state.py:1064] rank 1 in world size 8 is assigned as DP rank 0, PP rank 0, TP rank 1, EP rank 1
    (VllmWorker rank=0 pid=28518) INFO 08-25 14:17:25 [parallel_state.py:1064] rank 0 in world size 8 is assigned as DP rank 0, PP rank 0, TP rank 0, EP rank 0
    (VllmWorker rank=4 pid=28522) INFO 08-25 14:17:25 [parallel_state.py:1064] rank 4 in world size 8 is assigned as DP rank 0, PP rank 0, TP rank 4, EP rank 4
    (VllmWorker rank=3 pid=28521) INFO 08-25 14:17:25 [parallel_state.py:1064] rank 3 in world size 8 is assigned as DP rank 0, PP rank 0, TP rank 3, EP rank 3
    (VllmWorker rank=2 pid=28520) INFO 08-25 14:17:25 [parallel_state.py:1064] rank 2 in world size 8 is assigned as DP rank 0, PP rank 0, TP rank 2, EP rank 2
    (VllmWorker rank=7 pid=28525) INFO 08-25 14:17:25 [parallel_state.py:1064] rank 7 in world size 8 is assigned as DP rank 0, PP rank 0, TP rank 7, EP rank 7
    (VllmWorker rank=6 pid=28524) INFO 08-25 14:17:25 [parallel_state.py:1064] rank 6 in world size 8 is assigned as DP rank 0, PP rank 0, TP rank 6, EP rank 6
    (VllmWorker rank=5 pid=28523) INFO 08-25 14:17:25 [parallel_state.py:1064] rank 5 in world size 8 is assigned as DP rank 0, PP rank 0, TP rank 5, EP rank 5
    (VllmWorker rank=1 pid=28519) WARNING 08-25 14:17:25 [topk_topp_sampler.py:58] FlashInfer is not available. Falling back to the PyTorch-native implementation of top-p & top-k sampling. For the best performance, please install FlashInfer.
    (VllmWorker rank=7 pid=28525) WARNING 08-25 14:17:25 [topk_topp_sampler.py:58] FlashInfer is not available. Falling back to the PyTorch-native implementation of top-p & top-k sampling. For the best performance, please install FlashInfer.
    (VllmWorker rank=6 pid=28524) WARNING 08-25 14:17:25 [topk_topp_sampler.py:58] FlashInfer is not available. Falling back to the PyTorch-native implementation of top-p & top-k sampling. For the best performance, please install FlashInfer.
    (VllmWorker rank=5 pid=28523) WARNING 08-25 14:17:25 [topk_topp_sampler.py:58] FlashInfer is not available. Falling back to the PyTorch-native implementation of top-p & top-k sampling. For the best performance, please install FlashInfer.
    (VllmWorker rank=0 pid=28518) WARNING 08-25 14:17:25 [topk_topp_sampler.py:58] FlashInfer is not available. Falling back to the PyTorch-native implementation of top-p & top-k sampling. For the best performance, please install FlashInfer.
    (VllmWorker rank=3 pid=28521) WARNING 08-25 14:17:25 [topk_topp_sampler.py:58] FlashInfer is not available. Falling back to the PyTorch-native implementation of top-p & top-k sampling. For the best performance, please install FlashInfer.
    (VllmWorker rank=1 pid=28519) INFO 08-25 14:17:25 [gpu_model_runner.py:1531] Starting to load model /home/user/models/DeepSeek-V3.1-AWQ...
    (VllmWorker rank=7 pid=28525) INFO 08-25 14:17:25 [gpu_model_runner.py:1531] Starting to load model /home/user/models/DeepSeek-V3.1-AWQ...
    (VllmWorker rank=4 pid=28522) WARNING 08-25 14:17:25 [topk_topp_sampler.py:58] FlashInfer is not available. Falling back to the PyTorch-native implementation of top-p & top-k sampling. For the best performance, please install FlashInfer.
    (VllmWorker rank=6 pid=28524) INFO 08-25 14:17:25 [gpu_model_runner.py:1531] Starting to load model /home/user/models/DeepSeek-V3.1-AWQ...
    (VllmWorker rank=5 pid=28523) INFO 08-25 14:17:25 [gpu_model_runner.py:1531] Starting to load model /home/user/models/DeepSeek-V3.1-AWQ...
    (VllmWorker rank=2 pid=28520) WARNING 08-25 14:17:25 [topk_topp_sampler.py:58] FlashInfer is not available. Falling back to the PyTorch-native implementation of top-p & top-k sampling. For the best performance, please install FlashInfer.
    (VllmWorker rank=6 pid=28524) WARNING 08-25 14:17:25 [utils.py:298] The model class DeepseekV3ForCausalLM has not defined packed_modules_mapping, this may lead to incorrect mapping of quantized or ignored modules
    (VllmWorker rank=7 pid=28525) WARNING 08-25 14:17:25 [utils.py:298] The model class DeepseekV3ForCausalLM has not defined packed_modules_mapping, this may lead to incorrect mapping of quantized or ignored modules
    (VllmWorker rank=1 pid=28519) WARNING 08-25 14:17:25 [utils.py:298] The model class DeepseekV3ForCausalLM has not defined packed_modules_mapping, this may lead to incorrect mapping of quantized or ignored modules
    (VllmWorker rank=5 pid=28523) WARNING 08-25 14:17:25 [utils.py:298] The model class DeepseekV3ForCausalLM has not defined packed_modules_mapping, this may lead to incorrect mapping of quantized or ignored modules
    (VllmWorker rank=3 pid=28521) INFO 08-25 14:17:25 [gpu_model_runner.py:1531] Starting to load model /home/user/models/DeepSeek-V3.1-AWQ...
    (VllmWorker rank=0 pid=28518) INFO 08-25 14:17:25 [gpu_model_runner.py:1531] Starting to load model /home/user/models/DeepSeek-V3.1-AWQ...
    (VllmWorker rank=2 pid=28520) INFO 08-25 14:17:25 [gpu_model_runner.py:1531] Starting to load model /home/user/models/DeepSeek-V3.1-AWQ...
    (VllmWorker rank=0 pid=28518) WARNING 08-25 14:17:25 [utils.py:298] The model class DeepseekV3ForCausalLM has not defined packed_modules_mapping, this may lead to incorrect mapping of quantized or ignored modules
    (VllmWorker rank=4 pid=28522) INFO 08-25 14:17:25 [gpu_model_runner.py:1531] Starting to load model /home/user/models/DeepSeek-V3.1-AWQ...
    (VllmWorker rank=3 pid=28521) WARNING 08-25 14:17:25 [utils.py:298] The model class DeepseekV3ForCausalLM has not defined packed_modules_mapping, this may lead to incorrect mapping of quantized or ignored modules
    (VllmWorker rank=2 pid=28520) WARNING 08-25 14:17:25 [utils.py:298] The model class DeepseekV3ForCausalLM has not defined packed_modules_mapping, this may lead to incorrect mapping of quantized or ignored modules
    (VllmWorker rank=4 pid=28522) WARNING 08-25 14:17:25 [utils.py:298] The model class DeepseekV3ForCausalLM has not defined packed_modules_mapping, this may lead to incorrect mapping of quantized or ignored modules
    (VllmWorker rank=6 pid=28524) INFO 08-25 14:17:25 [cuda.py:200] Using FlashMLA backend on V1 engine.
    (VllmWorker rank=4 pid=28522) INFO 08-25 14:17:25 [cuda.py:200] Using FlashMLA backend on V1 engine.
    (VllmWorker rank=5 pid=28523) INFO 08-25 14:17:25 [cuda.py:200] Using FlashMLA backend on V1 engine.
    (VllmWorker rank=7 pid=28525) INFO 08-25 14:17:25 [cuda.py:200] Using FlashMLA backend on V1 engine.
    (VllmWorker rank=1 pid=28519) INFO 08-25 14:17:25 [cuda.py:200] Using FlashMLA backend on V1 engine.
    (VllmWorker rank=3 pid=28521) INFO 08-25 14:17:25 [cuda.py:200] Using FlashMLA backend on V1 engine.
    (VllmWorker rank=0 pid=28518) INFO 08-25 14:17:25 [cuda.py:200] Using FlashMLA backend on V1 engine.
    (VllmWorker rank=2 pid=28520) INFO 08-25 14:17:25 [cuda.py:200] Using FlashMLA backend on V1 engine.
    Loading safetensors checkpoint shards: 0% Completed | 0/134 [00:00<?, ?it/s]
    Loading safetensors checkpoint shards: 1% Completed | 1/134 [00:00<00:16, 8.08it/s]
    Loading safetensors checkpoint shards: 1% Completed | 2/134 [00:00<00:27, 4.79it/s]
    Loading safetensors checkpoint shards: 2% Completed | 3/134 [00:00<00:32, 4.08it/s]
    Loading safetensors checkpoint shards: 3% Completed | 4/134 [00:00<00:33, 3.88it/s]
    Loading safetensors checkpoint shards: 4% Completed | 5/134 [00:01<00:27, 4.77it/s]
    Loading safetensors checkpoint shards: 4% Completed | 6/134 [00:01<00:23, 5.51it/s]
    Loading safetensors checkpoint shards: 5% Completed | 7/134 [00:01<00:20, 6.18it/s]
    Loading safetensors checkpoint shards: 6% Completed | 8/134 [00:01<00:23, 5.41it/s]
    Loading safetensors checkpoint shards: 7% Completed | 9/134 [00:01<00:26, 4.76it/s]
    Loading safetensors checkpoint shards: 7% Completed | 10/134 [00:01<00:23, 5.35it/s]
    Loading safetensors checkpoint shards: 8% Completed | 11/134 [00:02<00:20, 6.03it/s]
    Loading safetensors checkpoint shards: 9% Completed | 12/134 [00:02<00:23, 5.11it/s]
    Loading safetensors checkpoint shards: 10% Completed | 13/134 [00:02<00:25, 4.66it/s]
    Loading safetensors checkpoint shards: 10% Completed | 14/134 [00:02<00:23, 5.16it/s]
    Loading safetensors checkpoint shards: 12% Completed | 16/134 [00:02<00:15, 7.49it/s]
    Loading safetensors checkpoint shards: 13% Completed | 17/134 [00:03<00:15, 7.65it/s]
    Loading safetensors checkpoint shards: 13% Completed | 18/134 [00:03<00:18, 6.21it/s]
    Loading safetensors checkpoint shards: 15% Completed | 20/134 [00:03<00:13, 8.27it/s]
    Loading safetensors checkpoint shards: 16% Completed | 21/134 [00:03<00:17, 6.52it/s]
    Loading safetensors checkpoint shards: 16% Completed | 22/134 [00:03<00:20, 5.49it/s]
    Loading safetensors checkpoint shards: 18% Completed | 24/134 [00:04<00:15, 6.99it/s]
    Loading safetensors checkpoint shards: 20% Completed | 27/134 [00:04<00:12, 8.58it/s]
    Loading safetensors checkpoint shards: 21% Completed | 28/134 [00:04<00:13, 7.64it/s]
    (VllmWorker rank=4 pid=28522) ERROR 08-25 14:17:30 [multiproc_executor.py:487] WorkerProc failed to start.
    (VllmWorker rank=4 pid=28522) ERROR 08-25 14:17:30 [multiproc_executor.py:487] Traceback (most recent call last):
    (VllmWorker rank=4 pid=28522) ERROR 08-25 14:17:30 [multiproc_executor.py:487] File "/home/user/ds31-env/dsv31-uv-env/lib/python3.10/site-packages/vllm/v1/executor/multiproc_executor.py", line 461, in worker_main
    (VllmWorker rank=4 pid=28522) ERROR 08-25 14:17:30 [multiproc_executor.py:487] worker = WorkerProc(*args, **kwargs)
    (VllmWorker rank=4 pid=28522) ERROR 08-25 14:17:30 [multiproc_executor.py:487] File "/home/user/ds31-env/dsv31-uv-env/lib/python3.10/site-packages/vllm/v1/executor/multiproc_executor.py", line 358, in __init__
    (VllmWorker rank=4 pid=28522) ERROR 08-25 14:17:30 [multiproc_executor.py:487] self.worker.load_model()
    (VllmWorker rank=4 pid=28522) ERROR 08-25 14:17:30 [multiproc_executor.py:487] File "/home/user/ds31-env/dsv31-uv-env/lib/python3.10/site-packages/vllm/v1/worker/gpu_worker.py", line 164, in load_model
    (VllmWorker rank=4 pid=28522) ERROR 08-25 14:17:30 [multiproc_executor.py:487] self.model_runner.load_model()
    (VllmWorker rank=4 pid=28522) ERROR 08-25 14:17:30 [multiproc_executor.py:487] File "/home/user/ds31-env/dsv31-uv-env/lib/python3.10/site-packages/vllm/v1/worker/gpu_model_runner.py", line 1534, in load_model
    (VllmWorker rank=4 pid=28522) ERROR 08-25 14:17:30 [multiproc_executor.py:487] self.model = get_model(vllm_config=self.vllm_config)
    (VllmWorker rank=4 pid=28522) ERROR 08-25 14:17:30 [multiproc_executor.py:487] File "/home/user/ds31-env/dsv31-uv-env/lib/python3.10/site-packages/vllm/model_executor/model_loader/__init__.py", line 58, in get_model
    (VllmWorker rank=4 pid=28522) ERROR 08-25 14:17:30 [multiproc_executor.py:487] return loader.load_model(vllm_config=vllm_config,
    (VllmWorker rank=4 pid=28522) ERROR 08-25 14:17:30 [multiproc_executor.py:487] File "/home/user/ds31-env/dsv31-uv-env/lib/python3.10/site-packages/vllm/model_executor/model_loader/default_loader.py", line 277, in load_model
    (VllmWorker rank=4 pid=28522) ERROR 08-25 14:17:30 [multiproc_executor.py:487] loaded_weights = model.load_weights(
    (VllmWorker rank=4 pid=28522) ERROR 08-25 14:17:30 [multiproc_executor.py:487] File "/home/user/ds31-env/dsv31-uv-env/lib/python3.10/site-packages/vllm/model_executor/models/deepseek_v2.py", line 800, in load_weights
    (VllmWorker rank=4 pid=28522) ERROR 08-25 14:17:30 [multiproc_executor.py:487] param = params_dict[name]
    (VllmWorker rank=4 pid=28522) ERROR 08-25 14:17:30 [multiproc_executor.py:487] KeyError: 'model.layers.60.mlp.experts.w2_weight'
    Loading safetensors checkpoint shards: 21% Completed | 28/134 [00:04<00:18, 5.61it/s]
    (VllmWorker rank=0 pid=28518)
    (VllmWorker rank=5 pid=28523) ERROR 08-25 14:17:30 [multiproc_executor.py:487] WorkerProc failed to start.
    (VllmWorker rank=5 pid=28523) ERROR 08-25 14:17:30 [multiproc_executor.py:487] Traceback (most recent call last):
    (VllmWorker rank=5 pid=28523) ERROR 08-25 14:17:30 [multiproc_executor.py:487] File "/home/user/ds31-env/dsv31-uv-env/lib/python3.10/site-packages/vllm/v1/executor/multiproc_executor.py", line 461, in worker_main
    (VllmWorker rank=5 pid=28523) ERROR 08-25 14:17:30 [multiproc_executor.py:487] worker = WorkerProc(*args, **kwargs)
    (VllmWorker rank=5 pid=28523) ERROR 08-25 14:17:30 [multiproc_executor.py:487] File "/home/user/ds31-env/dsv31-uv-env/lib/python3.10/site-packages/vllm/v1/executor/multiproc_executor.py", line 358, in __init__
    (VllmWorker rank=5 pid=28523) ERROR 08-25 14:17:30 [multiproc_executor.py:487] self.worker.load_model()
    (VllmWorker rank=5 pid=28523) ERROR 08-25 14:17:30 [multiproc_executor.py:487] File "/home/user/ds31-env/dsv31-uv-env/lib/python3.10/site-packages/vllm/v1/worker/gpu_worker.py", line 164, in load_model
    (VllmWorker rank=5 pid=28523) ERROR 08-25 14:17:30 [multiproc_executor.py:487] self.model_runner.load_model()
    (VllmWorker rank=5 pid=28523) ERROR 08-25 14:17:30 [multiproc_executor.py:487] File "/home/user/ds31-env/dsv31-uv-env/lib/python3.10/site-packages/vllm/v1/worker/gpu_model_runner.py", line 1534, in load_model
    (VllmWorker rank=5 pid=28523) ERROR 08-25 14:17:30 [multiproc_executor.py:487] self.model = get_model(vllm_config=self.vllm_config)
    (VllmWorker rank=5 pid=28523) ERROR 08-25 14:17:30 [multiproc_executor.py:487] File "/home/user/ds31-env/dsv31-uv-env/lib/python3.10/site-packages/vllm/model_executor/model_loader/__init__.py", line 58, in get_model
    (VllmWorker rank=5 pid=28523) ERROR 08-25 14:17:30 [multiproc_executor.py:487] return loader.load_model(vllm_config=vllm_config,
    (VllmWorker rank=5 pid=28523) ERROR 08-25 14:17:30 [multiproc_executor.py:487] File "/home/user/ds31-env/dsv31-uv-env/lib/python3.10/site-packages/vllm/model_executor/model_loader/default_loader.py", line 277, in load_model
    (VllmWorker rank=5 pid=28523) ERROR 08-25 14:17:30 [multiproc_executor.py:487] loaded_weights = model.load_weights(
    (VllmWorker rank=5 pid=28523) ERROR 08-25 14:17:30 [multiproc_executor.py:487] File "/home/user/ds31-env/dsv31-uv-env/lib/python3.10/site-packages/vllm/model_executor/models/deepseek_v2.py", line 758, in load_weights
    (VllmWorker rank=5 pid=28523) ERROR 08-25 14:17:30 [multiproc_executor.py:487] for name, loaded_weight in weights:
    (VllmWorker rank=5 pid=28523) ERROR 08-25 14:17:30 [multiproc_executor.py:487] File "/home/user/ds31-env/dsv31-uv-env/lib/python3.10/site-packages/vllm/model_executor/model_loader/default_loader.py", line 252, in get_all_weights
    (VllmWorker rank=5 pid=28523) ERROR 08-25 14:17:30 [multiproc_executor.py:487] yield from self._get_weights_iterator(primary_weights)
    (VllmWorker rank=5 pid=28523) ERROR 08-25 14:17:30 [multiproc_executor.py:487] File "/home/user/ds31-env/dsv31-uv-env/lib/python3.10/site-packages/vllm/model_executor/model_loader/default_loader.py", line 235, in
    (VllmWorker rank=5 pid=28523) ERROR 08-25 14:17:30 [multiproc_executor.py:487] return ((source.prefix + name, tensor)
    (VllmWorker rank=5 pid=28523) ERROR 08-25 14:17:30 [multiproc_executor.py:487] File "/home/user/ds31-env/dsv31-uv-env/lib/python3.10/site-packages/vllm/model_executor/model_loader/weight_utils.py", line 470, in safetensors_weights_iterator
    (VllmWorker rank=5 pid=28523) ERROR 08-25 14:17:30 [multiproc_executor.py:487] param = f.get_tensor(name)
    (VllmWorker rank=5 pid=28523) ERROR 08-25 14:17:30 [multiproc_executor.py:487] ValueError: could not determine the shape of object type 'torch.storage.UntypedStorage'
    [rank0]:[W825 14:17:31.025692350 ProcessGroupNCCL.cpp:1476] Warning: WARNING: destroy_process_group() was not called before program exit, which can leak resources. For more info, please see https://pytorch.org/docs/stable/distributed.html#shutdown (function operator())
    ERROR 08-25 14:17:33 [core.py:500] EngineCore failed to start.
    ERROR 08-25 14:17:33 [core.py:500] Traceback (most recent call last):
    ERROR 08-25 14:17:33 [core.py:500] File "/home/user/ds31-env/dsv31-uv-env/lib/python3.10/site-packages/vllm/v1/engine/core.py", line 491, in run_engine_core
    ERROR 08-25 14:17:33 [core.py:500] engine_core = EngineCoreProc(*args, **kwargs)
    ERROR 08-25 14:17:33 [core.py:500] File "/home/user/ds31-env/dsv31-uv-env/lib/python3.10/site-packages/vllm/v1/engine/core.py", line 390, in init
    ERROR 08-25 14:17:33 [core.py:500] super().init(vllm_config, executor_class, log_stats,
    ERROR 08-25 14:17:33 [core.py:500] File "/home/user/ds31-env/dsv31-uv-env/lib/python3.10/site-packages/vllm/v1/engine/core.py", line 71, in init
    ERROR 08-25 14:17:33 [core.py:500] self.model_executor = executor_class(vllm_config)
    ERROR 08-25 14:17:33 [core.py:500] File "/home/user/ds31-env/dsv31-uv-env/lib/python3.10/site-packages/vllm/executor/executor_base.py", line 52, in init
    ERROR 08-25 14:17:33 [core.py:500] self._init_executor()
    ERROR 08-25 14:17:33 [core.py:500] File "/home/user/ds31-env/dsv31-uv-env/lib/python3.10/site-packages/vllm/v1/executor/multiproc_executor.py", line 93, in _init_executor
    ERROR 08-25 14:17:33 [core.py:500] self.workers = WorkerProc.wait_for_ready(unready_workers)
    ERROR 08-25 14:17:33 [core.py:500] File "/home/user/ds31-env/dsv31-uv-env/lib/python3.10/site-packages/vllm/v1/executor/multiproc_executor.py", line 422, in wait_for_ready
    ERROR 08-25 14:17:33 [core.py:500] raise e from None
    ERROR 08-25 14:17:33 [core.py:500] Exception: WorkerProc initialization failed due to an exception in a background process. See stack trace for root cause.
    Process EngineCore_0:
    Traceback (most recent call last):
    File "/usr/lib/python3.10/multiprocessing/process.py", line 314, in _bootstrap
    self.run()
    File "/usr/lib/python3.10/multiprocessing/process.py", line 108, in run
    self._target(*self._args, **self._kwargs)
    File "/home/user/ds31-env/dsv31-uv-env/lib/python3.10/site-packages/vllm/v1/engine/core.py", line 504, in run_engine_core
    raise e
    File "/home/user/ds31-env/dsv31-uv-env/lib/python3.10/site-packages/vllm/v1/engine/core.py", line 491, in run_engine_core
    engine_core = EngineCoreProc(*args, **kwargs)
    File "/home/user/ds31-env/dsv31-uv-env/lib/python3.10/site-packages/vllm/v1/engine/core.py", line 390, in init
    super().init(vllm_config, executor_class, log_stats,
    File "/home/user/ds31-env/dsv31-uv-env/lib/python3.10/site-packages/vllm/v1/engine/core.py", line 71, in init
    self.model_executor = executor_class(vllm_config)
    File "/home/user/ds31-env/dsv31-uv-env/lib/python3.10/site-packages/vllm/executor/executor_base.py", line 52, in init
    self._init_executor()
    File "/home/user/ds31-env/dsv31-uv-env/lib/python3.10/site-packages/vllm/v1/executor/multiproc_executor.py", line 93, in _init_executor
    self.workers = WorkerProc.wait_for_ready(unready_workers)
    File "/home/user/ds31-env/dsv31-uv-env/lib/python3.10/site-packages/vllm/v1/executor/multiproc_executor.py", line 422, in wait_for_ready
    raise e from None
    Exception: WorkerProc initialization failed due to an exception in a background process. See stack trace for root cause.
    Traceback (most recent call last):
    File "/home/user/ds31-env/dsv31-uv-env/bin/vllm", line 10, in
    sys.exit(main())
    File "/home/user/ds31-env/dsv31-uv-env/lib/python3.10/site-packages/vllm/entrypoints/cli/main.py", line 56, in main
    args.dispatch_function(args)
    File "/home/user/ds31-env/dsv31-uv-env/lib/python3.10/site-packages/vllm/entrypoints/cli/serve.py", line 42, in cmd
    uvloop.run(run_server(args))
    File "/home/user/ds31-env/dsv31-uv-env/lib/python3.10/site-packages/uvloop/init.py", line 82, in run
    return loop.run_until_complete(wrapper())
    File "uvloop/loop.pyx", line 1518, in uvloop.loop.Loop.run_until_complete
    File "/home/user/ds31-env/dsv31-uv-env/lib/python3.10/site-packages/uvloop/init.py", line 61, in wrapper
    return await main
    File "/home/user/ds31-env/dsv31-uv-env/lib/python3.10/site-packages/vllm/entrypoints/openai/api_server.py", line 1324, in run_server
    async with build_async_engine_client(args) as engine_client:
    File "/usr/lib/python3.10/contextlib.py", line 199, in aenter
    return await anext(self.gen)
    File "/home/user/ds31-env/dsv31-uv-env/lib/python3.10/site-packages/vllm/entrypoints/openai/api_server.py", line 153, in build_async_engine_client
    async with build_async_engine_client_from_engine_args(
    File "/usr/lib/python3.10/contextlib.py", line 199, in aenter
    return await anext(self.gen)
    File "/home/user/ds31-env/dsv31-uv-env/lib/python3.10/site-packages/vllm/entrypoints/openai/api_server.py", line 185, in build_async_engine_client_from_engine_args
    async_llm = AsyncLLM.from_vllm_config(
    File "/home/user/ds31-env/dsv31-uv-env/lib/python3.10/site-packages/vllm/v1/engine/async_llm.py", line 157, in from_vllm_config
    return cls(
    File "/home/user/ds31-env/dsv31-uv-env/lib/python3.10/site-packages/vllm/v1/engine/async_llm.py", line 123, in init
    self.engine_core = core_client_class(
    File "/home/user/ds31-env/dsv31-uv-env/lib/python3.10/site-packages/vllm/v1/engine/core_client.py", line 734, in init
    super().init(
    File "/home/user/ds31-env/dsv31-uv-env/lib/python3.10/site-packages/vllm/v1/engine/core_client.py", line 418, in init
    self._wait_for_engine_startup(output_address, parallel_config)
    File "/home/user/ds31-env/dsv31-uv-env/lib/python3.10/site-packages/vllm/v1/engine/core_client.py", line 484, in _wait_for_engine_startup
    raise RuntimeError("Engine core initialization failed. "
    RuntimeError: Engine core initialization failed. See root cause above. Failed core proc(s): {}
    /usr/lib/python3.10/multiprocessing/resource_tracker.py:224: UserWarning: resource_tracker: There appear to be 1 leaked shared_memory objects to clean up at shutdown
    warnings.warn('resource_tracker: There appear to be %d '
    (dsv31-uv-env) :~/ds31-env$
QuantTrio org
edited Aug 26

Your transformers version is too high, need to follow the installation steps in the README.md

# ❗there are glitches with vllm 0.10.1.1, still looking for resolutions❗
# ❗downgrade vllm for now ❗
pip install vllm==0.9.0
pip install transformers==4.53

# ❗patch up AWQ MoE quant config, otherwise some modules cannot be properly loaded❗
SITE_PACKAGES=$(pip -V | awk '{print $4}' | sed 's/\/pip$//')
cp awq_marlin.py "$SITE_PACKAGES/vllm/model_executor/layers/quantization/awq_marlin.py"

The patch is to fix awq modules_to_not_convert issue, which has been merged in the later version of vllm.
We need to manually cp the file for now.

Your transformers version is too high, need to follow the installation steps in the README.md

# ❗there are glitches with vllm 0.10.1.1, still looking for resolutions❗
# ❗downgrade vllm for now ❗
pip install vllm==0.9.0
pip install transformers==4.53

# ❗patch up AWQ MoE quant config, otherwise some modules cannot be properly loaded❗
SITE_PACKAGES=$(pip -V | awk '{print $4}' | sed 's/\/pip$//')
cp awq_marlin.py "$SITE_PACKAGES/vllm/model_executor/layers/quantization/awq_marlin.py"

The patch is to fix awq modules_to_not_convert issue, which has been merged in the later version of vllm.
We need to manually cp the file for now.

Thanks for the tip @tclf90 . I have been trying for an hour to play with different versions of vllm and additional params, but without the 'awq_marlin' i guess it's unsolvable :-)

I recommend putting these important instructions at the top of the README. Make them more noticable

image.png

it is working now, thanks

Sign up or log in to comment