-
-
Save cyb70289/89f02826ad96a80e6b99b0a93beb2228 to your computer and use it in GitHub Desktop.
vllm log
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| $ DNNL_VERBOSE=all LD_PRELOAD="/usr/lib/`uname -m`-linux-gnu/libtcmalloc_minimal.so.4:/usr/lib/`uname -m`-linux-gnu/libgomp.so.1" OMP_NUM_THREADS=60 VLLM_CPU_OMP_THREADS_BIND="1-60" VLLM_TARGET_DEVICE=cpu VLLM_CPU_KVCACHE_SPACE=20 vllm serve RedHatAI/Llama-3.2-3B-Instruct-quantized.w8a8 --trust-remote-code --max-model-len=2048 --enforce-eager --host 0.0.0.0 --port 8000 | |
| onednn_verbose,v1,info,oneDNN v3.7.1 (commit 8d263e693366ef8db40acc569cc7d8edf644556d) | |
| onednn_verbose,v1,info,cpu,runtime:OpenMP,nthr:60 | |
| onednn_verbose,v1,info,cpu,isa:AArch64 SVE (128 bits) | |
| onednn_verbose,v1,info,gpu,runtime:none | |
| onednn_verbose,v1,info,graph,backend,0:dnnl_backend | |
| onednn_verbose,v1,primitive,info,template:operation,engine,primitive,implementation,prop_kind,memory_descriptors,attributes,auxiliary,problem_desc,exec_time | |
| onednn_verbose,v1,graph,info,template:operation,engine,partition_id,partition_kind,op_names,data_formats,logical_tensors,fpmath_mode,implementation,backend,exec_time | |
| INFO 07-15 10:50:56 [__init__.py:253] Automatically detected platform cpu. | |
| INFO 07-15 10:51:00 [api_server.py:1623] vLLM API server version 0.9.2rc2.dev127+g0f2ff2f65.d20250715 | |
| INFO 07-15 10:51:00 [cli_args.py:325] non-default args: {'host': '0.0.0.0', 'model': 'RedHatAI/Llama-3.2-3B-Instruct-quantized.w8a8', 'trust_remote_code': True, 'max_model_len': 2048, 'enforce_eager': True} | |
| INFO 07-15 10:51:09 [config.py:852] This model supports multiple tasks: {'embed', 'classify', 'generate', 'reward'}. Defaulting to 'generate'. | |
| INFO 07-15 10:51:09 [config.py:1489] Using max model len 2048 | |
| INFO 07-15 10:51:09 [importing.py:63] Triton not installed or not compatible; certain GPU-related functions will not be available. | |
| INFO 07-15 10:51:09 [arg_utils.py:1116] Chunked prefill is not supported for ARM and POWER CPUs; disabling it for V1 backend. | |
| onednn_verbose,v1,info,oneDNN v3.7.1 (commit 8d263e693366ef8db40acc569cc7d8edf644556d) | |
| onednn_verbose,v1,info,cpu,runtime:OpenMP,nthr:60 | |
| onednn_verbose,v1,info,cpu,isa:AArch64 SVE (128 bits) | |
| onednn_verbose,v1,info,gpu,runtime:none | |
| onednn_verbose,v1,info,graph,backend,0:dnnl_backend | |
| onednn_verbose,v1,primitive,info,template:operation,engine,primitive,implementation,prop_kind,memory_descriptors,attributes,auxiliary,problem_desc,exec_time | |
| onednn_verbose,v1,graph,info,template:operation,engine,partition_id,partition_kind,op_names,data_formats,logical_tensors,fpmath_mode,implementation,backend,exec_time | |
| INFO 07-15 10:51:14 [__init__.py:253] Automatically detected platform cpu. | |
| INFO 07-15 10:51:17 [importing.py:63] Triton not installed or not compatible; certain GPU-related functions will not be available. | |
| INFO 07-15 10:51:17 [core.py:526] Waiting for init message from front-end. | |
| INFO 07-15 10:51:17 [core.py:69] Initializing a V1 LLM engine (v0.9.2rc2.dev127+g0f2ff2f65.d20250715) with config: model='RedHatAI/Llama-3.2-3B-Instruct-quantized.w8a8', speculative_config=None, tokenizer='RedHatAI/Llama-3.2-3B-Instruct-quantized.w8a8', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, override_neuron_config={}, tokenizer_revision=None, trust_remote_code=True, dtype=torch.bfloat16, max_seq_len=2048, download_dir=None, load_format=LoadFormat.AUTO, tensor_parallel_size=1, pipeline_parallel_size=1, disable_custom_all_reduce=True, quantization=compressed-tensors, enforce_eager=True, kv_cache_dtype=auto, device_config=cpu, decoding_config=DecodingConfig(backend='auto', disable_fallback=False, disable_any_whitespace=False, disable_additional_properties=False, reasoning_backend=''), observability_config=ObservabilityConfig(show_hidden_metrics_for_version=None, otlp_traces_endpoint=None, collect_detailed_traces=None), seed=0, served_model_name=RedHatAI/Llama-3.2-3B-Instruct-quantized.w8a8, num_scheduler_steps=1, multi_step_stream_outputs=True, enable_prefix_caching=True, chunked_prefill_enabled=False, use_async_output_proc=False, pooler_config=None, compilation_config={"level":0,"debug_dump_path":"","cache_dir":"","backend":"","custom_ops":[],"splitting_ops":[],"use_inductor":true,"compile_sizes":[],"inductor_compile_config":{"enable_auto_functionalized_v2":false},"inductor_passes":{},"use_cudagraph":true,"cudagraph_num_of_warmups":0,"cudagraph_capture_sizes":[],"cudagraph_copy_inputs":false,"full_cuda_graph":false,"max_capture_size":0,"local_cache_dir":null} | |
| INFO 07-15 10:51:18 [cpu_worker.py:58] OMP threads binding of Process 1512068: | |
| INFO 07-15 10:51:18 [cpu_worker.py:58] OMP tid: 1512068, core 1 | |
| INFO 07-15 10:51:18 [cpu_worker.py:58] OMP tid: 1512251, core 2 | |
| INFO 07-15 10:51:18 [cpu_worker.py:58] OMP tid: 1512252, core 3 | |
| INFO 07-15 10:51:18 [cpu_worker.py:58] OMP tid: 1512253, core 4 | |
| INFO 07-15 10:51:18 [cpu_worker.py:58] OMP tid: 1512254, core 5 | |
| INFO 07-15 10:51:18 [cpu_worker.py:58] OMP tid: 1512255, core 6 | |
| INFO 07-15 10:51:18 [cpu_worker.py:58] OMP tid: 1512256, core 7 | |
| INFO 07-15 10:51:18 [cpu_worker.py:58] OMP tid: 1512257, core 8 | |
| INFO 07-15 10:51:18 [cpu_worker.py:58] OMP tid: 1512258, core 9 | |
| INFO 07-15 10:51:18 [cpu_worker.py:58] OMP tid: 1512259, core 10 | |
| INFO 07-15 10:51:18 [cpu_worker.py:58] OMP tid: 1512260, core 11 | |
| INFO 07-15 10:51:18 [cpu_worker.py:58] OMP tid: 1512261, core 12 | |
| INFO 07-15 10:51:18 [cpu_worker.py:58] OMP tid: 1512262, core 13 | |
| INFO 07-15 10:51:18 [cpu_worker.py:58] OMP tid: 1512263, core 14 | |
| INFO 07-15 10:51:18 [cpu_worker.py:58] OMP tid: 1512264, core 15 | |
| INFO 07-15 10:51:18 [cpu_worker.py:58] OMP tid: 1512265, core 16 | |
| INFO 07-15 10:51:18 [cpu_worker.py:58] OMP tid: 1512266, core 17 | |
| INFO 07-15 10:51:18 [cpu_worker.py:58] OMP tid: 1512267, core 18 | |
| INFO 07-15 10:51:18 [cpu_worker.py:58] OMP tid: 1512268, core 19 | |
| INFO 07-15 10:51:18 [cpu_worker.py:58] OMP tid: 1512269, core 20 | |
| INFO 07-15 10:51:18 [cpu_worker.py:58] OMP tid: 1512270, core 21 | |
| INFO 07-15 10:51:18 [cpu_worker.py:58] OMP tid: 1512271, core 22 | |
| INFO 07-15 10:51:18 [cpu_worker.py:58] OMP tid: 1512272, core 23 | |
| INFO 07-15 10:51:18 [cpu_worker.py:58] OMP tid: 1512273, core 24 | |
| INFO 07-15 10:51:18 [cpu_worker.py:58] OMP tid: 1512274, core 25 | |
| INFO 07-15 10:51:18 [cpu_worker.py:58] OMP tid: 1512275, core 26 | |
| INFO 07-15 10:51:18 [cpu_worker.py:58] OMP tid: 1512276, core 27 | |
| INFO 07-15 10:51:18 [cpu_worker.py:58] OMP tid: 1512277, core 28 | |
| INFO 07-15 10:51:18 [cpu_worker.py:58] OMP tid: 1512278, core 29 | |
| INFO 07-15 10:51:18 [cpu_worker.py:58] OMP tid: 1512279, core 30 | |
| INFO 07-15 10:51:18 [cpu_worker.py:58] OMP tid: 1512280, core 31 | |
| INFO 07-15 10:51:18 [cpu_worker.py:58] OMP tid: 1512281, core 32 | |
| INFO 07-15 10:51:18 [cpu_worker.py:58] OMP tid: 1512282, core 33 | |
| INFO 07-15 10:51:18 [cpu_worker.py:58] OMP tid: 1512283, core 34 | |
| INFO 07-15 10:51:18 [cpu_worker.py:58] OMP tid: 1512284, core 35 | |
| INFO 07-15 10:51:18 [cpu_worker.py:58] OMP tid: 1512285, core 36 | |
| INFO 07-15 10:51:18 [cpu_worker.py:58] OMP tid: 1512286, core 37 | |
| INFO 07-15 10:51:18 [cpu_worker.py:58] OMP tid: 1512287, core 38 | |
| INFO 07-15 10:51:18 [cpu_worker.py:58] OMP tid: 1512288, core 39 | |
| INFO 07-15 10:51:18 [cpu_worker.py:58] OMP tid: 1512289, core 40 | |
| INFO 07-15 10:51:18 [cpu_worker.py:58] OMP tid: 1512290, core 41 | |
| INFO 07-15 10:51:18 [cpu_worker.py:58] OMP tid: 1512291, core 42 | |
| INFO 07-15 10:51:18 [cpu_worker.py:58] OMP tid: 1512292, core 43 | |
| INFO 07-15 10:51:18 [cpu_worker.py:58] OMP tid: 1512293, core 44 | |
| INFO 07-15 10:51:18 [cpu_worker.py:58] OMP tid: 1512294, core 45 | |
| INFO 07-15 10:51:18 [cpu_worker.py:58] OMP tid: 1512295, core 46 | |
| INFO 07-15 10:51:18 [cpu_worker.py:58] OMP tid: 1512296, core 47 | |
| INFO 07-15 10:51:18 [cpu_worker.py:58] OMP tid: 1512297, core 48 | |
| INFO 07-15 10:51:18 [cpu_worker.py:58] OMP tid: 1512298, core 49 | |
| INFO 07-15 10:51:18 [cpu_worker.py:58] OMP tid: 1512299, core 50 | |
| INFO 07-15 10:51:18 [cpu_worker.py:58] OMP tid: 1512300, core 51 | |
| INFO 07-15 10:51:18 [cpu_worker.py:58] OMP tid: 1512301, core 52 | |
| INFO 07-15 10:51:18 [cpu_worker.py:58] OMP tid: 1512302, core 53 | |
| INFO 07-15 10:51:18 [cpu_worker.py:58] OMP tid: 1512303, core 54 | |
| INFO 07-15 10:51:18 [cpu_worker.py:58] OMP tid: 1512304, core 55 | |
| INFO 07-15 10:51:18 [cpu_worker.py:58] OMP tid: 1512305, core 56 | |
| INFO 07-15 10:51:18 [cpu_worker.py:58] OMP tid: 1512306, core 57 | |
| INFO 07-15 10:51:18 [cpu_worker.py:58] OMP tid: 1512307, core 58 | |
| INFO 07-15 10:51:18 [cpu_worker.py:58] OMP tid: 1512308, core 59 | |
| INFO 07-15 10:51:18 [cpu_worker.py:58] OMP tid: 1512309, core 60 | |
| INFO 07-15 10:51:18 [cpu_worker.py:58] | |
| [Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 | |
| [Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 | |
| [Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 | |
| [Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 | |
| [Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 | |
| [Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 | |
| [Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 | |
| [Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 | |
| [Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 | |
| [Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 | |
| [Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 | |
| INFO 07-15 10:51:18 [parallel_state.py:1078] rank 0 in world size 1 is assigned as DP rank 0, PP rank 0, TP rank 0, EP rank 0 | |
| WARNING 07-15 10:51:18 [cpu.py:244] Pin memory is not supported on CPU. | |
| INFO 07-15 10:51:18 [cpu_model_runner.py:54] Starting to load model RedHatAI/Llama-3.2-3B-Instruct-quantized.w8a8... | |
| INFO 07-15 10:51:18 [compressed_tensors_w8a8_int8.py:52] Using CutlassScaledMMLinearKernel for CompressedTensorsW8A8Int8 | |
| INFO 07-15 10:51:18 [cpu.py:69] Using Torch SDPA backend. | |
| INFO 07-15 10:51:19 [weight_utils.py:292] Using model weights format ['*.safetensors'] | |
| INFO 07-15 10:51:20 [weight_utils.py:345] No model.safetensors.index.json found in remote. | |
| Loading safetensors checkpoint shards: 0% Completed | 0/1 [00:00<?, ?it/s] | |
| Loading safetensors checkpoint shards: 100% Completed | 1/1 [00:00<00:00, 2.56it/s] | |
| Loading safetensors checkpoint shards: 100% Completed | 1/1 [00:00<00:00, 2.56it/s] | |
| INFO 07-15 10:51:20 [default_loader.py:272] Loading weights took 0.46 seconds | |
| INFO 07-15 10:51:20 [kv_cache_utils.py:716] GPU KV cache size: 187,232 tokens | |
| INFO 07-15 10:51:20 [kv_cache_utils.py:720] Maximum concurrency for 2,048 tokens per request: 91.42x | |
| INFO 07-15 10:51:20 [cpu.py:69] Using Torch SDPA backend. | |
| INFO 07-15 10:51:22 [cpu_model_runner.py:66] Warming up model for the compilation... | |
| ERROR 07-15 10:51:22 [core.py:586] EngineCore failed to start. | |
| ERROR 07-15 10:51:22 [core.py:586] Traceback (most recent call last): | |
| ERROR 07-15 10:51:22 [core.py:586] File "/home/cyb/vllm/venv/lib/python3.12/site-packages/vllm/v1/engine/core.py", line 577, in run_engine_core | |
| ERROR 07-15 10:51:22 [core.py:586] engine_core = EngineCoreProc(*args, **kwargs) | |
| ERROR 07-15 10:51:22 [core.py:586] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ | |
| ERROR 07-15 10:51:22 [core.py:586] File "/home/cyb/vllm/venv/lib/python3.12/site-packages/vllm/v1/engine/core.py", line 404, in __init__ | |
| ERROR 07-15 10:51:22 [core.py:586] super().__init__(vllm_config, executor_class, log_stats, | |
| ERROR 07-15 10:51:22 [core.py:586] File "/home/cyb/vllm/venv/lib/python3.12/site-packages/vllm/v1/engine/core.py", line 82, in __init__ | |
| ERROR 07-15 10:51:22 [core.py:586] self._initialize_kv_caches(vllm_config) | |
| ERROR 07-15 10:51:22 [core.py:586] File "/home/cyb/vllm/venv/lib/python3.12/site-packages/vllm/v1/engine/core.py", line 169, in _initialize_kv_caches | |
| ERROR 07-15 10:51:22 [core.py:586] self.model_executor.initialize_from_config(kv_cache_configs) | |
| ERROR 07-15 10:51:22 [core.py:586] File "/home/cyb/vllm/venv/lib/python3.12/site-packages/vllm/v1/executor/abstract.py", line 66, in initialize_from_config | |
| ERROR 07-15 10:51:22 [core.py:586] self.collective_rpc("compile_or_warm_up_model") | |
| ERROR 07-15 10:51:22 [core.py:586] File "/home/cyb/vllm/venv/lib/python3.12/site-packages/vllm/executor/uniproc_executor.py", line 57, in collective_rpc | |
| ERROR 07-15 10:51:22 [core.py:586] answer = run_method(self.driver_worker, method, args, kwargs) | |
| ERROR 07-15 10:51:22 [core.py:586] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ | |
| ERROR 07-15 10:51:22 [core.py:586] File "/home/cyb/vllm/venv/lib/python3.12/site-packages/vllm/utils/__init__.py", line 2943, in run_method | |
| ERROR 07-15 10:51:22 [core.py:586] return func(*args, **kwargs) | |
| ERROR 07-15 10:51:22 [core.py:586] ^^^^^^^^^^^^^^^^^^^^^ | |
| ERROR 07-15 10:51:22 [core.py:586] File "/home/cyb/vllm/venv/lib/python3.12/site-packages/vllm/v1/worker/cpu_worker.py", line 90, in compile_or_warm_up_model | |
| ERROR 07-15 10:51:22 [core.py:586] self.model_runner.warming_up_model() | |
| ERROR 07-15 10:51:22 [core.py:586] File "/home/cyb/vllm/venv/lib/python3.12/site-packages/vllm/v1/worker/cpu_model_runner.py", line 69, in warming_up_model | |
| ERROR 07-15 10:51:22 [core.py:586] self._dummy_run(max(16, self.max_num_reqs)) | |
| ERROR 07-15 10:51:22 [core.py:586] File "/home/cyb/vllm/venv/lib/python3.12/site-packages/torch/utils/_contextlib.py", line 120, in decorate_context | |
| ERROR 07-15 10:51:22 [core.py:586] return func(*args, **kwargs) | |
| ERROR 07-15 10:51:22 [core.py:586] ^^^^^^^^^^^^^^^^^^^^^ | |
| ERROR 07-15 10:51:22 [core.py:586] File "/home/cyb/vllm/venv/lib/python3.12/site-packages/vllm/v1/worker/gpu_model_runner.py", line 2058, in _dummy_run | |
| ERROR 07-15 10:51:22 [core.py:586] outputs = model( | |
| ERROR 07-15 10:51:22 [core.py:586] ^^^^^^ | |
| ERROR 07-15 10:51:22 [core.py:586] File "/home/cyb/vllm/venv/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1773, in _wrapped_call_impl | |
| ERROR 07-15 10:51:22 [core.py:586] return self._call_impl(*args, **kwargs) | |
| ERROR 07-15 10:51:22 [core.py:586] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ | |
| ERROR 07-15 10:51:22 [core.py:586] File "/home/cyb/vllm/venv/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1784, in _call_impl | |
| ERROR 07-15 10:51:22 [core.py:586] return forward_call(*args, **kwargs) | |
| ERROR 07-15 10:51:22 [core.py:586] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ | |
| ERROR 07-15 10:51:22 [core.py:586] File "/home/cyb/vllm/venv/lib/python3.12/site-packages/vllm/model_executor/models/llama.py", line 589, in forward | |
| ERROR 07-15 10:51:22 [core.py:586] model_output = self.model(input_ids, positions, intermediate_tensors, | |
| ERROR 07-15 10:51:22 [core.py:586] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ | |
| ERROR 07-15 10:51:22 [core.py:586] File "/home/cyb/vllm/venv/lib/python3.12/site-packages/vllm/compilation/decorators.py", line 173, in __call__ | |
| ERROR 07-15 10:51:22 [core.py:586] return self.forward(*args, **kwargs) | |
| ERROR 07-15 10:51:22 [core.py:586] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ | |
| ERROR 07-15 10:51:22 [core.py:586] File "/home/cyb/vllm/venv/lib/python3.12/site-packages/vllm/model_executor/models/llama.py", line 393, in forward | |
| ERROR 07-15 10:51:22 [core.py:586] hidden_states, residual = layer(positions, hidden_states, residual) | |
| ERROR 07-15 10:51:22 [core.py:586] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ | |
| ERROR 07-15 10:51:22 [core.py:586] File "/home/cyb/vllm/venv/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1773, in _wrapped_call_impl | |
| ERROR 07-15 10:51:22 [core.py:586] return self._call_impl(*args, **kwargs) | |
| ERROR 07-15 10:51:22 [core.py:586] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ | |
| ERROR 07-15 10:51:22 [core.py:586] File "/home/cyb/vllm/venv/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1784, in _call_impl | |
| ERROR 07-15 10:51:22 [core.py:586] return forward_call(*args, **kwargs) | |
| ERROR 07-15 10:51:22 [core.py:586] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ | |
| ERROR 07-15 10:51:22 [core.py:586] File "/home/cyb/vllm/venv/lib/python3.12/site-packages/vllm/model_executor/models/llama.py", line 306, in forward | |
| ERROR 07-15 10:51:22 [core.py:586] hidden_states = self.self_attn(positions=positions, | |
| ERROR 07-15 10:51:22 [core.py:586] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ | |
| ERROR 07-15 10:51:22 [core.py:586] File "/home/cyb/vllm/venv/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1773, in _wrapped_call_impl | |
| ERROR 07-15 10:51:22 [core.py:586] return self._call_impl(*args, **kwargs) | |
| ERROR 07-15 10:51:22 [core.py:586] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ | |
| ERROR 07-15 10:51:22 [core.py:586] File "/home/cyb/vllm/venv/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1784, in _call_impl | |
| ERROR 07-15 10:51:22 [core.py:586] return forward_call(*args, **kwargs) | |
| ERROR 07-15 10:51:22 [core.py:586] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ | |
| ERROR 07-15 10:51:22 [core.py:586] File "/home/cyb/vllm/venv/lib/python3.12/site-packages/vllm/model_executor/models/llama.py", line 201, in forward | |
| ERROR 07-15 10:51:22 [core.py:586] qkv, _ = self.qkv_proj(hidden_states) | |
| ERROR 07-15 10:51:22 [core.py:586] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^ | |
| ERROR 07-15 10:51:22 [core.py:586] File "/home/cyb/vllm/venv/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1773, in _wrapped_call_impl | |
| ERROR 07-15 10:51:22 [core.py:586] return self._call_impl(*args, **kwargs) | |
| ERROR 07-15 10:51:22 [core.py:586] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ | |
| ERROR 07-15 10:51:22 [core.py:586] File "/home/cyb/vllm/venv/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1784, in _call_impl | |
| ERROR 07-15 10:51:22 [core.py:586] return forward_call(*args, **kwargs) | |
| ERROR 07-15 10:51:22 [core.py:586] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ | |
| ERROR 07-15 10:51:22 [core.py:586] File "/home/cyb/vllm/venv/lib/python3.12/site-packages/vllm/model_executor/layers/linear.py", line 510, in forward | |
| ERROR 07-15 10:51:22 [core.py:586] output_parallel = self.quant_method.apply(self, input_, bias) | |
| ERROR 07-15 10:51:22 [core.py:586] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ | |
| ERROR 07-15 10:51:22 [core.py:586] File "/home/cyb/vllm/venv/lib/python3.12/site-packages/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py", line 677, in apply | |
| ERROR 07-15 10:51:22 [core.py:586] return scheme.apply_weights(layer, x, bias=bias) | |
| ERROR 07-15 10:51:22 [core.py:586] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ | |
| ERROR 07-15 10:51:22 [core.py:586] File "/home/cyb/vllm/venv/lib/python3.12/site-packages/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py", line 111, in apply_weights | |
| ERROR 07-15 10:51:22 [core.py:586] return self.kernel.apply_weights(layer, x, bias) | |
| ERROR 07-15 10:51:22 [core.py:586] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ | |
| ERROR 07-15 10:51:22 [core.py:586] File "/home/cyb/vllm/venv/lib/python3.12/site-packages/vllm/model_executor/layers/quantization/kernels/scaled_mm/cutlass.py", line 115, in apply_weights | |
| ERROR 07-15 10:51:22 [core.py:586] x_q, x_s, x_zp = ops.scaled_int8_quant(x.contiguous(), | |
| ERROR 07-15 10:51:22 [core.py:586] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ | |
| ERROR 07-15 10:51:22 [core.py:586] File "/home/cyb/vllm/venv/lib/python3.12/site-packages/vllm/_custom_ops.py", line 1395, in scaled_int8_quant | |
| ERROR 07-15 10:51:22 [core.py:586] torch.ops._C.dynamic_scaled_int8_quant(output, input.contiguous(), | |
| ERROR 07-15 10:51:22 [core.py:586] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ | |
| ERROR 07-15 10:51:22 [core.py:586] File "/home/cyb/vllm/venv/lib/python3.12/site-packages/torch/_ops.py", line 1353, in __getattr__ | |
| ERROR 07-15 10:51:22 [core.py:586] raise AttributeError( | |
| ERROR 07-15 10:51:22 [core.py:586] AttributeError: '_OpNamespace' '_C' object has no attribute 'dynamic_scaled_int8_quant' | |
| Process EngineCore_0: | |
| Traceback (most recent call last): | |
| File "/usr/lib/python3.12/multiprocessing/process.py", line 314, in _bootstrap | |
| self.run() | |
| File "/usr/lib/python3.12/multiprocessing/process.py", line 108, in run | |
| self._target(*self._args, **self._kwargs) | |
| File "/home/cyb/vllm/venv/lib/python3.12/site-packages/vllm/v1/engine/core.py", line 590, in run_engine_core | |
| raise e | |
| File "/home/cyb/vllm/venv/lib/python3.12/site-packages/vllm/v1/engine/core.py", line 577, in run_engine_core | |
| engine_core = EngineCoreProc(*args, **kwargs) | |
| ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ | |
| File "/home/cyb/vllm/venv/lib/python3.12/site-packages/vllm/v1/engine/core.py", line 404, in __init__ | |
| super().__init__(vllm_config, executor_class, log_stats, | |
| File "/home/cyb/vllm/venv/lib/python3.12/site-packages/vllm/v1/engine/core.py", line 82, in __init__ | |
| self._initialize_kv_caches(vllm_config) | |
| File "/home/cyb/vllm/venv/lib/python3.12/site-packages/vllm/v1/engine/core.py", line 169, in _initialize_kv_caches | |
| self.model_executor.initialize_from_config(kv_cache_configs) | |
| File "/home/cyb/vllm/venv/lib/python3.12/site-packages/vllm/v1/executor/abstract.py", line 66, in initialize_from_config | |
| self.collective_rpc("compile_or_warm_up_model") | |
| File "/home/cyb/vllm/venv/lib/python3.12/site-packages/vllm/executor/uniproc_executor.py", line 57, in collective_rpc | |
| answer = run_method(self.driver_worker, method, args, kwargs) | |
| ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ | |
| File "/home/cyb/vllm/venv/lib/python3.12/site-packages/vllm/utils/__init__.py", line 2943, in run_method | |
| return func(*args, **kwargs) | |
| ^^^^^^^^^^^^^^^^^^^^^ | |
| File "/home/cyb/vllm/venv/lib/python3.12/site-packages/vllm/v1/worker/cpu_worker.py", line 90, in compile_or_warm_up_model | |
| self.model_runner.warming_up_model() | |
| File "/home/cyb/vllm/venv/lib/python3.12/site-packages/vllm/v1/worker/cpu_model_runner.py", line 69, in warming_up_model | |
| self._dummy_run(max(16, self.max_num_reqs)) | |
| File "/home/cyb/vllm/venv/lib/python3.12/site-packages/torch/utils/_contextlib.py", line 120, in decorate_context | |
| return func(*args, **kwargs) | |
| ^^^^^^^^^^^^^^^^^^^^^ | |
| File "/home/cyb/vllm/venv/lib/python3.12/site-packages/vllm/v1/worker/gpu_model_runner.py", line 2058, in _dummy_run | |
| outputs = model( | |
| ^^^^^^ | |
| File "/home/cyb/vllm/venv/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1773, in _wrapped_call_impl | |
| return self._call_impl(*args, **kwargs) | |
| ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ | |
| File "/home/cyb/vllm/venv/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1784, in _call_impl | |
| return forward_call(*args, **kwargs) | |
| ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ | |
| File "/home/cyb/vllm/venv/lib/python3.12/site-packages/vllm/model_executor/models/llama.py", line 589, in forward | |
| model_output = self.model(input_ids, positions, intermediate_tensors, | |
| ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ | |
| File "/home/cyb/vllm/venv/lib/python3.12/site-packages/vllm/compilation/decorators.py", line 173, in __call__ | |
| return self.forward(*args, **kwargs) | |
| ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ | |
| File "/home/cyb/vllm/venv/lib/python3.12/site-packages/vllm/model_executor/models/llama.py", line 393, in forward | |
| hidden_states, residual = layer(positions, hidden_states, residual) | |
| ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ | |
| File "/home/cyb/vllm/venv/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1773, in _wrapped_call_impl | |
| return self._call_impl(*args, **kwargs) | |
| ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ | |
| File "/home/cyb/vllm/venv/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1784, in _call_impl | |
| return forward_call(*args, **kwargs) | |
| ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ | |
| File "/home/cyb/vllm/venv/lib/python3.12/site-packages/vllm/model_executor/models/llama.py", line 306, in forward | |
| hidden_states = self.self_attn(positions=positions, | |
| ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ | |
| File "/home/cyb/vllm/venv/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1773, in _wrapped_call_impl | |
| return self._call_impl(*args, **kwargs) | |
| ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ | |
| File "/home/cyb/vllm/venv/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1784, in _call_impl | |
| return forward_call(*args, **kwargs) | |
| ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ | |
| File "/home/cyb/vllm/venv/lib/python3.12/site-packages/vllm/model_executor/models/llama.py", line 201, in forward | |
| qkv, _ = self.qkv_proj(hidden_states) | |
| ^^^^^^^^^^^^^^^^^^^^^^^^^^^^ | |
| File "/home/cyb/vllm/venv/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1773, in _wrapped_call_impl | |
| return self._call_impl(*args, **kwargs) | |
| ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ | |
| File "/home/cyb/vllm/venv/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1784, in _call_impl | |
| return forward_call(*args, **kwargs) | |
| ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ | |
| File "/home/cyb/vllm/venv/lib/python3.12/site-packages/vllm/model_executor/layers/linear.py", line 510, in forward | |
| output_parallel = self.quant_method.apply(self, input_, bias) | |
| ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ | |
| File "/home/cyb/vllm/venv/lib/python3.12/site-packages/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py", line 677, in apply | |
| return scheme.apply_weights(layer, x, bias=bias) | |
| ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ | |
| File "/home/cyb/vllm/venv/lib/python3.12/site-packages/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py", line 111, in apply_weights | |
| return self.kernel.apply_weights(layer, x, bias) | |
| ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ | |
| File "/home/cyb/vllm/venv/lib/python3.12/site-packages/vllm/model_executor/layers/quantization/kernels/scaled_mm/cutlass.py", line 115, in apply_weights | |
| x_q, x_s, x_zp = ops.scaled_int8_quant(x.contiguous(), | |
| ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ | |
| File "/home/cyb/vllm/venv/lib/python3.12/site-packages/vllm/_custom_ops.py", line 1395, in scaled_int8_quant | |
| torch.ops._C.dynamic_scaled_int8_quant(output, input.contiguous(), | |
| ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ | |
| File "/home/cyb/vllm/venv/lib/python3.12/site-packages/torch/_ops.py", line 1353, in __getattr__ | |
| raise AttributeError( | |
| AttributeError: '_OpNamespace' '_C' object has no attribute 'dynamic_scaled_int8_quant' | |
| Traceback (most recent call last): | |
| File "/home/cyb/vllm/venv/bin/vllm", line 8, in <module> | |
| sys.exit(main()) | |
| ^^^^^^ | |
| File "/home/cyb/vllm/venv/lib/python3.12/site-packages/vllm/entrypoints/cli/main.py", line 65, in main | |
| args.dispatch_function(args) | |
| File "/home/cyb/vllm/venv/lib/python3.12/site-packages/vllm/entrypoints/cli/serve.py", line 57, in cmd | |
| uvloop.run(run_server(args)) | |
| File "/home/cyb/vllm/venv/lib/python3.12/site-packages/uvloop/__init__.py", line 109, in run | |
| return __asyncio.run( | |
| ^^^^^^^^^^^^^^ | |
| File "/usr/lib/python3.12/asyncio/runners.py", line 194, in run | |
| return runner.run(main) | |
| ^^^^^^^^^^^^^^^^ | |
| File "/usr/lib/python3.12/asyncio/runners.py", line 118, in run | |
| return self._loop.run_until_complete(task) | |
| ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ | |
| File "uvloop/loop.pyx", line 1518, in uvloop.loop.Loop.run_until_complete | |
| File "/home/cyb/vllm/venv/lib/python3.12/site-packages/uvloop/__init__.py", line 61, in wrapper | |
| return await main | |
| ^^^^^^^^^^ | |
| File "/home/cyb/vllm/venv/lib/python3.12/site-packages/vllm/entrypoints/openai/api_server.py", line 1659, in run_server | |
| await run_server_worker(listen_address, sock, args, **uvicorn_kwargs) | |
| File "/home/cyb/vllm/venv/lib/python3.12/site-packages/vllm/entrypoints/openai/api_server.py", line 1679, in run_server_worker | |
| async with build_async_engine_client(args, client_config) as engine_client: | |
| File "/usr/lib/python3.12/contextlib.py", line 210, in __aenter__ | |
| return await anext(self.gen) | |
| ^^^^^^^^^^^^^^^^^^^^^ | |
| File "/home/cyb/vllm/venv/lib/python3.12/site-packages/vllm/entrypoints/openai/api_server.py", line 161, in build_async_engine_client | |
| async with build_async_engine_client_from_engine_args( | |
| File "/usr/lib/python3.12/contextlib.py", line 210, in __aenter__ | |
| return await anext(self.gen) | |
| ^^^^^^^^^^^^^^^^^^^^^ | |
| File "/home/cyb/vllm/venv/lib/python3.12/site-packages/vllm/entrypoints/openai/api_server.py", line 197, in build_async_engine_client_from_engine_args | |
| async_llm = AsyncLLM.from_vllm_config( | |
| ^^^^^^^^^^^^^^^^^^^^^^^^^^ | |
| File "/home/cyb/vllm/venv/lib/python3.12/site-packages/vllm/v1/engine/async_llm.py", line 162, in from_vllm_config | |
| return cls( | |
| ^^^^ | |
| File "/home/cyb/vllm/venv/lib/python3.12/site-packages/vllm/v1/engine/async_llm.py", line 124, in __init__ | |
| self.engine_core = EngineCoreClient.make_async_mp_client( | |
| ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ | |
| File "/home/cyb/vllm/venv/lib/python3.12/site-packages/vllm/v1/engine/core_client.py", line 96, in make_async_mp_client | |
| return AsyncMPClient(*client_args) | |
| ^^^^^^^^^^^^^^^^^^^^^^^^^^^ | |
| File "/home/cyb/vllm/venv/lib/python3.12/site-packages/vllm/v1/engine/core_client.py", line 666, in __init__ | |
| super().__init__( | |
| File "/home/cyb/vllm/venv/lib/python3.12/site-packages/vllm/v1/engine/core_client.py", line 403, in __init__ | |
| with launch_core_engines(vllm_config, executor_class, | |
| File "/usr/lib/python3.12/contextlib.py", line 144, in __exit__ | |
| next(self.gen) | |
| File "/home/cyb/vllm/venv/lib/python3.12/site-packages/vllm/v1/engine/utils.py", line 444, in launch_core_engines | |
| wait_for_engine_startup( | |
| File "/home/cyb/vllm/venv/lib/python3.12/site-packages/vllm/v1/engine/utils.py", line 494, in wait_for_engine_startup | |
| raise RuntimeError("Engine core initialization failed. " | |
| RuntimeError: Engine core initialization failed. See root cause above. Failed core proc(s): {} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment