oglok · November 4, 2025 17:17
diff --git a/gistfile1.txt b/gistfile1.txt
 podman run --rm -it --device nvidia.com/gpu=all -p 8000:8000 --ipc=host --env "HF_TOKEN=xxxx"
 gdCcZlrQoK" --env "HF_HUB_OFFLINE=0" -v ~/.cache/vllm:/home/vllm/.cache --name=vllm quay.io/redhat-user-workloads/octo-edge-tenant/vllm-validated-for-
 nvidia-jetsons:vllm-validated-for-nvidia-jetsons-on-push-jf6fd-build-images --gpu-memory-utilization 0.7 --enforce-eager --model RedHatAI/gemma-3-1b-i
 t-quantized.w8a8

 (APIServer pid=1) INFO 11-04 14:23:48 [api_server.py:1839] vLLM API server version 0.11.0+rhai1
 (APIServer pid=1) INFO 11-04 14:23:48 [utils.py:233] non-default args: {'model': 'RedHatAI/gemma-3-1b-it-quantized.w8a8', 'enforce_eager': True, 'gpu_
 memory_utilization': 0.7}
 config.json: 1.95kB [00:00, 5.72MB/s]
 (APIServer pid=1) INFO 11-04 14:24:07 [model.py:547] Resolved architecture: Gemma3ForCausalLM
 (APIServer pid=1) `torch_dtype` is deprecated! Use `dtype` instead!
 (APIServer pid=1) INFO 11-04 14:24:07 [model.py:1510] Using max model len 32768
 (APIServer pid=1) INFO 11-04 14:24:08 [scheduler.py:205] Chunked prefill is enabled with max_num_batched_tokens=2048.
 (APIServer pid=1) INFO 11-04 14:24:08 [__init__.py:381] Cudagraph is disabled under eager mode
 tokenizer_config.json: 1.16MB [00:00, 31.3MB/s]
 tokenizer.model: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████
 ████████████████████████████████████████████████████████████████████| 4.69M/4.69M [00:00<00:00, 6.08MB/s]
 tokenizer.json: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████
 ████████████████████████████████████████████████████████████████████| 33.4M/33.4M [00:00<00:00, 48.3MB/s]
 added_tokens.json: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████
 ███████████████████████████████████████████████████████████████████████| 35.0/35.0 [00:00<00:00, 283kB/s]
 special_tokens_map.json: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████
 ████████████████████████████████████████████████████████████████████████| 662/662 [00:00<00:00, 5.53MB/s]
 generation_config.json: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████
 ████████████████████████████████████████████████████████████████████████| 210/210 [00:00<00:00, 1.70MB/s]
 /opt/app-root/lib64/python3.12/site-packages/torch/cuda/__init__.py:63: FutureWarning: The pynvml package is deprecated. Please install nvidia-ml-py i
 nstead. If you did not install pynvml directly, please report this to the maintainers of the package that installed pynvml for you.
  import pynvml  # type: ignore[import]
  ......
 (APIServer pid=1) INFO 11-04 14:25:14 [launcher.py:42] Route: /v1/responses/{response_id}, Methods: GET
 (APIServer pid=1) INFO 11-04 14:25:14 [launcher.py:42] Route: /v1/responses/{response_id}/cancel, Methods: POST
 (APIServer pid=1) INFO 11-04 14:25:14 [launcher.py:42] Route: /v1/chat/completions, Methods: POST
 (APIServer pid=1) INFO 11-04 14:25:14 [launcher.py:42] Route: /v1/completions, Methods: POST
 (APIServer pid=1) INFO 11-04 14:25:14 [launcher.py:42] Route: /v1/embeddings, Methods: POST
 (APIServer pid=1) INFO 11-04 14:25:14 [launcher.py:42] Route: /pooling, Methods: POST
 (APIServer pid=1) INFO 11-04 14:25:14 [launcher.py:42] Route: /classify, Methods: POST
 (APIServer pid=1) INFO 11-04 14:25:14 [launcher.py:42] Route: /score, Methods: POST
 (APIServer pid=1) INFO 11-04 14:25:14 [launcher.py:42] Route: /v1/score, Methods: POST
 (APIServer pid=1) INFO 11-04 14:25:14 [launcher.py:42] Route: /v1/audio/transcriptions, Methods: POST
 (APIServer pid=1) INFO 11-04 14:25:14 [launcher.py:42] Route: /v1/audio/translations, Methods: POST
 (APIServer pid=1) INFO 11-04 14:25:14 [launcher.py:42] Route: /rerank, Methods: POST
 (APIServer pid=1) INFO 11-04 14:25:14 [launcher.py:42] Route: /v1/rerank, Methods: POST
 (APIServer pid=1) INFO 11-04 14:25:14 [launcher.py:42] Route: /v2/rerank, Methods: POST
 (APIServer pid=1) INFO 11-04 14:25:14 [launcher.py:42] Route: /scale_elastic_ep, Methods: POST
 (APIServer pid=1) INFO 11-04 14:25:14 [launcher.py:42] Route: /is_scaling_elastic_ep, Methods: POST
 (APIServer pid=1) INFO 11-04 14:25:14 [launcher.py:42] Route: /invocations, Methods: POST
 (APIServer pid=1) INFO 11-04 14:25:14 [launcher.py:42] Route: /metrics, Methods: GET
 (APIServer pid=1) INFO:     Started server process [1]
 (APIServer pid=1) INFO:     Waiting for application startup.                                                                                          (APIServer pid=1) INFO:     Application startup complete.





 [root@nvidia-jetson-nx-orin-01 ~]# curl -X POST http://localhost:8000/v1/chat/completions   -H "Content-Type: application/json"   -d '{

    "model": "RedHatAI/gemma-3-1b-it-quantized.w8a8",
    "messages": [{"role": "user", "content": "Hello, how are you?"}],
    "max_tokens": 100
  }'

 {"id":"chatcmpl-f7f04c43bb8f457b8244638cc7084dec","object":"chat.completion","created":1762266644,"model":"RedHatAI/gemma-3-1b-it-quantized.w8a8","cho
 ices":[{"index":0,"message":{"role":"assistant","content":"Hello there! I'm doing well, thank you for asking. As an AI, I don't really experience feel
 ings the way humans do, but I'm functioning perfectly and ready to help you out. 😊 \n\nHow are *you* doing today? Is there anything you'd like to cha
 t about or need help with?","refusal":null,"annotations":null,"audio":null,"function_call":null,"tool_calls":[],"reasoning_content":null},"logprobs":n
 ull,"finish_reason":"stop","stop_reason":106,"token_ids":null}],"service_tier":null,"system_fingerprint":null,"usage":{"prompt_tokens":15,"total_token
 s":85,"completion_tokens":70,"prompt_tokens_details":null},"prompt_logprobs":null,"prompt_token_ids":null,"kv_transfer_params":null}[root@nvidia-jetso
 n-nx-orin-01 ~]#
	podman run --rm -it --device nvidia.com/gpu=all -p 8000:8000 --ipc=host --env "HF_TOKEN=xxxx"
	gdCcZlrQoK" --env "HF_HUB_OFFLINE=0" -v ~/.cache/vllm:/home/vllm/.cache --name=vllm quay.io/redhat-user-workloads/octo-edge-tenant/vllm-validated-for-
	nvidia-jetsons:vllm-validated-for-nvidia-jetsons-on-push-jf6fd-build-images --gpu-memory-utilization 0.7 --enforce-eager --model RedHatAI/gemma-3-1b-i
	t-quantized.w8a8

	(APIServer pid=1) INFO 11-04 14:23:48 [api_server.py:1839] vLLM API server version 0.11.0+rhai1
	(APIServer pid=1) INFO 11-04 14:23:48 [utils.py:233] non-default args: {'model': 'RedHatAI/gemma-3-1b-it-quantized.w8a8', 'enforce_eager': True, 'gpu_
	memory_utilization': 0.7}
	config.json: 1.95kB [00:00, 5.72MB/s]
	(APIServer pid=1) INFO 11-04 14:24:07 [model.py:547] Resolved architecture: Gemma3ForCausalLM
	(APIServer pid=1) `torch_dtype` is deprecated! Use `dtype` instead!
	(APIServer pid=1) INFO 11-04 14:24:07 [model.py:1510] Using max model len 32768
	(APIServer pid=1) INFO 11-04 14:24:08 [scheduler.py:205] Chunked prefill is enabled with max_num_batched_tokens=2048.
	(APIServer pid=1) INFO 11-04 14:24:08 [__init__.py:381] Cudagraph is disabled under eager mode
	tokenizer_config.json: 1.16MB [00:00, 31.3MB/s]
	tokenizer.model: 100%\|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████
	████████████████████████████████████████████████████████████████████\| 4.69M/4.69M [00:00<00:00, 6.08MB/s]
	tokenizer.json: 100%\|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████
	████████████████████████████████████████████████████████████████████\| 33.4M/33.4M [00:00<00:00, 48.3MB/s]
	added_tokens.json: 100%\|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████
	███████████████████████████████████████████████████████████████████████\| 35.0/35.0 [00:00<00:00, 283kB/s]
	special_tokens_map.json: 100%\|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████
	████████████████████████████████████████████████████████████████████████\| 662/662 [00:00<00:00, 5.53MB/s]
	generation_config.json: 100%\|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████
	████████████████████████████████████████████████████████████████████████\| 210/210 [00:00<00:00, 1.70MB/s]
	/opt/app-root/lib64/python3.12/site-packages/torch/cuda/__init__.py:63: FutureWarning: The pynvml package is deprecated. Please install nvidia-ml-py i
	nstead. If you did not install pynvml directly, please report this to the maintainers of the package that installed pynvml for you.
	import pynvml # type: ignore[import]
	......
	(APIServer pid=1) INFO 11-04 14:25:14 [launcher.py:42] Route: /v1/responses/{response_id}, Methods: GET
	(APIServer pid=1) INFO 11-04 14:25:14 [launcher.py:42] Route: /v1/responses/{response_id}/cancel, Methods: POST
	(APIServer pid=1) INFO 11-04 14:25:14 [launcher.py:42] Route: /v1/chat/completions, Methods: POST
	(APIServer pid=1) INFO 11-04 14:25:14 [launcher.py:42] Route: /v1/completions, Methods: POST
	(APIServer pid=1) INFO 11-04 14:25:14 [launcher.py:42] Route: /v1/embeddings, Methods: POST
	(APIServer pid=1) INFO 11-04 14:25:14 [launcher.py:42] Route: /pooling, Methods: POST
	(APIServer pid=1) INFO 11-04 14:25:14 [launcher.py:42] Route: /classify, Methods: POST
	(APIServer pid=1) INFO 11-04 14:25:14 [launcher.py:42] Route: /score, Methods: POST
	(APIServer pid=1) INFO 11-04 14:25:14 [launcher.py:42] Route: /v1/score, Methods: POST
	(APIServer pid=1) INFO 11-04 14:25:14 [launcher.py:42] Route: /v1/audio/transcriptions, Methods: POST
	(APIServer pid=1) INFO 11-04 14:25:14 [launcher.py:42] Route: /v1/audio/translations, Methods: POST
	(APIServer pid=1) INFO 11-04 14:25:14 [launcher.py:42] Route: /rerank, Methods: POST
	(APIServer pid=1) INFO 11-04 14:25:14 [launcher.py:42] Route: /v1/rerank, Methods: POST
	(APIServer pid=1) INFO 11-04 14:25:14 [launcher.py:42] Route: /v2/rerank, Methods: POST
	(APIServer pid=1) INFO 11-04 14:25:14 [launcher.py:42] Route: /scale_elastic_ep, Methods: POST
	(APIServer pid=1) INFO 11-04 14:25:14 [launcher.py:42] Route: /is_scaling_elastic_ep, Methods: POST
	(APIServer pid=1) INFO 11-04 14:25:14 [launcher.py:42] Route: /invocations, Methods: POST
	(APIServer pid=1) INFO 11-04 14:25:14 [launcher.py:42] Route: /metrics, Methods: GET
	(APIServer pid=1) INFO: Started server process [1]
	(APIServer pid=1) INFO: Waiting for application startup. (APIServer pid=1) INFO: Application startup complete.





	[root@nvidia-jetson-nx-orin-01 ~]# curl -X POST http://localhost:8000/v1/chat/completions -H "Content-Type: application/json" -d '{

	"model": "RedHatAI/gemma-3-1b-it-quantized.w8a8",
	"messages": [{"role": "user", "content": "Hello, how are you?"}],
	"max_tokens": 100
	}'

	{"id":"chatcmpl-f7f04c43bb8f457b8244638cc7084dec","object":"chat.completion","created":1762266644,"model":"RedHatAI/gemma-3-1b-it-quantized.w8a8","cho
	ices":[{"index":0,"message":{"role":"assistant","content":"Hello there! I'm doing well, thank you for asking. As an AI, I don't really experience feel
	ings the way humans do, but I'm functioning perfectly and ready to help you out. 😊 \n\nHow are you doing today? Is there anything you'd like to cha
	t about or need help with?","refusal":null,"annotations":null,"audio":null,"function_call":null,"tool_calls":[],"reasoning_content":null},"logprobs":n
	ull,"finish_reason":"stop","stop_reason":106,"token_ids":null}],"service_tier":null,"system_fingerprint":null,"usage":{"prompt_tokens":15,"total_token
	s":85,"completion_tokens":70,"prompt_tokens_details":null},"prompt_logprobs":null,"prompt_token_ids":null,"kv_transfer_params":null}[root@nvidia-jetso
	n-nx-orin-01 ~]#
No results found