curtishall · October 1, 2025 01:04
diff --git a/gistfile1.txt b/gistfile1.txt
 version: "3.8"

 services:
  big-coder:
    image: vllm/vllm-openai:latest
    container_name: big-coder
    restart: unless-stopped
    runtime: nvidia
    environment:
      - CUDA_VISIBLE_DEVICES=${CUDA_VISIBLE_DEVICES:-0,1}
    ports:
      - "8000:8000"
    ipc: host
    shm_size: "16gb"
    volumes:
      - ${HF_CACHE_DIR:-~/.cache/huggingface}:/root/.cache/huggingface
    command: >
      --model ${MODEL_REPO}
      --tensor-parallel-size ${TP_SIZE:-2}
      --max-model-len ${MAX_LEN:-16384}
      --gpu-memory-utilization ${GPU_UTIL:-0.92}
      --trust-remote-code


 ----------------------------

 .env

 MODEL_REPO=Qwen/Qwen2.5-Coder-32B-Instruct
 # Use 2 GPU, or change to 0
 CUDA_VISIBLE_DEVICES=0,1
 TP_SIZE=2
 MAX_LEN=16000
 GPU_UTIL=0.92
 HF_TOKEN=
 HF_CACHE_DIR=/opt/hf-cache
	version: "3.8"

	services:
	big-coder:
	image: vllm/vllm-openai:latest
	container_name: big-coder
	restart: unless-stopped
	runtime: nvidia
	environment:
	- CUDA_VISIBLE_DEVICES=${CUDA_VISIBLE_DEVICES:-0,1}
	ports:
	- "8000:8000"
	ipc: host
	shm_size: "16gb"
	volumes:
	- ${HF_CACHE_DIR:-~/.cache/huggingface}:/root/.cache/huggingface
	command: >
	--model ${MODEL_REPO}
	--tensor-parallel-size ${TP_SIZE:-2}
	--max-model-len ${MAX_LEN:-16384}
	--gpu-memory-utilization ${GPU_UTIL:-0.92}
	--trust-remote-code


	----------------------------

	.env

	MODEL_REPO=Qwen/Qwen2.5-Coder-32B-Instruct
	# Use 2 GPU, or change to 0
	CUDA_VISIBLE_DEVICES=0,1
	TP_SIZE=2
	MAX_LEN=16000
	GPU_UTIL=0.92
	HF_TOKEN=
	HF_CACHE_DIR=/opt/hf-cache
No results found