Andrewpk · March 11, 2026 13:40
diff --git a/readme.md b/readme.md
diff --git a/qwen3.5_openwebui_llama.cpp_docker-compose.yml b/qwen3.5_openwebui_llama.cpp_docker-compose.yml
 services:
  llama-cpp:
    image:  ghcr.io/ggml-org/llama.cpp:server-cuda13
    container_name: llama-cpp
    restart: unless-stopped
    ports:
      - "8081:8080"
    volumes:
      - ./models:/models
    command: >
      -hf unsloth/Qwen3.5-35B-A3B-GGUF:UD-IQ4_NL
      --host 0.0.0.0
      --port 8080
      --ctx-size 40960
      --temp 1.0
      --top-p 0.95
      --top-k 20
      --min-p 0.00
      -ngl 999
      --reasoning-budget 0
      --chat-template-kwargs '{"enable_thinking":false}'
 # Adjust ngl number of layers you want offloaded to GPU - 999 just uses all available GPU
 # You can comment out the last two of the command if you want reasoning enabled on qwen3.5
    environment:
      - HF_HUB_CACHE=/models/.cache
      - HUGGINGFACE_HUB_CACHE=/models/.cache
      - HF_HOME=/models
      - HF_TOKEN=${HF_TOKEN}
      - HUGGING_FACE_HUB_TOKEN=${HF_TOKEN}
      - LLAMA_CACHE=/models/.cache
    deploy:
      resources:
        reservations:
          devices:
            - driver: nvidia
              count: 1
              capabilities: [gpu]
    healthcheck:
      test: ["CMD", "curl", "-f", "http://localhost:8080/health"]
      interval: 30s
      timeout: 10s
      retries: 5
      start_period: 60s

  ollama:
    image: ollama/ollama:latest
    container_name: ollama
    restart: unless-stopped
    ports:
      - "11434:11434"
    volumes:
      - ./ollama:/root/.ollama
    environment:
      - OLLAMA_NUM_GPU=1
      - HF_TOKEN=${HF_TOKEN}
    deploy:
      resources:
        reservations:
          devices:
            - driver: nvidia
              count: 1
              capabilities: [gpu]
    healthcheck:
      test: ["CMD", "curl", "-f", "http://localhost:11434/api/version"]
      interval: 30s
      timeout: 10s
      retries: 5
      start_period: 30s

  open-webui:
    image: ghcr.io/open-webui/open-webui:main
    container_name: open-webui
    restart: unless-stopped
    ports:
      - "5000:8080"
    environment:
      # Backend connections
      - OPENAI_API_BASE_URL=http://llama-cpp:8080/v1
      - OPENAI_API_KEY=sk-no-key-required
      - OLLAMA_BASE_URL=http://ollama:11434


      # Memory Management
      - OMP_NUM_THREADS=4
      - MAX_CONTEXT_WINDOW=40960

      # Vector DB & Knowledge Settings
      - RAG_TOP_K=3
      - RAG_CHUNK_MIN_SIZE_TARGET=500
      - RAG_CHUNK_SIZE=1500
      - RAG_CHUNK_OVERLAP=100
      - RAG_MAX_TOKENS=6000

      # Logging
      - LOG_LEVEL=INFO
      - UPDATE_CHECK=false
    volumes:
      - ./open-webui:/app/backend/data
    healthcheck:
      test: ["CMD", "curl", "-f", "http://localhost:8080/api/v1/health"]
      interval: 30s
      timeout: 10s
      retries: 5
      start_period: 40s
	services:
	llama-cpp:
	image: ghcr.io/ggml-org/llama.cpp:server-cuda13
	container_name: llama-cpp
	restart: unless-stopped
	ports:
	- "8081:8080"
	volumes:
	- ./models:/models
	command: >
	-hf unsloth/Qwen3.5-35B-A3B-GGUF:UD-IQ4_NL
	--host 0.0.0.0
	--port 8080
	--ctx-size 40960
	--temp 1.0
	--top-p 0.95
	--top-k 20
	--min-p 0.00
	-ngl 999
	--reasoning-budget 0
	--chat-template-kwargs '{"enable_thinking":false}'
	# Adjust ngl number of layers you want offloaded to GPU - 999 just uses all available GPU
	# You can comment out the last two of the command if you want reasoning enabled on qwen3.5
	environment:
	- HF_HUB_CACHE=/models/.cache
	- HUGGINGFACE_HUB_CACHE=/models/.cache
	- HF_HOME=/models
	- HF_TOKEN=${HF_TOKEN}
	- HUGGING_FACE_HUB_TOKEN=${HF_TOKEN}
	- LLAMA_CACHE=/models/.cache
	deploy:
	resources:
	reservations:
	devices:
	- driver: nvidia
	count: 1
	capabilities: [gpu]
	healthcheck:
	test: ["CMD", "curl", "-f", "http://localhost:8080/health"]
	interval: 30s
	timeout: 10s
	retries: 5
	start_period: 60s

	ollama:
	image: ollama/ollama:latest
	container_name: ollama
	restart: unless-stopped
	ports:
	- "11434:11434"
	volumes:
	- ./ollama:/root/.ollama
	environment:
	- OLLAMA_NUM_GPU=1
	- HF_TOKEN=${HF_TOKEN}
	deploy:
	resources:
	reservations:
	devices:
	- driver: nvidia
	count: 1
	capabilities: [gpu]
	healthcheck:
	test: ["CMD", "curl", "-f", "http://localhost:11434/api/version"]
	interval: 30s
	timeout: 10s
	retries: 5
	start_period: 30s

	open-webui:
	image: ghcr.io/open-webui/open-webui:main
	container_name: open-webui
	restart: unless-stopped
	ports:
	- "5000:8080"
	environment:
	# Backend connections
	- OPENAI_API_BASE_URL=http://llama-cpp:8080/v1
	- OPENAI_API_KEY=sk-no-key-required
	- OLLAMA_BASE_URL=http://ollama:11434


	# Memory Management
	- OMP_NUM_THREADS=4
	- MAX_CONTEXT_WINDOW=40960

	# Vector DB & Knowledge Settings
	- RAG_TOP_K=3
	- RAG_CHUNK_MIN_SIZE_TARGET=500
	- RAG_CHUNK_SIZE=1500
	- RAG_CHUNK_OVERLAP=100
	- RAG_MAX_TOKENS=6000

	# Logging
	- LOG_LEVEL=INFO
	- UPDATE_CHECK=false
	volumes:
	- ./open-webui:/app/backend/data
	healthcheck:
	test: ["CMD", "curl", "-f", "http://localhost:8080/api/v1/health"]
	interval: 30s
	timeout: 10s
	retries: 5
	start_period: 40s
No results found