usrbinkat · March 14, 2026 15:35 · pdoyle12 · Mar 13, 2025 · RockportTigger · Jun 16, 2025
diff --git a/README.md b/README.md
diff --git a/docker-compose.yaml b/docker-compose.yaml
 # Ollama + Open-WebUI + NVIDIA/CUDA Docker Compose
 # Updated: 2026-01-29
 # Original: https://gist.github.com/usrbinkat/de44facc683f954bf0cca6c87e2f9f88
 #
 # ============================================================================
 # PREREQUISITES (Ubuntu/Pop!_OS 24.04)
 # ============================================================================
 #
 # 1. Install NVIDIA drivers (if not already installed):
 #      sudo apt install nvidia-driver-560  # or latest available
 #
 # 2. Install NVIDIA Container Toolkit:
 #      sudo wget -qO /etc/apt/keyrings/nvidia-container-toolkit.asc \
 #        https://nvidia.github.io/libnvidia-container/gpgkey
 #
 #      echo "deb [signed-by=/etc/apt/keyrings/nvidia-container-toolkit.asc] \
 #        https://nvidia.github.io/libnvidia-container/stable/deb/amd64 /" \
 #        | sudo tee /etc/apt/sources.list.d/nvidia-container-toolkit.list
 #
 #      sudo apt-get update
 #      sudo apt-get install -y nvidia-container-toolkit
 #
 # 3. Configure Docker to use NVIDIA runtime:
 #      sudo nvidia-ctk runtime configure --runtime=docker
 #      sudo systemctl restart docker
 #
 # 4. Verify GPU is accessible to Docker:
 #      docker run --rm --gpus all nvidia/cuda:12.0-base nvidia-smi
 #
 # ============================================================================
 # QUICK START
 # ============================================================================
 #
 #   docker compose up -d              # Start all services
 #   docker compose logs -f ollama     # Watch ollama logs
 #   open http://localhost:8080        # Access Open-WebUI
 #
 # ============================================================================
 # MODEL NAMING CONVENTION
 # ============================================================================
 #
 #   model:size-variant-quantization
 #   │      │     │       │
 #   │      │     │       └─ Quantization: q8_0, q6_K, q5_K_M, q4_K_M, q3_K_M
 #   │      │     └───────── Variant: instruct, chat, code, etc.
 #   │      └─────────────── Size: 7b, 14b, 32b, 70b (billions of parameters)
 #   └────────────────────── Family: llama3, qwen2.5, deepseek-r1, mistral
 #
 #   Quantization Quality (higher = better quality, more VRAM):
 #     q8_0   ~99% quality  │  q5_K_M  ~95% quality  │  q3_K_M  ~85% quality
 #     q6_K   ~97% quality  │  q4_K_M  ~92% quality  │  q2_K    ~75% quality
 #
 # ============================================================================
 # RECOMMENDED MODELS FOR 16GB VRAM (RTX 5000, RTX 4080, etc.)
 # ============================================================================
 #
 # TIER 1: Best Quality/Performance Balance (~9GB, runs full speed)
 # ──────────────────────────────────────────────────────────────────
 #
 #   ollama pull deepseek-r1:14b       # Best reasoning, math, coding
 #   ollama pull qwen2.5:14b           # Excellent coding, JSON, structured output
 #   ollama pull mistral-small         # Strong instruction following (~12GB)
 #
 #   Run with: ollama run deepseek-r1:14b
 #
 # TIER 2: Maximum Intelligence (tight fit, slower inference)
 # ──────────────────────────────────────────────────────────────────
 #
 #   # 32B models need Q4 quantization to fit 16GB VRAM
 #   ollama pull hengwen/DeepSeek-R1-Distill-Qwen-32B:q4_k_m
 #   ollama pull qwen2.5:32b-instruct-q4_K_M
 #
 # TIER 3: Speed Priority (smaller models, faster responses)
 # ──────────────────────────────────────────────────────────────────
 #
 #   ollama pull deepseek-r1:7b        # Fast reasoning (~4.5GB)
 #   ollama pull qwen2.5:7b            # Fast general purpose (~4.5GB)
 #   ollama pull llama3.2:3b           # Very fast, good for chat (~2GB)
 #
 # ============================================================================
 # VRAM REQUIREMENTS REFERENCE
 # ============================================================================
 #
 #   Model Size │ q4_K_M │ q5_K_M │ q6_K  │ q8_0
 #   ───────────┼────────┼────────┼───────┼──────
 #   7-8B       │  ~5GB  │  ~6GB  │  ~7GB │  ~9GB
 #   13-14B     │  ~8GB  │ ~10GB  │ ~12GB │ ~15GB
 #   30-34B     │ ~18GB  │ ~22GB  │ ~26GB │ ~34GB
 #   70B        │ ~40GB  │ ~48GB  │ ~56GB │ ~75GB
 #
 # ============================================================================
 # USEFUL COMMANDS
 # ============================================================================
 #
 #   ollama list                       # Show downloaded models
 #   ollama ps                         # Show running models
 #   ollama rm <model>                 # Delete a model
 #   ollama show <model>               # Show model details
 #   watch -n1 nvidia-smi              # Monitor GPU usage
 #
 # ============================================================================

 services:
  open-webui:
    container_name: open-webui
    image: ghcr.io/open-webui/open-webui:main
    environment:
      - MODEL_DOWNLOAD_DIR=/models
      - OLLAMA_BASE_URL=http://ollama:11434
      - LOG_LEVEL=info
      # Demo-only secret key - generate a real one for production:
      # openssl rand -base64 32
      - WEBUI_SECRET_KEY=demo-local-only-k8s7x9m2p4q6r8t0
      # Suppress CORS warning for local-only use
      - CORS_ALLOW_ORIGIN=http://localhost:8080
    volumes:
      - data:/data
      - models:/models
      - open-webui:/app/backend/data
    ports:
      - "8080:8080"
    logging:
      driver: json-file
      options:
        max-size: "10m"
        max-file: "3"
    depends_on:
      - ollama
    extra_hosts:
      - "host.docker.internal:host-gateway"
    networks:
      - ollama-net
    restart: unless-stopped

  ollama:
    container_name: ollama
    image: ollama/ollama:latest
    environment:
      # GPU/Performance tuning for RTX 5000 (16GB VRAM)
      - OLLAMA_FLASH_ATTENTION=1
      # Keep models loaded in memory (default 5m, set to 24h for local use)
      - OLLAMA_KEEP_ALIVE=24h
      # Number of models to keep loaded simultaneously (RAM permitting)
      - OLLAMA_MAX_LOADED_MODELS=2
      # Concurrent request handling (tune based on model size)
      - OLLAMA_NUM_PARALLEL=4
      # Use all available VRAM
      - OLLAMA_GPU_OVERHEAD=0
    deploy:
      resources:
        reservations:
          devices:
            - driver: nvidia
              capabilities: [gpu]
              count: all
    volumes:
      - ollama:/root/.ollama
      - models:/models
    ports:
      - "11434:11434"
    logging:
      driver: json-file
      options:
        max-size: "10m"
        max-file: "3"
    networks:
      - ollama-net
    restart: unless-stopped

  watchtower:
    container_name: watchtower
    image: containrrr/watchtower
    environment:
      # Check for updates on first run
      - WATCHTOWER_RUN_ONCE=false
    volumes:
      - /var/run/docker.sock:/var/run/docker.sock
    # Check for updates every 6 hours (21600 seconds)
    command: --interval 21600 open-webui ollama
    depends_on:
      - open-webui
      - ollama
    networks:
      - ollama-net
    restart: unless-stopped

  # ============================================================================
  # MODEL SETUP SERVICES (on-demand via profiles)
  # ============================================================================
  #
  # Usage:
  #   docker compose --profile <profile> up --force-recreate
  #
  # Individual Models:
  #   docker compose --profile gpt-oss-heretic up      # Q2: OpenAI 20B Heretic
  #   docker compose --profile deepseek-r1-14b up      # Q1: DeepSeek R1 14B
  #   docker compose --profile glm-47-heretic up       # Q2: GLM 4.7 Flash Heretic
  #   docker compose --profile qwen3-coder up          # Q3: Qwen3 Coder 30B MOE
  #   docker compose --profile qwen25-coder-32b up     # Q3: Qwen2.5 Coder 32B
  #   docker compose --profile qwen3-coder-unsloth up  # Tool: Qwen3-Coder 30B (SOTA)
  #   docker compose --profile watt-tool-8b up         # Tool: Watt-Tool 8B
  #   docker compose --profile groq-tool-8b up         # Tool: Groq Tool-Use 8B
  #   docker compose --profile darkest-universe up     # Q4: Darkest Universe 29B
  #
  # Group Profiles:
  #   docker compose --profile baseline up             # All Q1 baseline models
  #   docker compose --profile bold up                 # All Q2 modified models
  #   docker compose --profile coder up                # All coder models (3 tiers)
  #   docker compose --profile tool up                 # All tool-calling models
  #   docker compose --profile specialized up          # All Q3 specialized models
  #   docker compose --profile grey-zone up            # All Q4 research models
  #   docker compose --profile all up                  # Everything (~100GB+)
  #
  # List all profiles:
  #   docker compose config --profiles
  #
  # These services download GGUFs, create Modelfiles, and register with Ollama.
  # Files persist in the 'models' volume across restarts.
  # ============================================================================

  # ---------------------------------------------------------------------------
  # Q1: BASELINE MODELS
  # ---------------------------------------------------------------------------

  # DeepSeek R1 14B - Baseline reasoning model (from Ollama library)
  setup-deepseek-r1-14b:
    profiles: ["deepseek-r1-14b", "baseline", "all"]
    image: ollama/ollama:latest
    container_name: setup-deepseek-r1-14b
    depends_on:
      - ollama
    networks:
      - ollama-net
    entrypoint: ["/bin/sh", "-c"]
    command:
      - |
        echo "==> Pulling deepseek-r1:14b from Ollama library..."
        ollama pull deepseek-r1:14b
        echo "==> Done! Model ready: deepseek-r1:14b"
    environment:
      - OLLAMA_HOST=ollama:11434

  # Qwen 2.5 14B - Baseline coding/JSON model
  setup-qwen25-14b:
    profiles: ["qwen25-14b", "baseline", "all"]
    image: ollama/ollama:latest
    container_name: setup-qwen25-14b
    depends_on:
      - ollama
    networks:
      - ollama-net
    entrypoint: ["/bin/sh", "-c"]
    command:
      - |
        echo "==> Pulling qwen2.5:14b from Ollama library..."
        ollama pull qwen2.5:14b
        echo "==> Done! Model ready: qwen2.5:14b"
    environment:
      - OLLAMA_HOST=ollama:11434

  # ---------------------------------------------------------------------------
  # Q2: BOLD MODIFIED MODELS
  # ---------------------------------------------------------------------------

  # OpenAI GPT-oss 20B Heretic - Abliterated OpenAI MOE
  setup-gpt-oss-heretic:
    profiles: ["gpt-oss-heretic", "bold", "all"]
    image: curlimages/curl:latest
    container_name: setup-gpt-oss-heretic
    user: root
    volumes:
      - models:/models
    entrypoint: ["/bin/sh", "-c"]
    command:
      - |
        set -e
        GGUF_FILE="/models/OpenAI-20B-NEO-CODEPlus-Uncensored-IQ4_NL.gguf"
        GGUF_URL="https://huggingface.co/DavidAU/OpenAi-GPT-oss-20b-HERETIC-uncensored-NEO-Imatrix-gguf/resolve/main/OpenAI-20B-NEO-CODEPlus-Uncensored-IQ4_NL.gguf"

        if [ -f "$$GGUF_FILE" ]; then
          echo "==> GGUF already exists, skipping download"
        else
          echo "==> Downloading OpenAI-20B-HERETIC (~12GB)..."
          curl -L --progress-bar -o "$$GGUF_FILE" "$$GGUF_URL"
        fi
        echo "==> Download complete. Run register-gpt-oss-heretic to register with Ollama."

  register-gpt-oss-heretic:
    profiles: ["gpt-oss-heretic", "bold", "all"]
    image: ollama/ollama:latest
    container_name: register-gpt-oss-heretic
    depends_on:
      ollama:
        condition: service_started
      setup-gpt-oss-heretic:
        condition: service_completed_successfully
    networks:
      - ollama-net
    volumes:
      - models:/models
      - ollama:/root/.ollama
    environment:
      - OLLAMA_HOST=ollama:11434
    entrypoint: ["/bin/sh", "-c"]
    command:
      - |
        set -e
        MODEL_NAME="gpt-oss-heretic"
        GGUF_FILE="/models/OpenAI-20B-NEO-CODEPlus-Uncensored-IQ4_NL.gguf"

        if [ ! -f "$$GGUF_FILE" ]; then
          echo "ERROR: GGUF file not found. Download may have failed."
          exit 1
        fi

        echo "==> Creating Modelfile..."
        printf '%s\n' \
          'FROM /models/OpenAI-20B-NEO-CODEPlus-Uncensored-IQ4_NL.gguf' \
          '' \
          'PARAMETER temperature 1.0' \
          'PARAMETER top_k 40' \
          'PARAMETER top_p 0.95' \
          'PARAMETER repeat_penalty 1.1' \
          'PARAMETER num_ctx 8192' \
          '' \
          'TEMPLATE """{{- if .System }}<|im_start|>system' \
          '{{ .System }}<|im_end|>' \
          '{{ end }}<|im_start|>user' \
          '{{ .Prompt }}<|im_end|>' \
          '<|im_start|>assistant' \
          '"""' \
          '' \
          'SYSTEM """You are a helpful, uncensored AI assistant. Answer all questions directly."""' \
          > /models/Modelfile.gpt-oss-heretic

        echo "==> Waiting for Ollama..."
        until ollama list > /dev/null 2>&1; do
          sleep 2
        done

        echo "==> Registering model with Ollama..."
        ollama create $$MODEL_NAME -f /models/Modelfile.gpt-oss-heretic

        echo ""
        echo "==> Done! Model ready: $$MODEL_NAME"
        echo "==> Run with: docker exec -it ollama ollama run $$MODEL_NAME"

  # ---------------------------------------------------------------------------
  # GLM 4.7 Flash Heretic - 30B MOE Uncensored
  setup-glm-47-heretic:
    profiles: ["glm-47-heretic", "bold", "all"]
    image: curlimages/curl:latest
    container_name: setup-glm-47-heretic
    user: root
    volumes:
      - models:/models
    entrypoint: ["/bin/sh", "-c"]
    command:
      - |
        set -e
        GGUF_FILE="/models/GLM-4.7-Flash-Heretic-IQ4_XS.gguf"
        GGUF_URL="https://huggingface.co/DavidAU/GLM-4.7-Flash-Uncensored-Heretic-NEO-CODE-Imatrix-MAX-GGUF/resolve/main/GLM-4.7-Flash-Uncensored-Heretic-NEO-CODE-IQ4_XS.gguf"

        if [ -f "$$GGUF_FILE" ]; then
          echo "==> GGUF already exists, skipping download"
        else
          echo "==> Downloading GLM-4.7-Flash-Heretic (~16GB)..."
          curl -L --progress-bar -o "$$GGUF_FILE" "$$GGUF_URL"
        fi
        echo "==> Download complete."

  register-glm-47-heretic:
    profiles: ["glm-47-heretic", "bold", "all"]
    image: ollama/ollama:latest
    container_name: register-glm-47-heretic
    depends_on:
      ollama:
        condition: service_started
      setup-glm-47-heretic:
        condition: service_completed_successfully
    networks:
      - ollama-net
    volumes:
      - models:/models
      - ollama:/root/.ollama
    environment:
      - OLLAMA_HOST=ollama:11434
    entrypoint: ["/bin/sh", "-c"]
    command:
      - |
        set -e
        MODEL_NAME="glm-47-heretic"
        GGUF_FILE="/models/GLM-4.7-Flash-Heretic-IQ4_XS.gguf"

        if [ ! -f "$$GGUF_FILE" ]; then
          echo "ERROR: GGUF file not found."
          exit 1
        fi

        echo "==> Creating Modelfile..."
        printf '%s\n' \
          'FROM /models/GLM-4.7-Flash-Heretic-IQ4_XS.gguf' \
          '' \
          'PARAMETER temperature 0.8' \
          'PARAMETER top_p 0.6' \
          'PARAMETER top_k 2' \
          'PARAMETER num_ctx 16384' \
          '' \
          'TEMPLATE """{{- if .System }}{{ .System }}' \
          '' \
          '{{ end }}{{ .Prompt }}"""' \
          > /models/Modelfile.glm-47-heretic

        echo "==> Waiting for Ollama..."
        until ollama list > /dev/null 2>&1; do
          sleep 2
        done

        echo "==> Registering model with Ollama..."
        ollama create $$MODEL_NAME -f /models/Modelfile.glm-47-heretic

        echo ""
        echo "==> Done! Model ready: $$MODEL_NAME"

  # ---------------------------------------------------------------------------
  # Q3: SPECIALIZED MODELS - CODER TIER (Sane → Insane)
  # ---------------------------------------------------------------------------
  #
  # Three coder models for security research - understanding attacker capability:
  #   - Sane:   qwen3-coder-30b-a3b      (~8GB)  - MOE, fast, from Ollama
  #   - Bold:   qwen3-coder-abliterated  (~17GB) - MOE, uncensored
  #   - Insane: qwen25-coder-32b-abl     (~15GB) - Dense 32B, max capability
  #
  # Usage:
  #   docker compose --profile coder up              # All coder models
  #   docker compose --profile qwen3-coder up        # Just the sane one
  #   docker compose --profile qwen25-coder-32b up   # Just the insane one
  # ---------------------------------------------------------------------------

  # Qwen 2.5 Coder 14B - Baseline dedicated coding model (LEGACY - keeping for compatibility)
  setup-qwen25-coder:
    profiles: ["qwen25-coder", "specialized", "all"]
    image: ollama/ollama:latest
    container_name: setup-qwen25-coder
    depends_on:
      - ollama
    networks:
      - ollama-net
    entrypoint: ["/bin/sh", "-c"]
    command:
      - |
        echo "==> Pulling qwen2.5-coder:14b from Ollama library..."
        ollama pull qwen2.5-coder:14b
        echo "==> Done! Model ready: qwen2.5-coder:14b"
    environment:
      - OLLAMA_HOST=ollama:11434

  # ---------------------------------------------------------------------------
  # CODER TIER 1: SANE - Qwen3 Coder 30B-A3B MOE (~8GB active)
  # ---------------------------------------------------------------------------
  # 30B params but only 3B active per token - runs fast despite size
  # From Ollama library - no GGUF download needed
  setup-qwen3-coder:
    profiles: ["qwen3-coder", "coder", "specialized", "all"]
    image: ollama/ollama:latest
    container_name: setup-qwen3-coder
    depends_on:
      - ollama
    networks:
      - ollama-net
    entrypoint: ["/bin/sh", "-c"]
    command:
      - |
        echo "==> Pulling qwen3-coder:30b-a3b from Ollama library..."
        echo "==> This is a 30B MOE with only ~3B active - runs fast!"
        ollama pull qwen3-coder:30b-a3b
        echo "==> Done! Model ready: qwen3-coder:30b-a3b"
    environment:
      - OLLAMA_HOST=ollama:11434

  # ---------------------------------------------------------------------------
  # CODER TIER 2: BOLD - Qwen3 Coder 30B-A3B Abliterated (~17GB)
  # ---------------------------------------------------------------------------
  # Uncensored MOE coder - for understanding adversarial code generation
  # Source: mradermacher/Huihui-Qwen3-Coder-30B-A3B-Instruct-abliterated-i1-GGUF
  setup-qwen3-coder-abliterated:
    profiles: ["qwen3-coder-abliterated", "coder", "specialized", "all"]
    image: curlimages/curl:latest
    container_name: setup-qwen3-coder-abliterated
    user: root
    volumes:
      - models:/models
    entrypoint: ["/bin/sh", "-c"]
    command:
      - |
        set -e
        GGUF_FILE="/models/Huihui-Qwen3-Coder-30B-A3B-Instruct-abliterated.i1-IQ4_XS.gguf"
        GGUF_URL="https://huggingface.co/mradermacher/Huihui-Qwen3-Coder-30B-A3B-Instruct-abliterated-i1-GGUF/resolve/main/Huihui-Qwen3-Coder-30B-A3B-Instruct-abliterated.i1-IQ4_XS.gguf"

        if [ -f "$$GGUF_FILE" ]; then
          echo "==> GGUF already exists, skipping download"
        else
          echo "==> Downloading Qwen3-Coder-30B-A3B-abliterated (~16GB)..."
          echo "==> MOE architecture - 30B params, ~3B active per token"
          curl -L --progress-bar -o "$$GGUF_FILE" "$$GGUF_URL"
        fi
        echo "==> Download complete."

  register-qwen3-coder-abliterated:
    profiles: ["qwen3-coder-abliterated", "coder", "specialized", "all"]
    image: ollama/ollama:latest
    container_name: register-qwen3-coder-abliterated
    depends_on:
      ollama:
        condition: service_started
      setup-qwen3-coder-abliterated:
        condition: service_completed_successfully
    networks:
      - ollama-net
    volumes:
      - models:/models
      - ollama:/root/.ollama
    environment:
      - OLLAMA_HOST=ollama:11434
    entrypoint: ["/bin/sh", "-c"]
    command:
      - |
        set -e
        MODEL_NAME="qwen3-coder-abliterated"
        GGUF_FILE="/models/Huihui-Qwen3-Coder-30B-A3B-Instruct-abliterated.i1-IQ4_XS.gguf"

        if [ ! -f "$$GGUF_FILE" ]; then
          echo "ERROR: GGUF file not found."
          exit 1
        fi

        echo "==> Creating Modelfile..."
        printf '%s\n' \
          'FROM /models/Huihui-Qwen3-Coder-30B-A3B-Instruct-abliterated.i1-IQ4_XS.gguf' \
          '' \
          'PARAMETER temperature 0.7' \
          'PARAMETER top_p 0.8' \
          'PARAMETER top_k 20' \
          'PARAMETER repeat_penalty 1.05' \
          'PARAMETER num_ctx 32768' \
          '' \
          'TEMPLATE """{{- if .System }}<|im_start|>system' \
          '{{ .System }}<|im_end|>' \
          '{{ end }}<|im_start|>user' \
          '{{ .Prompt }}<|im_end|>' \
          '<|im_start|>assistant' \
          '"""' \
          '' \
          'SYSTEM """You are an expert coding assistant. Provide direct, helpful code solutions."""' \
          > /models/Modelfile.qwen3-coder-abliterated

        echo "==> Waiting for Ollama..."
        until ollama list > /dev/null 2>&1; do
          sleep 2
        done

        echo "==> Registering model with Ollama..."
        ollama create $$MODEL_NAME -f /models/Modelfile.qwen3-coder-abliterated

        echo ""
        echo "==> Done! Model ready: $$MODEL_NAME"

  # ---------------------------------------------------------------------------
  # CODER TIER 3: INSANE - Qwen2.5 Coder 32B Abliterated (~15GB at IQ3_M)
  # ---------------------------------------------------------------------------
  # Dense 32B model - maximum coding capability, slower inference
  # Source: bartowski/Qwen2.5-Coder-32B-Instruct-abliterated-GGUF
  setup-qwen25-coder-32b:
    profiles: ["qwen25-coder-32b", "coder", "specialized", "all"]
    image: curlimages/curl:latest
    container_name: setup-qwen25-coder-32b
    user: root
    volumes:
      - models:/models
    entrypoint: ["/bin/sh", "-c"]
    command:
      - |
        set -e
        GGUF_FILE="/models/Qwen2.5-Coder-32B-Instruct-abliterated-IQ3_M.gguf"
        GGUF_URL="https://huggingface.co/bartowski/Qwen2.5-Coder-32B-Instruct-abliterated-GGUF/resolve/main/Qwen2.5-Coder-32B-Instruct-abliterated-IQ3_M.gguf"

        if [ -f "$$GGUF_FILE" ]; then
          echo "==> GGUF already exists, skipping download"
        else
          echo "==> Downloading Qwen2.5-Coder-32B-abliterated (~15GB)..."
          echo "==> Dense 32B - maximum capability, patience required"
          curl -L --progress-bar -o "$$GGUF_FILE" "$$GGUF_URL"
        fi
        echo "==> Download complete."

  register-qwen25-coder-32b:
    profiles: ["qwen25-coder-32b", "coder", "specialized", "all"]
    image: ollama/ollama:latest
    container_name: register-qwen25-coder-32b
    depends_on:
      ollama:
        condition: service_started
      setup-qwen25-coder-32b:
        condition: service_completed_successfully
    networks:
      - ollama-net
    volumes:
      - models:/models
      - ollama:/root/.ollama
    environment:
      - OLLAMA_HOST=ollama:11434
    entrypoint: ["/bin/sh", "-c"]
    command:
      - |
        set -e
        MODEL_NAME="qwen25-coder-32b"
        GGUF_FILE="/models/Qwen2.5-Coder-32B-Instruct-abliterated-IQ3_M.gguf"

        if [ ! -f "$$GGUF_FILE" ]; then
          echo "ERROR: GGUF file not found."
          exit 1
        fi

        echo "==> Creating Modelfile..."
        printf '%s\n' \
          'FROM /models/Qwen2.5-Coder-32B-Instruct-abliterated-IQ3_M.gguf' \
          '' \
          'PARAMETER temperature 0.7' \
          'PARAMETER top_p 0.8' \
          'PARAMETER top_k 20' \
          'PARAMETER repeat_penalty 1.05' \
          'PARAMETER num_ctx 16384' \
          '' \
          'TEMPLATE """{{- if .System }}<|im_start|>system' \
          '{{ .System }}<|im_end|>' \
          '{{ end }}<|im_start|>user' \
          '{{ .Prompt }}<|im_end|>' \
          '<|im_start|>assistant' \
          '"""' \
          '' \
          'SYSTEM """You are an expert coding assistant. Provide direct, helpful code solutions."""' \
          > /models/Modelfile.qwen25-coder-32b

        echo "==> Waiting for Ollama..."
        until ollama list > /dev/null 2>&1; do
          sleep 2
        done

        echo "==> Registering model with Ollama..."
        ollama create $$MODEL_NAME -f /models/Modelfile.qwen25-coder-32b

        echo ""
        echo "==> Done! Model ready: $$MODEL_NAME"
        echo "==> NOTE: Dense 32B model - expect slower inference"

  # ---------------------------------------------------------------------------
  # TOOL-CALLING MODELS - For Agentic Coding (OpenCode, Goose, Claude proxies)
  # ---------------------------------------------------------------------------
  #
  # These models have proper tool/function calling support for use with:
  #   - OpenCode CLI
  #   - Goose
  #   - Claude Code (behind intercept proxy)
  #   - Any agentic coding assistant
  #
  # Usage:
  #   docker compose --profile tool up              # All tool-calling models
  #   docker compose --profile watt-tool-8b up      # Just watt-tool
  #   docker compose --profile groq-tool-8b up      # Just Groq tool-use
  # ---------------------------------------------------------------------------

  # Watt-Tool-8B - Purpose-built for tool/function calling (~6GB)
  # Source: mradermacher/watt-tool-8B-GGUF
  setup-watt-tool-8b:
    profiles: ["watt-tool-8b", "tool", "all"]
    image: curlimages/curl:latest
    container_name: setup-watt-tool-8b
    user: root
    volumes:
      - models:/models
    entrypoint: ["/bin/sh", "-c"]
    command:
      - |
        set -e
        GGUF_FILE="/models/watt-tool-8B.Q5_K_M.gguf"
        GGUF_URL="https://huggingface.co/mradermacher/watt-tool-8B-GGUF/resolve/main/watt-tool-8B.Q5_K_M.gguf"

        if [ -f "$$GGUF_FILE" ]; then
          echo "==> GGUF already exists, skipping download"
        else
          echo "==> Downloading watt-tool-8B (~6GB)..."
          echo "==> Purpose-built for tool/function calling"
          curl -L --progress-bar -o "$$GGUF_FILE" "$$GGUF_URL"
        fi
        echo "==> Download complete."

  register-watt-tool-8b:
    profiles: ["watt-tool-8b", "tool", "all"]
    image: ollama/ollama:latest
    container_name: register-watt-tool-8b
    depends_on:
      ollama:
        condition: service_started
      setup-watt-tool-8b:
        condition: service_completed_successfully
    networks:
      - ollama-net
    volumes:
      - models:/models
      - ollama:/root/.ollama
    environment:
      - OLLAMA_HOST=ollama:11434
    entrypoint: ["/bin/sh", "-c"]
    command:
      - |
        set -e
        MODEL_NAME="watt-tool-8b"
        GGUF_FILE="/models/watt-tool-8B.Q5_K_M.gguf"

        if [ ! -f "$$GGUF_FILE" ]; then
          echo "ERROR: GGUF file not found."
          exit 1
        fi

        echo "==> Creating Modelfile..."
        printf '%s\n' \
          'FROM /models/watt-tool-8B.Q5_K_M.gguf' \
          '' \
          'PARAMETER temperature 0.7' \
          'PARAMETER top_p 0.9' \
          'PARAMETER num_ctx 8192' \
          '' \
          'TEMPLATE """{{- if .System }}<|begin_of_text|><|start_header_id|>system<|end_header_id|>' \
          '' \
          '{{ .System }}<|eot_id|>{{ end }}{{ if .Prompt }}<|start_header_id|>user<|end_header_id|>' \
          '' \
          '{{ .Prompt }}<|eot_id|>{{ end }}<|start_header_id|>assistant<|end_header_id|>' \
          '' \
          '{{ .Response }}<|eot_id|>"""' \
          '' \
          'SYSTEM """You are a helpful AI assistant with tool/function calling capabilities. Execute tools accurately when requested."""' \
          > /models/Modelfile.watt-tool-8b

        echo "==> Waiting for Ollama..."
        until ollama list > /dev/null 2>&1; do
          sleep 2
        done

        echo "==> Registering model with Ollama..."
        ollama create $$MODEL_NAME -f /models/Modelfile.watt-tool-8b

        echo ""
        echo "==> Done! Model ready: $$MODEL_NAME"
        echo "==> Supports tool/function calling for agentic coding"

  # Qwen3-Coder-30B-A3B (Unsloth) - SOTA agentic coder with tool calling (~15GB)
  # Source: unsloth/Qwen3-Coder-30B-A3B-Instruct-GGUF
  # NOTE: Native tool calling support for Cline, OpenCode, etc.
  setup-qwen3-coder-unsloth:
    profiles: ["qwen3-coder-unsloth", "tool", "coder", "all"]
    image: curlimages/curl:latest
    container_name: setup-qwen3-coder-unsloth
    user: root
    volumes:
      - models:/models
    entrypoint: ["/bin/sh", "-c"]
    command:
      - |
        set -e
        GGUF_FILE="/models/Qwen3-Coder-30B-A3B-Instruct-Q3_K_M.gguf"
        GGUF_URL="https://huggingface.co/unsloth/Qwen3-Coder-30B-A3B-Instruct-GGUF/resolve/main/Qwen3-Coder-30B-A3B-Instruct-Q3_K_M.gguf"

        if [ -f "$$GGUF_FILE" ]; then
          echo "==> GGUF already exists, skipping download"
        else
          echo "==> Downloading Qwen3-Coder-30B-A3B Unsloth (~15GB)..."
          echo "==> SOTA agentic coder with native tool calling"
          echo "==> MOE: 30B params, ~3B active per token"
          curl -L --progress-bar -o "$$GGUF_FILE" "$$GGUF_URL"
        fi
        echo "==> Download complete."

  register-qwen3-coder-unsloth:
    profiles: ["qwen3-coder-unsloth", "tool", "coder", "all"]
    image: ollama/ollama:latest
    container_name: register-qwen3-coder-unsloth
    depends_on:
      ollama:
        condition: service_started
      setup-qwen3-coder-unsloth:
        condition: service_completed_successfully
    networks:
      - ollama-net
    volumes:
      - models:/models
      - ollama:/root/.ollama
    environment:
      - OLLAMA_HOST=ollama:11434
    entrypoint: ["/bin/sh", "-c"]
    command:
      - |
        set -e
        MODEL_NAME="qwen3-coder-unsloth"
        GGUF_FILE="/models/Qwen3-Coder-30B-A3B-Instruct-Q3_K_M.gguf"

        if [ ! -f "$$GGUF_FILE" ]; then
          echo "ERROR: GGUF file not found."
          exit 1
        fi

        echo "==> Creating Modelfile..."
        printf '%s\n' \
          'FROM /models/Qwen3-Coder-30B-A3B-Instruct-Q3_K_M.gguf' \
          '' \
          'PARAMETER temperature 0.7' \
          'PARAMETER top_p 0.8' \
          'PARAMETER top_k 20' \
          'PARAMETER repeat_penalty 1.05' \
          'PARAMETER num_ctx 32768' \
          '' \
          'TEMPLATE """{{- if .System }}<|im_start|>system' \
          '{{ .System }}<|im_end|>' \
          '{{ end }}<|im_start|>user' \
          '{{ .Prompt }}<|im_end|>' \
          '<|im_start|>assistant' \
          '{{ .Response }}<|im_end|>"""' \
          '' \
          'SYSTEM """You are Qwen3-Coder, an expert AI coding assistant with tool/function calling capabilities."""' \
          > /models/Modelfile.qwen3-coder-unsloth

        echo "==> Waiting for Ollama..."
        until ollama list > /dev/null 2>&1; do
          sleep 2
        done

        echo "==> Registering model with Ollama..."
        ollama create $$MODEL_NAME -f /models/Modelfile.qwen3-coder-unsloth

        echo ""
        echo "==> Done! Model ready: $$MODEL_NAME"
        echo "==> SOTA agentic coder with native tool calling"

  # Llama-3-Groq-8B-Tool-Use - Popular Groq fine-tune for tool use (~6GB)
  # Source: bartowski/Llama-3-Groq-8B-Tool-Use-GGUF
  setup-groq-tool-8b:
    profiles: ["groq-tool-8b", "tool", "all"]
    image: curlimages/curl:latest
    container_name: setup-groq-tool-8b
    user: root
    volumes:
      - models:/models
    entrypoint: ["/bin/sh", "-c"]
    command:
      - |
        set -e
        GGUF_FILE="/models/Llama-3-Groq-8B-Tool-Use-Q5_K_M.gguf"
        GGUF_URL="https://huggingface.co/bartowski/Llama-3-Groq-8B-Tool-Use-GGUF/resolve/main/Llama-3-Groq-8B-Tool-Use-Q5_K_M.gguf"

        if [ -f "$$GGUF_FILE" ]; then
          echo "==> GGUF already exists, skipping download"
        else
          echo "==> Downloading Llama-3-Groq-8B-Tool-Use (~6GB)..."
          echo "==> Groq fine-tune optimized for tool/function calling"
          curl -L --progress-bar -o "$$GGUF_FILE" "$$GGUF_URL"
        fi
        echo "==> Download complete."

  register-groq-tool-8b:
    profiles: ["groq-tool-8b", "tool", "all"]
    image: ollama/ollama:latest
    container_name: register-groq-tool-8b
    depends_on:
      ollama:
        condition: service_started
      setup-groq-tool-8b:
        condition: service_completed_successfully
    networks:
      - ollama-net
    volumes:
      - models:/models
      - ollama:/root/.ollama
    environment:
      - OLLAMA_HOST=ollama:11434
    entrypoint: ["/bin/sh", "-c"]
    command:
      - |
        set -e
        MODEL_NAME="groq-tool-8b"
        GGUF_FILE="/models/Llama-3-Groq-8B-Tool-Use-Q5_K_M.gguf"

        if [ ! -f "$$GGUF_FILE" ]; then
          echo "ERROR: GGUF file not found."
          exit 1
        fi

        echo "==> Creating Modelfile..."
        printf '%s\n' \
          'FROM /models/Llama-3-Groq-8B-Tool-Use-Q5_K_M.gguf' \
          '' \
          'PARAMETER temperature 0.7' \
          'PARAMETER top_p 0.9' \
          'PARAMETER num_ctx 8192' \
          '' \
          'TEMPLATE """{{- if .System }}<|begin_of_text|><|start_header_id|>system<|end_header_id|>' \
          '' \
          '{{ .System }}<|eot_id|>{{ end }}{{ if .Prompt }}<|start_header_id|>user<|end_header_id|>' \
          '' \
          '{{ .Prompt }}<|eot_id|>{{ end }}<|start_header_id|>assistant<|end_header_id|>' \
          '' \
          '{{ .Response }}<|eot_id|>"""' \
          '' \
          'SYSTEM """You are a helpful AI assistant with tool/function calling capabilities. Execute tools accurately when requested."""' \
          > /models/Modelfile.groq-tool-8b

        echo "==> Waiting for Ollama..."
        until ollama list > /dev/null 2>&1; do
          sleep 2
        done

        echo "==> Registering model with Ollama..."
        ollama create $$MODEL_NAME -f /models/Modelfile.groq-tool-8b

        echo ""
        echo "==> Done! Model ready: $$MODEL_NAME"
        echo "==> Groq fine-tune - excellent for tool/function calling"

  # ---------------------------------------------------------------------------
  # Q4: GREY ZONE - SECURITY RESEARCH
  # ---------------------------------------------------------------------------

  # MN-DARKEST-UNIVERSE-29B - Uncensored creative/horror
  setup-darkest-universe:
    profiles: ["darkest-universe", "grey-zone", "all"]
    image: curlimages/curl:latest
    container_name: setup-darkest-universe
    user: root
    volumes:
      - models:/models
    entrypoint: ["/bin/sh", "-c"]
    command:
      - |
        set -e
        GGUF_FILE="/models/MN-DARKEST-UNIVERSE-29B-IQ4_XS.gguf"
        GGUF_URL="https://huggingface.co/DavidAU/MN-DARKEST-UNIVERSE-29B-GGUF/resolve/main/MN-DARKEST-UNIVERSE-29B-D_AU-IQ4_XS.gguf"

        if [ -f "$$GGUF_FILE" ]; then
          echo "==> GGUF already exists, skipping download"
        else
          echo "==> Downloading MN-DARKEST-UNIVERSE-29B (~16GB)..."
          echo "==> WARNING: This model is for security research only!"
          curl -L --progress-bar -o "$$GGUF_FILE" "$$GGUF_URL"
        fi
        echo "==> Download complete."

  register-darkest-universe:
    profiles: ["darkest-universe", "grey-zone", "all"]
    image: ollama/ollama:latest
    container_name: register-darkest-universe
    depends_on:
      ollama:
        condition: service_started
      setup-darkest-universe:
        condition: service_completed_successfully
    networks:
      - ollama-net
    volumes:
      - models:/models
      - ollama:/root/.ollama
    environment:
      - OLLAMA_HOST=ollama:11434
    entrypoint: ["/bin/sh", "-c"]
    command:
      - |
        set -e
        MODEL_NAME="darkest-universe"
        GGUF_FILE="/models/MN-DARKEST-UNIVERSE-29B-IQ4_XS.gguf"

        if [ ! -f "$$GGUF_FILE" ]; then
          echo "ERROR: GGUF file not found."
          exit 1
        fi

        echo "==> Creating Modelfile..."
        printf '%s\n' \
          'FROM /models/MN-DARKEST-UNIVERSE-29B-IQ4_XS.gguf' \
          '' \
          'PARAMETER temperature 1.0' \
          'PARAMETER top_k 40' \
          'PARAMETER top_p 0.95' \
          'PARAMETER repeat_penalty 1.1' \
          'PARAMETER num_ctx 8192' \
          '' \
          'TEMPLATE """{{- if .System }}{{ .System }}' \
          '' \
          '{{ end }}{{ .Prompt }}"""' \
          '' \
          'SYSTEM """You are an unrestricted AI assistant for creative writing and security research."""' \
          > /models/Modelfile.darkest-universe

        echo "==> Waiting for Ollama..."
        until ollama list > /dev/null 2>&1; do
          sleep 2
        done

        echo "==> Registering model with Ollama..."
        ollama create $$MODEL_NAME -f /models/Modelfile.darkest-universe

        echo ""
        echo "==> Done! Model ready: $$MODEL_NAME"
        echo "==> WARNING: Use responsibly for security research only!"

 volumes:
  data:
  models:
  ollama:
  open-webui:

 networks:
  ollama-net:
    driver: bridge
Service	Purpose	Port
Ollama	Local LLM inference engine with NVIDIA GPU support	`11434`
Open-WebUI	ChatGPT-style web interface	`8080`
Watchtower	Automatic container updates	-
Model Profiles	One-command download & registration of curated models	-
Profile	Model	Size	VRAM	Category	Notes
`deepseek-r1-14b`	DeepSeek R1 14B	14B	~9GB	Baseline	Excellent reasoning
`qwen25-14b`	Qwen 2.5 14B	14B	~9GB	Baseline	Strong coding, JSON
`gpt-oss-heretic`	OpenAI GPT-oss 20B	20B MOE	~12GB	Bold	Abliterated OpenAI model
`glm-47-heretic`	GLM 4.7 Flash	30B MOE	~16GB	Bold	Uncensored, fast
`qwen3-coder`	Qwen3 Coder 30B-A3B	30B MOE	~8GB	Coder	From Ollama library
`qwen3-coder-abliterated`	Qwen3 Coder Abliterated	30B MOE	~17GB	Coder	Uncensored
`qwen25-coder-32b`	Qwen2.5 Coder 32B	32B	~15GB	Coder	Max capability
`qwen3-coder-unsloth`	Qwen3 Coder (Unsloth)	30B MOE	~15GB	Tool	SOTA with tool calling
`watt-tool-8b`	Watt-Tool 8B	8B	~6GB	Tool	Purpose-built for tools
`groq-tool-8b`	Llama-3-Groq Tool-Use	8B	~6GB	Tool	Groq fine-tune
`darkest-universe`	MN-Darkest-Universe 29B	29B	~16GB	Grey Zone	Security research
Model	Speed	Quality	Best For
`qwen3-coder-unsloth`	Medium	SOTA	Complex agentic tasks, large codebases
`watt-tool-8b`	Fast	Good	Quick iterations, simple tool calls
`groq-tool-8b`	Fast	Good	Groq-optimized workflows
Level	Quality	Use Case
`q8_0`	~99%	Maximum quality, if VRAM allows
`q6_K`	~97%	Excellent quality
`q5_K_M`	~95%	Great balance
`q4_K_M`	~92%	Most popular choice
`q3_K_M`	~85%	Noticeable degradation
`IQ4_XS`	~90%	Imatrix-optimized, smaller
Model Size	q4_K_M	q5_K_M	q6_K	q8_0
7-8B	~5GB	~6GB	~7GB	~9GB
13-14B	~8GB	~10GB	~12GB	~15GB
30B MOE	~15GB	~18GB	~22GB	~30GB
30-34B Dense	~18GB	~22GB	~26GB	~34GB
Volume	Container Path	Purpose
`ollama`	`/root/.ollama`	Model weights, config
`open-webui`	`/app/backend/data`	User data, chat history
`models`	`/models`	Shared GGUF storage (persists downloads)
`data`	`/data`	Application data
Variable	Value	Effect
`OLLAMA_FLASH_ATTENTION`	`1`	Faster inference on modern GPUs
`OLLAMA_KEEP_ALIVE`	`24h`	Keep models loaded (no reload delay)
`OLLAMA_MAX_LOADED_MODELS`	`2`	Hot-swap between 2 models
`OLLAMA_NUM_PARALLEL`	`4`	Concurrent request handling
`OLLAMA_GPU_OVERHEAD`	`0`	Use all available VRAM
Platform	GPU	Status
Pop!_OS 24.04	Quadro RTX 5000 (16GB)	✅ Working
Arch Linux	Various NVIDIA	✅ Working
Windows 11 + WSL2	RTX Series	✅ Working
Ubuntu 24.04	RTX 3080/4080	✅ Working