badnetmask · December 12, 2025 22:54
diff --git a/llama-cpp-values.yaml b/llama-cpp-values.yaml
 # This configuration is provided AS IS, without ANY guaruantees that it will work.
 # I highly recommend that you only use it if you understand what it's doing.
 # Don't blame me if it doesn't work. Don't ask for support.
 # Use it at your own risk.
 # You have been warned.
 ---
 apiVersion: v1
 kind: PersistentVolumeClaim
 metadata:
  name: llama-cpp-models
  namespace: llama-cpp
  labels:
    app.kubernetes.io/name: llama-cpp
    app.kubernetes.io/component: storage
 spec:
  accessModes:
    - ReadWriteOnce
  storageClassName: <your-class>
  resources:
    requests:
      storage: 50Gi
 ---
 apiVersion: v1
 kind: Service
 metadata:
  name: llama-cpp
  namespace: llama-cpp
  labels:
    app.kubernetes.io/name: llama-cpp
    app.kubernetes.io/component: service
 spec:
  type: ClusterIP
  ports:
    - port: 8080
      targetPort: 8080
      protocol: TCP
      name: http
  selector:
    app.kubernetes.io/name: llama-cpp
 ---
 apiVersion: apps/v1
 kind: Deployment
 metadata:
  name: llama-cpp
  namespace: llama-cpp
  labels:
    app.kubernetes.io/name: llama-cpp
    app.kubernetes.io/component: inference
 spec:
  replicas: 1
  strategy:
    type: Recreate
  selector:
    matchLabels:
      app.kubernetes.io/name: llama-cpp
  template:
    metadata:
      labels:
        app.kubernetes.io/name: llama-cpp
    spec:
      nodeSelector:
        amd.com/gpu.device-id: "1586"
      containers:
        - name: llama-cpp
          # renovate: datasource=docker depName=ghcr.io/ggml-org/llama.cpp
          image: ghcr.io/ggml-org/llama.cpp:full-vulkan-b7083
          command:
            - /bin/bash
            - -c
            - |
              # Install huggingface-cli and tools
              echo "Installing dependencies..."
              apt-get update && apt-get install -y python3-pip nvtop
              pip3 install --break-system-packages huggingface-hub
              
              # List of models to download
              # Format: "repo/filename" or "repo" to download all GGUF files
              # Add or remove models here
              MODELS=(
                "bartowski/Llama-3.2-3B-Instruct-GGUF/Llama-3.2-3B-Instruct-Q4_K_M.gguf"
              )
              
              # Download all models
              FIRST_DOWNLOADED=""
              for MODEL_SPEC in "${MODELS[@]}"; do
                # Check if filename is specified (contains 3 parts when split by /)
                if [[ "$MODEL_SPEC" == */*/* ]]; then
                  # Specific file: "repo/filename"
                  REPO="${MODEL_SPEC%/*}"
                  FILENAME="${MODEL_SPEC##*/}"
                  
                  if [ ! -f "/data/$FILENAME" ]; then
                    echo "Downloading $FILENAME from $REPO..."
                    huggingface-cli download \
                      "$REPO" \
                      "$FILENAME" \
                      --local-dir /data \
                      --local-dir-use-symlinks False
                    echo "✓ $FILENAME downloaded successfully!"
                  else
                    echo "✓ $FILENAME already exists"
                  fi
                  
                  # Track first downloaded file for default
                  if [ -z "$FIRST_DOWNLOADED" ]; then
                    FIRST_DOWNLOADED="$FILENAME"
                  fi
                else
                  # Entire repo: download all GGUF files
                  REPO="$MODEL_SPEC"
                  echo "Downloading all GGUF models from $REPO..."
                  huggingface-cli download \
                    "$REPO" \
                    --include "*.gguf" \
                    --local-dir /data \
                    --local-dir-use-symlinks False
                  echo "✓ All models from $REPO downloaded successfully!"
                  
                  # Track first gguf file for default
                  if [ -z "$FIRST_DOWNLOADED" ]; then
                    FIRST_DOWNLOADED=$(ls /data/*.gguf 2>/dev/null | head -1 | xargs basename)
                  fi
                fi
              done
              
              # Default to first model (set DEFAULT_MODEL env var to override)
              MODEL_NAME="${DEFAULT_MODEL:-$FIRST_DOWNLOADED}"
              MODEL_PATH="/data/$MODEL_NAME"
              
              echo ""
              echo "Models available in /data:"
              ls -lh /data/*.gguf || echo "No GGUF models found!"
              echo ""
              echo "Starting server with model: $MODEL_NAME"
              
              # Start llama-server HTTP API
              echo "Starting llama-server on port 8080..."
              echo "Model: $MODEL_PATH"
              echo "Context size: 8192 tokens"
              exec /app/llama-server \
                -m "$MODEL_PATH" \
                --host 0.0.0.0 \
                --port 8080 \
                -c 8192 \
                -ngl 999
          env:
            - name: HSA_OVERRIDE_GFX_VERSION
              value: "11.5.1"
            - name: GGML_CUDA_NO_INIT
              value: "1"
            - name: GGML_HIP_NO_INIT
              value: "1"
            # Uncomment to override default model:
            - name: DEFAULT_MODEL
              value: "Llama-3.2-3B-Instruct-Q4_K_M.gguf"
          ports:
            - containerPort: 8080
              name: http
              protocol: TCP
          volumeMounts:
            - name: models
              mountPath: /data
            - name: dri
              mountPath: /dev/dri
            - name: kfd
              mountPath: /dev/kfd
          resources:
            requests:
              cpu: 2000m
              memory: 8Gi
            limits:
              cpu: 8000m
              memory: 32Gi
          securityContext:
            privileged: true
            capabilities:
              add:
                - SYS_PTRACE
      hostIPC: true
      volumes:
        - name: models
          persistentVolumeClaim:
            claimName: llama-cpp-models
        - name: dri
          hostPath:
            path: /dev/dri
        - name: kfd
          hostPath:
            path: /dev/kfd
	# This configuration is provided AS IS, without ANY guaruantees that it will work.
	# I highly recommend that you only use it if you understand what it's doing.
	# Don't blame me if it doesn't work. Don't ask for support.
	# Use it at your own risk.
	# You have been warned.
	---
	apiVersion: v1
	kind: PersistentVolumeClaim
	metadata:
	name: llama-cpp-models
	namespace: llama-cpp
	labels:
	app.kubernetes.io/name: llama-cpp
	app.kubernetes.io/component: storage
	spec:
	accessModes:
	- ReadWriteOnce
	storageClassName: <your-class>
	resources:
	requests:
	storage: 50Gi
	---
	apiVersion: v1
	kind: Service
	metadata:
	name: llama-cpp
	namespace: llama-cpp
	labels:
	app.kubernetes.io/name: llama-cpp
	app.kubernetes.io/component: service
	spec:
	type: ClusterIP
	ports:
	- port: 8080
	targetPort: 8080
	protocol: TCP
	name: http
	selector:
	app.kubernetes.io/name: llama-cpp
	---
	apiVersion: apps/v1
	kind: Deployment
	metadata:
	name: llama-cpp
	namespace: llama-cpp
	labels:
	app.kubernetes.io/name: llama-cpp
	app.kubernetes.io/component: inference
	spec:
	replicas: 1
	strategy:
	type: Recreate
	selector:
	matchLabels:
	app.kubernetes.io/name: llama-cpp
	template:
	metadata:
	labels:
	app.kubernetes.io/name: llama-cpp
	spec:
	nodeSelector:
	amd.com/gpu.device-id: "1586"
	containers:
	- name: llama-cpp
	# renovate: datasource=docker depName=ghcr.io/ggml-org/llama.cpp
	image: ghcr.io/ggml-org/llama.cpp:full-vulkan-b7083
	command:
	- /bin/bash
	- -c
	- \|
	# Install huggingface-cli and tools
	echo "Installing dependencies..."
	apt-get update && apt-get install -y python3-pip nvtop
	pip3 install --break-system-packages huggingface-hub

	# List of models to download
	# Format: "repo/filename" or "repo" to download all GGUF files
	# Add or remove models here
	MODELS=(
	"bartowski/Llama-3.2-3B-Instruct-GGUF/Llama-3.2-3B-Instruct-Q4_K_M.gguf"
	)

	# Download all models
	FIRST_DOWNLOADED=""
	for MODEL_SPEC in "${MODELS[@]}"; do
	# Check if filename is specified (contains 3 parts when split by /)
	if [[ "$MODEL_SPEC" == //* ]]; then
	# Specific file: "repo/filename"
	REPO="${MODEL_SPEC%/*}"
	FILENAME="${MODEL_SPEC##*/}"

	if [ ! -f "/data/$FILENAME" ]; then
	echo "Downloading $FILENAME from $REPO..."
	huggingface-cli download \
	"$REPO" \
	"$FILENAME" \
	--local-dir /data \
	--local-dir-use-symlinks False
	echo "✓ $FILENAME downloaded successfully!"
	else
	echo "✓ $FILENAME already exists"
	fi

	# Track first downloaded file for default
	if [ -z "$FIRST_DOWNLOADED" ]; then
	FIRST_DOWNLOADED="$FILENAME"
	fi
	else
	# Entire repo: download all GGUF files
	REPO="$MODEL_SPEC"
	echo "Downloading all GGUF models from $REPO..."
	huggingface-cli download \
	"$REPO" \
	--include "*.gguf" \
	--local-dir /data \
	--local-dir-use-symlinks False
	echo "✓ All models from $REPO downloaded successfully!"

	# Track first gguf file for default
	if [ -z "$FIRST_DOWNLOADED" ]; then
	FIRST_DOWNLOADED=$(ls /data/*.gguf 2>/dev/null \| head -1 \| xargs basename)
	fi
	fi
	done

	# Default to first model (set DEFAULT_MODEL env var to override)
	MODEL_NAME="${DEFAULT_MODEL:-$FIRST_DOWNLOADED}"
	MODEL_PATH="/data/$MODEL_NAME"

	echo ""
	echo "Models available in /data:"
	ls -lh /data/*.gguf \|\| echo "No GGUF models found!"
	echo ""
	echo "Starting server with model: $MODEL_NAME"

	# Start llama-server HTTP API
	echo "Starting llama-server on port 8080..."
	echo "Model: $MODEL_PATH"
	echo "Context size: 8192 tokens"
	exec /app/llama-server \
	-m "$MODEL_PATH" \
	--host 0.0.0.0 \
	--port 8080 \
	-c 8192 \
	-ngl 999
	env:
	- name: HSA_OVERRIDE_GFX_VERSION
	value: "11.5.1"
	- name: GGML_CUDA_NO_INIT
	value: "1"
	- name: GGML_HIP_NO_INIT
	value: "1"
	# Uncomment to override default model:
	- name: DEFAULT_MODEL
	value: "Llama-3.2-3B-Instruct-Q4_K_M.gguf"
	ports:
	- containerPort: 8080
	name: http
	protocol: TCP
	volumeMounts:
	- name: models
	mountPath: /data
	- name: dri
	mountPath: /dev/dri
	- name: kfd
	mountPath: /dev/kfd
	resources:
	requests:
	cpu: 2000m
	memory: 8Gi
	limits:
	cpu: 8000m
	memory: 32Gi
	securityContext:
	privileged: true
	capabilities:
	add:
	- SYS_PTRACE
	hostIPC: true
	volumes:
	- name: models
	persistentVolumeClaim:
	claimName: llama-cpp-models
	- name: dri
	hostPath:
	path: /dev/dri
	- name: kfd
	hostPath:
	path: /dev/kfd
No results found