Last active
December 12, 2025 22:54
-
-
Save badnetmask/043dfce31ec7928d57ef667a2b829b35 to your computer and use it in GitHub Desktop.
Supporting material for my blog post: https://mteixeira.wordpress.com/2025/12/12/running-ollama-and-llama-cpp-on-talos-linux-on-an-amd-strix-halo-cpu/
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| # This configuration is provided AS IS, without ANY guaruantees that it will work. | |
| # I highly recommend that you only use it if you understand what it's doing. | |
| # Don't blame me if it doesn't work. Don't ask for support. | |
| # Use it at your own risk. | |
| # You have been warned. | |
| --- | |
| apiVersion: v1 | |
| kind: PersistentVolumeClaim | |
| metadata: | |
| name: llama-cpp-models | |
| namespace: llama-cpp | |
| labels: | |
| app.kubernetes.io/name: llama-cpp | |
| app.kubernetes.io/component: storage | |
| spec: | |
| accessModes: | |
| - ReadWriteOnce | |
| storageClassName: <your-class> | |
| resources: | |
| requests: | |
| storage: 50Gi | |
| --- | |
| apiVersion: v1 | |
| kind: Service | |
| metadata: | |
| name: llama-cpp | |
| namespace: llama-cpp | |
| labels: | |
| app.kubernetes.io/name: llama-cpp | |
| app.kubernetes.io/component: service | |
| spec: | |
| type: ClusterIP | |
| ports: | |
| - port: 8080 | |
| targetPort: 8080 | |
| protocol: TCP | |
| name: http | |
| selector: | |
| app.kubernetes.io/name: llama-cpp | |
| --- | |
| apiVersion: apps/v1 | |
| kind: Deployment | |
| metadata: | |
| name: llama-cpp | |
| namespace: llama-cpp | |
| labels: | |
| app.kubernetes.io/name: llama-cpp | |
| app.kubernetes.io/component: inference | |
| spec: | |
| replicas: 1 | |
| strategy: | |
| type: Recreate | |
| selector: | |
| matchLabels: | |
| app.kubernetes.io/name: llama-cpp | |
| template: | |
| metadata: | |
| labels: | |
| app.kubernetes.io/name: llama-cpp | |
| spec: | |
| nodeSelector: | |
| amd.com/gpu.device-id: "1586" | |
| containers: | |
| - name: llama-cpp | |
| # renovate: datasource=docker depName=ghcr.io/ggml-org/llama.cpp | |
| image: ghcr.io/ggml-org/llama.cpp:full-vulkan-b7083 | |
| command: | |
| - /bin/bash | |
| - -c | |
| - | | |
| # Install huggingface-cli and tools | |
| echo "Installing dependencies..." | |
| apt-get update && apt-get install -y python3-pip nvtop | |
| pip3 install --break-system-packages huggingface-hub | |
| # List of models to download | |
| # Format: "repo/filename" or "repo" to download all GGUF files | |
| # Add or remove models here | |
| MODELS=( | |
| "bartowski/Llama-3.2-3B-Instruct-GGUF/Llama-3.2-3B-Instruct-Q4_K_M.gguf" | |
| ) | |
| # Download all models | |
| FIRST_DOWNLOADED="" | |
| for MODEL_SPEC in "${MODELS[@]}"; do | |
| # Check if filename is specified (contains 3 parts when split by /) | |
| if [[ "$MODEL_SPEC" == */*/* ]]; then | |
| # Specific file: "repo/filename" | |
| REPO="${MODEL_SPEC%/*}" | |
| FILENAME="${MODEL_SPEC##*/}" | |
| if [ ! -f "/data/$FILENAME" ]; then | |
| echo "Downloading $FILENAME from $REPO..." | |
| huggingface-cli download \ | |
| "$REPO" \ | |
| "$FILENAME" \ | |
| --local-dir /data \ | |
| --local-dir-use-symlinks False | |
| echo "✓ $FILENAME downloaded successfully!" | |
| else | |
| echo "✓ $FILENAME already exists" | |
| fi | |
| # Track first downloaded file for default | |
| if [ -z "$FIRST_DOWNLOADED" ]; then | |
| FIRST_DOWNLOADED="$FILENAME" | |
| fi | |
| else | |
| # Entire repo: download all GGUF files | |
| REPO="$MODEL_SPEC" | |
| echo "Downloading all GGUF models from $REPO..." | |
| huggingface-cli download \ | |
| "$REPO" \ | |
| --include "*.gguf" \ | |
| --local-dir /data \ | |
| --local-dir-use-symlinks False | |
| echo "✓ All models from $REPO downloaded successfully!" | |
| # Track first gguf file for default | |
| if [ -z "$FIRST_DOWNLOADED" ]; then | |
| FIRST_DOWNLOADED=$(ls /data/*.gguf 2>/dev/null | head -1 | xargs basename) | |
| fi | |
| fi | |
| done | |
| # Default to first model (set DEFAULT_MODEL env var to override) | |
| MODEL_NAME="${DEFAULT_MODEL:-$FIRST_DOWNLOADED}" | |
| MODEL_PATH="/data/$MODEL_NAME" | |
| echo "" | |
| echo "Models available in /data:" | |
| ls -lh /data/*.gguf || echo "No GGUF models found!" | |
| echo "" | |
| echo "Starting server with model: $MODEL_NAME" | |
| # Start llama-server HTTP API | |
| echo "Starting llama-server on port 8080..." | |
| echo "Model: $MODEL_PATH" | |
| echo "Context size: 8192 tokens" | |
| exec /app/llama-server \ | |
| -m "$MODEL_PATH" \ | |
| --host 0.0.0.0 \ | |
| --port 8080 \ | |
| -c 8192 \ | |
| -ngl 999 | |
| env: | |
| - name: HSA_OVERRIDE_GFX_VERSION | |
| value: "11.5.1" | |
| - name: GGML_CUDA_NO_INIT | |
| value: "1" | |
| - name: GGML_HIP_NO_INIT | |
| value: "1" | |
| # Uncomment to override default model: | |
| - name: DEFAULT_MODEL | |
| value: "Llama-3.2-3B-Instruct-Q4_K_M.gguf" | |
| ports: | |
| - containerPort: 8080 | |
| name: http | |
| protocol: TCP | |
| volumeMounts: | |
| - name: models | |
| mountPath: /data | |
| - name: dri | |
| mountPath: /dev/dri | |
| - name: kfd | |
| mountPath: /dev/kfd | |
| resources: | |
| requests: | |
| cpu: 2000m | |
| memory: 8Gi | |
| limits: | |
| cpu: 8000m | |
| memory: 32Gi | |
| securityContext: | |
| privileged: true | |
| capabilities: | |
| add: | |
| - SYS_PTRACE | |
| hostIPC: true | |
| volumes: | |
| - name: models | |
| persistentVolumeClaim: | |
| claimName: llama-cpp-models | |
| - name: dri | |
| hostPath: | |
| path: /dev/dri | |
| - name: kfd | |
| hostPath: | |
| path: /dev/kfd |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment