shang-vikas · March 13, 2026 16:24
diff --git a/build-llm-cpp.sh b/build-llm-cpp.sh
 #!/usr/bin/env bash
 set -euo pipefail

 ########################################

 # CONFIG

 ########################################

 PREFIX="/opt/llama"
 SRC_DIR="$PREFIX/src"
 BIN_DIR="$PREFIX/bin"

 ########################################

 # LOG

 ########################################

 log() {
 echo "[`date '+%H:%M:%S'`] $1"
 }

 ########################################

 # INSTALL PACKAGE IF MISSING

 ########################################

 ensure_pkg() {

 pkg="$1"

 if dpkg -s "$pkg" >/dev/null 2>&1; then
 log "$pkg already installed"
 else
 log "Installing $pkg"
 apt-get update
 apt-get install -y "$pkg"
 fi
 }

 ########################################

 # INSTALL BUILD DEPENDENCIES

 ########################################

 install_build_tools() {

 ensure_pkg build-essential
 ensure_pkg cmake
 ensure_pkg git
 ensure_pkg pkg-config
 ensure_pkg curl
 ensure_pkg wget
 }

 ########################################

 # DETECT CUDA VERSION

 ########################################

 detect_cuda_version() {

 if ! command -v nvcc >/dev/null 2>&1; then
    log "CUDA compiler not found"
    return 1
 fi

 CUDA_VERSION=$(nvcc --version | grep release | sed 's/.*release //' | cut -d',' -f1)
 CUDA_MAJOR=$(echo "$CUDA_VERSION" | cut -d'.' -f1)

 log "Detected CUDA version: $CUDA_VERSION" >&2

 echo "$CUDA_MAJOR"
 }

 remove_cuda_repo() {

    log "Removing existing CUDA repositories"

    # Remove repo list files
    rm -f /etc/apt/sources.list.d/cuda*.list
    rm -f /etc/apt/sources.list.d/nvidia*.list

    # Remove keyring if installed
    if dpkg -l | grep -q cuda-keyring; then
        log "Removing cuda-keyring package"
        apt-get remove -y cuda-keyring
    fi

    # Clean apt metadata
    rm -rf /var/lib/apt/lists/*

    apt-get update -y

    log "CUDA repositories removed"
 }

 ########################################

 # ADD NVIDIA REPO IF NEEDED

 ########################################

 ensure_cuda_repo() {

    # Check if CUDA repo already exists
    if grep -R "developer.download.nvidia.com/compute/cuda/repos" /etc/apt/sources.list /etc/apt/sources.list.d 2>/dev/null | grep -q cuda; then
        log "CUDA repository already configured"
        return
    fi

    log "Adding NVIDIA CUDA repository"

    # Detect distro version
    . /etc/os-release

    case "$VERSION_ID" in
        "22.04")
            CUDA_REPO="ubuntu2204"
            ;;
        "20.04")
            CUDA_REPO="ubuntu2004"
            ;;
        "24.04")
            CUDA_REPO="ubuntu2404"
            ;;
        *)
            log "Unsupported distro version: $VERSION_ID"
            exit 1
            ;;
    esac

    KEYRING="cuda-keyring_1.1-1_all.deb"
    URL="https://developer.download.nvidia.com/compute/cuda/repos/${CUDA_REPO}/x86_64/${KEYRING}"

    log "Downloading CUDA keyring from $URL"

    wget -q "$URL" -O "/tmp/$KEYRING"

    dpkg -i "/tmp/$KEYRING"

    rm -f "/tmp/$KEYRING"

    apt-get update -y

    log "CUDA repository installed successfully"
 }

 ########################################

 # ENSURE CUBLAS DEV LIBS

 ########################################

 ensure_cublas() {

    if ldconfig -p | grep -q cuda-toolkit; then
        log "cuBLAS already installed"
        return
    fi

    # Detect CUDA version
    if command -v nvcc >/dev/null 2>&1; then
        CUDA_VERSION=$(nvcc --version | grep release | sed 's/.*release //' | cut -d',' -f1)
    elif command -v nvidia-smi >/dev/null 2>&1; then
        CUDA_VERSION=$(nvidia-smi | grep "CUDA Version" | awk '{print $9}')
    else
        log "ERROR: Could not detect CUDA version"
        exit 1
    fi

    CUDA_MAJOR=$(echo "$CUDA_VERSION" | cut -d'.' -f1)

    log "Detected CUDA version: $CUDA_VERSION"

    ensure_cuda_repo
    apt-get update -y >/dev/null

    if [ "$CUDA_MAJOR" = "12" ]; then
        RUNTIME_PKG="libcublas12"
        DEV_PKG="libcublas12-dev-cuda-12"
    elif [ "$CUDA_MAJOR" = "13" ]; then
        RUNTIME_PKG="libcublas13"
        DEV_PKG="libcublas13-dev-cuda-13"
    else
        log "ERROR: Unsupported CUDA major version $CUDA_MAJOR"
        exit 1
    fi

    log "Installing cuBLAS packages: $RUNTIME_PKG $DEV_PKG"

    apt-get install -y "cuda-toolkit-$CUDA_MAJOR"

    ldconfig
 }

 ########################################

 # BUILD LLAMA.CPP

 ########################################

 build_llamacpp() {

 mkdir -p "$SRC_DIR"
 mkdir -p "$BIN_DIR"

 cd "$SRC_DIR"

 if [ ! -d "llama.cpp" ]; then
 log "Cloning llama.cpp"
 git clone https://github.com/ggml-org/llama.cpp.git
 fi

 cd llama.cpp

 log "Building llama.cpp"

 rm -rf build
 mkdir build
 cd build

 cmake .. -DGGML_CUDA=ON -DGGML_NATIVE=ON  -DCMAKE_BUILD_TYPE=Release

 cmake --build . -j$(nproc)

 cp bin/llama-server "$BIN_DIR/"
 chmod +x "$BIN_DIR/llama-server"

 log "llama-server installed at $BIN_DIR/llama-server"
 }

 ########################################

 # MAIN

 ########################################

 log "Installing build tools"
 install_build_tools

 log "Ensuring cuBLAS libraries"
 ensure_cublas

 log "Building llama.cpp"
 build_llamacpp

 echo ""
 echo "----------------------------------------"
 echo "llama.cpp server ready"
 echo ""
 echo "Binary:"
 echo "$BIN_DIR/llama-server"
 echo ""
 echo "Example:"
 echo "$BIN_DIR/llama-server -m model.gguf -ngl 999 -c 32768 --port 8080"
 echo "----------------------------------------"
diff --git a/download-models.sh b/download-models.sh
 #!/usr/bin/env bash
 set -euo pipefail

 MODEL_DIR="/opt/llama/models"

 echo "Creating model directory"
 mkdir -p "$MODEL_DIR"
 cd "$MODEL_DIR"

 ############################
 # Ensure aria2 installed
 ############################

 if ! command -v aria2c >/dev/null 2>&1; then
  echo "Installing aria2"
  apt-get update -y
  apt-get install -y aria2
 fi

 ############################
 # Download CODING model
 ############################

 echo "Downloading CODING model (Qwen2.5-Coder-32B-Instruct Q6)"

 aria2c \
 -x 16 \
 -s 16 \
 -k 1M \
 -c \
 -o Qwen2.5-Coder-32B-Instruct-Q6_K.gguf \
 -d "$MODEL_DIR" \
 "https://huggingface.co/unsloth/Qwen2.5-Coder-32B-Instruct-128K-GGUF/resolve/main/Qwen2.5-Coder-32B-Instruct-Q6_K.gguf"

 ############################
 # Download REASONING model
 ############################

 echo "Downloading REASONING model (DeepSeek-R1 Distill Q6)"

 aria2c \
 -x 16 \
 -s 16 \
 -k 1M \
 -c \
 -o DeepSeek-R1-Distill-Qwen-32B-Q6_K.gguf \
 -d "$MODEL_DIR" \
 "https://huggingface.co/unsloth/DeepSeek-R1-Distill-Qwen-32B-GGUF/resolve/main/DeepSeek-R1-Distill-Qwen-32B-Q6_K.gguf"

 ############################
 # Done
 ############################

 echo ""
 echo "Download complete."
 echo ""

 ls -lh "$MODEL_DIR"
diff --git a/run-stack.sh b/run-stack.sh
 #!/usr/bin/env bash
 set -e

 echo "===================================="
 echo " Starting Local LLM Stack Setup"
 echo "===================================="

 ROOT_DIR="$(cd "$(dirname "$0")" && pwd)"
 LOG_DIR="$ROOT_DIR/logs"
 mkdir -p "$LOG_DIR"

 ########################################
 # 1. Start model downloads in background
 ########################################

 echo "Starting model downloads (background)..."

 bash "$ROOT_DIR/download-models.sh" \
    > "$LOG_DIR/download.log" 2>&1 &

 DOWNLOAD_PID=$!

 echo "Download PID: $DOWNLOAD_PID"

 ########################################
 # 2. Build llama.cpp while models download
 ########################################

 echo "Building llama.cpp..."

 bash "$ROOT_DIR/build-llm-cpp.sh" \
    | tee "$LOG_DIR/build.log"

 echo "llama.cpp build finished."

 ########################################
 # 3. Wait for models if still downloading
 ########################################

 echo "Waiting for model downloads to finish..."

 wait $DOWNLOAD_PID

 echo "Model downloads completed."

 ########################################
 # 4. Start LiteLLM
 ########################################

 echo "Starting LiteLLM..."

 bash "$ROOT_DIR/setup-lite-llm.sh" \
    | tee "$LOG_DIR/litellm.log"

 echo ""
 echo "===================================="
 echo " LLM Stack Ready"
 echo "===================================="
diff --git a/setup-lite-llm.sh b/setup-lite-llm.sh
 #!/usr/bin/env bash
 set -euo pipefail

 ############################
 # CONFIG
 ############################

 MODEL_DIR="/opt/llama/models"
 LLAMA_BIN="/opt/llama/bin/llama-server"

 PREFIX="/opt/litellm"
 CONFIG="$PREFIX/router.yaml"

 REASON_MODEL="$MODEL_DIR/DeepSeek-R1-Distill-Qwen-32B-Q6_K.gguf"
 CODER_MODEL="$MODEL_DIR/Qwen2.5-Coder-32B-Instruct-Q6_K.gguf"

 REASON_PORT=9000
 CODER_PORT=9001
 ROUTER_PORT=8000

 ############################
 # LOGGING
 ############################

 log() {
  echo "[$(date '+%H:%M:%S')] $*"
 }

 ############################
 # UTIL
 ############################

 port_open() {
  ss -ltn "sport = :$1" | grep -q LISTEN
 }

 wait_for_service() {
  local port=$1
  for i in {1..60}; do
    if curl -s "http://localhost:${port}/v1/models" >/dev/null 2>&1; then
      return 0
    fi
    sleep 1
  done
  return 1
 }

 ############################
 # DEPENDENCIES
 ############################

 log "Ensuring LiteLLM installed"

 if ! command -v litellm >/dev/null 2>&1; then
  pip install -q --upgrade pip
  pip install -q "litellm[proxy]" websockets uvicorn pyyaml
 fi

 ############################
 # VALIDATE MODELS
 ############################

 [ -f "$REASON_MODEL" ] || { echo "Missing model: $REASON_MODEL"; exit 1; }
 [ -f "$CODER_MODEL" ] || { echo "Missing model: $CODER_MODEL"; exit 1; }

 ############################
 # CONFIG FILE
 ############################

 log "Preparing LiteLLM config"

 mkdir -p "$PREFIX"

 cat <<EOF > "$CONFIG"
 model_list:
  - model_name: reasoning
    litellm_params:
      model: openai/reasoning
      api_base: http://localhost:${REASON_PORT}/v1
      api_key: none

  - model_name: coder
    litellm_params:
      model: openai/coder
      api_base: http://localhost:${CODER_PORT}/v1
      api_key: none

 router_settings:
  routing_strategy: simple-shuffle
 EOF

 ############################
 # START REASONING MODEL
 ############################

 if port_open "$REASON_PORT"; then
  log "Reasoning model already running on :$REASON_PORT"
 else
  log "Starting reasoning model"
  "$LLAMA_BIN" \
    -m "$REASON_MODEL" \
    -ngl 999 \
    --flash-attn on \
    -c 32768 \
    --port "$REASON_PORT" \
    --host 0.0.0.0 \
    --alias reasoning \
    --parallel 4 \
    --timeout 600 \
    > /tmp/reasoning.log 2>&1 &
 fi

 ############################
 # START CODER MODEL
 ############################

 if port_open "$CODER_PORT"; then
  log "Coder model already running on :$CODER_PORT"
 else
  log "Starting coder model"
  "$LLAMA_BIN" \
    -m "$CODER_MODEL" \
    -ngl 999 \
    --flash-attn on \
    -c 32768 \
    --port "$CODER_PORT" \
    --host 0.0.0.0 \
    --alias coder \
    --parallel 4 \
    --timeout 600 \
    > /tmp/coder.log 2>&1 &
 fi

 ############################
 # WAIT FOR MODELS
 ############################

 log "Waiting for reasoning model"
 wait_for_service "$REASON_PORT" || { echo "Reasoning server failed"; exit 1; }

 log "Waiting for coder model"
 wait_for_service "$CODER_PORT" || { echo "Coder server failed"; exit 1; }

 ############################
 # START ROUTER
 ############################

 if port_open "$ROUTER_PORT"; then
  log "LiteLLM router already running on :$ROUTER_PORT"
 else
  log "Starting LiteLLM router"
  litellm --config "$CONFIG" --port "$ROUTER_PORT"
 fi
	#!/usr/bin/env bash
	set -euo pipefail

	########################################

	# CONFIG

	########################################

	PREFIX="/opt/llama"
	SRC_DIR="$PREFIX/src"
	BIN_DIR="$PREFIX/bin"

	########################################

	# LOG

	########################################

	log() {
	echo "[`date '+%H:%M:%S'`] $1"
	}

	########################################

	# INSTALL PACKAGE IF MISSING

	########################################

	ensure_pkg() {

	pkg="$1"

	if dpkg -s "$pkg" >/dev/null 2>&1; then
	log "$pkg already installed"
	else
	log "Installing $pkg"
	apt-get update
	apt-get install -y "$pkg"
	fi
	}

	########################################

	# INSTALL BUILD DEPENDENCIES

	########################################

	install_build_tools() {

	ensure_pkg build-essential
	ensure_pkg cmake
	ensure_pkg git
	ensure_pkg pkg-config
	ensure_pkg curl
	ensure_pkg wget
	}

	########################################

	# DETECT CUDA VERSION

	########################################

	detect_cuda_version() {

	if ! command -v nvcc >/dev/null 2>&1; then
	log "CUDA compiler not found"
	return 1
	fi

	CUDA_VERSION=$(nvcc --version \| grep release \| sed 's/.*release //' \| cut -d',' -f1)
	CUDA_MAJOR=$(echo "$CUDA_VERSION" \| cut -d'.' -f1)

	log "Detected CUDA version: $CUDA_VERSION" >&2

	echo "$CUDA_MAJOR"
	}

	remove_cuda_repo() {

	log "Removing existing CUDA repositories"

	# Remove repo list files
	rm -f /etc/apt/sources.list.d/cuda*.list
	rm -f /etc/apt/sources.list.d/nvidia*.list

	# Remove keyring if installed
	if dpkg -l \| grep -q cuda-keyring; then
	log "Removing cuda-keyring package"
	apt-get remove -y cuda-keyring
	fi

	# Clean apt metadata
	rm -rf /var/lib/apt/lists/*

	apt-get update -y

	log "CUDA repositories removed"
	}

	########################################

	# ADD NVIDIA REPO IF NEEDED

	########################################

	ensure_cuda_repo() {

	# Check if CUDA repo already exists
	if grep -R "developer.download.nvidia.com/compute/cuda/repos" /etc/apt/sources.list /etc/apt/sources.list.d 2>/dev/null \| grep -q cuda; then
	log "CUDA repository already configured"
	return
	fi

	log "Adding NVIDIA CUDA repository"

	# Detect distro version
	. /etc/os-release

	case "$VERSION_ID" in
	"22.04")
	CUDA_REPO="ubuntu2204"
	;;
	"20.04")
	CUDA_REPO="ubuntu2004"
	;;
	"24.04")
	CUDA_REPO="ubuntu2404"
	;;
	*)
	log "Unsupported distro version: $VERSION_ID"
	exit 1
	;;
	esac

	KEYRING="cuda-keyring_1.1-1_all.deb"
	URL="https://developer.download.nvidia.com/compute/cuda/repos/${CUDA_REPO}/x86_64/${KEYRING}"

	log "Downloading CUDA keyring from $URL"

	wget -q "$URL" -O "/tmp/$KEYRING"

	dpkg -i "/tmp/$KEYRING"

	rm -f "/tmp/$KEYRING"

	apt-get update -y

	log "CUDA repository installed successfully"
	}

	########################################

	# ENSURE CUBLAS DEV LIBS

	########################################

	ensure_cublas() {

	if ldconfig -p \| grep -q cuda-toolkit; then
	log "cuBLAS already installed"
	return
	fi

	# Detect CUDA version
	if command -v nvcc >/dev/null 2>&1; then
	CUDA_VERSION=$(nvcc --version \| grep release \| sed 's/.*release //' \| cut -d',' -f1)
	elif command -v nvidia-smi >/dev/null 2>&1; then
	CUDA_VERSION=$(nvidia-smi \| grep "CUDA Version" \| awk '{print $9}')
	else
	log "ERROR: Could not detect CUDA version"
	exit 1
	fi

	CUDA_MAJOR=$(echo "$CUDA_VERSION" \| cut -d'.' -f1)

	log "Detected CUDA version: $CUDA_VERSION"

	ensure_cuda_repo
	apt-get update -y >/dev/null

	if [ "$CUDA_MAJOR" = "12" ]; then
	RUNTIME_PKG="libcublas12"
	DEV_PKG="libcublas12-dev-cuda-12"
	elif [ "$CUDA_MAJOR" = "13" ]; then
	RUNTIME_PKG="libcublas13"
	DEV_PKG="libcublas13-dev-cuda-13"
	else
	log "ERROR: Unsupported CUDA major version $CUDA_MAJOR"
	exit 1
	fi

	log "Installing cuBLAS packages: $RUNTIME_PKG $DEV_PKG"

	apt-get install -y "cuda-toolkit-$CUDA_MAJOR"

	ldconfig
	}

	########################################

	# BUILD LLAMA.CPP

	########################################

	build_llamacpp() {

	mkdir -p "$SRC_DIR"
	mkdir -p "$BIN_DIR"

	cd "$SRC_DIR"

	if [ ! -d "llama.cpp" ]; then
	log "Cloning llama.cpp"
	git clone https://github.com/ggml-org/llama.cpp.git
	fi

	cd llama.cpp

	log "Building llama.cpp"

	rm -rf build
	mkdir build
	cd build

	cmake .. -DGGML_CUDA=ON -DGGML_NATIVE=ON -DCMAKE_BUILD_TYPE=Release

	cmake --build . -j$(nproc)

	cp bin/llama-server "$BIN_DIR/"
	chmod +x "$BIN_DIR/llama-server"

	log "llama-server installed at $BIN_DIR/llama-server"
	}

	########################################

	# MAIN

	########################################

	log "Installing build tools"
	install_build_tools

	log "Ensuring cuBLAS libraries"
	ensure_cublas

	log "Building llama.cpp"
	build_llamacpp

	echo ""
	echo "----------------------------------------"
	echo "llama.cpp server ready"
	echo ""
	echo "Binary:"
	echo "$BIN_DIR/llama-server"
	echo ""
	echo "Example:"
	echo "$BIN_DIR/llama-server -m model.gguf -ngl 999 -c 32768 --port 8080"
	echo "----------------------------------------"
	#!/usr/bin/env bash
	set -euo pipefail

	MODEL_DIR="/opt/llama/models"

	echo "Creating model directory"
	mkdir -p "$MODEL_DIR"
	cd "$MODEL_DIR"

	############################
	# Ensure aria2 installed
	############################

	if ! command -v aria2c >/dev/null 2>&1; then
	echo "Installing aria2"
	apt-get update -y
	apt-get install -y aria2
	fi

	############################
	# Download CODING model
	############################

	echo "Downloading CODING model (Qwen2.5-Coder-32B-Instruct Q6)"

	aria2c \
	-x 16 \
	-s 16 \
	-k 1M \
	-c \
	-o Qwen2.5-Coder-32B-Instruct-Q6_K.gguf \
	-d "$MODEL_DIR" \
	"https://huggingface.co/unsloth/Qwen2.5-Coder-32B-Instruct-128K-GGUF/resolve/main/Qwen2.5-Coder-32B-Instruct-Q6_K.gguf"

	############################
	# Download REASONING model
	############################

	echo "Downloading REASONING model (DeepSeek-R1 Distill Q6)"

	aria2c \
	-x 16 \
	-s 16 \
	-k 1M \
	-c \
	-o DeepSeek-R1-Distill-Qwen-32B-Q6_K.gguf \
	-d "$MODEL_DIR" \
	"https://huggingface.co/unsloth/DeepSeek-R1-Distill-Qwen-32B-GGUF/resolve/main/DeepSeek-R1-Distill-Qwen-32B-Q6_K.gguf"

	############################
	# Done
	############################

	echo ""
	echo "Download complete."
	echo ""

	ls -lh "$MODEL_DIR"
	#!/usr/bin/env bash
	set -e

	echo "===================================="
	echo " Starting Local LLM Stack Setup"
	echo "===================================="

	ROOT_DIR="$(cd "$(dirname "$0")" && pwd)"
	LOG_DIR="$ROOT_DIR/logs"
	mkdir -p "$LOG_DIR"

	########################################
	# 1. Start model downloads in background
	########################################

	echo "Starting model downloads (background)..."

	bash "$ROOT_DIR/download-models.sh" \
	> "$LOG_DIR/download.log" 2>&1 &

	DOWNLOAD_PID=$!

	echo "Download PID: $DOWNLOAD_PID"

	########################################
	# 2. Build llama.cpp while models download
	########################################

	echo "Building llama.cpp..."

	bash "$ROOT_DIR/build-llm-cpp.sh" \
	\| tee "$LOG_DIR/build.log"

	echo "llama.cpp build finished."

	########################################
	# 3. Wait for models if still downloading
	########################################

	echo "Waiting for model downloads to finish..."

	wait $DOWNLOAD_PID

	echo "Model downloads completed."

	########################################
	# 4. Start LiteLLM
	########################################

	echo "Starting LiteLLM..."

	bash "$ROOT_DIR/setup-lite-llm.sh" \
	\| tee "$LOG_DIR/litellm.log"

	echo ""
	echo "===================================="
	echo " LLM Stack Ready"
	echo "===================================="