Skip to content

Instantly share code, notes, and snippets.

@shang-vikas
Last active March 13, 2026 16:24
Show Gist options
  • Select an option

  • Save shang-vikas/1adbd287b46a15fbd0e714609bd979b3 to your computer and use it in GitHub Desktop.

Select an option

Save shang-vikas/1adbd287b46a15fbd0e714609bd979b3 to your computer and use it in GitHub Desktop.
build llm-cpp , download code& & reason model, setup litellm
#!/usr/bin/env bash
set -euo pipefail
########################################
# CONFIG
########################################
PREFIX="/opt/llama"
SRC_DIR="$PREFIX/src"
BIN_DIR="$PREFIX/bin"
########################################
# LOG
########################################
log() {
echo "[`date '+%H:%M:%S'`] $1"
}
########################################
# INSTALL PACKAGE IF MISSING
########################################
ensure_pkg() {
pkg="$1"
if dpkg -s "$pkg" >/dev/null 2>&1; then
log "$pkg already installed"
else
log "Installing $pkg"
apt-get update
apt-get install -y "$pkg"
fi
}
########################################
# INSTALL BUILD DEPENDENCIES
########################################
install_build_tools() {
ensure_pkg build-essential
ensure_pkg cmake
ensure_pkg git
ensure_pkg pkg-config
ensure_pkg curl
ensure_pkg wget
}
########################################
# DETECT CUDA VERSION
########################################
detect_cuda_version() {
if ! command -v nvcc >/dev/null 2>&1; then
log "CUDA compiler not found"
return 1
fi
CUDA_VERSION=$(nvcc --version | grep release | sed 's/.*release //' | cut -d',' -f1)
CUDA_MAJOR=$(echo "$CUDA_VERSION" | cut -d'.' -f1)
log "Detected CUDA version: $CUDA_VERSION" >&2
echo "$CUDA_MAJOR"
}
remove_cuda_repo() {
log "Removing existing CUDA repositories"
# Remove repo list files
rm -f /etc/apt/sources.list.d/cuda*.list
rm -f /etc/apt/sources.list.d/nvidia*.list
# Remove keyring if installed
if dpkg -l | grep -q cuda-keyring; then
log "Removing cuda-keyring package"
apt-get remove -y cuda-keyring
fi
# Clean apt metadata
rm -rf /var/lib/apt/lists/*
apt-get update -y
log "CUDA repositories removed"
}
########################################
# ADD NVIDIA REPO IF NEEDED
########################################
ensure_cuda_repo() {
# Check if CUDA repo already exists
if grep -R "developer.download.nvidia.com/compute/cuda/repos" /etc/apt/sources.list /etc/apt/sources.list.d 2>/dev/null | grep -q cuda; then
log "CUDA repository already configured"
return
fi
log "Adding NVIDIA CUDA repository"
# Detect distro version
. /etc/os-release
case "$VERSION_ID" in
"22.04")
CUDA_REPO="ubuntu2204"
;;
"20.04")
CUDA_REPO="ubuntu2004"
;;
"24.04")
CUDA_REPO="ubuntu2404"
;;
*)
log "Unsupported distro version: $VERSION_ID"
exit 1
;;
esac
KEYRING="cuda-keyring_1.1-1_all.deb"
URL="https://developer.download.nvidia.com/compute/cuda/repos/${CUDA_REPO}/x86_64/${KEYRING}"
log "Downloading CUDA keyring from $URL"
wget -q "$URL" -O "/tmp/$KEYRING"
dpkg -i "/tmp/$KEYRING"
rm -f "/tmp/$KEYRING"
apt-get update -y
log "CUDA repository installed successfully"
}
########################################
# ENSURE CUBLAS DEV LIBS
########################################
ensure_cublas() {
if ldconfig -p | grep -q cuda-toolkit; then
log "cuBLAS already installed"
return
fi
# Detect CUDA version
if command -v nvcc >/dev/null 2>&1; then
CUDA_VERSION=$(nvcc --version | grep release | sed 's/.*release //' | cut -d',' -f1)
elif command -v nvidia-smi >/dev/null 2>&1; then
CUDA_VERSION=$(nvidia-smi | grep "CUDA Version" | awk '{print $9}')
else
log "ERROR: Could not detect CUDA version"
exit 1
fi
CUDA_MAJOR=$(echo "$CUDA_VERSION" | cut -d'.' -f1)
log "Detected CUDA version: $CUDA_VERSION"
ensure_cuda_repo
apt-get update -y >/dev/null
if [ "$CUDA_MAJOR" = "12" ]; then
RUNTIME_PKG="libcublas12"
DEV_PKG="libcublas12-dev-cuda-12"
elif [ "$CUDA_MAJOR" = "13" ]; then
RUNTIME_PKG="libcublas13"
DEV_PKG="libcublas13-dev-cuda-13"
else
log "ERROR: Unsupported CUDA major version $CUDA_MAJOR"
exit 1
fi
log "Installing cuBLAS packages: $RUNTIME_PKG $DEV_PKG"
apt-get install -y "cuda-toolkit-$CUDA_MAJOR"
ldconfig
}
########################################
# BUILD LLAMA.CPP
########################################
build_llamacpp() {
mkdir -p "$SRC_DIR"
mkdir -p "$BIN_DIR"
cd "$SRC_DIR"
if [ ! -d "llama.cpp" ]; then
log "Cloning llama.cpp"
git clone https://github.com/ggml-org/llama.cpp.git
fi
cd llama.cpp
log "Building llama.cpp"
rm -rf build
mkdir build
cd build
cmake .. -DGGML_CUDA=ON -DGGML_NATIVE=ON -DCMAKE_BUILD_TYPE=Release
cmake --build . -j$(nproc)
cp bin/llama-server "$BIN_DIR/"
chmod +x "$BIN_DIR/llama-server"
log "llama-server installed at $BIN_DIR/llama-server"
}
########################################
# MAIN
########################################
log "Installing build tools"
install_build_tools
log "Ensuring cuBLAS libraries"
ensure_cublas
log "Building llama.cpp"
build_llamacpp
echo ""
echo "----------------------------------------"
echo "llama.cpp server ready"
echo ""
echo "Binary:"
echo "$BIN_DIR/llama-server"
echo ""
echo "Example:"
echo "$BIN_DIR/llama-server -m model.gguf -ngl 999 -c 32768 --port 8080"
echo "----------------------------------------"
#!/usr/bin/env bash
set -euo pipefail
MODEL_DIR="/opt/llama/models"
echo "Creating model directory"
mkdir -p "$MODEL_DIR"
cd "$MODEL_DIR"
############################
# Ensure aria2 installed
############################
if ! command -v aria2c >/dev/null 2>&1; then
echo "Installing aria2"
apt-get update -y
apt-get install -y aria2
fi
############################
# Download CODING model
############################
echo "Downloading CODING model (Qwen2.5-Coder-32B-Instruct Q6)"
aria2c \
-x 16 \
-s 16 \
-k 1M \
-c \
-o Qwen2.5-Coder-32B-Instruct-Q6_K.gguf \
-d "$MODEL_DIR" \
"https://huggingface.co/unsloth/Qwen2.5-Coder-32B-Instruct-128K-GGUF/resolve/main/Qwen2.5-Coder-32B-Instruct-Q6_K.gguf"
############################
# Download REASONING model
############################
echo "Downloading REASONING model (DeepSeek-R1 Distill Q6)"
aria2c \
-x 16 \
-s 16 \
-k 1M \
-c \
-o DeepSeek-R1-Distill-Qwen-32B-Q6_K.gguf \
-d "$MODEL_DIR" \
"https://huggingface.co/unsloth/DeepSeek-R1-Distill-Qwen-32B-GGUF/resolve/main/DeepSeek-R1-Distill-Qwen-32B-Q6_K.gguf"
############################
# Done
############################
echo ""
echo "Download complete."
echo ""
ls -lh "$MODEL_DIR"
#!/usr/bin/env bash
set -e
echo "===================================="
echo " Starting Local LLM Stack Setup"
echo "===================================="
ROOT_DIR="$(cd "$(dirname "$0")" && pwd)"
LOG_DIR="$ROOT_DIR/logs"
mkdir -p "$LOG_DIR"
########################################
# 1. Start model downloads in background
########################################
echo "Starting model downloads (background)..."
bash "$ROOT_DIR/download-models.sh" \
> "$LOG_DIR/download.log" 2>&1 &
DOWNLOAD_PID=$!
echo "Download PID: $DOWNLOAD_PID"
########################################
# 2. Build llama.cpp while models download
########################################
echo "Building llama.cpp..."
bash "$ROOT_DIR/build-llm-cpp.sh" \
| tee "$LOG_DIR/build.log"
echo "llama.cpp build finished."
########################################
# 3. Wait for models if still downloading
########################################
echo "Waiting for model downloads to finish..."
wait $DOWNLOAD_PID
echo "Model downloads completed."
########################################
# 4. Start LiteLLM
########################################
echo "Starting LiteLLM..."
bash "$ROOT_DIR/setup-lite-llm.sh" \
| tee "$LOG_DIR/litellm.log"
echo ""
echo "===================================="
echo " LLM Stack Ready"
echo "===================================="
#!/usr/bin/env bash
set -euo pipefail
############################
# CONFIG
############################
MODEL_DIR="/opt/llama/models"
LLAMA_BIN="/opt/llama/bin/llama-server"
PREFIX="/opt/litellm"
CONFIG="$PREFIX/router.yaml"
REASON_MODEL="$MODEL_DIR/DeepSeek-R1-Distill-Qwen-32B-Q6_K.gguf"
CODER_MODEL="$MODEL_DIR/Qwen2.5-Coder-32B-Instruct-Q6_K.gguf"
REASON_PORT=9000
CODER_PORT=9001
ROUTER_PORT=8000
############################
# LOGGING
############################
log() {
echo "[$(date '+%H:%M:%S')] $*"
}
############################
# UTIL
############################
port_open() {
ss -ltn "sport = :$1" | grep -q LISTEN
}
wait_for_service() {
local port=$1
for i in {1..60}; do
if curl -s "http://localhost:${port}/v1/models" >/dev/null 2>&1; then
return 0
fi
sleep 1
done
return 1
}
############################
# DEPENDENCIES
############################
log "Ensuring LiteLLM installed"
if ! command -v litellm >/dev/null 2>&1; then
pip install -q --upgrade pip
pip install -q "litellm[proxy]" websockets uvicorn pyyaml
fi
############################
# VALIDATE MODELS
############################
[ -f "$REASON_MODEL" ] || { echo "Missing model: $REASON_MODEL"; exit 1; }
[ -f "$CODER_MODEL" ] || { echo "Missing model: $CODER_MODEL"; exit 1; }
############################
# CONFIG FILE
############################
log "Preparing LiteLLM config"
mkdir -p "$PREFIX"
cat <<EOF > "$CONFIG"
model_list:
- model_name: reasoning
litellm_params:
model: openai/reasoning
api_base: http://localhost:${REASON_PORT}/v1
api_key: none
- model_name: coder
litellm_params:
model: openai/coder
api_base: http://localhost:${CODER_PORT}/v1
api_key: none
router_settings:
routing_strategy: simple-shuffle
EOF
############################
# START REASONING MODEL
############################
if port_open "$REASON_PORT"; then
log "Reasoning model already running on :$REASON_PORT"
else
log "Starting reasoning model"
"$LLAMA_BIN" \
-m "$REASON_MODEL" \
-ngl 999 \
--flash-attn on \
-c 32768 \
--port "$REASON_PORT" \
--host 0.0.0.0 \
--alias reasoning \
--parallel 4 \
--timeout 600 \
> /tmp/reasoning.log 2>&1 &
fi
############################
# START CODER MODEL
############################
if port_open "$CODER_PORT"; then
log "Coder model already running on :$CODER_PORT"
else
log "Starting coder model"
"$LLAMA_BIN" \
-m "$CODER_MODEL" \
-ngl 999 \
--flash-attn on \
-c 32768 \
--port "$CODER_PORT" \
--host 0.0.0.0 \
--alias coder \
--parallel 4 \
--timeout 600 \
> /tmp/coder.log 2>&1 &
fi
############################
# WAIT FOR MODELS
############################
log "Waiting for reasoning model"
wait_for_service "$REASON_PORT" || { echo "Reasoning server failed"; exit 1; }
log "Waiting for coder model"
wait_for_service "$CODER_PORT" || { echo "Coder server failed"; exit 1; }
############################
# START ROUTER
############################
if port_open "$ROUTER_PORT"; then
log "LiteLLM router already running on :$ROUTER_PORT"
else
log "Starting LiteLLM router"
litellm --config "$CONFIG" --port "$ROUTER_PORT"
fi
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment