Last active
March 13, 2026 16:24
-
-
Save shang-vikas/1adbd287b46a15fbd0e714609bd979b3 to your computer and use it in GitHub Desktop.
build llm-cpp , download code& & reason model, setup litellm
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/usr/bin/env bash | |
| set -euo pipefail | |
| ######################################## | |
| # CONFIG | |
| ######################################## | |
| PREFIX="/opt/llama" | |
| SRC_DIR="$PREFIX/src" | |
| BIN_DIR="$PREFIX/bin" | |
| ######################################## | |
| # LOG | |
| ######################################## | |
| log() { | |
| echo "[`date '+%H:%M:%S'`] $1" | |
| } | |
| ######################################## | |
| # INSTALL PACKAGE IF MISSING | |
| ######################################## | |
| ensure_pkg() { | |
| pkg="$1" | |
| if dpkg -s "$pkg" >/dev/null 2>&1; then | |
| log "$pkg already installed" | |
| else | |
| log "Installing $pkg" | |
| apt-get update | |
| apt-get install -y "$pkg" | |
| fi | |
| } | |
| ######################################## | |
| # INSTALL BUILD DEPENDENCIES | |
| ######################################## | |
| install_build_tools() { | |
| ensure_pkg build-essential | |
| ensure_pkg cmake | |
| ensure_pkg git | |
| ensure_pkg pkg-config | |
| ensure_pkg curl | |
| ensure_pkg wget | |
| } | |
| ######################################## | |
| # DETECT CUDA VERSION | |
| ######################################## | |
| detect_cuda_version() { | |
| if ! command -v nvcc >/dev/null 2>&1; then | |
| log "CUDA compiler not found" | |
| return 1 | |
| fi | |
| CUDA_VERSION=$(nvcc --version | grep release | sed 's/.*release //' | cut -d',' -f1) | |
| CUDA_MAJOR=$(echo "$CUDA_VERSION" | cut -d'.' -f1) | |
| log "Detected CUDA version: $CUDA_VERSION" >&2 | |
| echo "$CUDA_MAJOR" | |
| } | |
| remove_cuda_repo() { | |
| log "Removing existing CUDA repositories" | |
| # Remove repo list files | |
| rm -f /etc/apt/sources.list.d/cuda*.list | |
| rm -f /etc/apt/sources.list.d/nvidia*.list | |
| # Remove keyring if installed | |
| if dpkg -l | grep -q cuda-keyring; then | |
| log "Removing cuda-keyring package" | |
| apt-get remove -y cuda-keyring | |
| fi | |
| # Clean apt metadata | |
| rm -rf /var/lib/apt/lists/* | |
| apt-get update -y | |
| log "CUDA repositories removed" | |
| } | |
| ######################################## | |
| # ADD NVIDIA REPO IF NEEDED | |
| ######################################## | |
| ensure_cuda_repo() { | |
| # Check if CUDA repo already exists | |
| if grep -R "developer.download.nvidia.com/compute/cuda/repos" /etc/apt/sources.list /etc/apt/sources.list.d 2>/dev/null | grep -q cuda; then | |
| log "CUDA repository already configured" | |
| return | |
| fi | |
| log "Adding NVIDIA CUDA repository" | |
| # Detect distro version | |
| . /etc/os-release | |
| case "$VERSION_ID" in | |
| "22.04") | |
| CUDA_REPO="ubuntu2204" | |
| ;; | |
| "20.04") | |
| CUDA_REPO="ubuntu2004" | |
| ;; | |
| "24.04") | |
| CUDA_REPO="ubuntu2404" | |
| ;; | |
| *) | |
| log "Unsupported distro version: $VERSION_ID" | |
| exit 1 | |
| ;; | |
| esac | |
| KEYRING="cuda-keyring_1.1-1_all.deb" | |
| URL="https://developer.download.nvidia.com/compute/cuda/repos/${CUDA_REPO}/x86_64/${KEYRING}" | |
| log "Downloading CUDA keyring from $URL" | |
| wget -q "$URL" -O "/tmp/$KEYRING" | |
| dpkg -i "/tmp/$KEYRING" | |
| rm -f "/tmp/$KEYRING" | |
| apt-get update -y | |
| log "CUDA repository installed successfully" | |
| } | |
| ######################################## | |
| # ENSURE CUBLAS DEV LIBS | |
| ######################################## | |
| ensure_cublas() { | |
| if ldconfig -p | grep -q cuda-toolkit; then | |
| log "cuBLAS already installed" | |
| return | |
| fi | |
| # Detect CUDA version | |
| if command -v nvcc >/dev/null 2>&1; then | |
| CUDA_VERSION=$(nvcc --version | grep release | sed 's/.*release //' | cut -d',' -f1) | |
| elif command -v nvidia-smi >/dev/null 2>&1; then | |
| CUDA_VERSION=$(nvidia-smi | grep "CUDA Version" | awk '{print $9}') | |
| else | |
| log "ERROR: Could not detect CUDA version" | |
| exit 1 | |
| fi | |
| CUDA_MAJOR=$(echo "$CUDA_VERSION" | cut -d'.' -f1) | |
| log "Detected CUDA version: $CUDA_VERSION" | |
| ensure_cuda_repo | |
| apt-get update -y >/dev/null | |
| if [ "$CUDA_MAJOR" = "12" ]; then | |
| RUNTIME_PKG="libcublas12" | |
| DEV_PKG="libcublas12-dev-cuda-12" | |
| elif [ "$CUDA_MAJOR" = "13" ]; then | |
| RUNTIME_PKG="libcublas13" | |
| DEV_PKG="libcublas13-dev-cuda-13" | |
| else | |
| log "ERROR: Unsupported CUDA major version $CUDA_MAJOR" | |
| exit 1 | |
| fi | |
| log "Installing cuBLAS packages: $RUNTIME_PKG $DEV_PKG" | |
| apt-get install -y "cuda-toolkit-$CUDA_MAJOR" | |
| ldconfig | |
| } | |
| ######################################## | |
| # BUILD LLAMA.CPP | |
| ######################################## | |
| build_llamacpp() { | |
| mkdir -p "$SRC_DIR" | |
| mkdir -p "$BIN_DIR" | |
| cd "$SRC_DIR" | |
| if [ ! -d "llama.cpp" ]; then | |
| log "Cloning llama.cpp" | |
| git clone https://github.com/ggml-org/llama.cpp.git | |
| fi | |
| cd llama.cpp | |
| log "Building llama.cpp" | |
| rm -rf build | |
| mkdir build | |
| cd build | |
| cmake .. -DGGML_CUDA=ON -DGGML_NATIVE=ON -DCMAKE_BUILD_TYPE=Release | |
| cmake --build . -j$(nproc) | |
| cp bin/llama-server "$BIN_DIR/" | |
| chmod +x "$BIN_DIR/llama-server" | |
| log "llama-server installed at $BIN_DIR/llama-server" | |
| } | |
| ######################################## | |
| # MAIN | |
| ######################################## | |
| log "Installing build tools" | |
| install_build_tools | |
| log "Ensuring cuBLAS libraries" | |
| ensure_cublas | |
| log "Building llama.cpp" | |
| build_llamacpp | |
| echo "" | |
| echo "----------------------------------------" | |
| echo "llama.cpp server ready" | |
| echo "" | |
| echo "Binary:" | |
| echo "$BIN_DIR/llama-server" | |
| echo "" | |
| echo "Example:" | |
| echo "$BIN_DIR/llama-server -m model.gguf -ngl 999 -c 32768 --port 8080" | |
| echo "----------------------------------------" |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/usr/bin/env bash | |
| set -euo pipefail | |
| MODEL_DIR="/opt/llama/models" | |
| echo "Creating model directory" | |
| mkdir -p "$MODEL_DIR" | |
| cd "$MODEL_DIR" | |
| ############################ | |
| # Ensure aria2 installed | |
| ############################ | |
| if ! command -v aria2c >/dev/null 2>&1; then | |
| echo "Installing aria2" | |
| apt-get update -y | |
| apt-get install -y aria2 | |
| fi | |
| ############################ | |
| # Download CODING model | |
| ############################ | |
| echo "Downloading CODING model (Qwen2.5-Coder-32B-Instruct Q6)" | |
| aria2c \ | |
| -x 16 \ | |
| -s 16 \ | |
| -k 1M \ | |
| -c \ | |
| -o Qwen2.5-Coder-32B-Instruct-Q6_K.gguf \ | |
| -d "$MODEL_DIR" \ | |
| "https://huggingface.co/unsloth/Qwen2.5-Coder-32B-Instruct-128K-GGUF/resolve/main/Qwen2.5-Coder-32B-Instruct-Q6_K.gguf" | |
| ############################ | |
| # Download REASONING model | |
| ############################ | |
| echo "Downloading REASONING model (DeepSeek-R1 Distill Q6)" | |
| aria2c \ | |
| -x 16 \ | |
| -s 16 \ | |
| -k 1M \ | |
| -c \ | |
| -o DeepSeek-R1-Distill-Qwen-32B-Q6_K.gguf \ | |
| -d "$MODEL_DIR" \ | |
| "https://huggingface.co/unsloth/DeepSeek-R1-Distill-Qwen-32B-GGUF/resolve/main/DeepSeek-R1-Distill-Qwen-32B-Q6_K.gguf" | |
| ############################ | |
| # Done | |
| ############################ | |
| echo "" | |
| echo "Download complete." | |
| echo "" | |
| ls -lh "$MODEL_DIR" |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/usr/bin/env bash | |
| set -e | |
| echo "====================================" | |
| echo " Starting Local LLM Stack Setup" | |
| echo "====================================" | |
| ROOT_DIR="$(cd "$(dirname "$0")" && pwd)" | |
| LOG_DIR="$ROOT_DIR/logs" | |
| mkdir -p "$LOG_DIR" | |
| ######################################## | |
| # 1. Start model downloads in background | |
| ######################################## | |
| echo "Starting model downloads (background)..." | |
| bash "$ROOT_DIR/download-models.sh" \ | |
| > "$LOG_DIR/download.log" 2>&1 & | |
| DOWNLOAD_PID=$! | |
| echo "Download PID: $DOWNLOAD_PID" | |
| ######################################## | |
| # 2. Build llama.cpp while models download | |
| ######################################## | |
| echo "Building llama.cpp..." | |
| bash "$ROOT_DIR/build-llm-cpp.sh" \ | |
| | tee "$LOG_DIR/build.log" | |
| echo "llama.cpp build finished." | |
| ######################################## | |
| # 3. Wait for models if still downloading | |
| ######################################## | |
| echo "Waiting for model downloads to finish..." | |
| wait $DOWNLOAD_PID | |
| echo "Model downloads completed." | |
| ######################################## | |
| # 4. Start LiteLLM | |
| ######################################## | |
| echo "Starting LiteLLM..." | |
| bash "$ROOT_DIR/setup-lite-llm.sh" \ | |
| | tee "$LOG_DIR/litellm.log" | |
| echo "" | |
| echo "====================================" | |
| echo " LLM Stack Ready" | |
| echo "====================================" |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/usr/bin/env bash | |
| set -euo pipefail | |
| ############################ | |
| # CONFIG | |
| ############################ | |
| MODEL_DIR="/opt/llama/models" | |
| LLAMA_BIN="/opt/llama/bin/llama-server" | |
| PREFIX="/opt/litellm" | |
| CONFIG="$PREFIX/router.yaml" | |
| REASON_MODEL="$MODEL_DIR/DeepSeek-R1-Distill-Qwen-32B-Q6_K.gguf" | |
| CODER_MODEL="$MODEL_DIR/Qwen2.5-Coder-32B-Instruct-Q6_K.gguf" | |
| REASON_PORT=9000 | |
| CODER_PORT=9001 | |
| ROUTER_PORT=8000 | |
| ############################ | |
| # LOGGING | |
| ############################ | |
| log() { | |
| echo "[$(date '+%H:%M:%S')] $*" | |
| } | |
| ############################ | |
| # UTIL | |
| ############################ | |
| port_open() { | |
| ss -ltn "sport = :$1" | grep -q LISTEN | |
| } | |
| wait_for_service() { | |
| local port=$1 | |
| for i in {1..60}; do | |
| if curl -s "http://localhost:${port}/v1/models" >/dev/null 2>&1; then | |
| return 0 | |
| fi | |
| sleep 1 | |
| done | |
| return 1 | |
| } | |
| ############################ | |
| # DEPENDENCIES | |
| ############################ | |
| log "Ensuring LiteLLM installed" | |
| if ! command -v litellm >/dev/null 2>&1; then | |
| pip install -q --upgrade pip | |
| pip install -q "litellm[proxy]" websockets uvicorn pyyaml | |
| fi | |
| ############################ | |
| # VALIDATE MODELS | |
| ############################ | |
| [ -f "$REASON_MODEL" ] || { echo "Missing model: $REASON_MODEL"; exit 1; } | |
| [ -f "$CODER_MODEL" ] || { echo "Missing model: $CODER_MODEL"; exit 1; } | |
| ############################ | |
| # CONFIG FILE | |
| ############################ | |
| log "Preparing LiteLLM config" | |
| mkdir -p "$PREFIX" | |
| cat <<EOF > "$CONFIG" | |
| model_list: | |
| - model_name: reasoning | |
| litellm_params: | |
| model: openai/reasoning | |
| api_base: http://localhost:${REASON_PORT}/v1 | |
| api_key: none | |
| - model_name: coder | |
| litellm_params: | |
| model: openai/coder | |
| api_base: http://localhost:${CODER_PORT}/v1 | |
| api_key: none | |
| router_settings: | |
| routing_strategy: simple-shuffle | |
| EOF | |
| ############################ | |
| # START REASONING MODEL | |
| ############################ | |
| if port_open "$REASON_PORT"; then | |
| log "Reasoning model already running on :$REASON_PORT" | |
| else | |
| log "Starting reasoning model" | |
| "$LLAMA_BIN" \ | |
| -m "$REASON_MODEL" \ | |
| -ngl 999 \ | |
| --flash-attn on \ | |
| -c 32768 \ | |
| --port "$REASON_PORT" \ | |
| --host 0.0.0.0 \ | |
| --alias reasoning \ | |
| --parallel 4 \ | |
| --timeout 600 \ | |
| > /tmp/reasoning.log 2>&1 & | |
| fi | |
| ############################ | |
| # START CODER MODEL | |
| ############################ | |
| if port_open "$CODER_PORT"; then | |
| log "Coder model already running on :$CODER_PORT" | |
| else | |
| log "Starting coder model" | |
| "$LLAMA_BIN" \ | |
| -m "$CODER_MODEL" \ | |
| -ngl 999 \ | |
| --flash-attn on \ | |
| -c 32768 \ | |
| --port "$CODER_PORT" \ | |
| --host 0.0.0.0 \ | |
| --alias coder \ | |
| --parallel 4 \ | |
| --timeout 600 \ | |
| > /tmp/coder.log 2>&1 & | |
| fi | |
| ############################ | |
| # WAIT FOR MODELS | |
| ############################ | |
| log "Waiting for reasoning model" | |
| wait_for_service "$REASON_PORT" || { echo "Reasoning server failed"; exit 1; } | |
| log "Waiting for coder model" | |
| wait_for_service "$CODER_PORT" || { echo "Coder server failed"; exit 1; } | |
| ############################ | |
| # START ROUTER | |
| ############################ | |
| if port_open "$ROUTER_PORT"; then | |
| log "LiteLLM router already running on :$ROUTER_PORT" | |
| else | |
| log "Starting LiteLLM router" | |
| litellm --config "$CONFIG" --port "$ROUTER_PORT" | |
| fi |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment