Last active
January 25, 2026 21:17
-
-
Save dertin/d315617fe3742804ea2d147e9df5a866 to your computer and use it in GitHub Desktop.
llama.cpp setup - Ubuntu 24.04
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/usr/bin/env bash | |
| set -euo pipefail | |
| ############################################################################### | |
| # llama.cpp setup under /opt/llama.cpp - Ubuntu 24.04 | |
| # - Unified build: CPU + Vulkan + SYCL (dynamic backends) | |
| # - Installs stable commands into /usr/local/bin | |
| # - Installs backend .so into /usr/local/lib/llama and also next to binaries | |
| # | |
| # IMPORTANT NOTE: For Intel Arc / Lunar Lake SYCL GPU support, follow Intel's | |
| # Client GPU guide (Ubuntu Latest) and install required packages/PPAs: | |
| # https://dgpu-docs.intel.com/driver/client/overview.html#ubuntu-latest | |
| # | |
| # Success criteria: | |
| # source /opt/intel/oneapi/setvars.sh | |
| # sycl-ls -> shows level_zero:gpu | |
| ############################################################################### | |
| # ---- Configurable variables ---- | |
| INSTALL_ROOT="/opt/llama.cpp" | |
| SRC_DIR="${INSTALL_ROOT}/src" | |
| BUILD_UNIFIED="${INSTALL_ROOT}/build-unified" | |
| BUILD_CPU="${INSTALL_ROOT}/build-cpu" | |
| LIB_DIR="/usr/local/lib/llama" | |
| BIN_DIR="/usr/local/bin" | |
| # Pin to a specific revision for reproducibility: | |
| # Set LLAMA_CPP_REF to a tag (e.g., "bXXXX") or commit hash | |
| LLAMA_CPP_REF="${LLAMA_CPP_REF:-master}" | |
| # Whether to keep existing /opt/llama.cpp/src if present | |
| KEEP_EXISTING_SRC="${KEEP_EXISTING_SRC:-true}" | |
| # ---- Helpers ---- | |
| log() { printf "\n[%s] %s\n" "$(date '+%F %T')" "$*"; } | |
| need_cmd() { command -v "$1" >/dev/null 2>&1 || { echo "Missing command: $1"; exit 1; }; } | |
| ############################################################################### | |
| # 0) Preconditions | |
| ############################################################################### | |
| log "Checking prerequisites" | |
| need_cmd sudo | |
| need_cmd git | |
| need_cmd cmake | |
| if [[ ! -f /opt/intel/oneapi/setvars.sh ]]; then | |
| echo "ERROR: /opt/intel/oneapi/setvars.sh not found. Install Intel oneAPI first." | |
| exit 1 | |
| fi | |
| source /opt/intel/oneapi/setvars.sh || true | |
| p="$(ls -d /opt/intel/oneapi/compiler/latest/lib 2>/dev/null || ls -d /opt/intel/oneapi/compiler/*/lib 2>/dev/null | sort -V | tail -n1)"; f="/etc/ld.so.conf.d/intel-oneapi.conf"; sudo mkdir -p "$(dirname "$f")" && (sudo test -f "$f" && sudo grep -Fxq "$p" "$f") || echo "$p" | sudo tee -a "$f" >/dev/null; sudo ldconfig | |
| ############################################################################### | |
| # 1) System deps (CPU BLAS + Vulkan build deps) | |
| ############################################################################### | |
| log "Installing build dependencies (OpenBLAS + Vulkan tools/dev headers)" | |
| sudo apt update | |
| sudo apt install -y \ | |
| build-essential git cmake software-properties-common | |
| sudo add-apt-repository -y ppa:kobuk-team/intel-graphics | |
| sudo apt update | |
| sudo apt install -y \ | |
| ninja-build pkg-config libopenblas-dev \ | |
| libvulkan-dev vulkan-tools \ | |
| ocl-icd-libopencl1 clinfo \ | |
| libze-intel-gpu1 libze1 \ | |
| intel-opencl-icd intel-gsc \ | |
| intel-ocloc libze-dev | |
| ############################################################################### | |
| # 2) Prepare /opt layout | |
| ############################################################################### | |
| log "Preparing ${INSTALL_ROOT}" | |
| sudo mkdir -p "${INSTALL_ROOT}" | |
| sudo chown -R "$USER":"$USER" "${INSTALL_ROOT}" | |
| ############################################################################### | |
| # 3) Clone or reuse llama.cpp sources | |
| ############################################################################### | |
| if [[ -d "${SRC_DIR}/.git" && "${KEEP_EXISTING_SRC}" == "true" ]]; then | |
| log "Using existing repo at ${SRC_DIR}" | |
| cd "${SRC_DIR}" | |
| else | |
| log "Cloning llama.cpp into ${SRC_DIR}" | |
| rm -rf "${SRC_DIR}" | |
| git clone https://github.com/ggml-org/llama.cpp "${SRC_DIR}" | |
| cd "${SRC_DIR}" | |
| fi | |
| # Ensure the 'origin' remote exists (common when the repo was copied via rsync) | |
| if ! git remote get-url origin >/dev/null 2>&1; then | |
| log "Remote 'origin' missing; adding it" | |
| git remote add origin https://github.com/ggml-org/llama.cpp | |
| fi | |
| # Fetch latest refs and tags | |
| git fetch --prune origin --tags | |
| # Detect the default branch from the remote (handles main/master changes) | |
| DEFAULT_BRANCH="$(git remote show origin 2>/dev/null | awk '/HEAD branch/ {print $NF}')" | |
| DEFAULT_BRANCH="${DEFAULT_BRANCH:-main}" | |
| # If LLAMA_CPP_REF is empty, use the detected default branch | |
| LLAMA_CPP_REF="${LLAMA_CPP_REF:-$DEFAULT_BRANCH}" | |
| log "Selecting revision: ${LLAMA_CPP_REF}" | |
| # 1) If it's a remote branch: create/update a local branch tracking origin/<branch> | |
| if git ls-remote --exit-code --heads origin "${LLAMA_CPP_REF}" >/dev/null 2>&1; then | |
| git checkout -B "${LLAMA_CPP_REF}" "origin/${LLAMA_CPP_REF}" | |
| git pull --ff-only || true | |
| # 2) If it's a tag: checkout the tag | |
| elif git show-ref --tags --quiet "refs/tags/${LLAMA_CPP_REF}"; then | |
| git checkout "tags/${LLAMA_CPP_REF}" | |
| # 3) If it's a commit hash (or any valid commit-ish): checkout directly | |
| elif git rev-parse --verify "${LLAMA_CPP_REF}^{commit}" >/dev/null 2>&1; then | |
| git checkout "${LLAMA_CPP_REF}" | |
| else | |
| echo "ERROR: LLAMA_CPP_REF='${LLAMA_CPP_REF}' not found as a branch/tag/commit in origin." | |
| echo "Hint: use a valid branch (e.g. ${DEFAULT_BRANCH}), tag, or commit hash." | |
| exit 1 | |
| fi | |
| log "Recording version info" | |
| git rev-parse --short HEAD | tee "${INSTALL_ROOT}/LLAMA_CPP_COMMIT.txt" >/dev/null | |
| git describe --tags --always 2>/dev/null | tee "${INSTALL_ROOT}/LLAMA_CPP_DESCRIBE.txt" >/dev/null || true | |
| ############################################################################### | |
| # 4) Build: Unified (CPU variants + Vulkan + SYCL) with dynamic backends | |
| ############################################################################### | |
| log "Building UNIFIED (CPU + Vulkan + SYCL, dynamic backends)" | |
| source /opt/intel/oneapi/setvars.sh >/dev/null 2>&1 || true | |
| rm -rf "${BUILD_UNIFIED}" | |
| cmake -S "${SRC_DIR}" -B "${BUILD_UNIFIED}" -G Ninja \ | |
| -DCMAKE_BUILD_TYPE=Release \ | |
| -DBUILD_SHARED_LIBS=ON \ | |
| -DGGML_BACKEND_DL=ON \ | |
| -DGGML_CPU_ALL_VARIANTS=ON \ | |
| -DGGML_BLAS=ON \ | |
| -DGGML_BLAS_VENDOR=OpenBLAS \ | |
| -DGGML_VULKAN=ON \ | |
| -DGGML_SYCL=ON \ | |
| -DCMAKE_C_COMPILER=icx \ | |
| -DCMAKE_CXX_COMPILER=icpx | |
| cmake --build "${BUILD_UNIFIED}" -j | |
| ############################################################################### | |
| # 6) Install binaries and libraries (robust loader setup) | |
| ############################################################################### | |
| log "Installing runtime layout (binaries + backend libs)" | |
| sudo mkdir -p "${LIB_DIR}" | |
| # Install unified standard commands | |
| sudo install -m 0755 "${BUILD_UNIFIED}/bin/llama-cli" "${BIN_DIR}/llama-cli" | |
| sudo install -m 0755 "${BUILD_UNIFIED}/bin/llama-server" "${BIN_DIR}/llama-server" | |
| # Backend libraries: put them in /usr/local/lib/llama | |
| # and also copy next to /usr/local/bin to avoid loader path issues. | |
| sudo cp -a "${BUILD_UNIFIED}/bin/libggml-"*.so* "${LIB_DIR}/" 2>/dev/null || true | |
| sudo cp -a "${BUILD_UNIFIED}/bin/libggml-"*.so* "${BIN_DIR}/" 2>/dev/null || true | |
| # Optional: ensure /usr/local/lib is in loader cache (usually is) | |
| sudo ldconfig | |
| ############################################################################### | |
| # 7) Verification | |
| ############################################################################### | |
| log "Verifying installation" | |
| echo "== Version (unified) ==" | |
| "${BIN_DIR}/llama-cli" --version || true | |
| echo | |
| echo "== Devices ==" | |
| "${BIN_DIR}/llama-cli" --list-devices || true | |
| log "Done." | |
| cat <<'EOF' | |
| NEXT STEPS / NOTES | |
| ------------------ | |
| 1) Download a small GGUF model for a real smoke test (saved under /opt/llama.cpp/models) | |
| - Create the models directory: | |
| sudo mkdir -p /opt/llama.cpp/models | |
| sudo chown -R "$USER":"$USER" /opt/llama.cpp/models | |
| - Download a small/fast model (Qwen3-0.6B-Q8_0): | |
| curl -L -o /opt/llama.cpp/models/Qwen3-0.6B-Q8_0.gguf \ | |
| https://huggingface.co/Qwen/Qwen3-0.6B-GGUF/resolve/main/Qwen3-0.6B-Q8_0.gguf | |
| - Define a convenience variable for commands below: | |
| MODEL="/opt/llama.cpp/models/Qwen3-0.6B-Q8_0.gguf" | |
| 2) Validate devices and backends (this confirms SYCL + Vulkan + CPU are available) | |
| - List devices/backends: | |
| llama-cli --list-devices | |
| - Run a short prompt on each backend (expect a coherent response): | |
| llama-cli --device SYCL0 -m "$MODEL" -p "Write one sentence about Lunar Lake." -n 64 | |
| llama-cli --device Vulkan0 -m "$MODEL" -p "Write one sentence about Lunar Lake." -n 64 | |
| llama-cli --device CPU -m "$MODEL" -p "Write one sentence about Lunar Lake." -n 64 | |
| - If a backend fails, rerun with verbose logs: | |
| llama-cli --verbose --device SYCL0 -m "$MODEL" -p "Hello" -n 16 | |
| 3) Installed commands policy | |
| - Only the stable entrypoints are installed system-wide: | |
| /usr/local/bin/llama-cli | |
| /usr/local/bin/llama-server | |
| - All additional tools remain available in: | |
| /opt/llama.cpp/build-unified/bin | |
| EOF |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/usr/bin/env bash | |
| set -euo pipefail | |
| ############################################################################### | |
| # bench-sycl-llama-bench.sh | |
| # | |
| # Focused SYCL env screening using llama-bench (non-interactive). | |
| # Prints a final report to stdout (no CSV required). | |
| # | |
| # Example: | |
| # MODEL="/opt/llama.cpp/models/Qwen3-0.6B-Q8_0.gguf" DEVICE="SYCL0" REPS=5 N_PROMPT=256 N_GEN=128 BATCH=1024 UBATCH=256 THREADS=8 VARIANT_SET=8 ./llama-cpp-bench-matrix.sh | |
| ############################################################################### | |
| # ----------------------------- | |
| # Config (override via env vars) | |
| # ----------------------------- | |
| BENCH_BIN="${BENCH_BIN:-/opt/llama.cpp/build-unified/bin/llama-bench}" | |
| MODEL="${MODEL:-/opt/llama.cpp/models/Qwen3-0.6B-Q8_0.gguf}" | |
| DEVICE="${DEVICE:-SYCL0}" # SYCL0 / Vulkan0 / auto | |
| N_GPU_LAYERS="${N_GPU_LAYERS:-99}" # 0 for CPU-only, 99 for "as much as possible" | |
| REPS="${REPS:-5}" | |
| N_PROMPT="${N_PROMPT:-512}" | |
| N_GEN="${N_GEN:-256}" | |
| BATCH="${BATCH:-2048}" | |
| UBATCH="${UBATCH:-512}" | |
| THREADS="${THREADS:-8}" | |
| NO_WARMUP="${NO_WARMUP:-0}" # 1 to skip warmup | |
| DELAY="${DELAY:-0}" # seconds between tests | |
| VARIANT_SET="${VARIANT_SET:-6}" # "6" or "8" | |
| OUT_DIR="${OUT_DIR:-./bench_out}" | |
| RUN_TAG="${RUN_TAG:-sycl_env_screen}" | |
| mkdir -p "${OUT_DIR}" | |
| TS="$(date +%F_%H%M%S)" | |
| # ----------------------------- | |
| # Helpers | |
| # ----------------------------- | |
| log() { printf "\n[%s] %s\n" "$(date '+%F %T')" "$*"; } | |
| die() { echo "ERROR: $*" >&2; exit 1; } | |
| need_cmd() { command -v "$1" >/dev/null 2>&1 || die "Missing command: $1"; } | |
| need_cmd awk | |
| need_cmd grep | |
| need_cmd sed | |
| need_cmd sort | |
| need_cmd head | |
| need_cmd tail | |
| [[ -x "${BENCH_BIN}" ]] || die "llama-bench not found/executable: ${BENCH_BIN}" | |
| [[ -f "${MODEL}" ]] || die "Model file not found: ${MODEL}" | |
| strip_ansi() { | |
| sed -r \ | |
| -e 's/\x1B\[[0-9;?]*[ -/]*[@-~]//g' \ | |
| -e 's/\x1B\][^\x07]*(\x07|\x1B\\)//g' | |
| } | |
| extract_tps_from_md() { | |
| local md_file="$1" | |
| # --- Mode A (your current output): | |
| # Table rows contain test=ppNN and test=tgNN, and throughput in column "t/s". | |
| # Example rows: | |
| # | ... | test | t/s | | |
| # | ... | pp128 | 1340.14 ± 0.00 | | |
| # | ... | tg64 | 37.32 ± 0.00 | | |
| local pp tg | |
| pp="$( | |
| awk -F'|' ' | |
| function trim(s){ sub(/^[ \t]+/,"",s); sub(/[ \t]+$/,"",s); return s } | |
| function first_float(s){ | |
| gsub(/±/," ",s); gsub(/\([^)]*\)/," ",s); gsub(/[^0-9.\-eE ]/," ",s) | |
| if (match(s, /-?[0-9]+(\.[0-9]+)?([eE]-?[0-9]+)?/)) return substr(s, RSTART, RLENGTH) | |
| return "" | |
| } | |
| /^\|/ { | |
| # skip separator rows like | --- | | |
| if ($0 ~ /^\|[ \t-:|]+\|[ \t]*$/) next | |
| # detect header with "test" and "t/s" | |
| if (tolower($0) ~ /\|[[:space:]]*test[[:space:]]*\|/ && tolower($0) ~ /\|[[:space:]]*t\/s[[:space:]]*\|/) { | |
| # record column positions | |
| n=split($0,a,"|") | |
| test_i=-1; ts_i=-1 | |
| for (i=2;i<=n-1;i++){ | |
| h=tolower(trim(a[i])) | |
| if (h=="test") test_i=i | |
| if (h=="t/s") ts_i=i | |
| } | |
| next | |
| } | |
| # parse data rows once we know columns | |
| if (test_i>0 && ts_i>0) { | |
| n=split($0,a,"|") | |
| t=trim(a[test_i]) | |
| if (t ~ /^pp[0-9]+$/) { v=first_float(a[ts_i]); if (v!="") { print v; exit } } | |
| } | |
| } | |
| ' "$md_file" | |
| )" | |
| tg="$( | |
| awk -F'|' ' | |
| function trim(s){ sub(/^[ \t]+/,"",s); sub(/[ \t]+$/,"",s); return s } | |
| function first_float(s){ | |
| gsub(/±/," ",s); gsub(/\([^)]*\)/," ",s); gsub(/[^0-9.\-eE ]/," ",s) | |
| if (match(s, /-?[0-9]+(\.[0-9]+)?([eE]-?[0-9]+)?/)) return substr(s, RSTART, RLENGTH) | |
| return "" | |
| } | |
| /^\|/ { | |
| if ($0 ~ /^\|[ \t-:|]+\|[ \t]*$/) next | |
| if (tolower($0) ~ /\|[[:space:]]*test[[:space:]]*\|/ && tolower($0) ~ /\|[[:space:]]*t\/s[[:space:]]*\|/) { | |
| n=split($0,a,"|") | |
| test_i=-1; ts_i=-1 | |
| for (i=2;i<=n-1;i++){ | |
| h=tolower(trim(a[i])) | |
| if (h=="test") test_i=i | |
| if (h=="t/s") ts_i=i | |
| } | |
| next | |
| } | |
| if (test_i>0 && ts_i>0) { | |
| n=split($0,a,"|") | |
| t=trim(a[test_i]) | |
| if (t ~ /^tg[0-9]+$/) { v=first_float(a[ts_i]); if (v!="") { print v; exit } } | |
| } | |
| } | |
| ' "$md_file" | |
| )" | |
| if [[ -n "${pp}" && -n "${tg}" ]]; then | |
| echo "${pp},${tg}" | |
| return | |
| fi | |
| # --- Mode B (alternate output): | |
| # Columns themselves are pp/tg (rare across versions); keep as fallback. | |
| local out | |
| out="$( | |
| awk ' | |
| function trim(s){ sub(/^[ \t]+/,"",s); sub(/[ \t]+$/,"",s); return s } | |
| function lower(s){ for(i=1;i<=length(s);i++){ c=substr(s,i,1); if(c>="A"&&c<="Z") c=tolower(c); r=r c } tmp=r; r=""; return tmp } | |
| function first_float(s){ | |
| gsub(/±/," ",s); gsub(/\([^)]*\)/," ",s); gsub(/[^0-9.\-eE ]/," ",s) | |
| if (match(s, /-?[0-9]+(\.[0-9]+)?([eE]-?[0-9]+)?/)) return substr(s, RSTART, RLENGTH) | |
| return "" | |
| } | |
| BEGIN{ got_header=0; pp_i=-1; tg_i=-1 } | |
| /^\|/ { | |
| if ($0 ~ /^\|[ \t-:|]+\|[ \t]*$/) next | |
| n=split($0,a,"|") | |
| # header detection | |
| if (got_header==0) { | |
| for (i=2;i<=n-1;i++){ | |
| h=lower(trim(a[i])) | |
| if (h=="pp") pp_i=i | |
| if (h=="tg") tg_i=i | |
| } | |
| if (pp_i>0 && tg_i>0) { got_header=1; next } | |
| } else { | |
| pp=first_float(a[pp_i]); tg=first_float(a[tg_i]) | |
| if (pp!="" && tg!="") { print pp "," tg; exit } | |
| } | |
| } | |
| ' "$md_file" | |
| )" | |
| if [[ -n "${out}" ]]; then | |
| echo "${out}" | |
| return | |
| fi | |
| # --- Final fallback: tok/s or t/s anywhere (best-effort) | |
| local line nums | |
| line="$(grep -E '([0-9]+\.[0-9]+|[0-9]+)[[:space:]]*(tok/s|t/s)' "$md_file" | tail -n 1 || true)" | |
| if [[ -z "${line}" ]]; then | |
| echo "NA,NA" | |
| return | |
| fi | |
| nums="$(echo "$line" | grep -Eo '([0-9]+\.[0-9]+|[0-9]+)[[:space:]]*(tok/s|t/s)' | head -n 2 | awk '{print $1}' | paste -sd, -)" | |
| [[ -z "${nums}" ]] && { echo "NA,NA"; return; } | |
| [[ "${nums}" != *,* ]] && nums="${nums},${nums}" | |
| echo "${nums}" | |
| } | |
| float_gt() { awk -v a="$1" -v b="$2" 'BEGIN{exit !(a>b)}'; } | |
| # ----------------------------- | |
| # Curated variants (6 or 8) | |
| # ----------------------------- | |
| # Format: "graph dnn opt dmmv" where each is 0/1 | |
| # graph = GGML_SYCL_DISABLE_GRAPH | |
| # dnn = GGML_SYCL_DISABLE_DNN | |
| # opt = GGML_SYCL_DISABLE_OPT | |
| # dmmv = GGML_SYCL_PRIORITIZE_DMMV | |
| if [[ "${VARIANT_SET}" == "8" ]]; then | |
| CONFIGS=( | |
| "0 0 0 0" # baseline (graph on, dnn on) | |
| "1 0 0 0" # graph off (strong baseline winner family) | |
| "1 1 0 0" # graph off + dnn off (your current winner) | |
| "1 1 0 1" # graph off + dnn off + dmmv | |
| "1 0 0 1" # graph off + dmmv | |
| "1 0 1 0" # graph off + opt off | |
| "1 0 1 1" # graph off + opt off + dmmv (NEW, good to test) | |
| "0 0 0 1" # graph on + dmmv (control, usually safe) | |
| ) | |
| else | |
| CONFIGS=( | |
| "0 0 0 0" | |
| "1 0 0 0" | |
| "1 0 1 0" | |
| "1 1 0 0" | |
| "1 0 0 1" | |
| "1 1 0 1" | |
| ) | |
| fi | |
| # ----------------------------- | |
| # llama-bench args | |
| # ----------------------------- | |
| BASE_ARGS=( | |
| --model "${MODEL}" | |
| --device "${DEVICE}" | |
| --repetitions "${REPS}" | |
| --n-prompt "${N_PROMPT}" | |
| --n-gen "${N_GEN}" | |
| --batch-size "${BATCH}" | |
| --ubatch-size "${UBATCH}" | |
| --threads "${THREADS}" | |
| --n-gpu-layers "${N_GPU_LAYERS}" | |
| --output md | |
| --progress | |
| ) | |
| [[ "${NO_WARMUP}" == "1" ]] && BASE_ARGS+=( --no-warmup ) | |
| [[ "${DELAY}" != "0" ]] && BASE_ARGS+=( --delay "${DELAY}" ) | |
| # ----------------------------- | |
| # Run | |
| # ----------------------------- | |
| log "Starting llama-bench env screening" | |
| echo "Bench: ${BENCH_BIN}" | |
| echo "Model: ${MODEL}" | |
| echo "Device: ${DEVICE} (N_GPU_LAYERS=${N_GPU_LAYERS})" | |
| echo "Workload: REPS=${REPS} N_PROMPT=${N_PROMPT} N_GEN=${N_GEN} BATCH=${BATCH} UBATCH=${UBATCH} THREADS=${THREADS}" | |
| echo "Variants: ${#CONFIGS[@]} (VARIANT_SET=${VARIANT_SET})" | |
| echo "Artifacts: ${OUT_DIR}/" | |
| echo | |
| RESULTS_TSV="${OUT_DIR}/${RUN_TAG}_${TS}_results.tsv" | |
| : > "${RESULTS_TSV}" | |
| printf "cfg_id\ttag\tgraph\tdnn\topt\tdmmv\tpp_tps\ttg_tps\tmd_path\tlog_path\n" >> "${RESULTS_TSV}" | |
| best_gen="0" | |
| best_desc="(none)" | |
| cfg_id=0 | |
| for cfg in "${CONFIGS[@]}"; do | |
| cfg_id=$((cfg_id+1)) | |
| read -r g d o m <<< "${cfg}" | |
| tag="g${g}_d${d}_o${o}_m${m}" | |
| md_out="${OUT_DIR}/${RUN_TAG}_${TS}_${tag}.md" | |
| log_out="${OUT_DIR}/${RUN_TAG}_${TS}_${tag}.log" | |
| log "Config ${cfg_id}/${#CONFIGS[@]}: ${tag}" | |
| log " GGML_SYCL_DISABLE_GRAPH=${g} GGML_SYCL_DISABLE_DNN=${d} GGML_SYCL_DISABLE_OPT=${o} GGML_SYCL_PRIORITIZE_DMMV=${m}" | |
| # Run bench; never let a crash stop the matrix. | |
| # - stdout -> md_out | |
| # - stderr -> log_out | |
| set +e | |
| NO_COLOR=1 \ | |
| LLAMA_LOG_COLORS=0 \ | |
| GGML_LOG_COLORS=0 \ | |
| GGML_SYCL_DISABLE_GRAPH="${g}" \ | |
| GGML_SYCL_DISABLE_DNN="${d}" \ | |
| GGML_SYCL_DISABLE_OPT="${o}" \ | |
| GGML_SYCL_PRIORITIZE_DMMV="${m}" \ | |
| "${BENCH_BIN}" "${BASE_ARGS[@]}" >"${md_out}.tmp" 2>"${log_out}.tmp" | |
| rc=$? | |
| set -e | |
| # Strip ANSI after the fact (safe even if program crashed) | |
| strip_ansi <"${md_out}.tmp" >"${md_out}" || true | |
| strip_ansi <"${log_out}.tmp" >"${log_out}" || true | |
| rm -f "${md_out}.tmp" "${log_out}.tmp" | |
| if [[ $rc -ne 0 ]]; then | |
| log "WARNING: Config ${tag} exited non-zero (rc=${rc}). Marking throughput as NA." | |
| pp_tps="NA" | |
| tg_tps="NA" | |
| else | |
| tps="$(extract_tps_from_md "${md_out}")" | |
| pp_tps="${tps%,*}" | |
| tg_tps="${tps#*,}" | |
| fi | |
| echo "Result ${cfg_id}: ${tag}" | |
| echo " pp (prompt/prefill) = ${pp_tps} tok/s" | |
| echo " tg (generation) = ${tg_tps} tok/s" | |
| echo " report = ${md_out}" | |
| echo " log = ${log_out}" | |
| echo | |
| printf "%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n" \ | |
| "${cfg_id}" "${tag}" "${g}" "${d}" "${o}" "${m}" "${pp_tps}" "${tg_tps}" "${md_out}" "${log_out}" \ | |
| >> "${RESULTS_TSV}" | |
| if [[ "${tg_tps}" != "NA" ]]; then | |
| if [[ "${best_gen}" == "0" ]] || float_gt "${tg_tps}" "${best_gen}"; then | |
| best_gen="${tg_tps}" | |
| best_desc="cfg=${cfg_id} ${tag} (tg=${tg_tps} tok/s, pp=${pp_tps} tok/s)" | |
| fi | |
| else | |
| log "WARNING: Could not extract throughput from markdown table." | |
| log " Inspect: ${md_out}" | |
| log " Inspect: ${log_out}" | |
| fi | |
| done | |
| echo "=============================================================================" | |
| echo "FINAL REPORT" | |
| echo "=============================================================================" | |
| echo "Best by generation throughput: ${best_desc}" | |
| echo | |
| echo "Ranking by tg (generation tok/s):" | |
| awk -F'\t' 'NR>1 && $8 != "NA" {print}' "${RESULTS_TSV}" \ | |
| | sort -t$'\t' -k8,8gr \ | |
| | awk -F'\t' '{printf " cfg=%s tg=%s tok/s pp=%s tok/s env=[graph=%s dnn=%s opt=%s dmmv=%s] (%s)\n", $1,$8,$7,$3,$4,$5,$6,$2}' | |
| echo | |
| echo "Artifacts:" | |
| echo " TSV summary: ${RESULTS_TSV}" | |
| echo " Reports: ${OUT_DIR}/${RUN_TAG}_${TS}_g*_d*_o*_m*.md" | |
| echo " Logs: ${OUT_DIR}/${RUN_TAG}_${TS}_g*_d*_o*_m*.log" | |
| echo | |
| cat <<'EOF' | |
| NOTES | |
| ----- | |
| - 'pp' is prompt/prefill throughput; 'tg' is generation throughput. | |
| - For faster iteration: | |
| REPS=1 N_PROMPT=128 N_GEN=64 BATCH=512 UBATCH=128 ./bench-sycl-llama-bench.sh | |
| Then scale up REPS/tokens once you choose the best config. | |
| - CPU-only baseline: | |
| DEVICE=auto N_GPU_LAYERS=0 ./bench-sycl-llama-bench.sh | |
| - Vulkan comparison: | |
| DEVICE=Vulkan0 ./bench-sycl-llama-bench.sh | |
| EOF |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment