dertin · January 25, 2026 21:17
diff --git a/01-llama-cpp-install-intel.sh b/01-llama-cpp-install-intel.sh
 #!/usr/bin/env bash
 set -euo pipefail

 ###############################################################################
 # llama.cpp setup under /opt/llama.cpp - Ubuntu 24.04
 # - Unified build: CPU + Vulkan + SYCL (dynamic backends)
 # - Installs stable commands into /usr/local/bin
 # - Installs backend .so into /usr/local/lib/llama and also next to binaries
 #
 # IMPORTANT NOTE: For Intel Arc / Lunar Lake SYCL GPU support, follow Intel's
 # Client GPU guide (Ubuntu Latest) and install required packages/PPAs:
 #   https://dgpu-docs.intel.com/driver/client/overview.html#ubuntu-latest
 #
 # Success criteria:
 #   source /opt/intel/oneapi/setvars.sh
 #   sycl-ls  -> shows level_zero:gpu
 ###############################################################################

 # ---- Configurable variables ----
 INSTALL_ROOT="/opt/llama.cpp"
 SRC_DIR="${INSTALL_ROOT}/src"
 BUILD_UNIFIED="${INSTALL_ROOT}/build-unified"
 BUILD_CPU="${INSTALL_ROOT}/build-cpu"
 LIB_DIR="/usr/local/lib/llama"
 BIN_DIR="/usr/local/bin"

 # Pin to a specific revision for reproducibility:
 # Set LLAMA_CPP_REF to a tag (e.g., "bXXXX") or commit hash
 LLAMA_CPP_REF="${LLAMA_CPP_REF:-master}"

 # Whether to keep existing /opt/llama.cpp/src if present
 KEEP_EXISTING_SRC="${KEEP_EXISTING_SRC:-true}"

 # ---- Helpers ----
 log() { printf "\n[%s] %s\n" "$(date '+%F %T')" "$*"; }
 need_cmd() { command -v "$1" >/dev/null 2>&1 || { echo "Missing command: $1"; exit 1; }; }

 ###############################################################################
 # 0) Preconditions
 ###############################################################################
 log "Checking prerequisites"
 need_cmd sudo
 need_cmd git
 need_cmd cmake

 if [[ ! -f /opt/intel/oneapi/setvars.sh ]]; then
  echo "ERROR: /opt/intel/oneapi/setvars.sh not found. Install Intel oneAPI first."
  exit 1
 fi

 source /opt/intel/oneapi/setvars.sh || true

 p="$(ls -d /opt/intel/oneapi/compiler/latest/lib 2>/dev/null || ls -d /opt/intel/oneapi/compiler/*/lib 2>/dev/null | sort -V | tail -n1)"; f="/etc/ld.so.conf.d/intel-oneapi.conf"; sudo mkdir -p "$(dirname "$f")" && (sudo test -f "$f" && sudo grep -Fxq "$p" "$f") || echo "$p" | sudo tee -a "$f" >/dev/null; sudo ldconfig

 ###############################################################################
 # 1) System deps (CPU BLAS + Vulkan build deps)
 ###############################################################################
 log "Installing build dependencies (OpenBLAS + Vulkan tools/dev headers)"
 sudo apt update
 sudo apt install -y \
  build-essential git cmake software-properties-common 

 sudo add-apt-repository -y ppa:kobuk-team/intel-graphics
 sudo apt update

 sudo apt install -y \
  ninja-build pkg-config libopenblas-dev \
  libvulkan-dev vulkan-tools \
  ocl-icd-libopencl1 clinfo \
  libze-intel-gpu1 libze1 \
  intel-opencl-icd intel-gsc \
  intel-ocloc libze-dev

 ###############################################################################
 # 2) Prepare /opt layout
 ###############################################################################
 log "Preparing ${INSTALL_ROOT}"
 sudo mkdir -p "${INSTALL_ROOT}"
 sudo chown -R "$USER":"$USER" "${INSTALL_ROOT}"

 ###############################################################################
 # 3) Clone or reuse llama.cpp sources
 ###############################################################################
 if [[ -d "${SRC_DIR}/.git" && "${KEEP_EXISTING_SRC}" == "true" ]]; then
  log "Using existing repo at ${SRC_DIR}"
  cd "${SRC_DIR}"
 else
  log "Cloning llama.cpp into ${SRC_DIR}"
  rm -rf "${SRC_DIR}"
  git clone https://github.com/ggml-org/llama.cpp "${SRC_DIR}"
  cd "${SRC_DIR}"
 fi

 # Ensure the 'origin' remote exists (common when the repo was copied via rsync)
 if ! git remote get-url origin >/dev/null 2>&1; then
  log "Remote 'origin' missing; adding it"
  git remote add origin https://github.com/ggml-org/llama.cpp
 fi

 # Fetch latest refs and tags
 git fetch --prune origin --tags

 # Detect the default branch from the remote (handles main/master changes)
 DEFAULT_BRANCH="$(git remote show origin 2>/dev/null | awk '/HEAD branch/ {print $NF}')"
 DEFAULT_BRANCH="${DEFAULT_BRANCH:-main}"

 # If LLAMA_CPP_REF is empty, use the detected default branch
 LLAMA_CPP_REF="${LLAMA_CPP_REF:-$DEFAULT_BRANCH}"

 log "Selecting revision: ${LLAMA_CPP_REF}"

 # 1) If it's a remote branch: create/update a local branch tracking origin/<branch>
 if git ls-remote --exit-code --heads origin "${LLAMA_CPP_REF}" >/dev/null 2>&1; then
  git checkout -B "${LLAMA_CPP_REF}" "origin/${LLAMA_CPP_REF}"
  git pull --ff-only || true

 # 2) If it's a tag: checkout the tag
 elif git show-ref --tags --quiet "refs/tags/${LLAMA_CPP_REF}"; then
  git checkout "tags/${LLAMA_CPP_REF}"

 # 3) If it's a commit hash (or any valid commit-ish): checkout directly
 elif git rev-parse --verify "${LLAMA_CPP_REF}^{commit}" >/dev/null 2>&1; then
  git checkout "${LLAMA_CPP_REF}"

 else
  echo "ERROR: LLAMA_CPP_REF='${LLAMA_CPP_REF}' not found as a branch/tag/commit in origin."
  echo "Hint: use a valid branch (e.g. ${DEFAULT_BRANCH}), tag, or commit hash."
  exit 1
 fi

 log "Recording version info"
 git rev-parse --short HEAD | tee "${INSTALL_ROOT}/LLAMA_CPP_COMMIT.txt" >/dev/null
 git describe --tags --always 2>/dev/null | tee "${INSTALL_ROOT}/LLAMA_CPP_DESCRIBE.txt" >/dev/null || true

 ###############################################################################
 # 4) Build: Unified (CPU variants + Vulkan + SYCL) with dynamic backends
 ###############################################################################
 log "Building UNIFIED (CPU + Vulkan + SYCL, dynamic backends)"
 source /opt/intel/oneapi/setvars.sh >/dev/null 2>&1 || true

 rm -rf "${BUILD_UNIFIED}"
 cmake -S "${SRC_DIR}" -B "${BUILD_UNIFIED}" -G Ninja \
  -DCMAKE_BUILD_TYPE=Release \
  -DBUILD_SHARED_LIBS=ON \
  -DGGML_BACKEND_DL=ON \
  -DGGML_CPU_ALL_VARIANTS=ON \
  -DGGML_BLAS=ON \
  -DGGML_BLAS_VENDOR=OpenBLAS \
  -DGGML_VULKAN=ON \
  -DGGML_SYCL=ON \
  -DCMAKE_C_COMPILER=icx \
  -DCMAKE_CXX_COMPILER=icpx

 cmake --build "${BUILD_UNIFIED}" -j

 ###############################################################################
 # 6) Install binaries and libraries (robust loader setup)
 ###############################################################################
 log "Installing runtime layout (binaries + backend libs)"

 sudo mkdir -p "${LIB_DIR}"

 # Install unified standard commands
 sudo install -m 0755 "${BUILD_UNIFIED}/bin/llama-cli"    "${BIN_DIR}/llama-cli"
 sudo install -m 0755 "${BUILD_UNIFIED}/bin/llama-server" "${BIN_DIR}/llama-server"

 # Backend libraries: put them in /usr/local/lib/llama
 # and also copy next to /usr/local/bin to avoid loader path issues.
 sudo cp -a "${BUILD_UNIFIED}/bin/libggml-"*.so* "${LIB_DIR}/" 2>/dev/null || true
 sudo cp -a "${BUILD_UNIFIED}/bin/libggml-"*.so* "${BIN_DIR}/" 2>/dev/null || true

 # Optional: ensure /usr/local/lib is in loader cache (usually is)
 sudo ldconfig

 ###############################################################################
 # 7) Verification
 ###############################################################################
 log "Verifying installation"
 echo "== Version (unified) =="
 "${BIN_DIR}/llama-cli" --version || true

 echo
 echo "== Devices =="
 "${BIN_DIR}/llama-cli" --list-devices || true

 log "Done."
 cat <<'EOF'

 NEXT STEPS / NOTES
 ------------------
 1) Download a small GGUF model for a real smoke test (saved under /opt/llama.cpp/models)
   - Create the models directory:
       sudo mkdir -p /opt/llama.cpp/models
       sudo chown -R "$USER":"$USER" /opt/llama.cpp/models

   - Download a small/fast model (Qwen3-0.6B-Q8_0):
       curl -L -o /opt/llama.cpp/models/Qwen3-0.6B-Q8_0.gguf \
  	https://huggingface.co/Qwen/Qwen3-0.6B-GGUF/resolve/main/Qwen3-0.6B-Q8_0.gguf

   - Define a convenience variable for commands below:
       MODEL="/opt/llama.cpp/models/Qwen3-0.6B-Q8_0.gguf"

 2) Validate devices and backends (this confirms SYCL + Vulkan + CPU are available)
   - List devices/backends:
       llama-cli --list-devices

   - Run a short prompt on each backend (expect a coherent response):
       llama-cli --device SYCL0   -m "$MODEL" -p "Write one sentence about Lunar Lake." -n 64
       llama-cli --device Vulkan0 -m "$MODEL" -p "Write one sentence about Lunar Lake." -n 64
       llama-cli --device CPU     -m "$MODEL" -p "Write one sentence about Lunar Lake." -n 64

   - If a backend fails, rerun with verbose logs:
       llama-cli --verbose --device SYCL0 -m "$MODEL" -p "Hello" -n 16

 3) Installed commands policy
   - Only the stable entrypoints are installed system-wide:
       /usr/local/bin/llama-cli
       /usr/local/bin/llama-server

   - All additional tools remain available in:
       /opt/llama.cpp/build-unified/bin

 EOF
diff --git a/02-llama-cpp-bench-matrix.sh b/02-llama-cpp-bench-matrix.sh
 #!/usr/bin/env bash
 set -euo pipefail

 ###############################################################################
 # bench-sycl-llama-bench.sh
 #
 # Focused SYCL env screening using llama-bench (non-interactive).
 # Prints a final report to stdout (no CSV required).
 #
 # Example:
 # MODEL="/opt/llama.cpp/models/Qwen3-0.6B-Q8_0.gguf" DEVICE="SYCL0" REPS=5 N_PROMPT=256 N_GEN=128 BATCH=1024 UBATCH=256 THREADS=8 VARIANT_SET=8 ./llama-cpp-bench-matrix.sh
 ###############################################################################
 
 # -----------------------------
 # Config (override via env vars)
 # -----------------------------
 BENCH_BIN="${BENCH_BIN:-/opt/llama.cpp/build-unified/bin/llama-bench}"
 MODEL="${MODEL:-/opt/llama.cpp/models/Qwen3-0.6B-Q8_0.gguf}"

 DEVICE="${DEVICE:-SYCL0}"              # SYCL0 / Vulkan0 / auto
 N_GPU_LAYERS="${N_GPU_LAYERS:-99}"     # 0 for CPU-only, 99 for "as much as possible"

 REPS="${REPS:-5}"
 N_PROMPT="${N_PROMPT:-512}"
 N_GEN="${N_GEN:-256}"
 BATCH="${BATCH:-2048}"
 UBATCH="${UBATCH:-512}"
 THREADS="${THREADS:-8}"

 NO_WARMUP="${NO_WARMUP:-0}"            # 1 to skip warmup
 DELAY="${DELAY:-0}"                    # seconds between tests
 VARIANT_SET="${VARIANT_SET:-6}"        # "6" or "8"

 OUT_DIR="${OUT_DIR:-./bench_out}"
 RUN_TAG="${RUN_TAG:-sycl_env_screen}"
 mkdir -p "${OUT_DIR}"
 TS="$(date +%F_%H%M%S)"

 # -----------------------------
 # Helpers
 # -----------------------------
 log() { printf "\n[%s] %s\n" "$(date '+%F %T')" "$*"; }
 die() { echo "ERROR: $*" >&2; exit 1; }

 need_cmd() { command -v "$1" >/dev/null 2>&1 || die "Missing command: $1"; }
 need_cmd awk
 need_cmd grep
 need_cmd sed
 need_cmd sort
 need_cmd head
 need_cmd tail

 [[ -x "${BENCH_BIN}" ]] || die "llama-bench not found/executable: ${BENCH_BIN}"
 [[ -f "${MODEL}" ]] || die "Model file not found: ${MODEL}"

 strip_ansi() {
  sed -r \
    -e 's/\x1B\[[0-9;?]*[ -/]*[@-~]//g' \
    -e 's/\x1B\][^\x07]*(\x07|\x1B\\)//g'
 }

 extract_tps_from_md() {
  local md_file="$1"

  # --- Mode A (your current output):
  # Table rows contain test=ppNN and test=tgNN, and throughput in column "t/s".
  # Example rows:
  # | ... | test | t/s |
  # | ... | pp128 | 1340.14 ± 0.00 |
  # | ... | tg64  |  37.32 ± 0.00 |
  local pp tg
  pp="$(
    awk -F'|' '
      function trim(s){ sub(/^[ \t]+/,"",s); sub(/[ \t]+$/,"",s); return s }
      function first_float(s){
        gsub(/±/," ",s); gsub(/\([^)]*\)/," ",s); gsub(/[^0-9.\-eE ]/," ",s)
        if (match(s, /-?[0-9]+(\.[0-9]+)?([eE]-?[0-9]+)?/)) return substr(s, RSTART, RLENGTH)
        return ""
      }
      /^\|/ {
        # skip separator rows like | --- |
        if ($0 ~ /^\|[ \t-:|]+\|[ \t]*$/) next

        # detect header with "test" and "t/s"
        if (tolower($0) ~ /\|[[:space:]]*test[[:space:]]*\|/ && tolower($0) ~ /\|[[:space:]]*t\/s[[:space:]]*\|/) {
          # record column positions
          n=split($0,a,"|")
          test_i=-1; ts_i=-1
          for (i=2;i<=n-1;i++){
            h=tolower(trim(a[i]))
            if (h=="test") test_i=i
            if (h=="t/s")  ts_i=i
          }
          next
        }

        # parse data rows once we know columns
        if (test_i>0 && ts_i>0) {
          n=split($0,a,"|")
          t=trim(a[test_i])
          if (t ~ /^pp[0-9]+$/) { v=first_float(a[ts_i]); if (v!="") { print v; exit } }
        }
      }
    ' "$md_file"
  )"

  tg="$(
    awk -F'|' '
      function trim(s){ sub(/^[ \t]+/,"",s); sub(/[ \t]+$/,"",s); return s }
      function first_float(s){
        gsub(/±/," ",s); gsub(/\([^)]*\)/," ",s); gsub(/[^0-9.\-eE ]/," ",s)
        if (match(s, /-?[0-9]+(\.[0-9]+)?([eE]-?[0-9]+)?/)) return substr(s, RSTART, RLENGTH)
        return ""
      }
      /^\|/ {
        if ($0 ~ /^\|[ \t-:|]+\|[ \t]*$/) next
        if (tolower($0) ~ /\|[[:space:]]*test[[:space:]]*\|/ && tolower($0) ~ /\|[[:space:]]*t\/s[[:space:]]*\|/) {
          n=split($0,a,"|")
          test_i=-1; ts_i=-1
          for (i=2;i<=n-1;i++){
            h=tolower(trim(a[i]))
            if (h=="test") test_i=i
            if (h=="t/s")  ts_i=i
          }
          next
        }
        if (test_i>0 && ts_i>0) {
          n=split($0,a,"|")
          t=trim(a[test_i])
          if (t ~ /^tg[0-9]+$/) { v=first_float(a[ts_i]); if (v!="") { print v; exit } }
        }
      }
    ' "$md_file"
  )"

  if [[ -n "${pp}" && -n "${tg}" ]]; then
    echo "${pp},${tg}"
    return
  fi

  # --- Mode B (alternate output):
  # Columns themselves are pp/tg (rare across versions); keep as fallback.
  local out
  out="$(
    awk '
      function trim(s){ sub(/^[ \t]+/,"",s); sub(/[ \t]+$/,"",s); return s }
      function lower(s){ for(i=1;i<=length(s);i++){ c=substr(s,i,1); if(c>="A"&&c<="Z") c=tolower(c); r=r c } tmp=r; r=""; return tmp }
      function first_float(s){
        gsub(/±/," ",s); gsub(/\([^)]*\)/," ",s); gsub(/[^0-9.\-eE ]/," ",s)
        if (match(s, /-?[0-9]+(\.[0-9]+)?([eE]-?[0-9]+)?/)) return substr(s, RSTART, RLENGTH)
        return ""
      }
      BEGIN{ got_header=0; pp_i=-1; tg_i=-1 }
      /^\|/ {
        if ($0 ~ /^\|[ \t-:|]+\|[ \t]*$/) next
        n=split($0,a,"|")
        # header detection
        if (got_header==0) {
          for (i=2;i<=n-1;i++){
            h=lower(trim(a[i]))
            if (h=="pp") pp_i=i
            if (h=="tg") tg_i=i
          }
          if (pp_i>0 && tg_i>0) { got_header=1; next }
        } else {
          pp=first_float(a[pp_i]); tg=first_float(a[tg_i])
          if (pp!="" && tg!="") { print pp "," tg; exit }
        }
      }
    ' "$md_file"
  )"
  if [[ -n "${out}" ]]; then
    echo "${out}"
    return
  fi

  # --- Final fallback: tok/s or t/s anywhere (best-effort)
  local line nums
  line="$(grep -E '([0-9]+\.[0-9]+|[0-9]+)[[:space:]]*(tok/s|t/s)' "$md_file" | tail -n 1 || true)"
  if [[ -z "${line}" ]]; then
    echo "NA,NA"
    return
  fi
  nums="$(echo "$line" | grep -Eo '([0-9]+\.[0-9]+|[0-9]+)[[:space:]]*(tok/s|t/s)' | head -n 2 | awk '{print $1}' | paste -sd, -)"
  [[ -z "${nums}" ]] && { echo "NA,NA"; return; }
  [[ "${nums}" != *,* ]] && nums="${nums},${nums}"
  echo "${nums}"
 }

 float_gt() { awk -v a="$1" -v b="$2" 'BEGIN{exit !(a>b)}'; }

 # -----------------------------
 # Curated variants (6 or 8)
 # -----------------------------
 # Format: "graph dnn opt dmmv" where each is 0/1
 # graph = GGML_SYCL_DISABLE_GRAPH
 # dnn   = GGML_SYCL_DISABLE_DNN
 # opt   = GGML_SYCL_DISABLE_OPT
 # dmmv  = GGML_SYCL_PRIORITIZE_DMMV
 if [[ "${VARIANT_SET}" == "8" ]]; then
  CONFIGS=(
    "0 0 0 0"  # baseline (graph on, dnn on)
    "1 0 0 0"  # graph off (strong baseline winner family)
    "1 1 0 0"  # graph off + dnn off (your current winner)
    "1 1 0 1"  # graph off + dnn off + dmmv
    "1 0 0 1"  # graph off + dmmv
    "1 0 1 0"  # graph off + opt off
    "1 0 1 1"  # graph off + opt off + dmmv  (NEW, good to test)
    "0 0 0 1"  # graph on + dmmv (control, usually safe)
  )
 else
  CONFIGS=(
    "0 0 0 0"
    "1 0 0 0"
    "1 0 1 0"
    "1 1 0 0"
    "1 0 0 1"
    "1 1 0 1"
  )
 fi

 # -----------------------------
 # llama-bench args
 # -----------------------------
 BASE_ARGS=(
  --model "${MODEL}"
  --device "${DEVICE}"
  --repetitions "${REPS}"
  --n-prompt "${N_PROMPT}"
  --n-gen "${N_GEN}"
  --batch-size "${BATCH}"
  --ubatch-size "${UBATCH}"
  --threads "${THREADS}"
  --n-gpu-layers "${N_GPU_LAYERS}"
  --output md
  --progress
 )

 [[ "${NO_WARMUP}" == "1" ]] && BASE_ARGS+=( --no-warmup )
 [[ "${DELAY}" != "0" ]] && BASE_ARGS+=( --delay "${DELAY}" )

 # -----------------------------
 # Run
 # -----------------------------
 log "Starting llama-bench env screening"
 echo "Bench:     ${BENCH_BIN}"
 echo "Model:     ${MODEL}"
 echo "Device:    ${DEVICE} (N_GPU_LAYERS=${N_GPU_LAYERS})"
 echo "Workload:  REPS=${REPS} N_PROMPT=${N_PROMPT} N_GEN=${N_GEN} BATCH=${BATCH} UBATCH=${UBATCH} THREADS=${THREADS}"
 echo "Variants:  ${#CONFIGS[@]} (VARIANT_SET=${VARIANT_SET})"
 echo "Artifacts: ${OUT_DIR}/"
 echo

 RESULTS_TSV="${OUT_DIR}/${RUN_TAG}_${TS}_results.tsv"
 : > "${RESULTS_TSV}"
 printf "cfg_id\ttag\tgraph\tdnn\topt\tdmmv\tpp_tps\ttg_tps\tmd_path\tlog_path\n" >> "${RESULTS_TSV}"

 best_gen="0"
 best_desc="(none)"

 cfg_id=0
 for cfg in "${CONFIGS[@]}"; do
  cfg_id=$((cfg_id+1))
  read -r g d o m <<< "${cfg}"
  tag="g${g}_d${d}_o${o}_m${m}"

  md_out="${OUT_DIR}/${RUN_TAG}_${TS}_${tag}.md"
  log_out="${OUT_DIR}/${RUN_TAG}_${TS}_${tag}.log"

  log "Config ${cfg_id}/${#CONFIGS[@]}: ${tag}"
  log "  GGML_SYCL_DISABLE_GRAPH=${g}  GGML_SYCL_DISABLE_DNN=${d}  GGML_SYCL_DISABLE_OPT=${o}  GGML_SYCL_PRIORITIZE_DMMV=${m}"

  # Run bench; never let a crash stop the matrix.
  # - stdout -> md_out
  # - stderr -> log_out
  set +e
  NO_COLOR=1 \
  LLAMA_LOG_COLORS=0 \
  GGML_LOG_COLORS=0 \
  GGML_SYCL_DISABLE_GRAPH="${g}" \
  GGML_SYCL_DISABLE_DNN="${d}" \
  GGML_SYCL_DISABLE_OPT="${o}" \
  GGML_SYCL_PRIORITIZE_DMMV="${m}" \
  "${BENCH_BIN}" "${BASE_ARGS[@]}" >"${md_out}.tmp" 2>"${log_out}.tmp"
  rc=$?
  set -e

  # Strip ANSI after the fact (safe even if program crashed)
  strip_ansi <"${md_out}.tmp" >"${md_out}" || true
  strip_ansi <"${log_out}.tmp" >"${log_out}" || true
  rm -f "${md_out}.tmp" "${log_out}.tmp"

  if [[ $rc -ne 0 ]]; then
    log "WARNING: Config ${tag} exited non-zero (rc=${rc}). Marking throughput as NA."
    pp_tps="NA"
    tg_tps="NA"
  else
    tps="$(extract_tps_from_md "${md_out}")"
    pp_tps="${tps%,*}"
    tg_tps="${tps#*,}"
  fi

  echo "Result ${cfg_id}: ${tag}"
  echo "  pp (prompt/prefill) = ${pp_tps} tok/s"
  echo "  tg (generation)     = ${tg_tps} tok/s"
  echo "  report              = ${md_out}"
  echo "  log                 = ${log_out}"
  echo

  printf "%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n" \
    "${cfg_id}" "${tag}" "${g}" "${d}" "${o}" "${m}" "${pp_tps}" "${tg_tps}" "${md_out}" "${log_out}" \
    >> "${RESULTS_TSV}"

  if [[ "${tg_tps}" != "NA" ]]; then
    if [[ "${best_gen}" == "0" ]] || float_gt "${tg_tps}" "${best_gen}"; then
      best_gen="${tg_tps}"
      best_desc="cfg=${cfg_id} ${tag} (tg=${tg_tps} tok/s, pp=${pp_tps} tok/s)"
    fi
  else
    log "WARNING: Could not extract throughput from markdown table."
    log "  Inspect: ${md_out}"
    log "  Inspect: ${log_out}"
  fi
 done

 echo "============================================================================="
 echo "FINAL REPORT"
 echo "============================================================================="
 echo "Best by generation throughput: ${best_desc}"
 echo

 echo "Ranking by tg (generation tok/s):"
 awk -F'\t' 'NR>1 && $8 != "NA" {print}' "${RESULTS_TSV}" \
  | sort -t$'\t' -k8,8gr \
  | awk -F'\t' '{printf "  cfg=%s  tg=%s tok/s  pp=%s tok/s  env=[graph=%s dnn=%s opt=%s dmmv=%s]  (%s)\n", $1,$8,$7,$3,$4,$5,$6,$2}'

 echo
 echo "Artifacts:"
 echo "  TSV summary: ${RESULTS_TSV}"
 echo "  Reports:     ${OUT_DIR}/${RUN_TAG}_${TS}_g*_d*_o*_m*.md"
 echo "  Logs:        ${OUT_DIR}/${RUN_TAG}_${TS}_g*_d*_o*_m*.log"
 echo

 cat <<'EOF'
 NOTES
 -----
 - 'pp' is prompt/prefill throughput; 'tg' is generation throughput.
 - For faster iteration:
    REPS=1 N_PROMPT=128 N_GEN=64 BATCH=512 UBATCH=128 ./bench-sycl-llama-bench.sh
  Then scale up REPS/tokens once you choose the best config.
 - CPU-only baseline:
    DEVICE=auto N_GPU_LAYERS=0 ./bench-sycl-llama-bench.sh
 - Vulkan comparison:
    DEVICE=Vulkan0 ./bench-sycl-llama-bench.sh
 EOF
	#!/usr/bin/env bash
	set -euo pipefail

	###############################################################################
	# llama.cpp setup under /opt/llama.cpp - Ubuntu 24.04
	# - Unified build: CPU + Vulkan + SYCL (dynamic backends)
	# - Installs stable commands into /usr/local/bin
	# - Installs backend .so into /usr/local/lib/llama and also next to binaries
	#
	# IMPORTANT NOTE: For Intel Arc / Lunar Lake SYCL GPU support, follow Intel's
	# Client GPU guide (Ubuntu Latest) and install required packages/PPAs:
	# https://dgpu-docs.intel.com/driver/client/overview.html#ubuntu-latest
	#
	# Success criteria:
	# source /opt/intel/oneapi/setvars.sh
	# sycl-ls -> shows level_zero:gpu
	###############################################################################

	# ---- Configurable variables ----
	INSTALL_ROOT="/opt/llama.cpp"
	SRC_DIR="${INSTALL_ROOT}/src"
	BUILD_UNIFIED="${INSTALL_ROOT}/build-unified"
	BUILD_CPU="${INSTALL_ROOT}/build-cpu"
	LIB_DIR="/usr/local/lib/llama"
	BIN_DIR="/usr/local/bin"

	# Pin to a specific revision for reproducibility:
	# Set LLAMA_CPP_REF to a tag (e.g., "bXXXX") or commit hash
	LLAMA_CPP_REF="${LLAMA_CPP_REF:-master}"

	# Whether to keep existing /opt/llama.cpp/src if present
	KEEP_EXISTING_SRC="${KEEP_EXISTING_SRC:-true}"

	# ---- Helpers ----
	log() { printf "\n[%s] %s\n" "$(date '+%F %T')" "$*"; }
	need_cmd() { command -v "$1" >/dev/null 2>&1 \|\| { echo "Missing command: $1"; exit 1; }; }

	###############################################################################
	# 0) Preconditions
	###############################################################################
	log "Checking prerequisites"
	need_cmd sudo
	need_cmd git
	need_cmd cmake

	if [[ ! -f /opt/intel/oneapi/setvars.sh ]]; then
	echo "ERROR: /opt/intel/oneapi/setvars.sh not found. Install Intel oneAPI first."
	exit 1
	fi

	source /opt/intel/oneapi/setvars.sh \|\| true

	p="$(ls -d /opt/intel/oneapi/compiler/latest/lib 2>/dev/null \|\| ls -d /opt/intel/oneapi/compiler/*/lib 2>/dev/null \| sort -V \| tail -n1)"; f="/etc/ld.so.conf.d/intel-oneapi.conf"; sudo mkdir -p "$(dirname "$f")" && (sudo test -f "$f" && sudo grep -Fxq "$p" "$f") \|\| echo "$p" \| sudo tee -a "$f" >/dev/null; sudo ldconfig

	###############################################################################
	# 1) System deps (CPU BLAS + Vulkan build deps)
	###############################################################################
	log "Installing build dependencies (OpenBLAS + Vulkan tools/dev headers)"
	sudo apt update
	sudo apt install -y \
	build-essential git cmake software-properties-common

	sudo add-apt-repository -y ppa:kobuk-team/intel-graphics
	sudo apt update

	sudo apt install -y \
	ninja-build pkg-config libopenblas-dev \
	libvulkan-dev vulkan-tools \
	ocl-icd-libopencl1 clinfo \
	libze-intel-gpu1 libze1 \
	intel-opencl-icd intel-gsc \
	intel-ocloc libze-dev

	###############################################################################
	# 2) Prepare /opt layout
	###############################################################################
	log "Preparing ${INSTALL_ROOT}"
	sudo mkdir -p "${INSTALL_ROOT}"
	sudo chown -R "$USER":"$USER" "${INSTALL_ROOT}"

	###############################################################################
	# 3) Clone or reuse llama.cpp sources
	###############################################################################
	if [[ -d "${SRC_DIR}/.git" && "${KEEP_EXISTING_SRC}" == "true" ]]; then
	log "Using existing repo at ${SRC_DIR}"
	cd "${SRC_DIR}"
	else
	log "Cloning llama.cpp into ${SRC_DIR}"
	rm -rf "${SRC_DIR}"
	git clone https://github.com/ggml-org/llama.cpp "${SRC_DIR}"
	cd "${SRC_DIR}"
	fi

	# Ensure the 'origin' remote exists (common when the repo was copied via rsync)
	if ! git remote get-url origin >/dev/null 2>&1; then
	log "Remote 'origin' missing; adding it"
	git remote add origin https://github.com/ggml-org/llama.cpp
	fi

	# Fetch latest refs and tags
	git fetch --prune origin --tags

	# Detect the default branch from the remote (handles main/master changes)
	DEFAULT_BRANCH="$(git remote show origin 2>/dev/null \| awk '/HEAD branch/ {print $NF}')"
	DEFAULT_BRANCH="${DEFAULT_BRANCH:-main}"

	# If LLAMA_CPP_REF is empty, use the detected default branch
	LLAMA_CPP_REF="${LLAMA_CPP_REF:-$DEFAULT_BRANCH}"

	log "Selecting revision: ${LLAMA_CPP_REF}"

	# 1) If it's a remote branch: create/update a local branch tracking origin/<branch>
	if git ls-remote --exit-code --heads origin "${LLAMA_CPP_REF}" >/dev/null 2>&1; then
	git checkout -B "${LLAMA_CPP_REF}" "origin/${LLAMA_CPP_REF}"
	git pull --ff-only \|\| true

	# 2) If it's a tag: checkout the tag
	elif git show-ref --tags --quiet "refs/tags/${LLAMA_CPP_REF}"; then
	git checkout "tags/${LLAMA_CPP_REF}"

	# 3) If it's a commit hash (or any valid commit-ish): checkout directly
	elif git rev-parse --verify "${LLAMA_CPP_REF}^{commit}" >/dev/null 2>&1; then
	git checkout "${LLAMA_CPP_REF}"

	else
	echo "ERROR: LLAMA_CPP_REF='${LLAMA_CPP_REF}' not found as a branch/tag/commit in origin."
	echo "Hint: use a valid branch (e.g. ${DEFAULT_BRANCH}), tag, or commit hash."
	exit 1
	fi

	log "Recording version info"
	git rev-parse --short HEAD \| tee "${INSTALL_ROOT}/LLAMA_CPP_COMMIT.txt" >/dev/null
	git describe --tags --always 2>/dev/null \| tee "${INSTALL_ROOT}/LLAMA_CPP_DESCRIBE.txt" >/dev/null \|\| true

	###############################################################################
	# 4) Build: Unified (CPU variants + Vulkan + SYCL) with dynamic backends
	###############################################################################
	log "Building UNIFIED (CPU + Vulkan + SYCL, dynamic backends)"
	source /opt/intel/oneapi/setvars.sh >/dev/null 2>&1 \|\| true

	rm -rf "${BUILD_UNIFIED}"
	cmake -S "${SRC_DIR}" -B "${BUILD_UNIFIED}" -G Ninja \
	-DCMAKE_BUILD_TYPE=Release \
	-DBUILD_SHARED_LIBS=ON \
	-DGGML_BACKEND_DL=ON \
	-DGGML_CPU_ALL_VARIANTS=ON \
	-DGGML_BLAS=ON \
	-DGGML_BLAS_VENDOR=OpenBLAS \
	-DGGML_VULKAN=ON \
	-DGGML_SYCL=ON \
	-DCMAKE_C_COMPILER=icx \
	-DCMAKE_CXX_COMPILER=icpx

	cmake --build "${BUILD_UNIFIED}" -j

	###############################################################################
	# 6) Install binaries and libraries (robust loader setup)
	###############################################################################
	log "Installing runtime layout (binaries + backend libs)"

	sudo mkdir -p "${LIB_DIR}"

	# Install unified standard commands
	sudo install -m 0755 "${BUILD_UNIFIED}/bin/llama-cli" "${BIN_DIR}/llama-cli"
	sudo install -m 0755 "${BUILD_UNIFIED}/bin/llama-server" "${BIN_DIR}/llama-server"

	# Backend libraries: put them in /usr/local/lib/llama
	# and also copy next to /usr/local/bin to avoid loader path issues.
	sudo cp -a "${BUILD_UNIFIED}/bin/libggml-".so "${LIB_DIR}/" 2>/dev/null \|\| true
	sudo cp -a "${BUILD_UNIFIED}/bin/libggml-".so "${BIN_DIR}/" 2>/dev/null \|\| true

	# Optional: ensure /usr/local/lib is in loader cache (usually is)
	sudo ldconfig

	###############################################################################
	# 7) Verification
	###############################################################################
	log "Verifying installation"
	echo "== Version (unified) =="
	"${BIN_DIR}/llama-cli" --version \|\| true

	echo
	echo "== Devices =="
	"${BIN_DIR}/llama-cli" --list-devices \|\| true

	log "Done."
	cat <<'EOF'

	NEXT STEPS / NOTES
	------------------
	1) Download a small GGUF model for a real smoke test (saved under /opt/llama.cpp/models)
	- Create the models directory:
	sudo mkdir -p /opt/llama.cpp/models
	sudo chown -R "$USER":"$USER" /opt/llama.cpp/models

	- Download a small/fast model (Qwen3-0.6B-Q8_0):
	curl -L -o /opt/llama.cpp/models/Qwen3-0.6B-Q8_0.gguf \
	https://huggingface.co/Qwen/Qwen3-0.6B-GGUF/resolve/main/Qwen3-0.6B-Q8_0.gguf

	- Define a convenience variable for commands below:
	MODEL="/opt/llama.cpp/models/Qwen3-0.6B-Q8_0.gguf"

	2) Validate devices and backends (this confirms SYCL + Vulkan + CPU are available)
	- List devices/backends:
	llama-cli --list-devices

	- Run a short prompt on each backend (expect a coherent response):
	llama-cli --device SYCL0 -m "$MODEL" -p "Write one sentence about Lunar Lake." -n 64
	llama-cli --device Vulkan0 -m "$MODEL" -p "Write one sentence about Lunar Lake." -n 64
	llama-cli --device CPU -m "$MODEL" -p "Write one sentence about Lunar Lake." -n 64

	- If a backend fails, rerun with verbose logs:
	llama-cli --verbose --device SYCL0 -m "$MODEL" -p "Hello" -n 16

	3) Installed commands policy
	- Only the stable entrypoints are installed system-wide:
	/usr/local/bin/llama-cli
	/usr/local/bin/llama-server

	- All additional tools remain available in:
	/opt/llama.cpp/build-unified/bin

	EOF
No results found