Skip to content

Instantly share code, notes, and snippets.

@dertin
Last active January 25, 2026 21:17
Show Gist options
  • Select an option

  • Save dertin/d315617fe3742804ea2d147e9df5a866 to your computer and use it in GitHub Desktop.

Select an option

Save dertin/d315617fe3742804ea2d147e9df5a866 to your computer and use it in GitHub Desktop.
llama.cpp setup - Ubuntu 24.04
#!/usr/bin/env bash
set -euo pipefail
###############################################################################
# llama.cpp setup under /opt/llama.cpp - Ubuntu 24.04
# - Unified build: CPU + Vulkan + SYCL (dynamic backends)
# - Installs stable commands into /usr/local/bin
# - Installs backend .so into /usr/local/lib/llama and also next to binaries
#
# IMPORTANT NOTE: For Intel Arc / Lunar Lake SYCL GPU support, follow Intel's
# Client GPU guide (Ubuntu Latest) and install required packages/PPAs:
# https://dgpu-docs.intel.com/driver/client/overview.html#ubuntu-latest
#
# Success criteria:
# source /opt/intel/oneapi/setvars.sh
# sycl-ls -> shows level_zero:gpu
###############################################################################
# ---- Configurable variables ----
INSTALL_ROOT="/opt/llama.cpp"
SRC_DIR="${INSTALL_ROOT}/src"
BUILD_UNIFIED="${INSTALL_ROOT}/build-unified"
BUILD_CPU="${INSTALL_ROOT}/build-cpu"
LIB_DIR="/usr/local/lib/llama"
BIN_DIR="/usr/local/bin"
# Pin to a specific revision for reproducibility:
# Set LLAMA_CPP_REF to a tag (e.g., "bXXXX") or commit hash
LLAMA_CPP_REF="${LLAMA_CPP_REF:-master}"
# Whether to keep existing /opt/llama.cpp/src if present
KEEP_EXISTING_SRC="${KEEP_EXISTING_SRC:-true}"
# ---- Helpers ----
log() { printf "\n[%s] %s\n" "$(date '+%F %T')" "$*"; }
need_cmd() { command -v "$1" >/dev/null 2>&1 || { echo "Missing command: $1"; exit 1; }; }
###############################################################################
# 0) Preconditions
###############################################################################
log "Checking prerequisites"
need_cmd sudo
need_cmd git
need_cmd cmake
if [[ ! -f /opt/intel/oneapi/setvars.sh ]]; then
echo "ERROR: /opt/intel/oneapi/setvars.sh not found. Install Intel oneAPI first."
exit 1
fi
source /opt/intel/oneapi/setvars.sh || true
p="$(ls -d /opt/intel/oneapi/compiler/latest/lib 2>/dev/null || ls -d /opt/intel/oneapi/compiler/*/lib 2>/dev/null | sort -V | tail -n1)"; f="/etc/ld.so.conf.d/intel-oneapi.conf"; sudo mkdir -p "$(dirname "$f")" && (sudo test -f "$f" && sudo grep -Fxq "$p" "$f") || echo "$p" | sudo tee -a "$f" >/dev/null; sudo ldconfig
###############################################################################
# 1) System deps (CPU BLAS + Vulkan build deps)
###############################################################################
log "Installing build dependencies (OpenBLAS + Vulkan tools/dev headers)"
sudo apt update
sudo apt install -y \
build-essential git cmake software-properties-common
sudo add-apt-repository -y ppa:kobuk-team/intel-graphics
sudo apt update
sudo apt install -y \
ninja-build pkg-config libopenblas-dev \
libvulkan-dev vulkan-tools \
ocl-icd-libopencl1 clinfo \
libze-intel-gpu1 libze1 \
intel-opencl-icd intel-gsc \
intel-ocloc libze-dev
###############################################################################
# 2) Prepare /opt layout
###############################################################################
log "Preparing ${INSTALL_ROOT}"
sudo mkdir -p "${INSTALL_ROOT}"
sudo chown -R "$USER":"$USER" "${INSTALL_ROOT}"
###############################################################################
# 3) Clone or reuse llama.cpp sources
###############################################################################
if [[ -d "${SRC_DIR}/.git" && "${KEEP_EXISTING_SRC}" == "true" ]]; then
log "Using existing repo at ${SRC_DIR}"
cd "${SRC_DIR}"
else
log "Cloning llama.cpp into ${SRC_DIR}"
rm -rf "${SRC_DIR}"
git clone https://github.com/ggml-org/llama.cpp "${SRC_DIR}"
cd "${SRC_DIR}"
fi
# Ensure the 'origin' remote exists (common when the repo was copied via rsync)
if ! git remote get-url origin >/dev/null 2>&1; then
log "Remote 'origin' missing; adding it"
git remote add origin https://github.com/ggml-org/llama.cpp
fi
# Fetch latest refs and tags
git fetch --prune origin --tags
# Detect the default branch from the remote (handles main/master changes)
DEFAULT_BRANCH="$(git remote show origin 2>/dev/null | awk '/HEAD branch/ {print $NF}')"
DEFAULT_BRANCH="${DEFAULT_BRANCH:-main}"
# If LLAMA_CPP_REF is empty, use the detected default branch
LLAMA_CPP_REF="${LLAMA_CPP_REF:-$DEFAULT_BRANCH}"
log "Selecting revision: ${LLAMA_CPP_REF}"
# 1) If it's a remote branch: create/update a local branch tracking origin/<branch>
if git ls-remote --exit-code --heads origin "${LLAMA_CPP_REF}" >/dev/null 2>&1; then
git checkout -B "${LLAMA_CPP_REF}" "origin/${LLAMA_CPP_REF}"
git pull --ff-only || true
# 2) If it's a tag: checkout the tag
elif git show-ref --tags --quiet "refs/tags/${LLAMA_CPP_REF}"; then
git checkout "tags/${LLAMA_CPP_REF}"
# 3) If it's a commit hash (or any valid commit-ish): checkout directly
elif git rev-parse --verify "${LLAMA_CPP_REF}^{commit}" >/dev/null 2>&1; then
git checkout "${LLAMA_CPP_REF}"
else
echo "ERROR: LLAMA_CPP_REF='${LLAMA_CPP_REF}' not found as a branch/tag/commit in origin."
echo "Hint: use a valid branch (e.g. ${DEFAULT_BRANCH}), tag, or commit hash."
exit 1
fi
log "Recording version info"
git rev-parse --short HEAD | tee "${INSTALL_ROOT}/LLAMA_CPP_COMMIT.txt" >/dev/null
git describe --tags --always 2>/dev/null | tee "${INSTALL_ROOT}/LLAMA_CPP_DESCRIBE.txt" >/dev/null || true
###############################################################################
# 4) Build: Unified (CPU variants + Vulkan + SYCL) with dynamic backends
###############################################################################
log "Building UNIFIED (CPU + Vulkan + SYCL, dynamic backends)"
source /opt/intel/oneapi/setvars.sh >/dev/null 2>&1 || true
rm -rf "${BUILD_UNIFIED}"
cmake -S "${SRC_DIR}" -B "${BUILD_UNIFIED}" -G Ninja \
-DCMAKE_BUILD_TYPE=Release \
-DBUILD_SHARED_LIBS=ON \
-DGGML_BACKEND_DL=ON \
-DGGML_CPU_ALL_VARIANTS=ON \
-DGGML_BLAS=ON \
-DGGML_BLAS_VENDOR=OpenBLAS \
-DGGML_VULKAN=ON \
-DGGML_SYCL=ON \
-DCMAKE_C_COMPILER=icx \
-DCMAKE_CXX_COMPILER=icpx
cmake --build "${BUILD_UNIFIED}" -j
###############################################################################
# 6) Install binaries and libraries (robust loader setup)
###############################################################################
log "Installing runtime layout (binaries + backend libs)"
sudo mkdir -p "${LIB_DIR}"
# Install unified standard commands
sudo install -m 0755 "${BUILD_UNIFIED}/bin/llama-cli" "${BIN_DIR}/llama-cli"
sudo install -m 0755 "${BUILD_UNIFIED}/bin/llama-server" "${BIN_DIR}/llama-server"
# Backend libraries: put them in /usr/local/lib/llama
# and also copy next to /usr/local/bin to avoid loader path issues.
sudo cp -a "${BUILD_UNIFIED}/bin/libggml-"*.so* "${LIB_DIR}/" 2>/dev/null || true
sudo cp -a "${BUILD_UNIFIED}/bin/libggml-"*.so* "${BIN_DIR}/" 2>/dev/null || true
# Optional: ensure /usr/local/lib is in loader cache (usually is)
sudo ldconfig
###############################################################################
# 7) Verification
###############################################################################
log "Verifying installation"
echo "== Version (unified) =="
"${BIN_DIR}/llama-cli" --version || true
echo
echo "== Devices =="
"${BIN_DIR}/llama-cli" --list-devices || true
log "Done."
cat <<'EOF'
NEXT STEPS / NOTES
------------------
1) Download a small GGUF model for a real smoke test (saved under /opt/llama.cpp/models)
- Create the models directory:
sudo mkdir -p /opt/llama.cpp/models
sudo chown -R "$USER":"$USER" /opt/llama.cpp/models
- Download a small/fast model (Qwen3-0.6B-Q8_0):
curl -L -o /opt/llama.cpp/models/Qwen3-0.6B-Q8_0.gguf \
https://huggingface.co/Qwen/Qwen3-0.6B-GGUF/resolve/main/Qwen3-0.6B-Q8_0.gguf
- Define a convenience variable for commands below:
MODEL="/opt/llama.cpp/models/Qwen3-0.6B-Q8_0.gguf"
2) Validate devices and backends (this confirms SYCL + Vulkan + CPU are available)
- List devices/backends:
llama-cli --list-devices
- Run a short prompt on each backend (expect a coherent response):
llama-cli --device SYCL0 -m "$MODEL" -p "Write one sentence about Lunar Lake." -n 64
llama-cli --device Vulkan0 -m "$MODEL" -p "Write one sentence about Lunar Lake." -n 64
llama-cli --device CPU -m "$MODEL" -p "Write one sentence about Lunar Lake." -n 64
- If a backend fails, rerun with verbose logs:
llama-cli --verbose --device SYCL0 -m "$MODEL" -p "Hello" -n 16
3) Installed commands policy
- Only the stable entrypoints are installed system-wide:
/usr/local/bin/llama-cli
/usr/local/bin/llama-server
- All additional tools remain available in:
/opt/llama.cpp/build-unified/bin
EOF
#!/usr/bin/env bash
set -euo pipefail
###############################################################################
# bench-sycl-llama-bench.sh
#
# Focused SYCL env screening using llama-bench (non-interactive).
# Prints a final report to stdout (no CSV required).
#
# Example:
# MODEL="/opt/llama.cpp/models/Qwen3-0.6B-Q8_0.gguf" DEVICE="SYCL0" REPS=5 N_PROMPT=256 N_GEN=128 BATCH=1024 UBATCH=256 THREADS=8 VARIANT_SET=8 ./llama-cpp-bench-matrix.sh
###############################################################################
# -----------------------------
# Config (override via env vars)
# -----------------------------
BENCH_BIN="${BENCH_BIN:-/opt/llama.cpp/build-unified/bin/llama-bench}"
MODEL="${MODEL:-/opt/llama.cpp/models/Qwen3-0.6B-Q8_0.gguf}"
DEVICE="${DEVICE:-SYCL0}" # SYCL0 / Vulkan0 / auto
N_GPU_LAYERS="${N_GPU_LAYERS:-99}" # 0 for CPU-only, 99 for "as much as possible"
REPS="${REPS:-5}"
N_PROMPT="${N_PROMPT:-512}"
N_GEN="${N_GEN:-256}"
BATCH="${BATCH:-2048}"
UBATCH="${UBATCH:-512}"
THREADS="${THREADS:-8}"
NO_WARMUP="${NO_WARMUP:-0}" # 1 to skip warmup
DELAY="${DELAY:-0}" # seconds between tests
VARIANT_SET="${VARIANT_SET:-6}" # "6" or "8"
OUT_DIR="${OUT_DIR:-./bench_out}"
RUN_TAG="${RUN_TAG:-sycl_env_screen}"
mkdir -p "${OUT_DIR}"
TS="$(date +%F_%H%M%S)"
# -----------------------------
# Helpers
# -----------------------------
log() { printf "\n[%s] %s\n" "$(date '+%F %T')" "$*"; }
die() { echo "ERROR: $*" >&2; exit 1; }
need_cmd() { command -v "$1" >/dev/null 2>&1 || die "Missing command: $1"; }
need_cmd awk
need_cmd grep
need_cmd sed
need_cmd sort
need_cmd head
need_cmd tail
[[ -x "${BENCH_BIN}" ]] || die "llama-bench not found/executable: ${BENCH_BIN}"
[[ -f "${MODEL}" ]] || die "Model file not found: ${MODEL}"
strip_ansi() {
sed -r \
-e 's/\x1B\[[0-9;?]*[ -/]*[@-~]//g' \
-e 's/\x1B\][^\x07]*(\x07|\x1B\\)//g'
}
extract_tps_from_md() {
local md_file="$1"
# --- Mode A (your current output):
# Table rows contain test=ppNN and test=tgNN, and throughput in column "t/s".
# Example rows:
# | ... | test | t/s |
# | ... | pp128 | 1340.14 ± 0.00 |
# | ... | tg64 | 37.32 ± 0.00 |
local pp tg
pp="$(
awk -F'|' '
function trim(s){ sub(/^[ \t]+/,"",s); sub(/[ \t]+$/,"",s); return s }
function first_float(s){
gsub(/±/," ",s); gsub(/\([^)]*\)/," ",s); gsub(/[^0-9.\-eE ]/," ",s)
if (match(s, /-?[0-9]+(\.[0-9]+)?([eE]-?[0-9]+)?/)) return substr(s, RSTART, RLENGTH)
return ""
}
/^\|/ {
# skip separator rows like | --- |
if ($0 ~ /^\|[ \t-:|]+\|[ \t]*$/) next
# detect header with "test" and "t/s"
if (tolower($0) ~ /\|[[:space:]]*test[[:space:]]*\|/ && tolower($0) ~ /\|[[:space:]]*t\/s[[:space:]]*\|/) {
# record column positions
n=split($0,a,"|")
test_i=-1; ts_i=-1
for (i=2;i<=n-1;i++){
h=tolower(trim(a[i]))
if (h=="test") test_i=i
if (h=="t/s") ts_i=i
}
next
}
# parse data rows once we know columns
if (test_i>0 && ts_i>0) {
n=split($0,a,"|")
t=trim(a[test_i])
if (t ~ /^pp[0-9]+$/) { v=first_float(a[ts_i]); if (v!="") { print v; exit } }
}
}
' "$md_file"
)"
tg="$(
awk -F'|' '
function trim(s){ sub(/^[ \t]+/,"",s); sub(/[ \t]+$/,"",s); return s }
function first_float(s){
gsub(/±/," ",s); gsub(/\([^)]*\)/," ",s); gsub(/[^0-9.\-eE ]/," ",s)
if (match(s, /-?[0-9]+(\.[0-9]+)?([eE]-?[0-9]+)?/)) return substr(s, RSTART, RLENGTH)
return ""
}
/^\|/ {
if ($0 ~ /^\|[ \t-:|]+\|[ \t]*$/) next
if (tolower($0) ~ /\|[[:space:]]*test[[:space:]]*\|/ && tolower($0) ~ /\|[[:space:]]*t\/s[[:space:]]*\|/) {
n=split($0,a,"|")
test_i=-1; ts_i=-1
for (i=2;i<=n-1;i++){
h=tolower(trim(a[i]))
if (h=="test") test_i=i
if (h=="t/s") ts_i=i
}
next
}
if (test_i>0 && ts_i>0) {
n=split($0,a,"|")
t=trim(a[test_i])
if (t ~ /^tg[0-9]+$/) { v=first_float(a[ts_i]); if (v!="") { print v; exit } }
}
}
' "$md_file"
)"
if [[ -n "${pp}" && -n "${tg}" ]]; then
echo "${pp},${tg}"
return
fi
# --- Mode B (alternate output):
# Columns themselves are pp/tg (rare across versions); keep as fallback.
local out
out="$(
awk '
function trim(s){ sub(/^[ \t]+/,"",s); sub(/[ \t]+$/,"",s); return s }
function lower(s){ for(i=1;i<=length(s);i++){ c=substr(s,i,1); if(c>="A"&&c<="Z") c=tolower(c); r=r c } tmp=r; r=""; return tmp }
function first_float(s){
gsub(/±/," ",s); gsub(/\([^)]*\)/," ",s); gsub(/[^0-9.\-eE ]/," ",s)
if (match(s, /-?[0-9]+(\.[0-9]+)?([eE]-?[0-9]+)?/)) return substr(s, RSTART, RLENGTH)
return ""
}
BEGIN{ got_header=0; pp_i=-1; tg_i=-1 }
/^\|/ {
if ($0 ~ /^\|[ \t-:|]+\|[ \t]*$/) next
n=split($0,a,"|")
# header detection
if (got_header==0) {
for (i=2;i<=n-1;i++){
h=lower(trim(a[i]))
if (h=="pp") pp_i=i
if (h=="tg") tg_i=i
}
if (pp_i>0 && tg_i>0) { got_header=1; next }
} else {
pp=first_float(a[pp_i]); tg=first_float(a[tg_i])
if (pp!="" && tg!="") { print pp "," tg; exit }
}
}
' "$md_file"
)"
if [[ -n "${out}" ]]; then
echo "${out}"
return
fi
# --- Final fallback: tok/s or t/s anywhere (best-effort)
local line nums
line="$(grep -E '([0-9]+\.[0-9]+|[0-9]+)[[:space:]]*(tok/s|t/s)' "$md_file" | tail -n 1 || true)"
if [[ -z "${line}" ]]; then
echo "NA,NA"
return
fi
nums="$(echo "$line" | grep -Eo '([0-9]+\.[0-9]+|[0-9]+)[[:space:]]*(tok/s|t/s)' | head -n 2 | awk '{print $1}' | paste -sd, -)"
[[ -z "${nums}" ]] && { echo "NA,NA"; return; }
[[ "${nums}" != *,* ]] && nums="${nums},${nums}"
echo "${nums}"
}
float_gt() { awk -v a="$1" -v b="$2" 'BEGIN{exit !(a>b)}'; }
# -----------------------------
# Curated variants (6 or 8)
# -----------------------------
# Format: "graph dnn opt dmmv" where each is 0/1
# graph = GGML_SYCL_DISABLE_GRAPH
# dnn = GGML_SYCL_DISABLE_DNN
# opt = GGML_SYCL_DISABLE_OPT
# dmmv = GGML_SYCL_PRIORITIZE_DMMV
if [[ "${VARIANT_SET}" == "8" ]]; then
CONFIGS=(
"0 0 0 0" # baseline (graph on, dnn on)
"1 0 0 0" # graph off (strong baseline winner family)
"1 1 0 0" # graph off + dnn off (your current winner)
"1 1 0 1" # graph off + dnn off + dmmv
"1 0 0 1" # graph off + dmmv
"1 0 1 0" # graph off + opt off
"1 0 1 1" # graph off + opt off + dmmv (NEW, good to test)
"0 0 0 1" # graph on + dmmv (control, usually safe)
)
else
CONFIGS=(
"0 0 0 0"
"1 0 0 0"
"1 0 1 0"
"1 1 0 0"
"1 0 0 1"
"1 1 0 1"
)
fi
# -----------------------------
# llama-bench args
# -----------------------------
BASE_ARGS=(
--model "${MODEL}"
--device "${DEVICE}"
--repetitions "${REPS}"
--n-prompt "${N_PROMPT}"
--n-gen "${N_GEN}"
--batch-size "${BATCH}"
--ubatch-size "${UBATCH}"
--threads "${THREADS}"
--n-gpu-layers "${N_GPU_LAYERS}"
--output md
--progress
)
[[ "${NO_WARMUP}" == "1" ]] && BASE_ARGS+=( --no-warmup )
[[ "${DELAY}" != "0" ]] && BASE_ARGS+=( --delay "${DELAY}" )
# -----------------------------
# Run
# -----------------------------
log "Starting llama-bench env screening"
echo "Bench: ${BENCH_BIN}"
echo "Model: ${MODEL}"
echo "Device: ${DEVICE} (N_GPU_LAYERS=${N_GPU_LAYERS})"
echo "Workload: REPS=${REPS} N_PROMPT=${N_PROMPT} N_GEN=${N_GEN} BATCH=${BATCH} UBATCH=${UBATCH} THREADS=${THREADS}"
echo "Variants: ${#CONFIGS[@]} (VARIANT_SET=${VARIANT_SET})"
echo "Artifacts: ${OUT_DIR}/"
echo
RESULTS_TSV="${OUT_DIR}/${RUN_TAG}_${TS}_results.tsv"
: > "${RESULTS_TSV}"
printf "cfg_id\ttag\tgraph\tdnn\topt\tdmmv\tpp_tps\ttg_tps\tmd_path\tlog_path\n" >> "${RESULTS_TSV}"
best_gen="0"
best_desc="(none)"
cfg_id=0
for cfg in "${CONFIGS[@]}"; do
cfg_id=$((cfg_id+1))
read -r g d o m <<< "${cfg}"
tag="g${g}_d${d}_o${o}_m${m}"
md_out="${OUT_DIR}/${RUN_TAG}_${TS}_${tag}.md"
log_out="${OUT_DIR}/${RUN_TAG}_${TS}_${tag}.log"
log "Config ${cfg_id}/${#CONFIGS[@]}: ${tag}"
log " GGML_SYCL_DISABLE_GRAPH=${g} GGML_SYCL_DISABLE_DNN=${d} GGML_SYCL_DISABLE_OPT=${o} GGML_SYCL_PRIORITIZE_DMMV=${m}"
# Run bench; never let a crash stop the matrix.
# - stdout -> md_out
# - stderr -> log_out
set +e
NO_COLOR=1 \
LLAMA_LOG_COLORS=0 \
GGML_LOG_COLORS=0 \
GGML_SYCL_DISABLE_GRAPH="${g}" \
GGML_SYCL_DISABLE_DNN="${d}" \
GGML_SYCL_DISABLE_OPT="${o}" \
GGML_SYCL_PRIORITIZE_DMMV="${m}" \
"${BENCH_BIN}" "${BASE_ARGS[@]}" >"${md_out}.tmp" 2>"${log_out}.tmp"
rc=$?
set -e
# Strip ANSI after the fact (safe even if program crashed)
strip_ansi <"${md_out}.tmp" >"${md_out}" || true
strip_ansi <"${log_out}.tmp" >"${log_out}" || true
rm -f "${md_out}.tmp" "${log_out}.tmp"
if [[ $rc -ne 0 ]]; then
log "WARNING: Config ${tag} exited non-zero (rc=${rc}). Marking throughput as NA."
pp_tps="NA"
tg_tps="NA"
else
tps="$(extract_tps_from_md "${md_out}")"
pp_tps="${tps%,*}"
tg_tps="${tps#*,}"
fi
echo "Result ${cfg_id}: ${tag}"
echo " pp (prompt/prefill) = ${pp_tps} tok/s"
echo " tg (generation) = ${tg_tps} tok/s"
echo " report = ${md_out}"
echo " log = ${log_out}"
echo
printf "%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n" \
"${cfg_id}" "${tag}" "${g}" "${d}" "${o}" "${m}" "${pp_tps}" "${tg_tps}" "${md_out}" "${log_out}" \
>> "${RESULTS_TSV}"
if [[ "${tg_tps}" != "NA" ]]; then
if [[ "${best_gen}" == "0" ]] || float_gt "${tg_tps}" "${best_gen}"; then
best_gen="${tg_tps}"
best_desc="cfg=${cfg_id} ${tag} (tg=${tg_tps} tok/s, pp=${pp_tps} tok/s)"
fi
else
log "WARNING: Could not extract throughput from markdown table."
log " Inspect: ${md_out}"
log " Inspect: ${log_out}"
fi
done
echo "============================================================================="
echo "FINAL REPORT"
echo "============================================================================="
echo "Best by generation throughput: ${best_desc}"
echo
echo "Ranking by tg (generation tok/s):"
awk -F'\t' 'NR>1 && $8 != "NA" {print}' "${RESULTS_TSV}" \
| sort -t$'\t' -k8,8gr \
| awk -F'\t' '{printf " cfg=%s tg=%s tok/s pp=%s tok/s env=[graph=%s dnn=%s opt=%s dmmv=%s] (%s)\n", $1,$8,$7,$3,$4,$5,$6,$2}'
echo
echo "Artifacts:"
echo " TSV summary: ${RESULTS_TSV}"
echo " Reports: ${OUT_DIR}/${RUN_TAG}_${TS}_g*_d*_o*_m*.md"
echo " Logs: ${OUT_DIR}/${RUN_TAG}_${TS}_g*_d*_o*_m*.log"
echo
cat <<'EOF'
NOTES
-----
- 'pp' is prompt/prefill throughput; 'tg' is generation throughput.
- For faster iteration:
REPS=1 N_PROMPT=128 N_GEN=64 BATCH=512 UBATCH=128 ./bench-sycl-llama-bench.sh
Then scale up REPS/tokens once you choose the best config.
- CPU-only baseline:
DEVICE=auto N_GPU_LAYERS=0 ./bench-sycl-llama-bench.sh
- Vulkan comparison:
DEVICE=Vulkan0 ./bench-sycl-llama-bench.sh
EOF
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment