Skip to content

Instantly share code, notes, and snippets.

@xxrjun
Created October 15, 2025 03:22
Show Gist options
  • Select an option

  • Save xxrjun/a0eecb033a529522b9cec80c1145359a to your computer and use it in GitHub Desktop.

Select an option

Save xxrjun/a0eecb033a529522b9cec80c1145359a to your computer and use it in GitHub Desktop.
#!/bin/bash
# verify_mnnvl_health.sh
# Local verification for MNNVL compute tray & NVLink switch health.
# - Runs all checks to completion; no early exit.
# - Safe output mode ON by default (redacts IP/MAC/long HEX/IDs).
# - Prints a final Summary and returns nonzero if any check failed.
#
# Reference: https://docs.nvidia.com/multi-node-nvlink-systems/mnnvl-user-guide/verifying.html
# ==============================================================================
# UI UTILITIES
# ==============================================================================
PASS="OK"; FAIL="FAIL"; WARN="WARN"
log() { local level="$1"; shift; printf "[%s] %-5s %s\n" "$(date '+%F %T')" "$level" "$*"; }
say() { log INFO "$*"; }
section(){ printf "\n[%s] %-5s %s\n" "$(date '+%F %T')" INFO "======== $* ========"; }
ok() { log INFO "${PASS} $*"; }
bad() { log ERROR "${FAIL} $*"; }
warn() { log WARN "${WARN} $*"; }
# ==============================================================================
# DEFAULTS
# ==============================================================================
ROLE="compute" # compute | switch
SWITCH_HEALTHCHECK_HOST="0.0.0.0"
REQUIRE_N_GPUS=1
KERNEL_MIN_VERSION="6.5.0"
KERNEL_MIN_BUILD=1024
REQUIRE_KERNEL_FLAVOR="nvidia-64k"
TIMEOUT=30
SAFE_MODE=1
# ==============================================================================
# LOGGING (default ON to ./log/<date>)
# ==============================================================================
ENABLE_LOG=1
LOG_DIR="../log"
TIME_STAMP=$(date +%Y%m%d_%H%M%S)
LOG_FILE="${LOG_DIR}/verify_mnnvl_${TIME_STAMP}.log"
# ==============================================================================
# HELPERS
# ==============================================================================
usage() {
cat <<EOF
verify_mnnvl_health.sh
Local verification for MNNVL compute trays or NVLink switch trays.
Usage:
${0##*/} [--role compute|switch] [--require-gpus N] [--health-host HOST]
[--unsafe] [--log FILE] [--no-log]
Options:
--role ROLE 'compute' (default) or 'switch'
--require-gpus N Minimum GPUs expected (default ${REQUIRE_N_GPUS})
--health-host HOST Host/IP for NMX-T curl endpoints (default ${SWITCH_HEALTHCHECK_HOST})
--unsafe Print raw command outputs (disable redaction)
--log FILE Override log path (default: ${LOG_FILE})
--no-log Do not write log file
-h|--help Show help
EOF
}
have() { command -v "$1" >/dev/null 2>&1; }
timeout_run() {
if have timeout; then
timeout "$TIMEOUT" "$@"
else
perl -e 'alarm shift @ARGV; exec @ARGV' "$TIMEOUT" "$@"
fi
}
# Redact sensitive strings while keeping outputs readable
mask_safe() {
sed -E \
-e 's/([0-9]{1,3}\.){3}[0-9]{1,3}/<IP>/g' \
-e 's/([0-9A-Fa-f]{2}:){5}[0-9A-Fa-f]{2}/<MAC>/g' \
-e 's/[0-9A-Fa-f]{8,}/<HEX>/g' \
-e 's/([A-Za-z0-9]{4}-){2,}[A-Za-z0-9]+/<ID>/g'
}
# Run a command, show its (possibly redacted) output, return rc but DO NOT exit
run_cmd() {
local label="$1"; shift
section "$label"
log DEBUG "\$ $*"
local tmp; tmp="$(mktemp)"
timeout_run "$@" >"$tmp" 2>&1
local rc=$?
if (( SAFE_MODE )); then
mask_safe <"$tmp" | sed -n '1,200p'
else
sed -n '1,200p' "$tmp"
fi
echo
rm -f "$tmp"
return "$rc"
}
# Semantic version comparison X.Y.Z (ignore distro suffix)
semver_ge() {
local IFS=.; local -a A=(${1%%-*}) B=(${2%%-*})
for i in 0 1 2; do
local a=${A[i]:-0} b=${B[i]:-0}
((10#$a>10#$b)) && return 0
((10#$a<10#$b)) && return 1
done
return 0
}
extract_build() { local s="$1"; s="${s#*-}"; [[ "$s" =~ ^([0-9]+) ]] && echo "${BASH_REMATCH[1]}" || echo 0; }
# ==============================================================================
# SUMMARY COUNTERS
# ==============================================================================
PASS_CNT=0; FAIL_CNT=0; WARN_CNT=0
FAILED_ITEMS=()
mark_pass() { ok "$1"; ((PASS_CNT++)); }
mark_fail() { bad "$1"; FAILED_ITEMS+=("$1"); ((FAIL_CNT++)); }
mark_warn() { warn "$1"; ((WARN_CNT++)); }
# ==============================================================================
# COMPUTE CHECKS
# ==============================================================================
chk_imex_channel() {
local d="/dev/nvidia-caps-imex-channels/channel0"
if [[ -c "$d" ]]; then
mark_pass "IMEX channel0 present ($d)"
else
mark_fail "IMEX channel0 missing ($d)"
fi
}
chk_kernel() {
local k; k="$(uname -r)"
local build; build="$(extract_build "$k")"
say "Kernel: $k (build $build)"
local okv=0 okb=0 okf=0
semver_ge "$k" "$KERNEL_MIN_VERSION" && okv=1
(( build >= KERNEL_MIN_BUILD )) && okb=1
[[ "$k" == *"$REQUIRE_KERNEL_FLAVOR"* ]] && okf=1
((okv)) || mark_fail "Kernel version < $KERNEL_MIN_VERSION"
((okb)) || mark_fail "Kernel build < $KERNEL_MIN_BUILD"
((okf)) || mark_fail "Kernel flavor missing '$REQUIRE_KERNEL_FLAVOR'"
((okv && okb && okf)) && mark_pass "Kernel OK"
}
chk_nvidia_smi() {
if ! have nvidia-smi; then mark_fail "nvidia-smi not found"; return; fi
if run_cmd "[GPU] nvidia-smi" nvidia-smi; then
local cnt; cnt="$(nvidia-smi --query-gpu=name --format=csv,noheader 2>/dev/null | wc -l | tr -d ' ')"
[[ -z "$cnt" ]] && cnt=0
(( cnt >= REQUIRE_N_GPUS )) && mark_pass "Detected $cnt GPU(s) (>= $REQUIRE_N_GPUS)" || mark_fail "Detected $cnt GPU(s) (< $REQUIRE_N_GPUS)"
else
mark_fail "nvidia-smi execution failed"
fi
}
chk_fabric_status() {
if ! have nvidia-smi; then mark_fail "nvidia-smi not found (fabric)"; return; fi
if run_cmd "[GPU] Fabric block (nvidia-smi -q | grep -A4 Fabric)" bash -lc "nvidia-smi -q | grep -n 'Fabric' -A 4"; then
local q; q="$(nvidia-smi -q 2>/dev/null || true)"
if grep -qi 'Fabric' <<<"$q" && grep -qi 'Completed' <<<"$q" && grep -qi 'Success' <<<"$q"; then
mark_pass "Fabric state Completed/Success"
else
mark_fail "Fabric state not Completed/Success"
fi
else
mark_fail "Failed to read Fabric block"
fi
}
chk_nvlink_status() {
if ! have nvidia-smi; then mark_fail "nvidia-smi not found (nvlink)"; return; fi
if run_cmd "[NVLink] nvidia-smi nvlink --status" nvidia-smi nvlink --status; then
local s; s="$(nvidia-smi nvlink --status 2>/dev/null || true)"
if grep -q "<inactive>" <<<"$s"; then
mark_fail "NVLink: one or more links <inactive>"
else
mark_pass "NVLink: no <inactive> links"
fi
if grep -E 'Link [0-9]+:' <<<"$s" | grep -Ev '<inactive>' | grep -vq '50 GB/s'; then
mark_fail "NVLink: some active links not 50 GB/s"
else
mark_pass "NVLink: all active links report 50 GB/s"
fi
else
mark_fail "nvidia-smi nvlink --status failed"
fi
}
# ==============================================================================
# TOPOLOGY PARSER
# ==============================================================================
chk_topology() {
if ! have nvidia-smi; then mark_fail "nvidia-smi not found (topology)"; return; fi
if run_cmd "[Topology] nvidia-smi topo -p2p n" nvidia-smi topo -p2p n; then
local out; out="$(nvidia-smi topo -p2p n 2>/dev/null || true)"
local matrix
matrix="$(awk '
/^[[:space:]]*GPU[0-9]+/ { hdr=1 }
hdr && NF==0 { exit }
hdr { print }
' <<< "$out")"
if [[ -z "$matrix" ]]; then
mark_fail "Topology: unable to parse matrix block"
return
fi
local bad_tokens
bad_tokens="$(awk '
NR==1 { next }
{
for (i=2; i<=NF; i++) {
t=$i
if (t!="OK" && t!="X" && t ~ /^(N\/A|WARN|FAIL|TNS|CNS|GNS|NS|U)$/) {
seen[t]=1
}
}
}
END {
for (k in seen) printf "%s ", k
}
' <<< "$matrix")"
if [[ -n "$bad_tokens" ]]; then
mark_fail "Topology matrix has non-OK entries: ${bad_tokens%% }"
return
fi
mark_pass "Topology: GPU↔GPU links OK (diagonal X ignored)"
else
mark_fail "nvidia-smi topo failed"
fi
}
# ==============================================================================
# SERVICE & IMEX CHECKS
# ==============================================================================
chk_services() {
local saw=0
for s in nvidia-persistenced nvidia-imex; do
if have systemctl && systemctl is-active --quiet "$s"; then
mark_pass "Service active: $s"
run_cmd "[Service] systemctl status $s" systemctl status "$s" || true
else
mark_fail "Service NOT active: $s"
fi
saw=1
done
((saw)) || mark_warn "systemctl not available; skipped service checks"
}
chk_imex_version() {
if [[ -x /usr/bin/nvidia-imex ]]; then
run_cmd "[IMEX] /usr/bin/nvidia-imex --version" /usr/bin/nvidia-imex --version && mark_pass "IMEX version retrieved" || mark_fail "IMEX version failed"
else
mark_fail "IMEX binary not found at /usr/bin/nvidia-imex"
fi
}
chk_imex_domain() {
if have nvidia-imex-ctl; then
run_cmd "[IMEX] nvidia-imex-ctl -N" nvidia-imex-ctl -N && mark_pass "IMEX domain query succeeded" || mark_fail "IMEX domain query failed"
else
mark_fail "nvidia-imex-ctl not found"
fi
}
run_compute() {
section "MNNVL Verification (compute, local)"
chk_imex_channel
chk_kernel
chk_nvidia_smi
chk_fabric_status
chk_nvlink_status
chk_topology
chk_services
chk_imex_version
chk_imex_domain
}
# ==============================================================================
# SWITCH CHECKS
# ==============================================================================
chk_switch_health() {
if have nv; then
run_cmd "[Switch] nv show system health" nv show system health && mark_pass "Switch system health command OK" || mark_fail "Switch system health failed"
else
mark_fail "'nv' CLI not found on switch"
fi
}
chk_switch_apps() {
if have nv; then
run_cmd "[Switch] nv show cluster apps" nv show cluster apps || mark_fail "nv show cluster apps failed"
run_cmd "[Switch] nv show cluster apps running" nv show cluster apps running || mark_fail "nv show cluster apps running failed"
if nv show cluster apps running 2>/dev/null | grep -qE '^nmx-controller[[:space:]]+ok'; then
mark_pass "nmx-controller status ok"
else
mark_fail "nmx-controller not ok/missing"
fi
if nv show cluster apps running 2>/dev/null | grep -qE '^nmx-telemetry[[:space:]]+ok'; then
mark_pass "nmx-telemetry status ok"
else
mark_fail "nmx-telemetry not ok/missing"
fi
fi
}
chk_nmx_endpoints() {
if have curl; then
run_cmd "[Switch] NMX-T healthcheck (9350)" curl -fsS "http://${SWITCH_HEALTHCHECK_HOST}:9350/healthcheck" || true
local hc; hc="$(curl -fsS "http://${SWITCH_HEALTHCHECK_HOST}:9350/healthcheck" 2>/dev/null || true)"
if [[ "$hc" == *'"status":0'* && "$hc" == *'"message":"OK"'* ]]; then
mark_pass "NMX-T healthcheck OK"
else
mark_fail "NMX-T healthcheck unexpected"
fi
run_cmd "[Switch] NMX-T statistics (9352)" curl -fsS "http://${SWITCH_HEALTHCHECK_HOST}:9352/management/statistics" || true
run_cmd "[Switch] NMX-T mgmt status (9352)" curl -fsS "http://${SWITCH_HEALTHCHECK_HOST}:9352/management/check_status" || true
else
mark_fail "curl not found for NMX-T endpoints"
fi
}
run_switch() {
section "MNNVL Verification (switch, local)"
chk_switch_health
chk_switch_apps
chk_nmx_endpoints
}
# ==============================================================================
# CLI ARGUMENT PARSING
# ==============================================================================
while [[ $# -gt 0 ]]; do
case "$1" in
--role) ROLE="${2:-}"; shift 2;;
--require-gpus) REQUIRE_N_GPUS="${2:-}"; shift 2;;
--health-host) SWITCH_HEALTHCHECK_HOST="${2:-}"; shift 2;;
--unsafe) SAFE_MODE=0; shift;;
--log) ENABLE_LOG=1; LOG_FILE="${2:-}"; shift 2;;
--no-log) ENABLE_LOG=0; shift;;
-h|--help) usage; exit 0;;
*) warn "Unknown arg: $1"; usage; exit 2;;
esac
done
# ==============================================================================
# ENABLE LOGGING
# ==============================================================================
if (( ENABLE_LOG )); then
mkdir -p "$(dirname -- "$LOG_FILE")" 2>/dev/null || true
exec > >(tee -a "$LOG_FILE") 2>&1
say "Logging to: $LOG_FILE"
fi
# ==============================================================================
# DISPATCH
# ==============================================================================
case "$ROLE" in
compute) run_compute ;;
switch) run_switch ;;
*) bad "Invalid --role '$ROLE'"; usage; exit 2;;
esac
# ==============================================================================
# SUMMARY
# ==============================================================================
section "Summary"
say "Role: $ROLE Safe Output: $([[ $SAFE_MODE -eq 1 ]] && echo ON || echo OFF) Timeout: ${TIMEOUT}s"
say "Kernel min: ${KERNEL_MIN_VERSION}-${KERNEL_MIN_BUILD}-${REQUIRE_KERNEL_FLAVOR} GPUs required: ${REQUIRE_N_GPUS}"
(( ENABLE_LOG )) && say "Log file: $LOG_FILE"
say "Results: ${PASS}: ${PASS_CNT} ${WARN}: ${WARN_CNT} ${FAIL}: ${FAIL_CNT}"
if (( FAIL_CNT > 0 )); then
say "Failed items:"
for it in "${FAILED_ITEMS[@]}"; do say " - $it"; done
exit 1
else
ok "All checks passed"
exit 0
fi
@xxrjun
Copy link
Author

xxrjun commented Oct 15, 2025

Expected Output

[2025-10-15 11:09:43] INFO  ======== Summary ========
[2025-10-15 11:09:43] INFO  Role: compute   Safe Output: ON   Timeout: 30s
[2025-10-15 11:09:43] INFO  Kernel min: 6.5.0-1024-nvidia-64k   GPUs required: 1
[2025-10-15 11:09:43] INFO  Log file: ../log/verify_mnnvl_20251015_110941.log
[2025-10-15 11:09:43] INFO  Results: OK: 11   WARN: 0   FAIL: 0
[2025-10-15 11:09:43] INFO  OK  All checks passed

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment