Created
October 15, 2025 03:22
-
-
Save xxrjun/a0eecb033a529522b9cec80c1145359a to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/bin/bash | |
| # verify_mnnvl_health.sh | |
| # Local verification for MNNVL compute tray & NVLink switch health. | |
| # - Runs all checks to completion; no early exit. | |
| # - Safe output mode ON by default (redacts IP/MAC/long HEX/IDs). | |
| # - Prints a final Summary and returns nonzero if any check failed. | |
| # | |
| # Reference: https://docs.nvidia.com/multi-node-nvlink-systems/mnnvl-user-guide/verifying.html | |
| # ============================================================================== | |
| # UI UTILITIES | |
| # ============================================================================== | |
| PASS="OK"; FAIL="FAIL"; WARN="WARN" | |
| log() { local level="$1"; shift; printf "[%s] %-5s %s\n" "$(date '+%F %T')" "$level" "$*"; } | |
| say() { log INFO "$*"; } | |
| section(){ printf "\n[%s] %-5s %s\n" "$(date '+%F %T')" INFO "======== $* ========"; } | |
| ok() { log INFO "${PASS} $*"; } | |
| bad() { log ERROR "${FAIL} $*"; } | |
| warn() { log WARN "${WARN} $*"; } | |
| # ============================================================================== | |
| # DEFAULTS | |
| # ============================================================================== | |
| ROLE="compute" # compute | switch | |
| SWITCH_HEALTHCHECK_HOST="0.0.0.0" | |
| REQUIRE_N_GPUS=1 | |
| KERNEL_MIN_VERSION="6.5.0" | |
| KERNEL_MIN_BUILD=1024 | |
| REQUIRE_KERNEL_FLAVOR="nvidia-64k" | |
| TIMEOUT=30 | |
| SAFE_MODE=1 | |
| # ============================================================================== | |
| # LOGGING (default ON to ./log/<date>) | |
| # ============================================================================== | |
| ENABLE_LOG=1 | |
| LOG_DIR="../log" | |
| TIME_STAMP=$(date +%Y%m%d_%H%M%S) | |
| LOG_FILE="${LOG_DIR}/verify_mnnvl_${TIME_STAMP}.log" | |
| # ============================================================================== | |
| # HELPERS | |
| # ============================================================================== | |
| usage() { | |
| cat <<EOF | |
| verify_mnnvl_health.sh | |
| Local verification for MNNVL compute trays or NVLink switch trays. | |
| Usage: | |
| ${0##*/} [--role compute|switch] [--require-gpus N] [--health-host HOST] | |
| [--unsafe] [--log FILE] [--no-log] | |
| Options: | |
| --role ROLE 'compute' (default) or 'switch' | |
| --require-gpus N Minimum GPUs expected (default ${REQUIRE_N_GPUS}) | |
| --health-host HOST Host/IP for NMX-T curl endpoints (default ${SWITCH_HEALTHCHECK_HOST}) | |
| --unsafe Print raw command outputs (disable redaction) | |
| --log FILE Override log path (default: ${LOG_FILE}) | |
| --no-log Do not write log file | |
| -h|--help Show help | |
| EOF | |
| } | |
| have() { command -v "$1" >/dev/null 2>&1; } | |
| timeout_run() { | |
| if have timeout; then | |
| timeout "$TIMEOUT" "$@" | |
| else | |
| perl -e 'alarm shift @ARGV; exec @ARGV' "$TIMEOUT" "$@" | |
| fi | |
| } | |
| # Redact sensitive strings while keeping outputs readable | |
| mask_safe() { | |
| sed -E \ | |
| -e 's/([0-9]{1,3}\.){3}[0-9]{1,3}/<IP>/g' \ | |
| -e 's/([0-9A-Fa-f]{2}:){5}[0-9A-Fa-f]{2}/<MAC>/g' \ | |
| -e 's/[0-9A-Fa-f]{8,}/<HEX>/g' \ | |
| -e 's/([A-Za-z0-9]{4}-){2,}[A-Za-z0-9]+/<ID>/g' | |
| } | |
| # Run a command, show its (possibly redacted) output, return rc but DO NOT exit | |
| run_cmd() { | |
| local label="$1"; shift | |
| section "$label" | |
| log DEBUG "\$ $*" | |
| local tmp; tmp="$(mktemp)" | |
| timeout_run "$@" >"$tmp" 2>&1 | |
| local rc=$? | |
| if (( SAFE_MODE )); then | |
| mask_safe <"$tmp" | sed -n '1,200p' | |
| else | |
| sed -n '1,200p' "$tmp" | |
| fi | |
| echo | |
| rm -f "$tmp" | |
| return "$rc" | |
| } | |
| # Semantic version comparison X.Y.Z (ignore distro suffix) | |
| semver_ge() { | |
| local IFS=.; local -a A=(${1%%-*}) B=(${2%%-*}) | |
| for i in 0 1 2; do | |
| local a=${A[i]:-0} b=${B[i]:-0} | |
| ((10#$a>10#$b)) && return 0 | |
| ((10#$a<10#$b)) && return 1 | |
| done | |
| return 0 | |
| } | |
| extract_build() { local s="$1"; s="${s#*-}"; [[ "$s" =~ ^([0-9]+) ]] && echo "${BASH_REMATCH[1]}" || echo 0; } | |
| # ============================================================================== | |
| # SUMMARY COUNTERS | |
| # ============================================================================== | |
| PASS_CNT=0; FAIL_CNT=0; WARN_CNT=0 | |
| FAILED_ITEMS=() | |
| mark_pass() { ok "$1"; ((PASS_CNT++)); } | |
| mark_fail() { bad "$1"; FAILED_ITEMS+=("$1"); ((FAIL_CNT++)); } | |
| mark_warn() { warn "$1"; ((WARN_CNT++)); } | |
| # ============================================================================== | |
| # COMPUTE CHECKS | |
| # ============================================================================== | |
| chk_imex_channel() { | |
| local d="/dev/nvidia-caps-imex-channels/channel0" | |
| if [[ -c "$d" ]]; then | |
| mark_pass "IMEX channel0 present ($d)" | |
| else | |
| mark_fail "IMEX channel0 missing ($d)" | |
| fi | |
| } | |
| chk_kernel() { | |
| local k; k="$(uname -r)" | |
| local build; build="$(extract_build "$k")" | |
| say "Kernel: $k (build $build)" | |
| local okv=0 okb=0 okf=0 | |
| semver_ge "$k" "$KERNEL_MIN_VERSION" && okv=1 | |
| (( build >= KERNEL_MIN_BUILD )) && okb=1 | |
| [[ "$k" == *"$REQUIRE_KERNEL_FLAVOR"* ]] && okf=1 | |
| ((okv)) || mark_fail "Kernel version < $KERNEL_MIN_VERSION" | |
| ((okb)) || mark_fail "Kernel build < $KERNEL_MIN_BUILD" | |
| ((okf)) || mark_fail "Kernel flavor missing '$REQUIRE_KERNEL_FLAVOR'" | |
| ((okv && okb && okf)) && mark_pass "Kernel OK" | |
| } | |
| chk_nvidia_smi() { | |
| if ! have nvidia-smi; then mark_fail "nvidia-smi not found"; return; fi | |
| if run_cmd "[GPU] nvidia-smi" nvidia-smi; then | |
| local cnt; cnt="$(nvidia-smi --query-gpu=name --format=csv,noheader 2>/dev/null | wc -l | tr -d ' ')" | |
| [[ -z "$cnt" ]] && cnt=0 | |
| (( cnt >= REQUIRE_N_GPUS )) && mark_pass "Detected $cnt GPU(s) (>= $REQUIRE_N_GPUS)" || mark_fail "Detected $cnt GPU(s) (< $REQUIRE_N_GPUS)" | |
| else | |
| mark_fail "nvidia-smi execution failed" | |
| fi | |
| } | |
| chk_fabric_status() { | |
| if ! have nvidia-smi; then mark_fail "nvidia-smi not found (fabric)"; return; fi | |
| if run_cmd "[GPU] Fabric block (nvidia-smi -q | grep -A4 Fabric)" bash -lc "nvidia-smi -q | grep -n 'Fabric' -A 4"; then | |
| local q; q="$(nvidia-smi -q 2>/dev/null || true)" | |
| if grep -qi 'Fabric' <<<"$q" && grep -qi 'Completed' <<<"$q" && grep -qi 'Success' <<<"$q"; then | |
| mark_pass "Fabric state Completed/Success" | |
| else | |
| mark_fail "Fabric state not Completed/Success" | |
| fi | |
| else | |
| mark_fail "Failed to read Fabric block" | |
| fi | |
| } | |
| chk_nvlink_status() { | |
| if ! have nvidia-smi; then mark_fail "nvidia-smi not found (nvlink)"; return; fi | |
| if run_cmd "[NVLink] nvidia-smi nvlink --status" nvidia-smi nvlink --status; then | |
| local s; s="$(nvidia-smi nvlink --status 2>/dev/null || true)" | |
| if grep -q "<inactive>" <<<"$s"; then | |
| mark_fail "NVLink: one or more links <inactive>" | |
| else | |
| mark_pass "NVLink: no <inactive> links" | |
| fi | |
| if grep -E 'Link [0-9]+:' <<<"$s" | grep -Ev '<inactive>' | grep -vq '50 GB/s'; then | |
| mark_fail "NVLink: some active links not 50 GB/s" | |
| else | |
| mark_pass "NVLink: all active links report 50 GB/s" | |
| fi | |
| else | |
| mark_fail "nvidia-smi nvlink --status failed" | |
| fi | |
| } | |
| # ============================================================================== | |
| # TOPOLOGY PARSER | |
| # ============================================================================== | |
| chk_topology() { | |
| if ! have nvidia-smi; then mark_fail "nvidia-smi not found (topology)"; return; fi | |
| if run_cmd "[Topology] nvidia-smi topo -p2p n" nvidia-smi topo -p2p n; then | |
| local out; out="$(nvidia-smi topo -p2p n 2>/dev/null || true)" | |
| local matrix | |
| matrix="$(awk ' | |
| /^[[:space:]]*GPU[0-9]+/ { hdr=1 } | |
| hdr && NF==0 { exit } | |
| hdr { print } | |
| ' <<< "$out")" | |
| if [[ -z "$matrix" ]]; then | |
| mark_fail "Topology: unable to parse matrix block" | |
| return | |
| fi | |
| local bad_tokens | |
| bad_tokens="$(awk ' | |
| NR==1 { next } | |
| { | |
| for (i=2; i<=NF; i++) { | |
| t=$i | |
| if (t!="OK" && t!="X" && t ~ /^(N\/A|WARN|FAIL|TNS|CNS|GNS|NS|U)$/) { | |
| seen[t]=1 | |
| } | |
| } | |
| } | |
| END { | |
| for (k in seen) printf "%s ", k | |
| } | |
| ' <<< "$matrix")" | |
| if [[ -n "$bad_tokens" ]]; then | |
| mark_fail "Topology matrix has non-OK entries: ${bad_tokens%% }" | |
| return | |
| fi | |
| mark_pass "Topology: GPU↔GPU links OK (diagonal X ignored)" | |
| else | |
| mark_fail "nvidia-smi topo failed" | |
| fi | |
| } | |
| # ============================================================================== | |
| # SERVICE & IMEX CHECKS | |
| # ============================================================================== | |
| chk_services() { | |
| local saw=0 | |
| for s in nvidia-persistenced nvidia-imex; do | |
| if have systemctl && systemctl is-active --quiet "$s"; then | |
| mark_pass "Service active: $s" | |
| run_cmd "[Service] systemctl status $s" systemctl status "$s" || true | |
| else | |
| mark_fail "Service NOT active: $s" | |
| fi | |
| saw=1 | |
| done | |
| ((saw)) || mark_warn "systemctl not available; skipped service checks" | |
| } | |
| chk_imex_version() { | |
| if [[ -x /usr/bin/nvidia-imex ]]; then | |
| run_cmd "[IMEX] /usr/bin/nvidia-imex --version" /usr/bin/nvidia-imex --version && mark_pass "IMEX version retrieved" || mark_fail "IMEX version failed" | |
| else | |
| mark_fail "IMEX binary not found at /usr/bin/nvidia-imex" | |
| fi | |
| } | |
| chk_imex_domain() { | |
| if have nvidia-imex-ctl; then | |
| run_cmd "[IMEX] nvidia-imex-ctl -N" nvidia-imex-ctl -N && mark_pass "IMEX domain query succeeded" || mark_fail "IMEX domain query failed" | |
| else | |
| mark_fail "nvidia-imex-ctl not found" | |
| fi | |
| } | |
| run_compute() { | |
| section "MNNVL Verification (compute, local)" | |
| chk_imex_channel | |
| chk_kernel | |
| chk_nvidia_smi | |
| chk_fabric_status | |
| chk_nvlink_status | |
| chk_topology | |
| chk_services | |
| chk_imex_version | |
| chk_imex_domain | |
| } | |
| # ============================================================================== | |
| # SWITCH CHECKS | |
| # ============================================================================== | |
| chk_switch_health() { | |
| if have nv; then | |
| run_cmd "[Switch] nv show system health" nv show system health && mark_pass "Switch system health command OK" || mark_fail "Switch system health failed" | |
| else | |
| mark_fail "'nv' CLI not found on switch" | |
| fi | |
| } | |
| chk_switch_apps() { | |
| if have nv; then | |
| run_cmd "[Switch] nv show cluster apps" nv show cluster apps || mark_fail "nv show cluster apps failed" | |
| run_cmd "[Switch] nv show cluster apps running" nv show cluster apps running || mark_fail "nv show cluster apps running failed" | |
| if nv show cluster apps running 2>/dev/null | grep -qE '^nmx-controller[[:space:]]+ok'; then | |
| mark_pass "nmx-controller status ok" | |
| else | |
| mark_fail "nmx-controller not ok/missing" | |
| fi | |
| if nv show cluster apps running 2>/dev/null | grep -qE '^nmx-telemetry[[:space:]]+ok'; then | |
| mark_pass "nmx-telemetry status ok" | |
| else | |
| mark_fail "nmx-telemetry not ok/missing" | |
| fi | |
| fi | |
| } | |
| chk_nmx_endpoints() { | |
| if have curl; then | |
| run_cmd "[Switch] NMX-T healthcheck (9350)" curl -fsS "http://${SWITCH_HEALTHCHECK_HOST}:9350/healthcheck" || true | |
| local hc; hc="$(curl -fsS "http://${SWITCH_HEALTHCHECK_HOST}:9350/healthcheck" 2>/dev/null || true)" | |
| if [[ "$hc" == *'"status":0'* && "$hc" == *'"message":"OK"'* ]]; then | |
| mark_pass "NMX-T healthcheck OK" | |
| else | |
| mark_fail "NMX-T healthcheck unexpected" | |
| fi | |
| run_cmd "[Switch] NMX-T statistics (9352)" curl -fsS "http://${SWITCH_HEALTHCHECK_HOST}:9352/management/statistics" || true | |
| run_cmd "[Switch] NMX-T mgmt status (9352)" curl -fsS "http://${SWITCH_HEALTHCHECK_HOST}:9352/management/check_status" || true | |
| else | |
| mark_fail "curl not found for NMX-T endpoints" | |
| fi | |
| } | |
| run_switch() { | |
| section "MNNVL Verification (switch, local)" | |
| chk_switch_health | |
| chk_switch_apps | |
| chk_nmx_endpoints | |
| } | |
| # ============================================================================== | |
| # CLI ARGUMENT PARSING | |
| # ============================================================================== | |
| while [[ $# -gt 0 ]]; do | |
| case "$1" in | |
| --role) ROLE="${2:-}"; shift 2;; | |
| --require-gpus) REQUIRE_N_GPUS="${2:-}"; shift 2;; | |
| --health-host) SWITCH_HEALTHCHECK_HOST="${2:-}"; shift 2;; | |
| --unsafe) SAFE_MODE=0; shift;; | |
| --log) ENABLE_LOG=1; LOG_FILE="${2:-}"; shift 2;; | |
| --no-log) ENABLE_LOG=0; shift;; | |
| -h|--help) usage; exit 0;; | |
| *) warn "Unknown arg: $1"; usage; exit 2;; | |
| esac | |
| done | |
| # ============================================================================== | |
| # ENABLE LOGGING | |
| # ============================================================================== | |
| if (( ENABLE_LOG )); then | |
| mkdir -p "$(dirname -- "$LOG_FILE")" 2>/dev/null || true | |
| exec > >(tee -a "$LOG_FILE") 2>&1 | |
| say "Logging to: $LOG_FILE" | |
| fi | |
| # ============================================================================== | |
| # DISPATCH | |
| # ============================================================================== | |
| case "$ROLE" in | |
| compute) run_compute ;; | |
| switch) run_switch ;; | |
| *) bad "Invalid --role '$ROLE'"; usage; exit 2;; | |
| esac | |
| # ============================================================================== | |
| # SUMMARY | |
| # ============================================================================== | |
| section "Summary" | |
| say "Role: $ROLE Safe Output: $([[ $SAFE_MODE -eq 1 ]] && echo ON || echo OFF) Timeout: ${TIMEOUT}s" | |
| say "Kernel min: ${KERNEL_MIN_VERSION}-${KERNEL_MIN_BUILD}-${REQUIRE_KERNEL_FLAVOR} GPUs required: ${REQUIRE_N_GPUS}" | |
| (( ENABLE_LOG )) && say "Log file: $LOG_FILE" | |
| say "Results: ${PASS}: ${PASS_CNT} ${WARN}: ${WARN_CNT} ${FAIL}: ${FAIL_CNT}" | |
| if (( FAIL_CNT > 0 )); then | |
| say "Failed items:" | |
| for it in "${FAILED_ITEMS[@]}"; do say " - $it"; done | |
| exit 1 | |
| else | |
| ok "All checks passed" | |
| exit 0 | |
| fi |
Author
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Expected Output