Created
September 12, 2025 08:17
-
-
Save ddk50/8a1420f0ef7c309c47b7450219291756 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/usr/bin/env bash | |
| # smart_audit.sh - SMART全台チェック&総合判定(OK/WARN/ALERT) | |
| # deps: smartmontools, (optional) mailutils | |
| export PATH="/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin" | |
| set -euo pipefail | |
| LC_ALL=C | |
| # ========================= | |
| # Config | |
| # ========================= | |
| MAIL_TO="${MAIL_TO:-}" # 環境変数で宛先指定。空なら標準出力のみ | |
| AGE_HOURS=26280 # 3年=26280h 以上で WARN | |
| TEMP_WARN=45 # ℃ 警告 | |
| TEMP_CRIT=50 # ℃ 危険 | |
| CRC_WARN=1 # UDMA_CRC_Error_Count >=1 で WARN | |
| HOST="$(hostname -s)" | |
| NOW="$(date '+%F %T %Z')" | |
| ALERT=0 | |
| WARN=0 | |
| OUT="$(mktemp)" | |
| # ========================= | |
| # Helpers | |
| # ========================= | |
| ydh() { # hours -> "XyYdZh" | |
| local h=$1 y d | |
| y=$(( h / 8760 )) | |
| d=$(( (h % 8760) / 24 )) | |
| h=$(( h % 24 )) | |
| printf "%dy%dd%dh" "$y" "$d" "$h" | |
| } | |
| byid_for() { # prefer by-id 表示名 | |
| local b; b="$(basename "$1")" | |
| ls -l /dev/disk/by-id/ 2>/dev/null | awk -v t="$b" '$NF==t{print "/dev/disk/by-id/"$9; exit}' | |
| } | |
| print_header() { | |
| echo "SMART Audit Report - ${HOST} (${NOW})" | |
| echo | |
| printf "%-5s %-6s %-24s %-18s %-22s %8s %8s %-11s %6s %6s %6s %6s %6s %6s %s\n" \ | |
| "Stat" "Type" "MODEL" "SERIAL" "DEVICE" "TBW" "HOURS" "(~y d h)" "TEMP" "ReAl" "Pend" "Unco" "CRC" "Wear%" "Notes" | |
| echo "---------------------------------------------------------------------------------------------------------------------------------------------------------------" | |
| } | |
| print_summary() { | |
| echo | |
| echo "Summary:" | |
| (( ALERT )) && echo "- ALERT: あり(至急対応推奨)" | |
| (( WARN )) && echo "- WARN : あり(計画的に対処)" | |
| (( ALERT==0 && WARN==0 )) && echo "- すべてOK" | |
| echo | |
| echo "Policy:" | |
| echo "- 3年(26280h)超えたHDDは予防交換候補。" | |
| echo "- Reallocated/Pending/Uncorrectable > 0 は即バックアップ&交換検討。" | |
| echo "- 温度Warn≥${TEMP_WARN}℃ / Crit≥${TEMP_CRIT}℃。CRC>0はケーブル/ポート疑い。" | |
| } | |
| # ========================= | |
| # Assessors | |
| # ========================= | |
| assess_ata() { # args: device_path (by-id推奨) | |
| local dev="$1" info data model serial poh temp realloc pend unco crc notes status | |
| info="$(smartctl -i "$dev" 2>/dev/null || true)" | |
| data="$(smartctl -A "$dev" 2>/dev/null || true)" | |
| model="$(awk -F: '/Device Model|Model Number|Model/{gsub(/^[ \t]+/,"",$2); m=$2} END{print m}' <<<"$info")" | |
| serial="$(awk -F: '/Serial Number/{gsub(/^[ \t]+/,"",$2); print $2}' <<<"$info")" | |
| poh="$(awk '$1==9 && $2 ~ /Power_On_Hours/ {print $10}' <<<"$data")"; : "${poh:=0}" | |
| temp="$(awk '/Temperature_Celsius|Temperature/ {t=$10?$10:$2} END{if(t)print t; else print "-"}' <<<"$data")" | |
| realloc="$(awk '$2 ~ /Reallocated_Sector_Ct/ {print $10}' <<<"$data")"; : "${realloc:=0}" | |
| pend="$(awk '$2 ~ /Current_Pending_Sector/ {print $10}' <<<"$data")"; : "${pend:=0}" | |
| unco="$(awk '$2 ~ /Offline_Uncorrectable/ {print $10}' <<<"$data")"; : "${unco:=0}" | |
| crc="$(awk '$2 ~ /UDMA_CRC_Error_Count/ {print $10}' <<<"$data")"; : "${crc:=0}" | |
| notes=""; status="OK" | |
| # 温度 | |
| if [[ "$temp" != "-" ]]; then | |
| if (( temp >= TEMP_CRIT )); then notes+="[TEMP-CRIT]"; status="ALERT"; ALERT=1 | |
| elif (( temp >= TEMP_WARN )); then notes+="[TEMP-WARN]"; [[ "$status" == "OK" ]] && status="WARN"; WARN=1 | |
| fi | |
| fi | |
| # 不良関連 | |
| if (( pend>0 || unco>0 )); then | |
| (( pend>0 )) && notes+="[PENDING]" | |
| (( unco>0 )) && notes+="[UNCO]" | |
| status="ALERT"; ALERT=1 | |
| fi | |
| if (( realloc>0 )); then notes+="[REALLOC]"; [[ "$status" == "OK" ]] && status="WARN" && WARN=1; fi | |
| if (( crc>=CRC_WARN )); then notes+="[CRC]"; [[ "$status" == "OK" ]] && status="WARN" && WARN=1; fi | |
| if (( poh>=AGE_HOURS )); then notes+="[AGE≥3y]"; [[ "$status" == "OK" ]] && status="WARN" && WARN=1; fi | |
| [[ -z "$notes" ]] && notes="-" | |
| printf "%-5s %-6s %-24s %-18s %-22s %8s %8d %-11s %6s %6d %6d %6d %6d %6s %s\n" \ | |
| "$status" "HDD" "${model:0:24}" "${serial:0:18}" "$(basename "$dev")" \ | |
| "-" "$poh" "$(ydh "$poh")" "$temp" "$realloc" "$pend" "$unco" "$crc" "-" "$notes" | |
| } | |
| assess_nvme() { # args: /dev/nvmeX | |
| local dev="$1" info data model serial temp_c poh pct_used media_err ctrl_err duw_units tbw notes status wear_str | |
| info="$(smartctl -i -H "$dev" 2>/dev/null || true)" | |
| data="$(smartctl -A "$dev" 2>/dev/null || true)" | |
| model="$(awk -F: '/Model Number/{gsub(/^[ \t]+/,"",$2); print $2}' <<<"$info")" | |
| serial="$(awk -F: '/Serial Number/{gsub(/^[ \t]+/,"",$2); print $2}' <<<"$info")" | |
| temp_c="$(awk -F: '/^[[:space:]]*Temperature:/{gsub(/[^0-9]/,""); print $0}' <<<"$data" | head -1)"; : "${temp_c:=0}" | |
| poh="$(awk -F: '/Power On Hours/{gsub(/[^0-9]/,"",$2); print $2}' <<<"$data")"; : "${poh:=0}" | |
| pct_used="$(awk -F: '/Percentage Used/{gsub(/[^0-9]/,"",$2); print $2}' <<<"$data")"; : "${pct_used:=0}" | |
| media_err="$(awk -F: '/Media and Data Integrity Errors/{gsub(/[^0-9]/,"",$2); print $2}' <<<"$data")"; : "${media_err:=0}" | |
| ctrl_err="$(awk -F: '/Error Information Log Entries/{gsub(/[^0-9]/,"",$2); print $2}' <<<"$data")"; : "${ctrl_err:=0}" | |
| # Data Units Written → 10進TB(1 unit = 512,000 bytes = 0.000000512 TB) | |
| duw_units="$( | |
| awk -F: '/Data Units Written/{ | |
| s=$2; sub(/\[.*/,"",s); gsub(/[^0-9]/,"",s); print s | |
| }' <<<"$data" | |
| )"; : "${duw_units:=0}" | |
| tbw="$(awk -v n="${duw_units}" 'BEGIN{ printf("%.1fTB", n * 0.000000512) }')" | |
| notes=""; status="OK" | |
| if (( media_err>0 )); then status="ALERT"; ALERT=1; notes+="[MEDIA-ERR]"; fi | |
| if (( temp_c>=80 )); then status="ALERT"; ALERT=1; notes+="[TEMP-CRIT]" | |
| elif (( temp_c>=70 )); then [[ "$status" == "OK" ]] && status="WARN"; WARN=1; notes+="[TEMP-WARN]" | |
| fi | |
| if (( pct_used>=90 )); then [[ "$status" == "OK" ]] && status="WARN"; WARN=1; notes+="[WEAR≥90%]"; fi | |
| [[ -z "$notes" ]] && notes="-" | |
| wear_str="${pct_used}%" | |
| printf "%-5s %-6s %-24s %-18s %-22s %8s %8d %-11s %6s %6s %6s %6s %6s %6s %s\n" \ | |
| "$status" "NVMe" "${model:0:24}" "${serial:0:18}" "$(basename "${dev}n1")" \ | |
| "${tbw}" "$poh" "$(ydh "$poh")" "$temp_c" "-" "-" "-" "-" "$wear_str" "$notes" | |
| } | |
| # ========================= | |
| # Discovery | |
| # ========================= | |
| discover_disks() { | |
| mapfile -t DISKS < <( | |
| lsblk -ndo NAME,TYPE \ | |
| | awk '$2=="disk"{print $1}' \ | |
| | grep -Ev '^(zd|loop|ram|sr|dm-|md|nbd|zram)' \ | |
| | awk '{print "/dev/"$1}' | |
| ) | |
| } | |
| # ========================= | |
| # Main | |
| # ========================= | |
| main() { | |
| print_header >> "$OUT" | |
| discover_disks | |
| { | |
| for dev in "${DISKS[@]}"; do | |
| if [[ "$dev" =~ ^/dev/nvme ]]; then | |
| assess_nvme "$dev" | |
| else | |
| local disp; disp="$(byid_for "$dev")"; [[ -z "$disp" ]] && disp="$dev" | |
| assess_ata "$disp" | |
| fi | |
| done | |
| } | sort -k1,1 -k3,3 >> "$OUT" | |
| print_summary >> "$OUT" | |
| local subj="${HOST} SMART audit" | |
| (( ALERT )) && subj="[ALERT] ${subj}" | |
| (( ! ALERT && WARN )) && subj="[WARN] ${subj}" | |
| if [[ -n "${MAIL_TO}" ]] && command -v mail >/dev/null 2>&1; then | |
| mail -s "$subj" "$MAIL_TO" < "$OUT" | |
| else | |
| cat "$OUT" | |
| fi | |
| rm -f "$OUT" | |
| (( ALERT )) && return 2 | |
| (( WARN )) && return 1 | |
| return 0 | |
| } | |
| main "$@" |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment