Skip to content

Instantly share code, notes, and snippets.

@zahin-mohammad
Last active September 8, 2025 22:04
Show Gist options
  • Select an option

  • Save zahin-mohammad/750bd6ba83aee7dc49c56171535d5e70 to your computer and use it in GitHub Desktop.

Select an option

Save zahin-mohammad/750bd6ba83aee7dc49c56171535d5e70 to your computer and use it in GitHub Desktop.
Scan your file system to see if you have installed malware for the npm ecosystem. Run from the root of your file system. Requires `jq` to be installed.
#!/usr/bin/env bash
###############################################################################
# Malware npm dependency scanner using GitHub Advisory API
#
# REQUIREMENTS (runtime tools):
# - bash 3.2+
# - curl : HTTP requests to GitHub
# - jq : JSON parsing
# - npm : Build dependency tree (npm ls --json --all)
# - Standard POSIX utilities: find, xargs, awk, sort, paste, stat (or perl fallback)
#
# OPTIONAL ENV VARS:
# ROOT_DIR Directory to scan (also arg1) (default: .)
# JOBS Parallel workers (default: 10)
# CACHE_FILE Advisory cache file (default: /tmp/github_advisories_malware.json)
# GHA_CACHE_TTL Cache TTL seconds (default: 3600)
# OUTPUT_JSON If set, produce JSON findings array (default: unset)
# EXCLUDE_PATHS Comma-separated substrings to skip paths (default: '/.config/yarn/global,/Library')
# FIND_SUPPRESS_ERRORS Suppress find permission errors (default: 1)
# GITHUB_TOKEN / GH_TOKEN Auth token to raise GitHub API rate limits
#
# EXIT CODES:
# 0 Success (even if no findings)
# 1 Failed to fetch advisories
# 2 Missing required tooling
#
# FEATURES:
# - Caching + token auth to minimize rate limits
# - Parallel scanning of package.json files (node_modules excluded)
# - Approximate version range detection (presence-based)
# - Portable mtime detection across Linux/macOS
# - Optional machine-readable JSON output
#
# FUTURE ENHANCEMENTS (not implemented yet):
# - Pagination if >100 malware advisories
# - Config file for advanced allow/deny filtering
# - Direct parsing of yarn/pnpm lockfiles to avoid npm ls cost
###############################################################################
set -euo pipefail
IFS=$'\n\t'
ROOT_DIR=${1:-.}
JOBS=${JOBS:-100}
CACHE_FILE=${CACHE_FILE:-/tmp/github_advisories_malware.json}
GHA_CACHE_TTL=${GHA_CACHE_TTL:-3600}
OUTPUT_JSON=${OUTPUT_JSON:-} # set to path to also emit JSON summary
EXCLUDE_PATHS=${EXCLUDE_PATHS:-"/.config/yarn/global,/Library"} # comma-separated substrings to skip
FIND_SUPPRESS_ERRORS=${FIND_SUPPRESS_ERRORS:-1}
if ! command -v curl >/dev/null 2>&1; then echo "curl required" >&2; exit 2; fi
if ! command -v jq >/dev/null 2>&1; then echo "jq required" >&2; exit 2; fi
if ! command -v npm >/dev/null 2>&1; then echo "npm required" >&2; exit 2; fi
auth_header() {
if [ -n "${GITHUB_TOKEN:-}" ]; then
printf 'Authorization: Bearer %s' "$GITHUB_TOKEN"
elif [ -n "${GH_TOKEN:-}" ]; then
printf 'Authorization: Bearer %s' "$GH_TOKEN"
fi
}
fetch_advisories() {
# Basic single-page fetch (malware advisories are usually small).
# If needed, extend with pagination (per_page & page loop).
local url="https://api.github.com/advisories?type=malware&per_page=100"
local headers=(-H 'Accept: application/vnd.github+json')
local ah
ah=$(auth_header || true)
if [ -n "$ah" ]; then headers+=(-H "$ah"); fi
curl -sSf "${headers[@]}" "$url"
}
# -----------------------------
# Minimal SemVer helpers (no Node dependency)
# Supports operators: =, <, <=, >, >= and logical AND (space or comma separated)
# and OR groups separated by '||'. Pre-release/build metadata ignored.
# Example ranges handled:
# "= 4.4.2"
# ">=1.0.0 <2.0.0"
# "< 1.2.3, >=1.0.0"
# ">=1.0.0 || =0.9.5"
# Anything unrecognized in a group is ignored (best-effort, safe default = NOT match).
# -----------------------------
semver_cmp() { # v1 v2 -> echo -1 / 0 / 1
local a=$1 b=$2
local IFS=.
local -a A B
read -r -a A <<<"$a"
read -r -a B <<<"$b"
local i
for i in 0 1 2; do
local ai=${A[$i]:-0}
local bi=${B[$i]:-0}
# strip leading zeros
ai=$(echo "$ai" | sed 's/^0*//'); ai=${ai:-0}
bi=$(echo "$bi" | sed 's/^0*//'); bi=${bi:-0}
if [ "$ai" -lt "$bi" ]; then echo -1; return 0; fi
if [ "$ai" -gt "$bi" ]; then echo 1; return 0; fi
done
echo 0
}
semver_check() { # version op target -> returns 0 if satisfies
local v=$1 op=$2 tgt=$3
local cmp
cmp=$(semver_cmp "$v" "$tgt")
case $op in
'=') [ "$cmp" -eq 0 ] ;;
'==') [ "$cmp" -eq 0 ] ;;
'>=') [ "$cmp" -ge 0 ] ;;
'>') [ "$cmp" -gt 0 ] ;;
'<=') [ "$cmp" -le 0 ] ;;
'<') [ "$cmp" -lt 0 ] ;;
*) return 1 ;;
esac
}
semver_satisfies() { # version range_string
local ver=$1 range=$2
range=$(printf '%s' "$range" | sed -E 's/^ *//; s/ *$//')
[ -z "$range" ] && return 0
# Fast path: wildcard '*'
[ "$range" = "*" ] && return 0
# Normalize '||' to single pipe and split
local IFS='|'
local part
for part in $(echo "$range" | sed 's/||/|/g'); do
part=$(echo "$part" | sed -E 's/^ *//; s/ *$//')
[ -z "$part" ] && continue
# Replace commas with spaces and collapse
local group=$(echo "$part" | tr ',' ' ' | tr -s ' ')
# Tokenize
local ok=1
local tokens=()
local word
for word in $group; do tokens+=("$word"); done
local i=0
while [ $i -lt ${#tokens[@]} ]; do
local tok=${tokens[$i]}
local op tgt
if [[ $tok =~ ^([<>]=?|==?)([0-9]+(\.[0-9]+){0,2})$ ]]; then
op="${BASH_REMATCH[1]}"; tgt="${BASH_REMATCH[2]}"
elif [[ $tok =~ ^([<>]=?|==?)$ ]]; then
# bare operator, need next token
i=$((i+1))
[ $i -lt ${#tokens[@]} ] || { ok=0; break; }
local nxt=${tokens[$i]}
if [[ $nxt =~ ^([0-9]+(\.[0-9]+){0,2})$ ]]; then
op="$tok"; tgt="$nxt"
else
ok=0; break
fi
elif [[ $tok =~ ^([0-9]+(\.[0-9]+){0,2})$ ]]; then
op='='; tgt="$tok"
else
ok=0; break
fi
if ! semver_check "$ver" "$op" "$tgt"; then ok=0; break; fi
i=$((i+1))
done
if [ $ok -eq 1 ]; then return 0; fi
done
return 1
}
file_mtime() {
# Cross-platform mtime fetch (GNU stat, BSD stat, perl fallback)
local f="$1"
if stat -c %Y "$f" >/dev/null 2>&1; then
stat -c %Y "$f"
elif stat -f %m "$f" >/dev/null 2>&1; then
stat -f %m "$f"
else
perl -e 'print((stat shift)[9])' "$f" 2>/dev/null || echo 0
fi
}
need_refresh=true
if [ -f "$CACHE_FILE" ]; then
now=$(date +%s)
mod=$(file_mtime "$CACHE_FILE")
age=$(( now - mod )) || age=999999
if [ "$age" -lt "$GHA_CACHE_TTL" ]; then
need_refresh=false
fi
fi
if $need_refresh; then
echo "Fetching latest advisories from GitHub..." >&2
if ! fetch_advisories >"$CACHE_FILE".tmp 2>/dev/null; then
echo "Failed to fetch advisories" >&2; exit 1; fi
mv "$CACHE_FILE".tmp "$CACHE_FILE"
else
echo "Using cached advisories ($CACHE_FILE)" >&2
fi
ADVISORIES=$(cat "$CACHE_FILE")
# Extract malware advisories for npm ecosystem, retaining parent advisory context (summary, url, severity)
MALWARE_JSON=$(echo "$ADVISORIES" | jq '[ .[] as $adv
| select($adv.type=="malware")
| $adv.vulnerabilities[]?
| select(.package.ecosystem=="npm")
| {package: .package.name,
range: .vulnerable_version_range,
advisory: ($adv.summary // "(no summary)"),
url: ($adv.html_url // ""),
severity: (.severity // $adv.severity // "unknown")}
] | unique')
COUNT=$(echo "$MALWARE_JSON" | jq 'length')
if [ "$COUNT" -eq 0 ]; then
echo "No npm malware advisories found." >&2
exit 0
fi
ADVISORY_COUNT=$COUNT
echo "Malware advisories (npm only):" >&2
echo "$MALWARE_JSON" | jq -r '.[] | "- [" + .advisory + "](" + .url + ") -> " + .package + " (" + .range + ") (severity: " + .severity + ")"'
echo "Scanning for package.json files under $ROOT_DIR (parallel: $JOBS)..." >&2
scan_pkg() {
local pkgjson="$1"
local pkgdir
pkgdir=$(dirname "$pkgjson")
# Skip some noisy global or extension dirs (adjust as needed)
# Dynamic exclusions
IFS=',' read -r -a _exarr <<< "$EXCLUDE_PATHS"
for _pat in "${_exarr[@]}"; do
[ -z "$_pat" ] && continue
case "$pkgdir" in *"$_pat"*) return 0 ;; esac
done
local npm_json
if ! npm_json=$(npm ls --all --json --prefix "$pkgdir" 2>/dev/null); then
# Silenced npm ls failure (previously logged). Intentionally returning quietly.
return 0
fi
# Flatten dependency tree; guard against nulls (compatible with jq by type checks)
local deps
deps=$(echo "$npm_json" | jq -r '
def walkdeps: to_entries[]?
| select(.value.version != null)
| "\(.key)|\(.value.version)" , ( .value.dependencies? // {} | walkdeps );
(.dependencies? // {}) | walkdeps
' | sort -u)
[ -z "$deps" ] && return 0
# Iterate malware packages; perform O(n*m) lookup (m small) using awk
local findings_found=0
while IFS= read -r m; do
local mname mrange murl msum found_ver
IFS='|' read -r mname mrange murl msum <<< "$m"
# Exact match on package name
found_ver=$(printf '%s\n' "$deps" | awk -F'|' -v n="$mname" '$1==n {print $2; exit}') || true
[ -z "$found_ver" ] && continue
# Semver range validation (if range has an explicit operator or version). If not satisfied, skip.
if [ -n "$mrange" ] && ! semver_satisfies "$found_ver" "$mrange" 2>/dev/null; then
continue
fi
findings_found=1
# Record that we found at least one issue (parallel-safe append)
[ -n "${FINDINGS_MARKER:-}" ] && echo 1 >> "$FINDINGS_MARKER" 2>/dev/null || true
printf '⚠️ %s@%s in %s (advisory: %s | range: %s)\n' "$mname" "$found_ver" "$pkgdir" "$msum" "$mrange"
if [ -n "$OUTPUT_JSON" ]; then
printf '{"package":"%s","foundVersion":"%s","range":"%s","advisory":%s,"url":"%s","path":"%s"}\n' \
"$mname" "$found_ver" "$mrange" "$(printf '%s' "$msum" | jq -R '.')" "$murl" "$pkgdir" >> "$OUTPUT_JSON".tmp
fi
done < <(echo "$MALWARE_JSON" | jq -r '.[] | "\(.package)|\(.range)|\(.url)|\(.advisory)"')
return 0
}
export -f scan_pkg
export MALWARE_JSON OUTPUT_JSON
# Marker file to detect if any findings were produced (parallel processes append)
FINDINGS_MARKER=${FINDINGS_MARKER:-$(mktemp -t malware_findings_XXXXXX)}
export FINDINGS_MARKER
# Optionally initialize JSON output
if [ -n "$OUTPUT_JSON" ]; then
: > "$OUTPUT_JSON".tmp
fi
# Find package.json files (excluding node_modules) and scan in parallel
if [ "$FIND_SUPPRESS_ERRORS" -eq 1 ]; then
find "$ROOT_DIR" -type f -name package.json -not -path '*/node_modules/*' -print0 2>/dev/null | \
xargs -0 -n1 -P "$JOBS" bash -c 'scan_pkg "$0"'
else
find "$ROOT_DIR" -type f -name package.json -not -path '*/node_modules/*' -print0 | \
xargs -0 -n1 -P "$JOBS" bash -c 'scan_pkg "$0"'
fi
if [ -n "$OUTPUT_JSON" ]; then
# Wrap newline-delimited JSON objects into array
{ echo '['; paste -sd',' "$OUTPUT_JSON".tmp; echo ']'; } > "$OUTPUT_JSON"
rm -f "$OUTPUT_JSON".tmp
echo "JSON report written to $OUTPUT_JSON" >&2
fi
# Summarize results
if [ ! -s "$FINDINGS_MARKER" ]; then
echo "No impacted packages detected (0 matched out of $ADVISORY_COUNT malware advisories)." >&2
else
MATCHED=$(wc -l < "$FINDINGS_MARKER" 2>/dev/null || echo 0)
echo "Matched $MATCHED distinct advisory package name(s) out of $ADVISORY_COUNT." >&2
fi
rm -f "$FINDINGS_MARKER" 2>/dev/null || true
echo "Scan complete." >&2
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment