Skip to content

Instantly share code, notes, and snippets.

@afiqiqmal
Last active October 27, 2025 03:12
Show Gist options
  • Select an option

  • Save afiqiqmal/274940cee7f37517ab217fc023514752 to your computer and use it in GitHub Desktop.

Select an option

Save afiqiqmal/274940cee7f37517ab217fc023514752 to your computer and use it in GitHub Desktop.
Monitor Supervisor Zombie Process
* * * * * /etc/supervisor/supervisord_watchdog.sh
* 0,3,6 * * * service supervisord restart >> /var/log/supervisord_monitor/monitor.log 2>&1
#!/bin/bash
# Variables
LOG_DIR="/var/log/supervisord_monitor"
LOG_FILE="$LOG_DIR/monitor.log"
SUPERVISORD_SERVICE="supervisord"
TIMEOUT_CMD="/usr/bin/timeout"
SUPERVISORCTL_CMD="/usr/local/bin/supervisorctl"
TIMEOUT=5 # seconds
TIMESTAMP=$(date '+%Y-%m-%d %H:%M:%S')
DATE_TAG_FILE="$LOG_DIR/.last_purge_date"
# Ensure log directory exists
mkdir -p "$LOG_DIR"
# --- Daily log purge ---
CURRENT_DATE=$(date +%Y-%m-%d)
if [ ! -f "$DATE_TAG_FILE" ] || [ "$CURRENT_DATE" != "$(cat "$DATE_TAG_FILE")" ]; then
echo "$TIMESTAMP - Rotating monitor log (new day detected: $CURRENT_DATE)" > "$LOG_FILE"
echo "$CURRENT_DATE" > "$DATE_TAG_FILE"
fi
# --- Check supervisord process ---
if ! pgrep -x "$SUPERVISORD_SERVICE" > /dev/null; then
echo "$TIMESTAMP - ERROR: supervisord process not found, attempting restart..." >> "$LOG_FILE"
systemctl restart "$SUPERVISORD_SERVICE"
echo "$TIMESTAMP - supervisord restarted." >> "$LOG_FILE"
exit 1
fi
# --- Check supervisord responsiveness ---
if ! "$TIMEOUT_CMD" "$TIMEOUT" "$SUPERVISORCTL_CMD" version > /dev/null 2>&1; then
echo "$TIMESTAMP - ERROR: supervisord unresponsive (no version response), attempting restart..." >> "$LOG_FILE"
systemctl restart "$SUPERVISORD_SERVICE"
echo "$TIMESTAMP - supervisord restarted." >> "$LOG_FILE"
exit 1
fi
# --- Check for zombie process ---
if ps -o stat= -C "$SUPERVISORD_SERVICE" | grep -q 'Z'; then
echo "$TIMESTAMP - ERROR: supervisord is a zombie, attempting restart..." >> "$LOG_FILE"
systemctl restart "$SUPERVISORD_SERVICE"
echo "$TIMESTAMP - supervisord restarted." >> "$LOG_FILE"
exit 1
fi
# --- Check supervisorctl availability ---
if ! "$TIMEOUT_CMD" "$TIMEOUT" "$SUPERVISORCTL_CMD" version > /dev/null 2>&1; then
echo "$TIMESTAMP - ERROR: supervisord unresponsive (no version response), attempting restart..." >> "$LOG_FILE"
systemctl restart "$SUPERVISORD_SERVICE"
echo "$TIMESTAMP - supervisord restarted." >> "$LOG_FILE"
exit 1
fi
# --- Check if processes are RUNNING ---
if ! "$TIMEOUT_CMD" "$TIMEOUT" "$SUPERVISORCTL_CMD" status | grep -q RUNNING; then
echo "$TIMESTAMP - ERROR: supervisor config process not running under supervisord." >> "$LOG_FILE"
"$SUPERVISORCTL_CMD" restart all
echo "$TIMESTAMP - supervisorctl process restarted." >> "$LOG_FILE"
exit 1
fi
# --- Success log ---
echo "$TIMESTAMP - OK: supervisord is running and responsive." >> "$LOG_FILE"
exit 0
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment