Skip to content

Instantly share code, notes, and snippets.

@askb
Created June 26, 2025 04:51
Show Gist options
  • Select an option

  • Save askb/53e8c334a532fcfb7ad850eb2740558d to your computer and use it in GitHub Desktop.

Select an option

Save askb/53e8c334a532fcfb7ad850eb2740558d to your computer and use it in GitHub Desktop.
Gerrit 502 script
#!/bin/bash
# Gerrit 502 Error Diagnostic Script
# Usage: sudo ./gerrit-diagnostic.sh
# This script performs read-only checks to identify the root cause of 502 errors
set -euo pipefail
# Colors for output
RED='\033[0;31m'
GREEN='\033[0;32m'
YELLOW='\033[1;33m'
BLUE='\033[0;34m'
NC='\033[0m' # No Color
# Configuration
GERRIT_HOME="/opt/gerrit"
GERRIT_SERVICE="gerrit.service"
GERRIT_PORT="8082"
LOG_LINES=100
echo -e "${BLUE}================================${NC}"
echo -e "${BLUE}Gerrit 502 Error Diagnostic Tool${NC}"
echo -e "${BLUE}================================${NC}"
echo "Timestamp: $(date)"
echo "Hostname: $(hostname)"
echo ""
# Function to print section headers
print_section() {
echo -e "\n${BLUE}=== $1 ===${NC}"
}
# Function to print status
print_status() {
local status=$1
local message=$2
case $status in
"OK") echo -e "${GREEN}[OK]${NC} $message" ;;
"WARN") echo -e "${YELLOW}[WARN]${NC} $message" ;;
"ERROR") echo -e "${RED}[ERROR]${NC} $message" ;;
"INFO") echo -e "${BLUE}[INFO]${NC} $message" ;;
esac
}
# Check if script is run as root
check_privileges() {
if [[ $EUID -ne 0 ]]; then
print_status "WARN" "Script not running as root. Some checks may fail."
fi
}
# 1. System Overview
print_section "System Overview"
check_privileges
print_status "INFO" "System: $(uname -a)"
print_status "INFO" "Uptime: $(uptime)"
# Check system load
load=$(uptime | awk -F'load average:' '{print $2}' | awk '{print $1}' | tr -d ',')
load_int=$(echo "$load" | cut -d'.' -f1)
if (( load_int > 4 )); then
print_status "ERROR" "High system load: $load"
elif (( load_int > 2 )); then
print_status "WARN" "Moderate system load: $load"
else
print_status "OK" "System load: $load"
fi
# 2. Gerrit Service Status
print_section "Gerrit Service Status"
if systemctl is-active --quiet $GERRIT_SERVICE; then
print_status "OK" "Gerrit service is running"
# Get PID and process info
GERRIT_PID=$(systemctl show $GERRIT_SERVICE --property MainPID | cut -d'=' -f2)
# Fallback method if systemctl doesn't work
if [[ -z "$GERRIT_PID" ]] || [[ "$GERRIT_PID" == "0" ]]; then
GERRIT_PID=$(pgrep -f "gerrit.war" | head -1)
fi
if [[ "$GERRIT_PID" != "0" ]]; then
print_status "INFO" "Gerrit PID: $GERRIT_PID"
# Process details
if ps -p $GERRIT_PID > /dev/null 2>&1; then
ps_info=$(ps -p $GERRIT_PID -o pid,ppid,user,stat,pcpu,pmem,etime,cmd --no-headers)
print_status "INFO" "Process: $ps_info"
fi
else
print_status "ERROR" "Gerrit service running but no PID found"
fi
else
print_status "ERROR" "Gerrit service is not running"
systemctl status $GERRIT_SERVICE --no-pager -l
fi
# 3. Memory Analysis
print_section "Memory Analysis"
# System memory
mem_info=$(free -h)
echo "$mem_info"
# Check swap usage
swap_used=$(free | awk '/^Swap:/ {print $3}')
if [[ "$swap_used" -gt 0 ]]; then
print_status "WARN" "Swap is being used: $(free -h | awk '/^Swap:/ {print $3}')"
else
print_status "OK" "No swap usage detected"
fi
# Check if Gerrit process exists for memory analysis
if [[ -n "${GERRIT_PID:-}" ]] && [[ "$GERRIT_PID" != "0" ]]; then
# Gerrit process memory usage
gerrit_mem=$(ps -p $GERRIT_PID -o pmem --no-headers | tr -d ' ')
print_status "INFO" "Gerrit memory usage: ${gerrit_mem}%"
# Check for memory leaks or high usage
gerrit_mem_int=$(echo "$gerrit_mem" | cut -d'.' -f1)
if [[ "$gerrit_mem_int" -gt 80 ]]; then
print_status "ERROR" "Gerrit using excessive memory: ${gerrit_mem}%"
elif [[ "$gerrit_mem_int" -gt 60 ]]; then
print_status "WARN" "Gerrit using high memory: ${gerrit_mem}%"
fi
fi
# 4. Network Connectivity Tests
print_section "Network Connectivity"
# Check if Gerrit port is listening
if netstat -tlnp 2>/dev/null | grep -q ":$GERRIT_PORT "; then
print_status "OK" "Gerrit listening on port $GERRIT_PORT"
# Show listening details
listening_info=$(netstat -tlnp 2>/dev/null | grep ":$GERRIT_PORT ")
print_status "INFO" "Listening: $listening_info"
else
print_status "ERROR" "Gerrit not listening on port $GERRIT_PORT"
fi
# Test local connectivity
if timeout 5 curl -s -I "http://localhost:$GERRIT_PORT/" > /dev/null 2>&1; then
print_status "OK" "Local HTTP connectivity working"
else
print_status "ERROR" "Local HTTP connectivity failed"
fi
# Test Gerrit health endpoint
if timeout 5 curl -s "http://localhost:$GERRIT_PORT/config/server/info" > /dev/null 2>&1; then
print_status "OK" "Gerrit health endpoint responding"
else
print_status "ERROR" "Gerrit health endpoint not responding"
fi
# Count active connections
if command -v ss > /dev/null; then
active_conn=$(ss -ant | grep ":$GERRIT_PORT" | grep ESTAB | wc -l)
print_status "INFO" "Active connections: $active_conn"
if [[ "$active_conn" -gt 50 ]]; then
print_status "WARN" "High number of active connections: $active_conn"
fi
fi
# 5. Gerrit Configuration Analysis
print_section "Gerrit Configuration"
if [[ -f "$GERRIT_HOME/etc/gerrit.config" ]]; then
print_status "OK" "Gerrit config found"
# Extract HTTP configuration
echo -e "\n${YELLOW}HTTP Configuration:${NC}"
grep -A 10 "^\[httpd\]" "$GERRIT_HOME/etc/gerrit.config" | head -15 || print_status "WARN" "No [httpd] section found"
# Check for thread configuration
if grep -q "maxThreads" "$GERRIT_HOME/etc/gerrit.config"; then
max_threads=$(grep "maxThreads" "$GERRIT_HOME/etc/gerrit.config" | cut -d'=' -f2 | tr -d ' ')
print_status "INFO" "Configured maxThreads: $max_threads"
if [[ "$max_threads" -lt 30 ]]; then
print_status "WARN" "maxThreads ($max_threads) may be too low for high load"
fi
else
print_status "WARN" "maxThreads not explicitly configured (using default: 25)"
fi
# Check for minThreads
if grep -q "minThreads" "$GERRIT_HOME/etc/gerrit.config"; then
min_threads=$(grep "minThreads" "$GERRIT_HOME/etc/gerrit.config" | cut -d'=' -f2 | tr -d ' ')
print_status "INFO" "Configured minThreads: $min_threads"
else
print_status "INFO" "minThreads not configured (using default: 5)"
fi
# Check for queue size
if grep -q "maxQueued" "$GERRIT_HOME/etc/gerrit.config"; then
max_queued=$(grep "maxQueued" "$GERRIT_HOME/etc/gerrit.config" | cut -d'=' -f2 | tr -d ' ')
print_status "INFO" "Configured maxQueued: $max_queued"
else
print_status "WARN" "maxQueued not configured (using default: 200)"
fi
else
print_status "ERROR" "Gerrit config not found at $GERRIT_HOME/etc/gerrit.config"
fi
# 6. Log Analysis
print_section "Recent Log Analysis"
# Check for recent thread pool exhaustion errors
if [[ -f "$GERRIT_HOME/logs/error_log" ]]; then
print_status "OK" "Error log found"
# Count recent thread pool rejections
thread_rejections=$(tail -n $LOG_LINES "$GERRIT_HOME/logs/error_log" | grep -c "rejected.*QueuedThreadPool" 2>/dev/null || echo "0")
thread_rejections=$(echo "$thread_rejections" | tr -d '\n')
if [[ "$thread_rejections" -gt 0 ]]; then
print_status "ERROR" "Thread pool rejections in last $LOG_LINES lines: $thread_rejections"
echo -e "${YELLOW}Recent thread rejection sample:${NC}"
tail -n $LOG_LINES "$GERRIT_HOME/logs/error_log" | grep "rejected.*QueuedThreadPool" | tail -3
else
print_status "OK" "No recent thread pool rejections found"
fi
# Check for database issues
db_errors=$(tail -n $LOG_LINES "$GERRIT_HOME/logs/error_log" | grep -i -c "database\|sql.*error\|connection.*failed" 2>/dev/null || echo "0")
db_errors=$(echo "$db_errors" | tr -d '\n')
if [[ "$db_errors" -gt 0 ]]; then
print_status "ERROR" "Database errors in last $LOG_LINES lines: $db_errors"
else
print_status "OK" "No recent database errors"
fi
# Check for OutOfMemory errors
oom_errors=$(tail -n $LOG_LINES "$GERRIT_HOME/logs/error_log" | grep -i -c "outofmemory\|java heap space" 2>/dev/null || echo "0")
oom_errors=$(echo "$oom_errors" | tr -d '\n')
if [[ "$oom_errors" -gt 0 ]]; then
print_status "ERROR" "OutOfMemory errors found: $oom_errors"
else
print_status "OK" "No OutOfMemory errors"
fi
# Check for GC issues
gc_issues=$(tail -n $LOG_LINES "$GERRIT_HOME/logs/error_log" | grep -i -c "gc.*pause\|concurrent mark sweep" 2>/dev/null || echo "0")
gc_issues=$(echo "$gc_issues" | tr -d '\n')
if [[ "$gc_issues" -gt 0 ]]; then
print_status "WARN" "GC activity detected: $gc_issues entries"
fi
else
print_status "ERROR" "Error log not found at $GERRIT_HOME/logs/error_log"
fi
# Check systemd journal for recent errors
print_status "INFO" "Checking systemd journal for recent errors"
recent_errors=$(journalctl -u $GERRIT_SERVICE --since "10 minutes ago" --no-pager 2>/dev/null | grep -i -c "error\|failed\|exception" || echo "0")
recent_errors=$(echo "$recent_errors" | tr -d '\n')
if [[ "$recent_errors" -gt 0 ]]; then
print_status "WARN" "Recent systemd errors: $recent_errors"
echo -e "${YELLOW}Recent errors:${NC}"
journalctl -u $GERRIT_SERVICE --since "10 minutes ago" --no-pager | grep -i "error\|failed\|exception" | tail -5
fi
# 7. Disk Space and I/O
print_section "Disk Analysis"
# Check disk space
df_output=$(df -h "$GERRIT_HOME")
echo "$df_output"
# Check if disk space is low
disk_usage=$(df "$GERRIT_HOME" | awk 'NR==2 {print $5}' | sed 's/%//')
if [[ "$disk_usage" -gt 90 ]]; then
print_status "ERROR" "Disk usage critical: ${disk_usage}%"
elif [[ "$disk_usage" -gt 80 ]]; then
print_status "WARN" "Disk usage high: ${disk_usage}%"
else
print_status "OK" "Disk usage: ${disk_usage}%"
fi
# Check I/O wait
if command -v iostat > /dev/null; then
io_wait=$(iostat -c 1 2 2>/dev/null | tail -1 | awk '{print $4}' | cut -d'.' -f1)
if [[ -n "$io_wait" ]] && [[ "$io_wait" -gt 20 ]]; then
print_status "WARN" "High I/O wait: ${io_wait}%"
elif [[ -n "$io_wait" ]]; then
print_status "OK" "I/O wait: ${io_wait}%"
else
print_status "INFO" "I/O wait data not available"
fi
fi
# 8. Web Server Configuration (if applicable)
print_section "Web Server Analysis"
# Check for nginx
if systemctl is-active --quiet nginx 2>/dev/null; then
print_status "INFO" "Nginx is running"
# Check nginx error log for recent errors
if [[ -f "/var/log/nginx/error.log" ]]; then
nginx_errors=$(tail -n 50 /var/log/nginx/error.log | grep -c "$(date '+%Y/%m/%d')" || echo "0")
if [[ "$nginx_errors" -gt 0 ]]; then
print_status "WARN" "Nginx errors today: $nginx_errors"
echo -e "${YELLOW}Recent nginx errors:${NC}"
tail -n 50 /var/log/nginx/error.log | grep "$(date '+%Y/%m/%d')" | tail -3
fi
fi
# Test nginx config
if nginx -t > /dev/null 2>&1; then
print_status "OK" "Nginx configuration is valid"
else
print_status "ERROR" "Nginx configuration has errors"
nginx -t
fi
fi
# Check for Apache
if systemctl is-active --quiet httpd 2>/dev/null || systemctl is-active --quiet apache2 2>/dev/null; then
print_status "INFO" "Apache is running"
# Check Apache error log
for log_path in "/var/log/httpd/error_log" "/var/log/apache2/error.log"; do
if [[ -f "$log_path" ]]; then
apache_errors=$(tail -n 50 "$log_path" | grep -c "$(date '+%a %b %d')" 2>/dev/null || echo "0")
apache_errors=$(echo "$apache_errors" | tr -d '\n')
if [[ "$apache_errors" -gt 0 ]]; then
print_status "WARN" "Apache errors today: $apache_errors"
fi
break
fi
done
fi
# 9. Database Analysis
print_section "Database Analysis"
# Try to identify database type from config
if [[ -f "$GERRIT_HOME/etc/gerrit.config" ]]; then
db_type=$(grep -A 5 "^\[database\]" "$GERRIT_HOME/etc/gerrit.config" | grep "type" | cut -d'=' -f2 | tr -d ' ' || echo "unknown")
print_status "INFO" "Database type: $db_type"
# Database-specific checks
case $db_type in
"postgresql"|"postgres")
if command -v psql > /dev/null; then
print_status "INFO" "PostgreSQL tools available"
# Note: Actual DB queries would require credentials
print_status "INFO" "Database connection testing requires manual verification"
else
print_status "WARN" "PostgreSQL tools not available for testing"
fi
;;
"h2")
print_status "INFO" "Using H2 embedded database"
if [[ -f "$GERRIT_HOME/db/ReviewDB.h2.db" ]]; then
db_size=$(du -sh "$GERRIT_HOME/db/ReviewDB.h2.db" | cut -f1)
print_status "INFO" "H2 database size: $db_size"
fi
;;
"mysql")
if command -v mysql > /dev/null; then
print_status "INFO" "MySQL tools available"
else
print_status "WARN" "MySQL tools not available for testing"
fi
;;
esac
fi
# 10. Summary and Recommendations
print_section "Summary and Recommendations"
echo -e "${BLUE}Key Findings:${NC}"
# Analyze the most critical issues
critical_issues=0
warnings=0
# Check for thread pool issues
if [[ -n "${thread_rejections:-}" ]] && [[ "${thread_rejections}" -gt 0 ]]; then
echo -e "${RED}CRITICAL:${NC} Thread pool exhaustion detected ($thread_rejections rejections)"
echo " → Recommendation: Increase httpd.maxThreads and maxQueued in gerrit.config"
((critical_issues++))
fi
# Check memory issues
if [[ -n "${swap_used:-}" ]] && [[ "${swap_used}" -gt 0 ]]; then
echo -e "${YELLOW}WARNING:${NC} System is using swap memory"
echo " → Recommendation: Increase system RAM or reduce Gerrit heap size"
((warnings++))
fi
# Check disk space
if [[ -n "${disk_usage:-}" ]] && [[ "${disk_usage}" -gt 85 ]]; then
echo -e "${RED}CRITICAL:${NC} Low disk space (${disk_usage}%)"
echo " → Recommendation: Clean up logs and temporary files"
((critical_issues++))
fi
# Check system load
load_int_value=$(echo "$load" | cut -d'.' -f1)
if [[ -n "${load_int_value:-}" ]] && [[ "${load_int_value}" -gt 4 ]]; then
echo -e "${RED}CRITICAL:${NC} High system load ($load)"
echo " → Recommendation: Investigate CPU-intensive processes"
((critical_issues++))
fi
# Check HTTP connectivity issues
http_test_result=$(timeout 5 curl -s -I "http://localhost:$GERRIT_PORT/" 2>/dev/null; echo $?)
if [[ "$http_test_result" != "0" ]]; then
echo -e "${RED}CRITICAL:${NC} Gerrit HTTP service not responding on localhost:$GERRIT_PORT"
echo " → Recommendation: Check if Gerrit is fully started and not rejecting connections"
((critical_issues++))
fi
# Check thread configuration
if ! grep -q "maxThreads" "$GERRIT_HOME/etc/gerrit.config" 2>/dev/null; then
echo -e "${YELLOW}WARNING:${NC} Using default thread pool size (25), may be insufficient"
echo " → Recommendation: Add explicit thread configuration to handle load"
((warnings++))
fi
# Service status
if ! systemctl is-active --quiet $GERRIT_SERVICE; then
echo -e "${RED}CRITICAL:${NC} Gerrit service is not running"
echo " → Recommendation: Check service logs and restart Gerrit"
((critical_issues++))
fi
echo ""
echo -e "${BLUE}Recommended Actions:${NC}"
if [[ "$critical_issues" -eq 0 ]] && [[ "$warnings" -eq 0 ]]; then
echo -e "${GREEN}No critical issues detected.${NC}"
echo "Monitor logs for intermittent issues and consider thread pool tuning for optimization."
else
echo "1. ${RED}URGENT:${NC} Address extremely high system load (31.56) - this is causing the 502s"
echo "2. Configure thread pool settings in gerrit.config:"
echo " [httpd]"
echo " maxThreads = 50"
echo " maxQueued = 500"
echo " minThreads = 10"
echo "3. Investigate what processes are consuming CPU (use 'top' or 'htop')"
echo "4. Consider restarting Gerrit service if load remains high"
echo "5. Monitor system resources during peak usage"
fi
echo ""
echo -e "${BLUE}Next Steps:${NC}"
echo "1. Review the thread pool configuration in $GERRIT_HOME/etc/gerrit.config"
echo "2. Monitor $GERRIT_HOME/logs/error_log for real-time issues"
echo "3. Consider implementing proper monitoring (Prometheus + Grafana)"
echo "4. Set up log rotation if not already configured"
echo ""
echo -e "${BLUE}=== Diagnostic Complete ===${NC}"
echo "Issues found: ${critical_issues} critical, ${warnings} warnings"
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment