Created
June 26, 2025 04:51
-
-
Save askb/53e8c334a532fcfb7ad850eb2740558d to your computer and use it in GitHub Desktop.
Gerrit 502 script
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/bin/bash | |
| # Gerrit 502 Error Diagnostic Script | |
| # Usage: sudo ./gerrit-diagnostic.sh | |
| # This script performs read-only checks to identify the root cause of 502 errors | |
| set -euo pipefail | |
| # Colors for output | |
| RED='\033[0;31m' | |
| GREEN='\033[0;32m' | |
| YELLOW='\033[1;33m' | |
| BLUE='\033[0;34m' | |
| NC='\033[0m' # No Color | |
| # Configuration | |
| GERRIT_HOME="/opt/gerrit" | |
| GERRIT_SERVICE="gerrit.service" | |
| GERRIT_PORT="8082" | |
| LOG_LINES=100 | |
| echo -e "${BLUE}================================${NC}" | |
| echo -e "${BLUE}Gerrit 502 Error Diagnostic Tool${NC}" | |
| echo -e "${BLUE}================================${NC}" | |
| echo "Timestamp: $(date)" | |
| echo "Hostname: $(hostname)" | |
| echo "" | |
| # Function to print section headers | |
| print_section() { | |
| echo -e "\n${BLUE}=== $1 ===${NC}" | |
| } | |
| # Function to print status | |
| print_status() { | |
| local status=$1 | |
| local message=$2 | |
| case $status in | |
| "OK") echo -e "${GREEN}[OK]${NC} $message" ;; | |
| "WARN") echo -e "${YELLOW}[WARN]${NC} $message" ;; | |
| "ERROR") echo -e "${RED}[ERROR]${NC} $message" ;; | |
| "INFO") echo -e "${BLUE}[INFO]${NC} $message" ;; | |
| esac | |
| } | |
| # Check if script is run as root | |
| check_privileges() { | |
| if [[ $EUID -ne 0 ]]; then | |
| print_status "WARN" "Script not running as root. Some checks may fail." | |
| fi | |
| } | |
| # 1. System Overview | |
| print_section "System Overview" | |
| check_privileges | |
| print_status "INFO" "System: $(uname -a)" | |
| print_status "INFO" "Uptime: $(uptime)" | |
| # Check system load | |
| load=$(uptime | awk -F'load average:' '{print $2}' | awk '{print $1}' | tr -d ',') | |
| load_int=$(echo "$load" | cut -d'.' -f1) | |
| if (( load_int > 4 )); then | |
| print_status "ERROR" "High system load: $load" | |
| elif (( load_int > 2 )); then | |
| print_status "WARN" "Moderate system load: $load" | |
| else | |
| print_status "OK" "System load: $load" | |
| fi | |
| # 2. Gerrit Service Status | |
| print_section "Gerrit Service Status" | |
| if systemctl is-active --quiet $GERRIT_SERVICE; then | |
| print_status "OK" "Gerrit service is running" | |
| # Get PID and process info | |
| GERRIT_PID=$(systemctl show $GERRIT_SERVICE --property MainPID | cut -d'=' -f2) | |
| # Fallback method if systemctl doesn't work | |
| if [[ -z "$GERRIT_PID" ]] || [[ "$GERRIT_PID" == "0" ]]; then | |
| GERRIT_PID=$(pgrep -f "gerrit.war" | head -1) | |
| fi | |
| if [[ "$GERRIT_PID" != "0" ]]; then | |
| print_status "INFO" "Gerrit PID: $GERRIT_PID" | |
| # Process details | |
| if ps -p $GERRIT_PID > /dev/null 2>&1; then | |
| ps_info=$(ps -p $GERRIT_PID -o pid,ppid,user,stat,pcpu,pmem,etime,cmd --no-headers) | |
| print_status "INFO" "Process: $ps_info" | |
| fi | |
| else | |
| print_status "ERROR" "Gerrit service running but no PID found" | |
| fi | |
| else | |
| print_status "ERROR" "Gerrit service is not running" | |
| systemctl status $GERRIT_SERVICE --no-pager -l | |
| fi | |
| # 3. Memory Analysis | |
| print_section "Memory Analysis" | |
| # System memory | |
| mem_info=$(free -h) | |
| echo "$mem_info" | |
| # Check swap usage | |
| swap_used=$(free | awk '/^Swap:/ {print $3}') | |
| if [[ "$swap_used" -gt 0 ]]; then | |
| print_status "WARN" "Swap is being used: $(free -h | awk '/^Swap:/ {print $3}')" | |
| else | |
| print_status "OK" "No swap usage detected" | |
| fi | |
| # Check if Gerrit process exists for memory analysis | |
| if [[ -n "${GERRIT_PID:-}" ]] && [[ "$GERRIT_PID" != "0" ]]; then | |
| # Gerrit process memory usage | |
| gerrit_mem=$(ps -p $GERRIT_PID -o pmem --no-headers | tr -d ' ') | |
| print_status "INFO" "Gerrit memory usage: ${gerrit_mem}%" | |
| # Check for memory leaks or high usage | |
| gerrit_mem_int=$(echo "$gerrit_mem" | cut -d'.' -f1) | |
| if [[ "$gerrit_mem_int" -gt 80 ]]; then | |
| print_status "ERROR" "Gerrit using excessive memory: ${gerrit_mem}%" | |
| elif [[ "$gerrit_mem_int" -gt 60 ]]; then | |
| print_status "WARN" "Gerrit using high memory: ${gerrit_mem}%" | |
| fi | |
| fi | |
| # 4. Network Connectivity Tests | |
| print_section "Network Connectivity" | |
| # Check if Gerrit port is listening | |
| if netstat -tlnp 2>/dev/null | grep -q ":$GERRIT_PORT "; then | |
| print_status "OK" "Gerrit listening on port $GERRIT_PORT" | |
| # Show listening details | |
| listening_info=$(netstat -tlnp 2>/dev/null | grep ":$GERRIT_PORT ") | |
| print_status "INFO" "Listening: $listening_info" | |
| else | |
| print_status "ERROR" "Gerrit not listening on port $GERRIT_PORT" | |
| fi | |
| # Test local connectivity | |
| if timeout 5 curl -s -I "http://localhost:$GERRIT_PORT/" > /dev/null 2>&1; then | |
| print_status "OK" "Local HTTP connectivity working" | |
| else | |
| print_status "ERROR" "Local HTTP connectivity failed" | |
| fi | |
| # Test Gerrit health endpoint | |
| if timeout 5 curl -s "http://localhost:$GERRIT_PORT/config/server/info" > /dev/null 2>&1; then | |
| print_status "OK" "Gerrit health endpoint responding" | |
| else | |
| print_status "ERROR" "Gerrit health endpoint not responding" | |
| fi | |
| # Count active connections | |
| if command -v ss > /dev/null; then | |
| active_conn=$(ss -ant | grep ":$GERRIT_PORT" | grep ESTAB | wc -l) | |
| print_status "INFO" "Active connections: $active_conn" | |
| if [[ "$active_conn" -gt 50 ]]; then | |
| print_status "WARN" "High number of active connections: $active_conn" | |
| fi | |
| fi | |
| # 5. Gerrit Configuration Analysis | |
| print_section "Gerrit Configuration" | |
| if [[ -f "$GERRIT_HOME/etc/gerrit.config" ]]; then | |
| print_status "OK" "Gerrit config found" | |
| # Extract HTTP configuration | |
| echo -e "\n${YELLOW}HTTP Configuration:${NC}" | |
| grep -A 10 "^\[httpd\]" "$GERRIT_HOME/etc/gerrit.config" | head -15 || print_status "WARN" "No [httpd] section found" | |
| # Check for thread configuration | |
| if grep -q "maxThreads" "$GERRIT_HOME/etc/gerrit.config"; then | |
| max_threads=$(grep "maxThreads" "$GERRIT_HOME/etc/gerrit.config" | cut -d'=' -f2 | tr -d ' ') | |
| print_status "INFO" "Configured maxThreads: $max_threads" | |
| if [[ "$max_threads" -lt 30 ]]; then | |
| print_status "WARN" "maxThreads ($max_threads) may be too low for high load" | |
| fi | |
| else | |
| print_status "WARN" "maxThreads not explicitly configured (using default: 25)" | |
| fi | |
| # Check for minThreads | |
| if grep -q "minThreads" "$GERRIT_HOME/etc/gerrit.config"; then | |
| min_threads=$(grep "minThreads" "$GERRIT_HOME/etc/gerrit.config" | cut -d'=' -f2 | tr -d ' ') | |
| print_status "INFO" "Configured minThreads: $min_threads" | |
| else | |
| print_status "INFO" "minThreads not configured (using default: 5)" | |
| fi | |
| # Check for queue size | |
| if grep -q "maxQueued" "$GERRIT_HOME/etc/gerrit.config"; then | |
| max_queued=$(grep "maxQueued" "$GERRIT_HOME/etc/gerrit.config" | cut -d'=' -f2 | tr -d ' ') | |
| print_status "INFO" "Configured maxQueued: $max_queued" | |
| else | |
| print_status "WARN" "maxQueued not configured (using default: 200)" | |
| fi | |
| else | |
| print_status "ERROR" "Gerrit config not found at $GERRIT_HOME/etc/gerrit.config" | |
| fi | |
| # 6. Log Analysis | |
| print_section "Recent Log Analysis" | |
| # Check for recent thread pool exhaustion errors | |
| if [[ -f "$GERRIT_HOME/logs/error_log" ]]; then | |
| print_status "OK" "Error log found" | |
| # Count recent thread pool rejections | |
| thread_rejections=$(tail -n $LOG_LINES "$GERRIT_HOME/logs/error_log" | grep -c "rejected.*QueuedThreadPool" 2>/dev/null || echo "0") | |
| thread_rejections=$(echo "$thread_rejections" | tr -d '\n') | |
| if [[ "$thread_rejections" -gt 0 ]]; then | |
| print_status "ERROR" "Thread pool rejections in last $LOG_LINES lines: $thread_rejections" | |
| echo -e "${YELLOW}Recent thread rejection sample:${NC}" | |
| tail -n $LOG_LINES "$GERRIT_HOME/logs/error_log" | grep "rejected.*QueuedThreadPool" | tail -3 | |
| else | |
| print_status "OK" "No recent thread pool rejections found" | |
| fi | |
| # Check for database issues | |
| db_errors=$(tail -n $LOG_LINES "$GERRIT_HOME/logs/error_log" | grep -i -c "database\|sql.*error\|connection.*failed" 2>/dev/null || echo "0") | |
| db_errors=$(echo "$db_errors" | tr -d '\n') | |
| if [[ "$db_errors" -gt 0 ]]; then | |
| print_status "ERROR" "Database errors in last $LOG_LINES lines: $db_errors" | |
| else | |
| print_status "OK" "No recent database errors" | |
| fi | |
| # Check for OutOfMemory errors | |
| oom_errors=$(tail -n $LOG_LINES "$GERRIT_HOME/logs/error_log" | grep -i -c "outofmemory\|java heap space" 2>/dev/null || echo "0") | |
| oom_errors=$(echo "$oom_errors" | tr -d '\n') | |
| if [[ "$oom_errors" -gt 0 ]]; then | |
| print_status "ERROR" "OutOfMemory errors found: $oom_errors" | |
| else | |
| print_status "OK" "No OutOfMemory errors" | |
| fi | |
| # Check for GC issues | |
| gc_issues=$(tail -n $LOG_LINES "$GERRIT_HOME/logs/error_log" | grep -i -c "gc.*pause\|concurrent mark sweep" 2>/dev/null || echo "0") | |
| gc_issues=$(echo "$gc_issues" | tr -d '\n') | |
| if [[ "$gc_issues" -gt 0 ]]; then | |
| print_status "WARN" "GC activity detected: $gc_issues entries" | |
| fi | |
| else | |
| print_status "ERROR" "Error log not found at $GERRIT_HOME/logs/error_log" | |
| fi | |
| # Check systemd journal for recent errors | |
| print_status "INFO" "Checking systemd journal for recent errors" | |
| recent_errors=$(journalctl -u $GERRIT_SERVICE --since "10 minutes ago" --no-pager 2>/dev/null | grep -i -c "error\|failed\|exception" || echo "0") | |
| recent_errors=$(echo "$recent_errors" | tr -d '\n') | |
| if [[ "$recent_errors" -gt 0 ]]; then | |
| print_status "WARN" "Recent systemd errors: $recent_errors" | |
| echo -e "${YELLOW}Recent errors:${NC}" | |
| journalctl -u $GERRIT_SERVICE --since "10 minutes ago" --no-pager | grep -i "error\|failed\|exception" | tail -5 | |
| fi | |
| # 7. Disk Space and I/O | |
| print_section "Disk Analysis" | |
| # Check disk space | |
| df_output=$(df -h "$GERRIT_HOME") | |
| echo "$df_output" | |
| # Check if disk space is low | |
| disk_usage=$(df "$GERRIT_HOME" | awk 'NR==2 {print $5}' | sed 's/%//') | |
| if [[ "$disk_usage" -gt 90 ]]; then | |
| print_status "ERROR" "Disk usage critical: ${disk_usage}%" | |
| elif [[ "$disk_usage" -gt 80 ]]; then | |
| print_status "WARN" "Disk usage high: ${disk_usage}%" | |
| else | |
| print_status "OK" "Disk usage: ${disk_usage}%" | |
| fi | |
| # Check I/O wait | |
| if command -v iostat > /dev/null; then | |
| io_wait=$(iostat -c 1 2 2>/dev/null | tail -1 | awk '{print $4}' | cut -d'.' -f1) | |
| if [[ -n "$io_wait" ]] && [[ "$io_wait" -gt 20 ]]; then | |
| print_status "WARN" "High I/O wait: ${io_wait}%" | |
| elif [[ -n "$io_wait" ]]; then | |
| print_status "OK" "I/O wait: ${io_wait}%" | |
| else | |
| print_status "INFO" "I/O wait data not available" | |
| fi | |
| fi | |
| # 8. Web Server Configuration (if applicable) | |
| print_section "Web Server Analysis" | |
| # Check for nginx | |
| if systemctl is-active --quiet nginx 2>/dev/null; then | |
| print_status "INFO" "Nginx is running" | |
| # Check nginx error log for recent errors | |
| if [[ -f "/var/log/nginx/error.log" ]]; then | |
| nginx_errors=$(tail -n 50 /var/log/nginx/error.log | grep -c "$(date '+%Y/%m/%d')" || echo "0") | |
| if [[ "$nginx_errors" -gt 0 ]]; then | |
| print_status "WARN" "Nginx errors today: $nginx_errors" | |
| echo -e "${YELLOW}Recent nginx errors:${NC}" | |
| tail -n 50 /var/log/nginx/error.log | grep "$(date '+%Y/%m/%d')" | tail -3 | |
| fi | |
| fi | |
| # Test nginx config | |
| if nginx -t > /dev/null 2>&1; then | |
| print_status "OK" "Nginx configuration is valid" | |
| else | |
| print_status "ERROR" "Nginx configuration has errors" | |
| nginx -t | |
| fi | |
| fi | |
| # Check for Apache | |
| if systemctl is-active --quiet httpd 2>/dev/null || systemctl is-active --quiet apache2 2>/dev/null; then | |
| print_status "INFO" "Apache is running" | |
| # Check Apache error log | |
| for log_path in "/var/log/httpd/error_log" "/var/log/apache2/error.log"; do | |
| if [[ -f "$log_path" ]]; then | |
| apache_errors=$(tail -n 50 "$log_path" | grep -c "$(date '+%a %b %d')" 2>/dev/null || echo "0") | |
| apache_errors=$(echo "$apache_errors" | tr -d '\n') | |
| if [[ "$apache_errors" -gt 0 ]]; then | |
| print_status "WARN" "Apache errors today: $apache_errors" | |
| fi | |
| break | |
| fi | |
| done | |
| fi | |
| # 9. Database Analysis | |
| print_section "Database Analysis" | |
| # Try to identify database type from config | |
| if [[ -f "$GERRIT_HOME/etc/gerrit.config" ]]; then | |
| db_type=$(grep -A 5 "^\[database\]" "$GERRIT_HOME/etc/gerrit.config" | grep "type" | cut -d'=' -f2 | tr -d ' ' || echo "unknown") | |
| print_status "INFO" "Database type: $db_type" | |
| # Database-specific checks | |
| case $db_type in | |
| "postgresql"|"postgres") | |
| if command -v psql > /dev/null; then | |
| print_status "INFO" "PostgreSQL tools available" | |
| # Note: Actual DB queries would require credentials | |
| print_status "INFO" "Database connection testing requires manual verification" | |
| else | |
| print_status "WARN" "PostgreSQL tools not available for testing" | |
| fi | |
| ;; | |
| "h2") | |
| print_status "INFO" "Using H2 embedded database" | |
| if [[ -f "$GERRIT_HOME/db/ReviewDB.h2.db" ]]; then | |
| db_size=$(du -sh "$GERRIT_HOME/db/ReviewDB.h2.db" | cut -f1) | |
| print_status "INFO" "H2 database size: $db_size" | |
| fi | |
| ;; | |
| "mysql") | |
| if command -v mysql > /dev/null; then | |
| print_status "INFO" "MySQL tools available" | |
| else | |
| print_status "WARN" "MySQL tools not available for testing" | |
| fi | |
| ;; | |
| esac | |
| fi | |
| # 10. Summary and Recommendations | |
| print_section "Summary and Recommendations" | |
| echo -e "${BLUE}Key Findings:${NC}" | |
| # Analyze the most critical issues | |
| critical_issues=0 | |
| warnings=0 | |
| # Check for thread pool issues | |
| if [[ -n "${thread_rejections:-}" ]] && [[ "${thread_rejections}" -gt 0 ]]; then | |
| echo -e "${RED}CRITICAL:${NC} Thread pool exhaustion detected ($thread_rejections rejections)" | |
| echo " → Recommendation: Increase httpd.maxThreads and maxQueued in gerrit.config" | |
| ((critical_issues++)) | |
| fi | |
| # Check memory issues | |
| if [[ -n "${swap_used:-}" ]] && [[ "${swap_used}" -gt 0 ]]; then | |
| echo -e "${YELLOW}WARNING:${NC} System is using swap memory" | |
| echo " → Recommendation: Increase system RAM or reduce Gerrit heap size" | |
| ((warnings++)) | |
| fi | |
| # Check disk space | |
| if [[ -n "${disk_usage:-}" ]] && [[ "${disk_usage}" -gt 85 ]]; then | |
| echo -e "${RED}CRITICAL:${NC} Low disk space (${disk_usage}%)" | |
| echo " → Recommendation: Clean up logs and temporary files" | |
| ((critical_issues++)) | |
| fi | |
| # Check system load | |
| load_int_value=$(echo "$load" | cut -d'.' -f1) | |
| if [[ -n "${load_int_value:-}" ]] && [[ "${load_int_value}" -gt 4 ]]; then | |
| echo -e "${RED}CRITICAL:${NC} High system load ($load)" | |
| echo " → Recommendation: Investigate CPU-intensive processes" | |
| ((critical_issues++)) | |
| fi | |
| # Check HTTP connectivity issues | |
| http_test_result=$(timeout 5 curl -s -I "http://localhost:$GERRIT_PORT/" 2>/dev/null; echo $?) | |
| if [[ "$http_test_result" != "0" ]]; then | |
| echo -e "${RED}CRITICAL:${NC} Gerrit HTTP service not responding on localhost:$GERRIT_PORT" | |
| echo " → Recommendation: Check if Gerrit is fully started and not rejecting connections" | |
| ((critical_issues++)) | |
| fi | |
| # Check thread configuration | |
| if ! grep -q "maxThreads" "$GERRIT_HOME/etc/gerrit.config" 2>/dev/null; then | |
| echo -e "${YELLOW}WARNING:${NC} Using default thread pool size (25), may be insufficient" | |
| echo " → Recommendation: Add explicit thread configuration to handle load" | |
| ((warnings++)) | |
| fi | |
| # Service status | |
| if ! systemctl is-active --quiet $GERRIT_SERVICE; then | |
| echo -e "${RED}CRITICAL:${NC} Gerrit service is not running" | |
| echo " → Recommendation: Check service logs and restart Gerrit" | |
| ((critical_issues++)) | |
| fi | |
| echo "" | |
| echo -e "${BLUE}Recommended Actions:${NC}" | |
| if [[ "$critical_issues" -eq 0 ]] && [[ "$warnings" -eq 0 ]]; then | |
| echo -e "${GREEN}No critical issues detected.${NC}" | |
| echo "Monitor logs for intermittent issues and consider thread pool tuning for optimization." | |
| else | |
| echo "1. ${RED}URGENT:${NC} Address extremely high system load (31.56) - this is causing the 502s" | |
| echo "2. Configure thread pool settings in gerrit.config:" | |
| echo " [httpd]" | |
| echo " maxThreads = 50" | |
| echo " maxQueued = 500" | |
| echo " minThreads = 10" | |
| echo "3. Investigate what processes are consuming CPU (use 'top' or 'htop')" | |
| echo "4. Consider restarting Gerrit service if load remains high" | |
| echo "5. Monitor system resources during peak usage" | |
| fi | |
| echo "" | |
| echo -e "${BLUE}Next Steps:${NC}" | |
| echo "1. Review the thread pool configuration in $GERRIT_HOME/etc/gerrit.config" | |
| echo "2. Monitor $GERRIT_HOME/logs/error_log for real-time issues" | |
| echo "3. Consider implementing proper monitoring (Prometheus + Grafana)" | |
| echo "4. Set up log rotation if not already configured" | |
| echo "" | |
| echo -e "${BLUE}=== Diagnostic Complete ===${NC}" | |
| echo "Issues found: ${critical_issues} critical, ${warnings} warnings" |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment