Created
July 4, 2025 13:41
-
-
Save atyronesmith/bc348e843fe4b74083e2d2483c4d0a90 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/bin/bash | |
| # OpenShift PVC/PV Diagnostic and Auto-Fix Script | |
| # Version: 1.0 | |
| # Description: Comprehensive script to diagnose and fix PVC/PV issues in OpenShift | |
| set -euo pipefail | |
| # Configuration | |
| SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" | |
| LOG_FILE="${SCRIPT_DIR}/pvc_debug_$(date +%Y%m%d_%H%M%S).log" | |
| DRY_RUN=${DRY_RUN:-false} | |
| BACKUP_DIR="${SCRIPT_DIR}/backups_$(date +%Y%m%d_%H%M%S)" | |
| TIMEOUT=300 | |
| RETRY_COUNT=3 | |
| FORCE_FIX=${FORCE_FIX:-false} | |
| # Colors for output | |
| RED='\033[0;31m' | |
| GREEN='\033[0;32m' | |
| YELLOW='\033[1;33m' | |
| BLUE='\033[0;34m' | |
| NC='\033[0m' # No Color | |
| # Global variables | |
| NAMESPACE="" | |
| PVC_NAME="" | |
| PV_NAME="" | |
| ISSUE_TYPE="" | |
| STORAGE_CLASS="" | |
| ISSUES_FOUND=() | |
| FIXES_APPLIED=() | |
| # Logging function | |
| log() { | |
| local level=$1 | |
| shift | |
| local message="$*" | |
| local timestamp=$(date '+%Y-%m-%d %H:%M:%S') | |
| case $level in | |
| "INFO") echo -e "${GREEN}[INFO]${NC} ${timestamp} - $message" | tee -a "$LOG_FILE" ;; | |
| "WARN") echo -e "${YELLOW}[WARN]${NC} ${timestamp} - $message" | tee -a "$LOG_FILE" ;; | |
| "ERROR") echo -e "${RED}[ERROR]${NC} ${timestamp} - $message" | tee -a "$LOG_FILE" ;; | |
| "DEBUG") echo -e "${BLUE}[DEBUG]${NC} ${timestamp} - $message" | tee -a "$LOG_FILE" ;; | |
| esac | |
| } | |
| # Usage function | |
| usage() { | |
| cat << EOF | |
| Usage: $0 [OPTIONS] | |
| OpenShift PVC/PV Diagnostic and Auto-Fix Script | |
| OPTIONS: | |
| -n, --namespace NAMESPACE Target namespace (default: current namespace) | |
| -p, --pvc PVC_NAME Specific PVC to debug | |
| -v, --pv PV_NAME Specific PV to debug | |
| -d, --dry-run Show what would be done without making changes | |
| -f, --force Force fixes without confirmation | |
| -h, --help Show this help message | |
| ENVIRONMENT VARIABLES: | |
| DRY_RUN=true Enable dry-run mode | |
| FORCE_FIX=true Force fixes without confirmation | |
| Examples: | |
| $0 -n myapp -p data-pvc # Debug specific PVC in namespace | |
| $0 -n myapp # Debug all PVCs in namespace | |
| $0 --dry-run # Show diagnostics without fixing | |
| DRY_RUN=true $0 -n myapp # Dry run via environment variable | |
| EOF | |
| } | |
| # Parse command line arguments | |
| parse_args() { | |
| while [[ $# -gt 0 ]]; do | |
| case $1 in | |
| -n|--namespace) | |
| NAMESPACE="$2" | |
| shift 2 | |
| ;; | |
| -p|--pvc) | |
| PVC_NAME="$2" | |
| shift 2 | |
| ;; | |
| -v|--pv) | |
| PV_NAME="$2" | |
| shift 2 | |
| ;; | |
| -d|--dry-run) | |
| DRY_RUN=true | |
| shift | |
| ;; | |
| -f|--force) | |
| FORCE_FIX=true | |
| shift | |
| ;; | |
| -h|--help) | |
| usage | |
| exit 0 | |
| ;; | |
| *) | |
| log "ERROR" "Unknown option: $1" | |
| usage | |
| exit 1 | |
| ;; | |
| esac | |
| done | |
| } | |
| # Check prerequisites | |
| check_prerequisites() { | |
| log "INFO" "Checking prerequisites..." | |
| # Check if oc command is available | |
| if ! command -v oc &> /dev/null; then | |
| log "ERROR" "oc command not found. Please install OpenShift CLI." | |
| exit 1 | |
| fi | |
| # Check if logged in to OpenShift | |
| if ! oc whoami &> /dev/null; then | |
| log "ERROR" "Not logged in to OpenShift. Please run 'oc login' first." | |
| exit 1 | |
| fi | |
| # Set namespace if not provided | |
| if [[ -z "$NAMESPACE" ]]; then | |
| NAMESPACE=$(oc project -q) | |
| log "INFO" "Using current namespace: $NAMESPACE" | |
| fi | |
| # Verify namespace exists | |
| if ! oc get namespace "$NAMESPACE" &> /dev/null; then | |
| log "ERROR" "Namespace '$NAMESPACE' does not exist." | |
| exit 1 | |
| fi | |
| # Create backup directory | |
| if [[ "$DRY_RUN" != "true" ]]; then | |
| mkdir -p "$BACKUP_DIR" | |
| log "INFO" "Created backup directory: $BACKUP_DIR" | |
| fi | |
| } | |
| # Backup resource | |
| backup_resource() { | |
| local resource_type=$1 | |
| local resource_name=$2 | |
| local namespace=${3:-$NAMESPACE} | |
| if [[ "$DRY_RUN" == "true" ]]; then | |
| log "INFO" "[DRY RUN] Would backup $resource_type/$resource_name" | |
| return 0 | |
| fi | |
| local backup_file="${BACKUP_DIR}/${resource_type}_${resource_name}_$(date +%H%M%S).yaml" | |
| if oc get "$resource_type" "$resource_name" -n "$namespace" -o yaml > "$backup_file" 2>/dev/null; then | |
| log "INFO" "Backed up $resource_type/$resource_name to $backup_file" | |
| return 0 | |
| else | |
| log "WARN" "Failed to backup $resource_type/$resource_name" | |
| return 1 | |
| fi | |
| } | |
| # Wait for resource condition | |
| wait_for_condition() { | |
| local resource_type=$1 | |
| local resource_name=$2 | |
| local condition=$3 | |
| local timeout=${4:-$TIMEOUT} | |
| local namespace=${5:-$NAMESPACE} | |
| log "INFO" "Waiting for $resource_type/$resource_name to be $condition (timeout: ${timeout}s)" | |
| local count=0 | |
| while [[ $count -lt $timeout ]]; do | |
| local status="" | |
| case $resource_type in | |
| "pvc") | |
| status=$(oc get pvc "$resource_name" -n "$namespace" -o jsonpath='{.status.phase}' 2>/dev/null || echo "NotFound") | |
| ;; | |
| "pv") | |
| status=$(oc get pv "$resource_name" -o jsonpath='{.status.phase}' 2>/dev/null || echo "NotFound") | |
| ;; | |
| "pod") | |
| status=$(oc get pod "$resource_name" -n "$namespace" -o jsonpath='{.status.phase}' 2>/dev/null || echo "NotFound") | |
| ;; | |
| esac | |
| if [[ "$status" == "$condition" ]]; then | |
| log "INFO" "$resource_type/$resource_name is now $condition" | |
| return 0 | |
| fi | |
| sleep 5 | |
| count=$((count + 5)) | |
| done | |
| log "WARN" "Timeout waiting for $resource_type/$resource_name to be $condition" | |
| return 1 | |
| } | |
| # Get resource info | |
| get_resource_info() { | |
| local resource_type=$1 | |
| local resource_name=$2 | |
| local namespace=${3:-$NAMESPACE} | |
| local field=$4 | |
| case $resource_type in | |
| "pvc") | |
| oc get pvc "$resource_name" -n "$namespace" -o jsonpath="{$field}" 2>/dev/null || echo "" | |
| ;; | |
| "pv") | |
| oc get pv "$resource_name" -o jsonpath="{$field}" 2>/dev/null || echo "" | |
| ;; | |
| "pod") | |
| oc get pod "$resource_name" -n "$namespace" -o jsonpath="{$field}" 2>/dev/null || echo "" | |
| ;; | |
| "storageclass") | |
| oc get storageclass "$resource_name" -o jsonpath="{$field}" 2>/dev/null || echo "" | |
| ;; | |
| esac | |
| } | |
| # Diagnose PVC issues | |
| diagnose_pvc() { | |
| local pvc_name=$1 | |
| local namespace=${2:-$NAMESPACE} | |
| log "INFO" "Diagnosing PVC: $pvc_name in namespace: $namespace" | |
| # Check if PVC exists | |
| if ! oc get pvc "$pvc_name" -n "$namespace" &> /dev/null; then | |
| log "ERROR" "PVC '$pvc_name' not found in namespace '$namespace'" | |
| return 1 | |
| fi | |
| # Get PVC status | |
| local pvc_status=$(get_resource_info "pvc" "$pvc_name" "$namespace" '.status.phase') | |
| local pvc_capacity=$(get_resource_info "pvc" "$pvc_name" "$namespace" '.spec.resources.requests.storage') | |
| local pvc_access_modes=$(get_resource_info "pvc" "$pvc_name" "$namespace" '.spec.accessModes[0]') | |
| local pvc_storage_class=$(get_resource_info "pvc" "$pvc_name" "$namespace" '.spec.storageClassName') | |
| local bound_pv=$(get_resource_info "pvc" "$pvc_name" "$namespace" '.spec.volumeName') | |
| log "INFO" "PVC Status: $pvc_status" | |
| log "INFO" "PVC Capacity: $pvc_capacity" | |
| log "INFO" "PVC Access Mode: $pvc_access_modes" | |
| log "INFO" "PVC Storage Class: $pvc_storage_class" | |
| log "INFO" "Bound PV: $bound_pv" | |
| # Store values for later use | |
| STORAGE_CLASS="$pvc_storage_class" | |
| PV_NAME="$bound_pv" | |
| # Check PVC status and identify issues | |
| case $pvc_status in | |
| "Pending") | |
| log "WARN" "PVC is in Pending state" | |
| ISSUES_FOUND+=("pvc_pending") | |
| diagnose_pending_pvc "$pvc_name" "$namespace" | |
| ;; | |
| "Bound") | |
| log "INFO" "PVC is Bound - checking for pod mounting issues" | |
| diagnose_bound_pvc "$pvc_name" "$namespace" | |
| ;; | |
| "Lost") | |
| log "ERROR" "PVC is in Lost state" | |
| ISSUES_FOUND+=("pvc_lost") | |
| ;; | |
| *) | |
| log "WARN" "PVC is in unexpected state: $pvc_status" | |
| ISSUES_FOUND+=("pvc_unknown_state") | |
| ;; | |
| esac | |
| # Check for events | |
| local events=$(oc get events -n "$namespace" --field-selector involvedObject.name="$pvc_name" --sort-by='.lastTimestamp' -o custom-columns=REASON:.reason,MESSAGE:.message --no-headers 2>/dev/null | tail -5) | |
| if [[ -n "$events" ]]; then | |
| log "INFO" "Recent events for PVC $pvc_name:" | |
| echo "$events" | while read -r line; do | |
| log "INFO" " Event: $line" | |
| done | |
| fi | |
| } | |
| # Diagnose pending PVC | |
| diagnose_pending_pvc() { | |
| local pvc_name=$1 | |
| local namespace=$2 | |
| log "INFO" "Diagnosing pending PVC: $pvc_name" | |
| # Check if storage class exists | |
| if [[ -n "$STORAGE_CLASS" ]]; then | |
| if ! oc get storageclass "$STORAGE_CLASS" &> /dev/null; then | |
| log "ERROR" "Storage class '$STORAGE_CLASS' not found" | |
| ISSUES_FOUND+=("missing_storage_class") | |
| return | |
| fi | |
| else | |
| log "WARN" "No storage class specified in PVC" | |
| ISSUES_FOUND+=("no_storage_class") | |
| fi | |
| # Check for available PVs | |
| local pvc_capacity=$(get_resource_info "pvc" "$pvc_name" "$namespace" '.spec.resources.requests.storage') | |
| local pvc_access_modes=$(get_resource_info "pvc" "$pvc_name" "$namespace" '.spec.accessModes[0]') | |
| log "INFO" "Looking for available PVs with capacity >= $pvc_capacity and access mode $pvc_access_modes" | |
| local available_pvs=$(oc get pv --field-selector=status.phase=Available -o name 2>/dev/null | wc -l) | |
| log "INFO" "Found $available_pvs available PVs" | |
| if [[ $available_pvs -eq 0 ]]; then | |
| # Check dynamic provisioning | |
| if [[ -n "$STORAGE_CLASS" ]]; then | |
| local provisioner=$(get_resource_info "storageclass" "$STORAGE_CLASS" "" '.provisioner') | |
| log "INFO" "Storage class uses provisioner: $provisioner" | |
| if [[ -n "$provisioner" ]]; then | |
| ISSUES_FOUND+=("dynamic_provisioning_failed") | |
| check_provisioner_health "$provisioner" | |
| else | |
| ISSUES_FOUND+=("no_provisioner") | |
| fi | |
| else | |
| ISSUES_FOUND+=("no_available_pv") | |
| fi | |
| else | |
| ISSUES_FOUND+=("pv_selection_failed") | |
| fi | |
| } | |
| # Diagnose bound PVC | |
| diagnose_bound_pvc() { | |
| local pvc_name=$1 | |
| local namespace=$2 | |
| log "INFO" "Diagnosing bound PVC: $pvc_name" | |
| # Find pods using this PVC | |
| local pods_using_pvc=$(oc get pods -n "$namespace" -o json | jq -r --arg pvc "$pvc_name" '.items[] | select(.spec.volumes[]?.persistentVolumeClaim?.claimName == $pvc) | .metadata.name' 2>/dev/null || echo "") | |
| if [[ -n "$pods_using_pvc" ]]; then | |
| log "INFO" "Pods using PVC $pvc_name:" | |
| echo "$pods_using_pvc" | while read -r pod; do | |
| log "INFO" " Pod: $pod" | |
| diagnose_pod_volume_mount "$pod" "$namespace" "$pvc_name" | |
| done | |
| else | |
| log "INFO" "No pods currently using PVC $pvc_name" | |
| fi | |
| } | |
| # Diagnose pod volume mount issues | |
| diagnose_pod_volume_mount() { | |
| local pod_name=$1 | |
| local namespace=$2 | |
| local pvc_name=$3 | |
| local pod_status=$(get_resource_info "pod" "$pod_name" "$namespace" '.status.phase') | |
| log "INFO" "Pod $pod_name status: $pod_status" | |
| if [[ "$pod_status" == "Pending" ]]; then | |
| # Check for volume mount issues | |
| local pod_events=$(oc get events -n "$namespace" --field-selector involvedObject.name="$pod_name" --sort-by='.lastTimestamp' -o custom-columns=REASON:.reason,MESSAGE:.message --no-headers 2>/dev/null | tail -3) | |
| if echo "$pod_events" | grep -q "FailedMount\|VolumeMount"; then | |
| log "WARN" "Pod has volume mount issues" | |
| ISSUES_FOUND+=("pod_mount_failed") | |
| # Check node where pod is scheduled | |
| local node_name=$(get_resource_info "pod" "$pod_name" "$namespace" '.spec.nodeName') | |
| if [[ -n "$node_name" ]]; then | |
| log "INFO" "Pod scheduled on node: $node_name" | |
| check_node_storage_health "$node_name" | |
| fi | |
| fi | |
| fi | |
| } | |
| # Check provisioner health | |
| check_provisioner_health() { | |
| local provisioner=$1 | |
| log "INFO" "Checking provisioner health: $provisioner" | |
| # Common provisioner patterns | |
| case $provisioner in | |
| *"csi"*) | |
| check_csi_driver_health "$provisioner" | |
| ;; | |
| *"rook"*|*"ceph"*) | |
| check_ceph_health | |
| ;; | |
| *"nfs"*) | |
| check_nfs_health | |
| ;; | |
| *) | |
| log "WARN" "Unknown provisioner type: $provisioner" | |
| ISSUES_FOUND+=("unknown_provisioner") | |
| ;; | |
| esac | |
| } | |
| # Check CSI driver health | |
| check_csi_driver_health() { | |
| local provisioner=$1 | |
| log "INFO" "Checking CSI driver health for: $provisioner" | |
| # Check CSI driver pods | |
| local csi_pods=$(oc get pods -A -l app.kubernetes.io/name=csi-driver -o name 2>/dev/null | wc -l) | |
| if [[ $csi_pods -eq 0 ]]; then | |
| log "WARN" "No CSI driver pods found" | |
| ISSUES_FOUND+=("csi_driver_missing") | |
| else | |
| log "INFO" "Found $csi_pods CSI driver pods" | |
| fi | |
| # Check for CSI driver errors | |
| oc get pods -A -l app.kubernetes.io/name=csi-driver --field-selector=status.phase!=Running 2>/dev/null | while read -r line; do | |
| if [[ "$line" != "No resources found." ]] && [[ -n "$line" ]]; then | |
| log "WARN" "CSI driver pod not running: $line" | |
| ISSUES_FOUND+=("csi_driver_unhealthy") | |
| fi | |
| done | |
| } | |
| # Check Ceph health | |
| check_ceph_health() { | |
| log "INFO" "Checking Ceph cluster health" | |
| # Check if OCS/ODF is installed | |
| if oc get namespace openshift-storage &> /dev/null; then | |
| local ceph_cluster=$(oc get cephcluster -n openshift-storage -o name 2>/dev/null | head -1) | |
| if [[ -n "$ceph_cluster" ]]; then | |
| local ceph_health=$(oc get "$ceph_cluster" -n openshift-storage -o jsonpath='{.status.ceph.health}' 2>/dev/null || echo "Unknown") | |
| log "INFO" "Ceph cluster health: $ceph_health" | |
| if [[ "$ceph_health" != "HEALTH_OK" ]]; then | |
| ISSUES_FOUND+=("ceph_unhealthy") | |
| fi | |
| else | |
| log "WARN" "Ceph cluster not found" | |
| ISSUES_FOUND+=("ceph_missing") | |
| fi | |
| else | |
| log "WARN" "OpenShift Storage namespace not found" | |
| ISSUES_FOUND+=("ocs_missing") | |
| fi | |
| } | |
| # Check NFS health | |
| check_nfs_health() { | |
| log "INFO" "Checking NFS health" | |
| # This would need to be customized based on NFS setup | |
| log "INFO" "NFS health check not implemented - manual verification needed" | |
| ISSUES_FOUND+=("nfs_check_needed") | |
| } | |
| # Check node storage health | |
| check_node_storage_health() { | |
| local node_name=$1 | |
| log "INFO" "Checking storage health on node: $node_name" | |
| # Check node conditions | |
| local node_conditions=$(oc get node "$node_name" -o jsonpath='{.status.conditions[?(@.status=="True")].type}' 2>/dev/null || echo "") | |
| if echo "$node_conditions" | grep -q "DiskPressure"; then | |
| log "WARN" "Node $node_name has disk pressure" | |
| ISSUES_FOUND+=("node_disk_pressure") | |
| fi | |
| # Check kubelet logs for volume errors (would need debug access) | |
| log "INFO" "Node storage check completed for $node_name" | |
| } | |
| # Diagnose PV issues | |
| diagnose_pv() { | |
| local pv_name=$1 | |
| log "INFO" "Diagnosing PV: $pv_name" | |
| # Check if PV exists | |
| if ! oc get pv "$pv_name" &> /dev/null; then | |
| log "ERROR" "PV '$pv_name' not found" | |
| return 1 | |
| fi | |
| # Get PV status | |
| local pv_status=$(get_resource_info "pv" "$pv_name" "" '.status.phase') | |
| local pv_capacity=$(get_resource_info "pv" "$pv_name" "" '.spec.capacity.storage') | |
| local pv_access_modes=$(get_resource_info "pv" "$pv_name" "" '.spec.accessModes[0]') | |
| local pv_reclaim_policy=$(get_resource_info "pv" "$pv_name" "" '.spec.persistentVolumeReclaimPolicy') | |
| local pv_claim_ref=$(get_resource_info "pv" "$pv_name" "" '.spec.claimRef.name') | |
| log "INFO" "PV Status: $pv_status" | |
| log "INFO" "PV Capacity: $pv_capacity" | |
| log "INFO" "PV Access Mode: $pv_access_modes" | |
| log "INFO" "PV Reclaim Policy: $pv_reclaim_policy" | |
| log "INFO" "PV Claim Reference: $pv_claim_ref" | |
| # Check PV status and identify issues | |
| case $pv_status in | |
| "Released") | |
| log "WARN" "PV is in Released state" | |
| ISSUES_FOUND+=("pv_released") | |
| ;; | |
| "Failed") | |
| log "ERROR" "PV is in Failed state" | |
| ISSUES_FOUND+=("pv_failed") | |
| ;; | |
| "Available") | |
| log "INFO" "PV is Available" | |
| ;; | |
| "Bound") | |
| log "INFO" "PV is Bound" | |
| ;; | |
| *) | |
| log "WARN" "PV is in unexpected state: $pv_status" | |
| ISSUES_FOUND+=("pv_unknown_state") | |
| ;; | |
| esac | |
| } | |
| # Fix PVC pending issues | |
| fix_pvc_pending() { | |
| local pvc_name=$1 | |
| local namespace=$2 | |
| log "INFO" "Attempting to fix pending PVC: $pvc_name" | |
| # Strategy 1: Check and fix storage class | |
| if [[ " ${ISSUES_FOUND[*]} " =~ " missing_storage_class " ]]; then | |
| fix_missing_storage_class "$pvc_name" "$namespace" | |
| fi | |
| # Strategy 2: Create manual PV if dynamic provisioning failed | |
| if [[ " ${ISSUES_FOUND[*]} " =~ " dynamic_provisioning_failed " ]]; then | |
| fix_dynamic_provisioning "$pvc_name" "$namespace" | |
| fi | |
| # Strategy 3: Fix PV selection issues | |
| if [[ " ${ISSUES_FOUND[*]} " =~ " pv_selection_failed " ]]; then | |
| fix_pv_selection "$pvc_name" "$namespace" | |
| fi | |
| # Strategy 4: Recreate PVC with different parameters | |
| if [[ " ${ISSUES_FOUND[*]} " =~ " no_available_pv " ]]; then | |
| fix_no_available_pv "$pvc_name" "$namespace" | |
| fi | |
| } | |
| # Fix missing storage class | |
| fix_missing_storage_class() { | |
| local pvc_name=$1 | |
| local namespace=$2 | |
| log "INFO" "Fixing missing storage class for PVC: $pvc_name" | |
| # Get default storage class | |
| local default_sc=$(oc get storageclass -o jsonpath='{.items[?(@.metadata.annotations.storageclass\.kubernetes\.io/is-default-class=="true")].metadata.name}' 2>/dev/null || echo "") | |
| if [[ -n "$default_sc" ]]; then | |
| log "INFO" "Found default storage class: $default_sc" | |
| if confirm_action "Update PVC $pvc_name to use default storage class $default_sc"; then | |
| backup_resource "pvc" "$pvc_name" "$namespace" | |
| if [[ "$DRY_RUN" == "true" ]]; then | |
| log "INFO" "[DRY RUN] Would update PVC storage class to: $default_sc" | |
| else | |
| if oc patch pvc "$pvc_name" -n "$namespace" -p "{\"spec\":{\"storageClassName\":\"$default_sc\"}}" 2>/dev/null; then | |
| log "INFO" "Updated PVC storage class to: $default_sc" | |
| FIXES_APPLIED+=("updated_storage_class") | |
| # Wait for PVC to be bound | |
| if wait_for_condition "pvc" "$pvc_name" "Bound" 120 "$namespace"; then | |
| log "INFO" "PVC is now bound" | |
| return 0 | |
| else | |
| log "WARN" "PVC still not bound after storage class update" | |
| fi | |
| else | |
| log "ERROR" "Failed to update PVC storage class" | |
| fi | |
| fi | |
| fi | |
| else | |
| log "WARN" "No default storage class found" | |
| # List available storage classes | |
| local available_sc=$(oc get storageclass -o name 2>/dev/null | head -3) | |
| if [[ -n "$available_sc" ]]; then | |
| log "INFO" "Available storage classes:" | |
| echo "$available_sc" | while read -r sc; do | |
| log "INFO" " $sc" | |
| done | |
| # Try first available storage class | |
| local first_sc=$(echo "$available_sc" | head -1 | cut -d'/' -f2) | |
| if confirm_action "Update PVC $pvc_name to use storage class $first_sc"; then | |
| backup_resource "pvc" "$pvc_name" "$namespace" | |
| if [[ "$DRY_RUN" == "true" ]]; then | |
| log "INFO" "[DRY RUN] Would update PVC storage class to: $first_sc" | |
| else | |
| if oc patch pvc "$pvc_name" -n "$namespace" -p "{\"spec\":{\"storageClassName\":\"$first_sc\"}}" 2>/dev/null; then | |
| log "INFO" "Updated PVC storage class to: $first_sc" | |
| FIXES_APPLIED+=("updated_storage_class") | |
| if wait_for_condition "pvc" "$pvc_name" "Bound" 120 "$namespace"; then | |
| log "INFO" "PVC is now bound" | |
| return 0 | |
| fi | |
| else | |
| log "ERROR" "Failed to update PVC storage class" | |
| fi | |
| fi | |
| fi | |
| fi | |
| fi | |
| return 1 | |
| } | |
| # Fix dynamic provisioning issues | |
| fix_dynamic_provisioning() { | |
| local pvc_name=$1 | |
| local namespace=$2 | |
| log "INFO" "Fixing dynamic provisioning issues for PVC: $pvc_name" | |
| # Strategy 1: Restart provisioner pods | |
| if [[ " ${ISSUES_FOUND[*]} " =~ " csi_driver_unhealthy " ]]; then | |
| fix_csi_driver_issues | |
| fi | |
| # Strategy 2: Check and fix Ceph issues | |
| if [[ " ${ISSUES_FOUND[*]} " =~ " ceph_unhealthy " ]]; then | |
| fix_ceph_issues | |
| fi | |
| # Strategy 3: Delete and recreate PVC | |
| if confirm_action "Delete and recreate PVC $pvc_name to trigger fresh provisioning"; then | |
| recreate_pvc "$pvc_name" "$namespace" | |
| fi | |
| } | |
| # Fix CSI driver issues | |
| fix_csi_driver_issues() { | |
| log "INFO" "Fixing CSI driver issues" | |
| # Find and restart CSI driver pods | |
| local csi_pods=$(oc get pods -A -l app.kubernetes.io/name=csi-driver --field-selector=status.phase!=Running -o jsonpath='{.items[*].metadata.name}' 2>/dev/null) | |
| if [[ -n "$csi_pods" ]]; then | |
| for pod in $csi_pods; do | |
| local pod_namespace=$(oc get pod "$pod" -A -o jsonpath='{.metadata.namespace}' 2>/dev/null) | |
| if [[ -n "$pod_namespace" ]]; then | |
| log "INFO" "Restarting CSI driver pod: $pod in namespace: $pod_namespace" | |
| if [[ "$DRY_RUN" == "true" ]]; then | |
| log "INFO" "[DRY RUN] Would delete pod: $pod" | |
| else | |
| if oc delete pod "$pod" -n "$pod_namespace" 2>/dev/null; then | |
| log "INFO" "Deleted CSI driver pod: $pod" | |
| FIXES_APPLIED+=("restarted_csi_driver") | |
| # Wait for pod to be recreated | |
| sleep 10 | |
| # Check if new pod is running | |
| if wait_for_condition "pod" "$pod" "Running" 120 "$pod_namespace"; then | |
| log "INFO" "CSI driver pod is now running" | |
| else | |
| log "WARN" "CSI driver pod not running after restart" | |
| fi | |
| else | |
| log "ERROR" "Failed to delete CSI driver pod: $pod" | |
| fi | |
| fi | |
| fi | |
| done | |
| else | |
| log "INFO" "No unhealthy CSI driver pods found" | |
| fi | |
| } | |
| # Fix Ceph issues | |
| fix_ceph_issues() { | |
| log "INFO" "Fixing Ceph issues" | |
| # This is a basic attempt - real Ceph issues need specialized handling | |
| if oc get namespace openshift-storage &> /dev/null; then | |
| # Check for common Ceph operator issues | |
| local ceph_operator_pods=$(oc get pods -n openshift-storage -l app=rook-ceph-operator --field-selector=status.phase!=Running -o name 2>/dev/null) | |
| if [[ -n "$ceph_operator_pods" ]]; then | |
| for pod in $ceph_operator_pods; do | |
| log "INFO" "Restarting Ceph operator pod: $pod" | |
| if [[ "$DRY_RUN" == "true" ]]; then | |
| log "INFO" "[DRY RUN] Would delete pod: $pod" | |
| else | |
| if oc delete "$pod" -n openshift-storage 2>/dev/null; then | |
| log "INFO" "Deleted Ceph operator pod: $pod" | |
| FIXES_APPLIED+=("restarted_ceph_operator") | |
| sleep 15 | |
| else | |
| log "ERROR" "Failed to delete Ceph operator pod: $pod" | |
| fi | |
| fi | |
| done | |
| else | |
| log "INFO" "Ceph operator pods are running" | |
| fi | |
| else | |
| log "WARN" "OpenShift Storage not available - cannot fix Ceph issues" | |
| fi | |
| } | |
| # Fix PV selection issues | |
| fix_pv_selection() { | |
| local pvc_name=$1 | |
| local namespace=$2 | |
| log "INFO" "Fixing PV selection issues for PVC: $pvc_name" | |
| # Get PVC requirements | |
| local pvc_capacity=$(get_resource_info "pvc" "$pvc_name" "$namespace" '.spec.resources.requests.storage') | |
| local pvc_access_modes=$(get_resource_info "pvc" "$pvc_name" "$namespace" '.spec.accessModes[0]') | |
| local pvc_storage_class=$(get_resource_info "pvc" "$pvc_name" "$namespace" '.spec.storageClassName') | |
| # Find available PVs that match requirements | |
| local compatible_pvs=$(oc get pv --field-selector=status.phase=Available -o json | jq -r --arg capacity "$pvc_capacity" --arg access_mode "$pvc_access_modes" --arg storage_class "$pvc_storage_class" ' | |
| .items[] | | |
| select( | |
| (.spec.capacity.storage | tonumber) >= ($capacity | tonumber) and | |
| (.spec.accessModes | contains([$access_mode])) and | |
| (if $storage_class != "" then .spec.storageClassName == $storage_class else true end) | |
| ) | | |
| .metadata.name' 2>/dev/null) | |
| if [[ -n "$compatible_pvs" ]]; then | |
| log "INFO" "Found compatible PVs:" | |
| echo "$compatible_pvs" | while read -r pv; do | |
| log "INFO" " PV: $pv" | |
| done | |
| # Try to manually bind to first compatible PV | |
| local first_pv=$(echo "$compatible_pvs" | head -1) | |
| if confirm_action "Manually bind PVC $pvc_name to PV $first_pv"; then | |
| manual_bind_pvc_to_pv "$pvc_name" "$namespace" "$first_pv" | |
| fi | |
| else | |
| log "WARN" "No compatible PVs found" | |
| # Try to relax requirements | |
| if confirm_action "Create a new PV with matching requirements"; then | |
| create_manual_pv "$pvc_name" "$namespace" | |
| fi | |
| fi | |
| } | |
| # Manual bind PVC to PV | |
| manual_bind_pvc_to_pv() { | |
| local pvc_name=$1 | |
| local namespace=$2 | |
| local pv_name=$3 | |
| log "INFO" "Manually binding PVC $pvc_name to PV $pv_name" | |
| backup_resource "pvc" "$pvc_name" "$namespace" | |
| backup_resource "pv" "$pv_name" | |
| if [[ "$DRY_RUN" == "true" ]]; then | |
| log "INFO" "[DRY RUN] Would bind PVC $pvc_name to PV $pv_name" | |
| return 0 | |
| fi | |
| # Add PV reference to PVC | |
| if oc patch pvc "$pvc_name" -n "$namespace" -p "{\"spec\":{\"volumeName\":\"$pv_name\"}}" 2>/dev/null; then | |
| log "INFO" "Added PV reference to PVC" | |
| # Add PVC reference to PV | |
| if oc patch pv "$pv_name" -p "{\"spec\":{\"claimRef\":{\"name\":\"$pvc_name\",\"namespace\":\"$namespace\",\"uid\":\"$(oc get pvc $pvc_name -n $namespace -o jsonpath='{.metadata.uid}')\"}}}"; then | |
| log "INFO" "Added PVC reference to PV" | |
| FIXES_APPLIED+=("manual_bind_pvc_pv") | |
| # Wait for binding | |
| if wait_for_condition "pvc" "$pvc_name" "Bound" 60 "$namespace"; then | |
| log "INFO" "PVC is now bound to PV" | |
| return 0 | |
| else | |
| log "WARN" "PVC still not bound after manual binding" | |
| fi | |
| else | |
| log "ERROR" "Failed to add PVC reference to PV" | |
| fi | |
| else | |
| log "ERROR" "Failed to add PV reference to PVC" | |
| fi | |
| return 1 | |
| } | |
| # Create manual PV | |
| create_manual_pv() { | |
| local pvc_name=$1 | |
| local namespace=$2 | |
| log "INFO" "Creating manual PV for PVC: $pvc_name" | |
| # Get PVC requirements | |
| local pvc_capacity=$(get_resource_info "pvc" "$pvc_name" "$namespace" '.spec.resources.requests.storage') | |
| local pvc_access_modes=$(get_resource_info "pvc" "$pvc_name" "$namespace" '.spec.accessModes[0]') | |
| local pvc_storage_class=$(get_resource_info "pvc" "$pvc_name" "$namespace" '.spec.storageClassName') | |
| # Generate PV name | |
| local pv_name="manual-pv-$(date +%s)" | |
| if [[ "$DRY_RUN" == "true" ]]; then | |
| log "INFO" "[DRY RUN] Would create PV: $pv_name" | |
| return 0 | |
| fi | |
| # Create hostPath PV (for testing - not recommended for production) | |
| if confirm_action "Create hostPath PV $pv_name (WARNING: Not suitable for production)"; then | |
| cat << EOF | oc apply -f - | |
| apiVersion: v1 | |
| kind: PersistentVolume | |
| metadata: | |
| name: $pv_name | |
| labels: | |
| type: hostPath | |
| spec: | |
| capacity: | |
| storage: $pvc_capacity | |
| accessModes: | |
| - $pvc_access_modes | |
| persistentVolumeReclaimPolicy: Retain | |
| storageClassName: $pvc_storage_class | |
| hostPath: | |
| path: /tmp/pv-data/$pv_name | |
| type: DirectoryOrCreate | |
| EOF | |
| if [[ $? -eq 0 ]]; then | |
| log "INFO" "Created manual PV: $pv_name" | |
| FIXES_APPLIED+=("created_manual_pv") | |
| # Wait for PV to be available | |
| if wait_for_condition "pv" "$pv_name" "Available" 30; then | |
| log "INFO" "PV is now available" | |
| # Try to bind PVC to new PV | |
| manual_bind_pvc_to_pv "$pvc_name" "$namespace" "$pv_name" | |
| else | |
| log "WARN" "PV not available after creation" | |
| fi | |
| else | |
| log "ERROR" "Failed to create manual PV" | |
| fi | |
| fi | |
| } | |
| # Fix no available PV | |
| fix_no_available_pv() { | |
| local pvc_name=$1 | |
| local namespace=$2 | |
| log "INFO" "Fixing no available PV issue for PVC: $pvc_name" | |
| # Strategy 1: Create manual PV | |
| create_manual_pv "$pvc_name" "$namespace" | |
| # Strategy 2: Modify PVC requirements | |
| if confirm_action "Reduce PVC storage requirements to find available PV"; then | |
| modify_pvc_requirements "$pvc_name" "$namespace" | |
| fi | |
| } | |
| # Modify PVC requirements | |
| modify_pvc_requirements() { | |
| local pvc_name=$1 | |
| local namespace=$2 | |
| log "INFO" "Modifying PVC requirements for: $pvc_name" | |
| # Get current requirements | |
| local current_capacity=$(get_resource_info "pvc" "$pvc_name" "$namespace" '.spec.resources.requests.storage') | |
| local current_access_mode=$(get_resource_info "pvc" "$pvc_name" "$namespace" '.spec.accessModes[0]') | |
| # Try to find smaller available PVs | |
| local available_pvs=$(oc get pv --field-selector=status.phase=Available -o custom-columns=NAME:.metadata.name,CAPACITY:.spec.capacity.storage,ACCESS:.spec.accessModes[0] --no-headers 2>/dev/null | head -5) | |
| if [[ -n "$available_pvs" ]]; then | |
| log "INFO" "Available PVs:" | |
| echo "$available_pvs" | while read -r line; do | |
| log "INFO" " $line" | |
| done | |
| # Try ReadWriteOnce instead of ReadWriteMany | |
| if [[ "$current_access_mode" == "ReadWriteMany" ]]; then | |
| if confirm_action "Change PVC access mode from ReadWriteMany to ReadWriteOnce"; then | |
| backup_resource "pvc" "$pvc_name" "$namespace" | |
| if [[ "$DRY_RUN" == "true" ]]; then | |
| log "INFO" "[DRY RUN] Would change PVC access mode to ReadWriteOnce" | |
| else | |
| if oc patch pvc "$pvc_name" -n "$namespace" -p '{"spec":{"accessModes":["ReadWriteOnce"]}}' 2>/dev/null; then | |
| log "INFO" "Changed PVC access mode to ReadWriteOnce" | |
| FIXES_APPLIED+=("changed_access_mode") | |
| if wait_for_condition "pvc" "$pvc_name" "Bound" 60 "$namespace"; then | |
| log "INFO" "PVC is now bound" | |
| return 0 | |
| fi | |
| else | |
| log "ERROR" "Failed to change PVC access mode" | |
| fi | |
| fi | |
| fi | |
| fi | |
| fi | |
| } | |
| # Recreate PVC | |
| recreate_pvc() { | |
| local pvc_name=$1 | |
| local namespace=$2 | |
| log "INFO" "Recreating PVC: $pvc_name" | |
| # Backup current PVC | |
| backup_resource "pvc" "$pvc_name" "$namespace" | |
| if [[ "$DRY_RUN" == "true" ]]; then | |
| log "INFO" "[DRY RUN] Would recreate PVC: $pvc_name" | |
| return 0 | |
| fi | |
| # Get PVC definition | |
| local pvc_def=$(oc get pvc "$pvc_name" -n "$namespace" -o yaml 2>/dev/null) | |
| if [[ -n "$pvc_def" ]]; then | |
| # Remove PVC | |
| if oc delete pvc "$pvc_name" -n "$namespace" --timeout=60s 2>/dev/null; then | |
| log "INFO" "Deleted PVC: $pvc_name" | |
| # Wait a moment | |
| sleep 5 | |
| # Recreate PVC (remove metadata that shouldn't be recreated) | |
| echo "$pvc_def" | yq eval 'del(.metadata.uid, .metadata.resourceVersion, .metadata.creationTimestamp, .metadata.selfLink, .status)' - | oc apply -f - | |
| if [[ $? -eq 0 ]]; then | |
| log "INFO" "Recreated PVC: $pvc_name" | |
| FIXES_APPLIED+=("recreated_pvc") | |
| # Wait for PVC to be bound | |
| if wait_for_condition "pvc" "$pvc_name" "Bound" 120 "$namespace"; then | |
| log "INFO" "Recreated PVC is now bound" | |
| return 0 | |
| else | |
| log "WARN" "Recreated PVC still not bound" | |
| fi | |
| else | |
| log "ERROR" "Failed to recreate PVC" | |
| fi | |
| else | |
| log "ERROR" "Failed to delete PVC for recreation" | |
| fi | |
| else | |
| log "ERROR" "Could not get PVC definition for recreation" | |
| fi | |
| return 1 | |
| } | |
| # Fix PV released issues | |
| fix_pv_released() { | |
| local pv_name=$1 | |
| log "INFO" "Fixing released PV: $pv_name" | |
| # Get PV reclaim policy | |
| local reclaim_policy=$(get_resource_info "pv" "$pv_name" "" '.spec.persistentVolumeReclaimPolicy') | |
| log "INFO" "PV reclaim policy: $reclaim_policy" | |
| # Strategy 1: Remove claim reference to make PV available | |
| if confirm_action "Remove claim reference from PV $pv_name to make it available"; then | |
| backup_resource "pv" "$pv_name" | |
| if [[ "$DRY_RUN" == "true" ]]; then | |
| log "INFO" "[DRY RUN] Would remove claim reference from PV" | |
| else | |
| if oc patch pv "$pv_name" --type json -p '[{"op": "remove", "path": "/spec/claimRef"}]' 2>/dev/null; then | |
| log "INFO" "Removed claim reference from PV" | |
| FIXES_APPLIED+=("removed_claim_ref") | |
| # Wait for PV to be available | |
| if wait_for_condition "pv" "$pv_name" "Available" 30; then | |
| log "INFO" "PV is now available" | |
| return 0 | |
| else | |
| log "WARN" "PV still not available after removing claim reference" | |
| fi | |
| else | |
| log "ERROR" "Failed to remove claim reference from PV" | |
| fi | |
| fi | |
| fi | |
| # Strategy 2: Change reclaim policy to Retain | |
| if [[ "$reclaim_policy" != "Retain" ]]; then | |
| if confirm_action "Change PV reclaim policy to Retain"; then | |
| backup_resource "pv" "$pv_name" | |
| if [[ "$DRY_RUN" == "true" ]]; then | |
| log "INFO" "[DRY RUN] Would change PV reclaim policy to Retain" | |
| else | |
| if oc patch pv "$pv_name" -p '{"spec":{"persistentVolumeReclaimPolicy":"Retain"}}' 2>/dev/null; then | |
| log "INFO" "Changed PV reclaim policy to Retain" | |
| FIXES_APPLIED+=("changed_reclaim_policy") | |
| else | |
| log "ERROR" "Failed to change PV reclaim policy" | |
| fi | |
| fi | |
| fi | |
| fi | |
| } | |
| # Fix pod mount issues | |
| fix_pod_mount_issues() { | |
| local pod_name=$1 | |
| local namespace=$2 | |
| local pvc_name=$3 | |
| log "INFO" "Fixing pod mount issues for pod: $pod_name" | |
| # Strategy 1: Check node storage | |
| local node_name=$(get_resource_info "pod" "$pod_name" "$namespace" '.spec.nodeName') | |
| if [[ -n "$node_name" ]]; then | |
| log "INFO" "Pod is on node: $node_name" | |
| # Check if node has disk pressure | |
| if [[ " ${ISSUES_FOUND[*]} " =~ " node_disk_pressure " ]]; then | |
| if confirm_action "Cordon node $node_name due to disk pressure"; then | |
| if [[ "$DRY_RUN" == "true" ]]; then | |
| log "INFO" "[DRY RUN] Would cordon node: $node_name" | |
| else | |
| if oc cordon "$node_name" 2>/dev/null; then | |
| log "INFO" "Cordoned node: $node_name" | |
| FIXES_APPLIED+=("cordoned_node") | |
| else | |
| log "ERROR" "Failed to cordon node: $node_name" | |
| fi | |
| fi | |
| fi | |
| fi | |
| fi | |
| # Strategy 2: Restart pod | |
| if confirm_action "Restart pod $pod_name to retry mount"; then | |
| if [[ "$DRY_RUN" == "true" ]]; then | |
| log "INFO" "[DRY RUN] Would restart pod: $pod_name" | |
| else | |
| if oc delete pod "$pod_name" -n "$namespace" --timeout=60s 2>/dev/null; then | |
| log "INFO" "Restarted pod: $pod_name" | |
| FIXES_APPLIED+=("restarted_pod") | |
| # Wait for pod to be recreated and running | |
| sleep 10 | |
| if wait_for_condition "pod" "$pod_name" "Running" 180 "$namespace"; then | |
| log "INFO" "Pod is now running" | |
| return 0 | |
| else | |
| log "WARN" "Pod still not running after restart" | |
| fi | |
| else | |
| log "ERROR" "Failed to restart pod" | |
| fi | |
| fi | |
| fi | |
| # Strategy 3: Check and fix SELinux context | |
| local pod_security_context=$(get_resource_info "pod" "$pod_name" "$namespace" '.spec.securityContext') | |
| if [[ -n "$pod_security_context" ]]; then | |
| log "INFO" "Pod has security context - checking SELinux" | |
| # This would need more sophisticated SELinux handling | |
| log "INFO" "SELinux context check not implemented - manual verification needed" | |
| fi | |
| } | |
| # Confirm action | |
| confirm_action() { | |
| local action=$1 | |
| if [[ "$FORCE_FIX" == "true" ]]; then | |
| return 0 | |
| fi | |
| if [[ "$DRY_RUN" == "true" ]]; then | |
| return 0 | |
| fi | |
| echo -n "Do you want to $action? [y/N]: " | |
| read -r response | |
| case $response in | |
| [yY][eE][sS]|[yY]) | |
| return 0 | |
| ;; | |
| *) | |
| return 1 | |
| ;; | |
| esac | |
| } | |
| # Main diagnosis function | |
| run_diagnosis() { | |
| log "INFO" "Starting PVC/PV diagnosis" | |
| # If specific PVC provided, diagnose it | |
| if [[ -n "$PVC_NAME" ]]; then | |
| diagnose_pvc "$PVC_NAME" "$NAMESPACE" | |
| # If specific PV provided, diagnose it | |
| elif [[ -n "$PV_NAME" ]]; then | |
| diagnose_pv "$PV_NAME" | |
| else | |
| # Diagnose all PVCs in namespace | |
| log "INFO" "Diagnosing all PVCs in namespace: $NAMESPACE" | |
| local pvcs=$(oc get pvc -n "$NAMESPACE" -o name 2>/dev/null | cut -d'/' -f2) | |
| if [[ -n "$pvcs" ]]; then | |
| echo "$pvcs" | while read -r pvc; do | |
| diagnose_pvc "$pvc" "$NAMESPACE" | |
| done | |
| else | |
| log "INFO" "No PVCs found in namespace: $NAMESPACE" | |
| fi | |
| fi | |
| # Show summary of issues found | |
| if [[ ${#ISSUES_FOUND[@]} -gt 0 ]]; then | |
| log "INFO" "Issues found:" | |
| for issue in "${ISSUES_FOUND[@]}"; do | |
| log "INFO" " - $issue" | |
| done | |
| else | |
| log "INFO" "No issues found" | |
| fi | |
| } | |
| # Main fix function | |
| run_fixes() { | |
| if [[ ${#ISSUES_FOUND[@]} -eq 0 ]]; then | |
| log "INFO" "No issues to fix" | |
| return 0 | |
| fi | |
| log "INFO" "Starting automatic fixes" | |
| # Fix PVC issues | |
| if [[ -n "$PVC_NAME" ]]; then | |
| if [[ " ${ISSUES_FOUND[*]} " =~ " pvc_pending " ]]; then | |
| fix_pvc_pending "$PVC_NAME" "$NAMESPACE" | |
| fi | |
| if [[ " ${ISSUES_FOUND[*]} " =~ " pod_mount_failed " ]]; then | |
| # Find pods using this PVC | |
| local pods_using_pvc=$(oc get pods -n "$NAMESPACE" -o json | jq -r --arg pvc "$PVC_NAME" '.items[] | select(.spec.volumes[]?.persistentVolumeClaim?.claimName == $pvc) | .metadata.name' 2>/dev/null) | |
| if [[ -n "$pods_using_pvc" ]]; then | |
| echo "$pods_using_pvc" | while read -r pod; do | |
| fix_pod_mount_issues "$pod" "$NAMESPACE" "$PVC_NAME" | |
| done | |
| fi | |
| fi | |
| fi | |
| # Fix PV issues | |
| if [[ -n "$PV_NAME" ]]; then | |
| if [[ " ${ISSUES_FOUND[*]} " =~ " pv_released " ]]; then | |
| fix_pv_released "$PV_NAME" | |
| fi | |
| fi | |
| # Show summary of fixes applied | |
| if [[ ${#FIXES_APPLIED[@]} -gt 0 ]]; then | |
| log "INFO" "Fixes applied:" | |
| for fix in "${FIXES_APPLIED[@]}"; do | |
| log "INFO" " - $fix" | |
| done | |
| else | |
| log "INFO" "No fixes were applied" | |
| fi | |
| } | |
| # Generate report | |
| generate_report() { | |
| local report_file="${SCRIPT_DIR}/pvc_debug_report_$(date +%Y%m%d_%H%M%S).txt" | |
| cat > "$report_file" << EOF | |
| OpenShift PVC/PV Diagnostic Report | |
| Generated: $(date) | |
| Namespace: $NAMESPACE | |
| PVC: $PVC_NAME | |
| PV: $PV_NAME | |
| === Issues Found === | |
| EOF | |
| if [[ ${#ISSUES_FOUND[@]} -gt 0 ]]; then | |
| for issue in "${ISSUES_FOUND[@]}"; do | |
| echo "- $issue" >> "$report_file" | |
| done | |
| else | |
| echo "No issues found" >> "$report_file" | |
| fi | |
| cat >> "$report_file" << EOF | |
| === Fixes Applied === | |
| EOF | |
| if [[ ${#FIXES_APPLIED[@]} -gt 0 ]]; then | |
| for fix in "${FIXES_APPLIED[@]}"; do | |
| echo "- $fix" >> "$report_file" | |
| done | |
| else | |
| echo "No fixes applied" >> "$report_file" | |
| fi | |
| cat >> "$report_file" << EOF | |
| === Log File === | |
| $LOG_FILE | |
| === Backup Directory === | |
| $BACKUP_DIR | |
| === Recommendations === | |
| EOF | |
| # Add recommendations based on issues found | |
| if [[ " ${ISSUES_FOUND[*]} " =~ " ceph_unhealthy " ]]; then | |
| echo "- Monitor Ceph cluster health regularly" >> "$report_file" | |
| echo "- Consider increasing Ceph cluster resources" >> "$report_file" | |
| fi | |
| if [[ " ${ISSUES_FOUND[*]} " =~ " node_disk_pressure " ]]; then | |
| echo "- Monitor node disk usage" >> "$report_file" | |
| echo "- Consider adding storage capacity to nodes" >> "$report_file" | |
| fi | |
| if [[ " ${ISSUES_FOUND[*]} " =~ " dynamic_provisioning_failed " ]]; then | |
| echo "- Verify storage class configuration" >> "$report_file" | |
| echo "- Check provisioner pod health" >> "$report_file" | |
| fi | |
| echo "- Review backup files before deleting: $BACKUP_DIR" >> "$report_file" | |
| echo "- Monitor fixed resources for stability" >> "$report_file" | |
| log "INFO" "Report generated: $report_file" | |
| } | |
| # Main function | |
| main() { | |
| parse_args "$@" | |
| log "INFO" "Starting OpenShift PVC/PV diagnostic script" | |
| log "INFO" "Log file: $LOG_FILE" | |
| if [[ "$DRY_RUN" == "true" ]]; then | |
| log "INFO" "Running in DRY RUN mode - no changes will be made" | |
| fi | |
| check_prerequisites | |
| # Run diagnosis | |
| run_diagnosis | |
| # Run fixes if not dry run | |
| if [[ "$DRY_RUN" != "true" ]]; then | |
| run_fixes | |
| fi | |
| # Generate report | |
| generate_report | |
| log "INFO" "Script completed" | |
| # Exit with appropriate code | |
| if [[ ${#ISSUES_FOUND[@]} -gt 0 ]]; then | |
| log "WARN" "Issues were found - check the report for details" | |
| exit 1 | |
| else | |
| log "INFO" "No issues found - all resources are healthy" | |
| exit 0 | |
| fi | |
| } | |
| # Check if script is being sourced or executed | |
| if [[ "${BASH_SOURCE[0]}" == "${0}" ]]; then | |
| main "$@" | |
| fi |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment