Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Select an option

  • Save atyronesmith/bc348e843fe4b74083e2d2483c4d0a90 to your computer and use it in GitHub Desktop.

Select an option

Save atyronesmith/bc348e843fe4b74083e2d2483c4d0a90 to your computer and use it in GitHub Desktop.
#!/bin/bash
# OpenShift PVC/PV Diagnostic and Auto-Fix Script
# Version: 1.0
# Description: Comprehensive script to diagnose and fix PVC/PV issues in OpenShift
set -euo pipefail
# Configuration
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
LOG_FILE="${SCRIPT_DIR}/pvc_debug_$(date +%Y%m%d_%H%M%S).log"
DRY_RUN=${DRY_RUN:-false}
BACKUP_DIR="${SCRIPT_DIR}/backups_$(date +%Y%m%d_%H%M%S)"
TIMEOUT=300
RETRY_COUNT=3
FORCE_FIX=${FORCE_FIX:-false}
# Colors for output
RED='\033[0;31m'
GREEN='\033[0;32m'
YELLOW='\033[1;33m'
BLUE='\033[0;34m'
NC='\033[0m' # No Color
# Global variables
NAMESPACE=""
PVC_NAME=""
PV_NAME=""
ISSUE_TYPE=""
STORAGE_CLASS=""
ISSUES_FOUND=()
FIXES_APPLIED=()
# Logging function
log() {
local level=$1
shift
local message="$*"
local timestamp=$(date '+%Y-%m-%d %H:%M:%S')
case $level in
"INFO") echo -e "${GREEN}[INFO]${NC} ${timestamp} - $message" | tee -a "$LOG_FILE" ;;
"WARN") echo -e "${YELLOW}[WARN]${NC} ${timestamp} - $message" | tee -a "$LOG_FILE" ;;
"ERROR") echo -e "${RED}[ERROR]${NC} ${timestamp} - $message" | tee -a "$LOG_FILE" ;;
"DEBUG") echo -e "${BLUE}[DEBUG]${NC} ${timestamp} - $message" | tee -a "$LOG_FILE" ;;
esac
}
# Usage function
usage() {
cat << EOF
Usage: $0 [OPTIONS]
OpenShift PVC/PV Diagnostic and Auto-Fix Script
OPTIONS:
-n, --namespace NAMESPACE Target namespace (default: current namespace)
-p, --pvc PVC_NAME Specific PVC to debug
-v, --pv PV_NAME Specific PV to debug
-d, --dry-run Show what would be done without making changes
-f, --force Force fixes without confirmation
-h, --help Show this help message
ENVIRONMENT VARIABLES:
DRY_RUN=true Enable dry-run mode
FORCE_FIX=true Force fixes without confirmation
Examples:
$0 -n myapp -p data-pvc # Debug specific PVC in namespace
$0 -n myapp # Debug all PVCs in namespace
$0 --dry-run # Show diagnostics without fixing
DRY_RUN=true $0 -n myapp # Dry run via environment variable
EOF
}
# Parse command line arguments
parse_args() {
while [[ $# -gt 0 ]]; do
case $1 in
-n|--namespace)
NAMESPACE="$2"
shift 2
;;
-p|--pvc)
PVC_NAME="$2"
shift 2
;;
-v|--pv)
PV_NAME="$2"
shift 2
;;
-d|--dry-run)
DRY_RUN=true
shift
;;
-f|--force)
FORCE_FIX=true
shift
;;
-h|--help)
usage
exit 0
;;
*)
log "ERROR" "Unknown option: $1"
usage
exit 1
;;
esac
done
}
# Check prerequisites
check_prerequisites() {
log "INFO" "Checking prerequisites..."
# Check if oc command is available
if ! command -v oc &> /dev/null; then
log "ERROR" "oc command not found. Please install OpenShift CLI."
exit 1
fi
# Check if logged in to OpenShift
if ! oc whoami &> /dev/null; then
log "ERROR" "Not logged in to OpenShift. Please run 'oc login' first."
exit 1
fi
# Set namespace if not provided
if [[ -z "$NAMESPACE" ]]; then
NAMESPACE=$(oc project -q)
log "INFO" "Using current namespace: $NAMESPACE"
fi
# Verify namespace exists
if ! oc get namespace "$NAMESPACE" &> /dev/null; then
log "ERROR" "Namespace '$NAMESPACE' does not exist."
exit 1
fi
# Create backup directory
if [[ "$DRY_RUN" != "true" ]]; then
mkdir -p "$BACKUP_DIR"
log "INFO" "Created backup directory: $BACKUP_DIR"
fi
}
# Backup resource
backup_resource() {
local resource_type=$1
local resource_name=$2
local namespace=${3:-$NAMESPACE}
if [[ "$DRY_RUN" == "true" ]]; then
log "INFO" "[DRY RUN] Would backup $resource_type/$resource_name"
return 0
fi
local backup_file="${BACKUP_DIR}/${resource_type}_${resource_name}_$(date +%H%M%S).yaml"
if oc get "$resource_type" "$resource_name" -n "$namespace" -o yaml > "$backup_file" 2>/dev/null; then
log "INFO" "Backed up $resource_type/$resource_name to $backup_file"
return 0
else
log "WARN" "Failed to backup $resource_type/$resource_name"
return 1
fi
}
# Wait for resource condition
wait_for_condition() {
local resource_type=$1
local resource_name=$2
local condition=$3
local timeout=${4:-$TIMEOUT}
local namespace=${5:-$NAMESPACE}
log "INFO" "Waiting for $resource_type/$resource_name to be $condition (timeout: ${timeout}s)"
local count=0
while [[ $count -lt $timeout ]]; do
local status=""
case $resource_type in
"pvc")
status=$(oc get pvc "$resource_name" -n "$namespace" -o jsonpath='{.status.phase}' 2>/dev/null || echo "NotFound")
;;
"pv")
status=$(oc get pv "$resource_name" -o jsonpath='{.status.phase}' 2>/dev/null || echo "NotFound")
;;
"pod")
status=$(oc get pod "$resource_name" -n "$namespace" -o jsonpath='{.status.phase}' 2>/dev/null || echo "NotFound")
;;
esac
if [[ "$status" == "$condition" ]]; then
log "INFO" "$resource_type/$resource_name is now $condition"
return 0
fi
sleep 5
count=$((count + 5))
done
log "WARN" "Timeout waiting for $resource_type/$resource_name to be $condition"
return 1
}
# Get resource info
get_resource_info() {
local resource_type=$1
local resource_name=$2
local namespace=${3:-$NAMESPACE}
local field=$4
case $resource_type in
"pvc")
oc get pvc "$resource_name" -n "$namespace" -o jsonpath="{$field}" 2>/dev/null || echo ""
;;
"pv")
oc get pv "$resource_name" -o jsonpath="{$field}" 2>/dev/null || echo ""
;;
"pod")
oc get pod "$resource_name" -n "$namespace" -o jsonpath="{$field}" 2>/dev/null || echo ""
;;
"storageclass")
oc get storageclass "$resource_name" -o jsonpath="{$field}" 2>/dev/null || echo ""
;;
esac
}
# Diagnose PVC issues
diagnose_pvc() {
local pvc_name=$1
local namespace=${2:-$NAMESPACE}
log "INFO" "Diagnosing PVC: $pvc_name in namespace: $namespace"
# Check if PVC exists
if ! oc get pvc "$pvc_name" -n "$namespace" &> /dev/null; then
log "ERROR" "PVC '$pvc_name' not found in namespace '$namespace'"
return 1
fi
# Get PVC status
local pvc_status=$(get_resource_info "pvc" "$pvc_name" "$namespace" '.status.phase')
local pvc_capacity=$(get_resource_info "pvc" "$pvc_name" "$namespace" '.spec.resources.requests.storage')
local pvc_access_modes=$(get_resource_info "pvc" "$pvc_name" "$namespace" '.spec.accessModes[0]')
local pvc_storage_class=$(get_resource_info "pvc" "$pvc_name" "$namespace" '.spec.storageClassName')
local bound_pv=$(get_resource_info "pvc" "$pvc_name" "$namespace" '.spec.volumeName')
log "INFO" "PVC Status: $pvc_status"
log "INFO" "PVC Capacity: $pvc_capacity"
log "INFO" "PVC Access Mode: $pvc_access_modes"
log "INFO" "PVC Storage Class: $pvc_storage_class"
log "INFO" "Bound PV: $bound_pv"
# Store values for later use
STORAGE_CLASS="$pvc_storage_class"
PV_NAME="$bound_pv"
# Check PVC status and identify issues
case $pvc_status in
"Pending")
log "WARN" "PVC is in Pending state"
ISSUES_FOUND+=("pvc_pending")
diagnose_pending_pvc "$pvc_name" "$namespace"
;;
"Bound")
log "INFO" "PVC is Bound - checking for pod mounting issues"
diagnose_bound_pvc "$pvc_name" "$namespace"
;;
"Lost")
log "ERROR" "PVC is in Lost state"
ISSUES_FOUND+=("pvc_lost")
;;
*)
log "WARN" "PVC is in unexpected state: $pvc_status"
ISSUES_FOUND+=("pvc_unknown_state")
;;
esac
# Check for events
local events=$(oc get events -n "$namespace" --field-selector involvedObject.name="$pvc_name" --sort-by='.lastTimestamp' -o custom-columns=REASON:.reason,MESSAGE:.message --no-headers 2>/dev/null | tail -5)
if [[ -n "$events" ]]; then
log "INFO" "Recent events for PVC $pvc_name:"
echo "$events" | while read -r line; do
log "INFO" " Event: $line"
done
fi
}
# Diagnose pending PVC
diagnose_pending_pvc() {
local pvc_name=$1
local namespace=$2
log "INFO" "Diagnosing pending PVC: $pvc_name"
# Check if storage class exists
if [[ -n "$STORAGE_CLASS" ]]; then
if ! oc get storageclass "$STORAGE_CLASS" &> /dev/null; then
log "ERROR" "Storage class '$STORAGE_CLASS' not found"
ISSUES_FOUND+=("missing_storage_class")
return
fi
else
log "WARN" "No storage class specified in PVC"
ISSUES_FOUND+=("no_storage_class")
fi
# Check for available PVs
local pvc_capacity=$(get_resource_info "pvc" "$pvc_name" "$namespace" '.spec.resources.requests.storage')
local pvc_access_modes=$(get_resource_info "pvc" "$pvc_name" "$namespace" '.spec.accessModes[0]')
log "INFO" "Looking for available PVs with capacity >= $pvc_capacity and access mode $pvc_access_modes"
local available_pvs=$(oc get pv --field-selector=status.phase=Available -o name 2>/dev/null | wc -l)
log "INFO" "Found $available_pvs available PVs"
if [[ $available_pvs -eq 0 ]]; then
# Check dynamic provisioning
if [[ -n "$STORAGE_CLASS" ]]; then
local provisioner=$(get_resource_info "storageclass" "$STORAGE_CLASS" "" '.provisioner')
log "INFO" "Storage class uses provisioner: $provisioner"
if [[ -n "$provisioner" ]]; then
ISSUES_FOUND+=("dynamic_provisioning_failed")
check_provisioner_health "$provisioner"
else
ISSUES_FOUND+=("no_provisioner")
fi
else
ISSUES_FOUND+=("no_available_pv")
fi
else
ISSUES_FOUND+=("pv_selection_failed")
fi
}
# Diagnose bound PVC
diagnose_bound_pvc() {
local pvc_name=$1
local namespace=$2
log "INFO" "Diagnosing bound PVC: $pvc_name"
# Find pods using this PVC
local pods_using_pvc=$(oc get pods -n "$namespace" -o json | jq -r --arg pvc "$pvc_name" '.items[] | select(.spec.volumes[]?.persistentVolumeClaim?.claimName == $pvc) | .metadata.name' 2>/dev/null || echo "")
if [[ -n "$pods_using_pvc" ]]; then
log "INFO" "Pods using PVC $pvc_name:"
echo "$pods_using_pvc" | while read -r pod; do
log "INFO" " Pod: $pod"
diagnose_pod_volume_mount "$pod" "$namespace" "$pvc_name"
done
else
log "INFO" "No pods currently using PVC $pvc_name"
fi
}
# Diagnose pod volume mount issues
diagnose_pod_volume_mount() {
local pod_name=$1
local namespace=$2
local pvc_name=$3
local pod_status=$(get_resource_info "pod" "$pod_name" "$namespace" '.status.phase')
log "INFO" "Pod $pod_name status: $pod_status"
if [[ "$pod_status" == "Pending" ]]; then
# Check for volume mount issues
local pod_events=$(oc get events -n "$namespace" --field-selector involvedObject.name="$pod_name" --sort-by='.lastTimestamp' -o custom-columns=REASON:.reason,MESSAGE:.message --no-headers 2>/dev/null | tail -3)
if echo "$pod_events" | grep -q "FailedMount\|VolumeMount"; then
log "WARN" "Pod has volume mount issues"
ISSUES_FOUND+=("pod_mount_failed")
# Check node where pod is scheduled
local node_name=$(get_resource_info "pod" "$pod_name" "$namespace" '.spec.nodeName')
if [[ -n "$node_name" ]]; then
log "INFO" "Pod scheduled on node: $node_name"
check_node_storage_health "$node_name"
fi
fi
fi
}
# Check provisioner health
check_provisioner_health() {
local provisioner=$1
log "INFO" "Checking provisioner health: $provisioner"
# Common provisioner patterns
case $provisioner in
*"csi"*)
check_csi_driver_health "$provisioner"
;;
*"rook"*|*"ceph"*)
check_ceph_health
;;
*"nfs"*)
check_nfs_health
;;
*)
log "WARN" "Unknown provisioner type: $provisioner"
ISSUES_FOUND+=("unknown_provisioner")
;;
esac
}
# Check CSI driver health
check_csi_driver_health() {
local provisioner=$1
log "INFO" "Checking CSI driver health for: $provisioner"
# Check CSI driver pods
local csi_pods=$(oc get pods -A -l app.kubernetes.io/name=csi-driver -o name 2>/dev/null | wc -l)
if [[ $csi_pods -eq 0 ]]; then
log "WARN" "No CSI driver pods found"
ISSUES_FOUND+=("csi_driver_missing")
else
log "INFO" "Found $csi_pods CSI driver pods"
fi
# Check for CSI driver errors
oc get pods -A -l app.kubernetes.io/name=csi-driver --field-selector=status.phase!=Running 2>/dev/null | while read -r line; do
if [[ "$line" != "No resources found." ]] && [[ -n "$line" ]]; then
log "WARN" "CSI driver pod not running: $line"
ISSUES_FOUND+=("csi_driver_unhealthy")
fi
done
}
# Check Ceph health
check_ceph_health() {
log "INFO" "Checking Ceph cluster health"
# Check if OCS/ODF is installed
if oc get namespace openshift-storage &> /dev/null; then
local ceph_cluster=$(oc get cephcluster -n openshift-storage -o name 2>/dev/null | head -1)
if [[ -n "$ceph_cluster" ]]; then
local ceph_health=$(oc get "$ceph_cluster" -n openshift-storage -o jsonpath='{.status.ceph.health}' 2>/dev/null || echo "Unknown")
log "INFO" "Ceph cluster health: $ceph_health"
if [[ "$ceph_health" != "HEALTH_OK" ]]; then
ISSUES_FOUND+=("ceph_unhealthy")
fi
else
log "WARN" "Ceph cluster not found"
ISSUES_FOUND+=("ceph_missing")
fi
else
log "WARN" "OpenShift Storage namespace not found"
ISSUES_FOUND+=("ocs_missing")
fi
}
# Check NFS health
check_nfs_health() {
log "INFO" "Checking NFS health"
# This would need to be customized based on NFS setup
log "INFO" "NFS health check not implemented - manual verification needed"
ISSUES_FOUND+=("nfs_check_needed")
}
# Check node storage health
check_node_storage_health() {
local node_name=$1
log "INFO" "Checking storage health on node: $node_name"
# Check node conditions
local node_conditions=$(oc get node "$node_name" -o jsonpath='{.status.conditions[?(@.status=="True")].type}' 2>/dev/null || echo "")
if echo "$node_conditions" | grep -q "DiskPressure"; then
log "WARN" "Node $node_name has disk pressure"
ISSUES_FOUND+=("node_disk_pressure")
fi
# Check kubelet logs for volume errors (would need debug access)
log "INFO" "Node storage check completed for $node_name"
}
# Diagnose PV issues
diagnose_pv() {
local pv_name=$1
log "INFO" "Diagnosing PV: $pv_name"
# Check if PV exists
if ! oc get pv "$pv_name" &> /dev/null; then
log "ERROR" "PV '$pv_name' not found"
return 1
fi
# Get PV status
local pv_status=$(get_resource_info "pv" "$pv_name" "" '.status.phase')
local pv_capacity=$(get_resource_info "pv" "$pv_name" "" '.spec.capacity.storage')
local pv_access_modes=$(get_resource_info "pv" "$pv_name" "" '.spec.accessModes[0]')
local pv_reclaim_policy=$(get_resource_info "pv" "$pv_name" "" '.spec.persistentVolumeReclaimPolicy')
local pv_claim_ref=$(get_resource_info "pv" "$pv_name" "" '.spec.claimRef.name')
log "INFO" "PV Status: $pv_status"
log "INFO" "PV Capacity: $pv_capacity"
log "INFO" "PV Access Mode: $pv_access_modes"
log "INFO" "PV Reclaim Policy: $pv_reclaim_policy"
log "INFO" "PV Claim Reference: $pv_claim_ref"
# Check PV status and identify issues
case $pv_status in
"Released")
log "WARN" "PV is in Released state"
ISSUES_FOUND+=("pv_released")
;;
"Failed")
log "ERROR" "PV is in Failed state"
ISSUES_FOUND+=("pv_failed")
;;
"Available")
log "INFO" "PV is Available"
;;
"Bound")
log "INFO" "PV is Bound"
;;
*)
log "WARN" "PV is in unexpected state: $pv_status"
ISSUES_FOUND+=("pv_unknown_state")
;;
esac
}
# Fix PVC pending issues
fix_pvc_pending() {
local pvc_name=$1
local namespace=$2
log "INFO" "Attempting to fix pending PVC: $pvc_name"
# Strategy 1: Check and fix storage class
if [[ " ${ISSUES_FOUND[*]} " =~ " missing_storage_class " ]]; then
fix_missing_storage_class "$pvc_name" "$namespace"
fi
# Strategy 2: Create manual PV if dynamic provisioning failed
if [[ " ${ISSUES_FOUND[*]} " =~ " dynamic_provisioning_failed " ]]; then
fix_dynamic_provisioning "$pvc_name" "$namespace"
fi
# Strategy 3: Fix PV selection issues
if [[ " ${ISSUES_FOUND[*]} " =~ " pv_selection_failed " ]]; then
fix_pv_selection "$pvc_name" "$namespace"
fi
# Strategy 4: Recreate PVC with different parameters
if [[ " ${ISSUES_FOUND[*]} " =~ " no_available_pv " ]]; then
fix_no_available_pv "$pvc_name" "$namespace"
fi
}
# Fix missing storage class
fix_missing_storage_class() {
local pvc_name=$1
local namespace=$2
log "INFO" "Fixing missing storage class for PVC: $pvc_name"
# Get default storage class
local default_sc=$(oc get storageclass -o jsonpath='{.items[?(@.metadata.annotations.storageclass\.kubernetes\.io/is-default-class=="true")].metadata.name}' 2>/dev/null || echo "")
if [[ -n "$default_sc" ]]; then
log "INFO" "Found default storage class: $default_sc"
if confirm_action "Update PVC $pvc_name to use default storage class $default_sc"; then
backup_resource "pvc" "$pvc_name" "$namespace"
if [[ "$DRY_RUN" == "true" ]]; then
log "INFO" "[DRY RUN] Would update PVC storage class to: $default_sc"
else
if oc patch pvc "$pvc_name" -n "$namespace" -p "{\"spec\":{\"storageClassName\":\"$default_sc\"}}" 2>/dev/null; then
log "INFO" "Updated PVC storage class to: $default_sc"
FIXES_APPLIED+=("updated_storage_class")
# Wait for PVC to be bound
if wait_for_condition "pvc" "$pvc_name" "Bound" 120 "$namespace"; then
log "INFO" "PVC is now bound"
return 0
else
log "WARN" "PVC still not bound after storage class update"
fi
else
log "ERROR" "Failed to update PVC storage class"
fi
fi
fi
else
log "WARN" "No default storage class found"
# List available storage classes
local available_sc=$(oc get storageclass -o name 2>/dev/null | head -3)
if [[ -n "$available_sc" ]]; then
log "INFO" "Available storage classes:"
echo "$available_sc" | while read -r sc; do
log "INFO" " $sc"
done
# Try first available storage class
local first_sc=$(echo "$available_sc" | head -1 | cut -d'/' -f2)
if confirm_action "Update PVC $pvc_name to use storage class $first_sc"; then
backup_resource "pvc" "$pvc_name" "$namespace"
if [[ "$DRY_RUN" == "true" ]]; then
log "INFO" "[DRY RUN] Would update PVC storage class to: $first_sc"
else
if oc patch pvc "$pvc_name" -n "$namespace" -p "{\"spec\":{\"storageClassName\":\"$first_sc\"}}" 2>/dev/null; then
log "INFO" "Updated PVC storage class to: $first_sc"
FIXES_APPLIED+=("updated_storage_class")
if wait_for_condition "pvc" "$pvc_name" "Bound" 120 "$namespace"; then
log "INFO" "PVC is now bound"
return 0
fi
else
log "ERROR" "Failed to update PVC storage class"
fi
fi
fi
fi
fi
return 1
}
# Fix dynamic provisioning issues
fix_dynamic_provisioning() {
local pvc_name=$1
local namespace=$2
log "INFO" "Fixing dynamic provisioning issues for PVC: $pvc_name"
# Strategy 1: Restart provisioner pods
if [[ " ${ISSUES_FOUND[*]} " =~ " csi_driver_unhealthy " ]]; then
fix_csi_driver_issues
fi
# Strategy 2: Check and fix Ceph issues
if [[ " ${ISSUES_FOUND[*]} " =~ " ceph_unhealthy " ]]; then
fix_ceph_issues
fi
# Strategy 3: Delete and recreate PVC
if confirm_action "Delete and recreate PVC $pvc_name to trigger fresh provisioning"; then
recreate_pvc "$pvc_name" "$namespace"
fi
}
# Fix CSI driver issues
fix_csi_driver_issues() {
log "INFO" "Fixing CSI driver issues"
# Find and restart CSI driver pods
local csi_pods=$(oc get pods -A -l app.kubernetes.io/name=csi-driver --field-selector=status.phase!=Running -o jsonpath='{.items[*].metadata.name}' 2>/dev/null)
if [[ -n "$csi_pods" ]]; then
for pod in $csi_pods; do
local pod_namespace=$(oc get pod "$pod" -A -o jsonpath='{.metadata.namespace}' 2>/dev/null)
if [[ -n "$pod_namespace" ]]; then
log "INFO" "Restarting CSI driver pod: $pod in namespace: $pod_namespace"
if [[ "$DRY_RUN" == "true" ]]; then
log "INFO" "[DRY RUN] Would delete pod: $pod"
else
if oc delete pod "$pod" -n "$pod_namespace" 2>/dev/null; then
log "INFO" "Deleted CSI driver pod: $pod"
FIXES_APPLIED+=("restarted_csi_driver")
# Wait for pod to be recreated
sleep 10
# Check if new pod is running
if wait_for_condition "pod" "$pod" "Running" 120 "$pod_namespace"; then
log "INFO" "CSI driver pod is now running"
else
log "WARN" "CSI driver pod not running after restart"
fi
else
log "ERROR" "Failed to delete CSI driver pod: $pod"
fi
fi
fi
done
else
log "INFO" "No unhealthy CSI driver pods found"
fi
}
# Fix Ceph issues
fix_ceph_issues() {
log "INFO" "Fixing Ceph issues"
# This is a basic attempt - real Ceph issues need specialized handling
if oc get namespace openshift-storage &> /dev/null; then
# Check for common Ceph operator issues
local ceph_operator_pods=$(oc get pods -n openshift-storage -l app=rook-ceph-operator --field-selector=status.phase!=Running -o name 2>/dev/null)
if [[ -n "$ceph_operator_pods" ]]; then
for pod in $ceph_operator_pods; do
log "INFO" "Restarting Ceph operator pod: $pod"
if [[ "$DRY_RUN" == "true" ]]; then
log "INFO" "[DRY RUN] Would delete pod: $pod"
else
if oc delete "$pod" -n openshift-storage 2>/dev/null; then
log "INFO" "Deleted Ceph operator pod: $pod"
FIXES_APPLIED+=("restarted_ceph_operator")
sleep 15
else
log "ERROR" "Failed to delete Ceph operator pod: $pod"
fi
fi
done
else
log "INFO" "Ceph operator pods are running"
fi
else
log "WARN" "OpenShift Storage not available - cannot fix Ceph issues"
fi
}
# Fix PV selection issues
fix_pv_selection() {
local pvc_name=$1
local namespace=$2
log "INFO" "Fixing PV selection issues for PVC: $pvc_name"
# Get PVC requirements
local pvc_capacity=$(get_resource_info "pvc" "$pvc_name" "$namespace" '.spec.resources.requests.storage')
local pvc_access_modes=$(get_resource_info "pvc" "$pvc_name" "$namespace" '.spec.accessModes[0]')
local pvc_storage_class=$(get_resource_info "pvc" "$pvc_name" "$namespace" '.spec.storageClassName')
# Find available PVs that match requirements
local compatible_pvs=$(oc get pv --field-selector=status.phase=Available -o json | jq -r --arg capacity "$pvc_capacity" --arg access_mode "$pvc_access_modes" --arg storage_class "$pvc_storage_class" '
.items[] |
select(
(.spec.capacity.storage | tonumber) >= ($capacity | tonumber) and
(.spec.accessModes | contains([$access_mode])) and
(if $storage_class != "" then .spec.storageClassName == $storage_class else true end)
) |
.metadata.name' 2>/dev/null)
if [[ -n "$compatible_pvs" ]]; then
log "INFO" "Found compatible PVs:"
echo "$compatible_pvs" | while read -r pv; do
log "INFO" " PV: $pv"
done
# Try to manually bind to first compatible PV
local first_pv=$(echo "$compatible_pvs" | head -1)
if confirm_action "Manually bind PVC $pvc_name to PV $first_pv"; then
manual_bind_pvc_to_pv "$pvc_name" "$namespace" "$first_pv"
fi
else
log "WARN" "No compatible PVs found"
# Try to relax requirements
if confirm_action "Create a new PV with matching requirements"; then
create_manual_pv "$pvc_name" "$namespace"
fi
fi
}
# Manual bind PVC to PV
manual_bind_pvc_to_pv() {
local pvc_name=$1
local namespace=$2
local pv_name=$3
log "INFO" "Manually binding PVC $pvc_name to PV $pv_name"
backup_resource "pvc" "$pvc_name" "$namespace"
backup_resource "pv" "$pv_name"
if [[ "$DRY_RUN" == "true" ]]; then
log "INFO" "[DRY RUN] Would bind PVC $pvc_name to PV $pv_name"
return 0
fi
# Add PV reference to PVC
if oc patch pvc "$pvc_name" -n "$namespace" -p "{\"spec\":{\"volumeName\":\"$pv_name\"}}" 2>/dev/null; then
log "INFO" "Added PV reference to PVC"
# Add PVC reference to PV
if oc patch pv "$pv_name" -p "{\"spec\":{\"claimRef\":{\"name\":\"$pvc_name\",\"namespace\":\"$namespace\",\"uid\":\"$(oc get pvc $pvc_name -n $namespace -o jsonpath='{.metadata.uid}')\"}}}"; then
log "INFO" "Added PVC reference to PV"
FIXES_APPLIED+=("manual_bind_pvc_pv")
# Wait for binding
if wait_for_condition "pvc" "$pvc_name" "Bound" 60 "$namespace"; then
log "INFO" "PVC is now bound to PV"
return 0
else
log "WARN" "PVC still not bound after manual binding"
fi
else
log "ERROR" "Failed to add PVC reference to PV"
fi
else
log "ERROR" "Failed to add PV reference to PVC"
fi
return 1
}
# Create manual PV
create_manual_pv() {
local pvc_name=$1
local namespace=$2
log "INFO" "Creating manual PV for PVC: $pvc_name"
# Get PVC requirements
local pvc_capacity=$(get_resource_info "pvc" "$pvc_name" "$namespace" '.spec.resources.requests.storage')
local pvc_access_modes=$(get_resource_info "pvc" "$pvc_name" "$namespace" '.spec.accessModes[0]')
local pvc_storage_class=$(get_resource_info "pvc" "$pvc_name" "$namespace" '.spec.storageClassName')
# Generate PV name
local pv_name="manual-pv-$(date +%s)"
if [[ "$DRY_RUN" == "true" ]]; then
log "INFO" "[DRY RUN] Would create PV: $pv_name"
return 0
fi
# Create hostPath PV (for testing - not recommended for production)
if confirm_action "Create hostPath PV $pv_name (WARNING: Not suitable for production)"; then
cat << EOF | oc apply -f -
apiVersion: v1
kind: PersistentVolume
metadata:
name: $pv_name
labels:
type: hostPath
spec:
capacity:
storage: $pvc_capacity
accessModes:
- $pvc_access_modes
persistentVolumeReclaimPolicy: Retain
storageClassName: $pvc_storage_class
hostPath:
path: /tmp/pv-data/$pv_name
type: DirectoryOrCreate
EOF
if [[ $? -eq 0 ]]; then
log "INFO" "Created manual PV: $pv_name"
FIXES_APPLIED+=("created_manual_pv")
# Wait for PV to be available
if wait_for_condition "pv" "$pv_name" "Available" 30; then
log "INFO" "PV is now available"
# Try to bind PVC to new PV
manual_bind_pvc_to_pv "$pvc_name" "$namespace" "$pv_name"
else
log "WARN" "PV not available after creation"
fi
else
log "ERROR" "Failed to create manual PV"
fi
fi
}
# Fix no available PV
fix_no_available_pv() {
local pvc_name=$1
local namespace=$2
log "INFO" "Fixing no available PV issue for PVC: $pvc_name"
# Strategy 1: Create manual PV
create_manual_pv "$pvc_name" "$namespace"
# Strategy 2: Modify PVC requirements
if confirm_action "Reduce PVC storage requirements to find available PV"; then
modify_pvc_requirements "$pvc_name" "$namespace"
fi
}
# Modify PVC requirements
modify_pvc_requirements() {
local pvc_name=$1
local namespace=$2
log "INFO" "Modifying PVC requirements for: $pvc_name"
# Get current requirements
local current_capacity=$(get_resource_info "pvc" "$pvc_name" "$namespace" '.spec.resources.requests.storage')
local current_access_mode=$(get_resource_info "pvc" "$pvc_name" "$namespace" '.spec.accessModes[0]')
# Try to find smaller available PVs
local available_pvs=$(oc get pv --field-selector=status.phase=Available -o custom-columns=NAME:.metadata.name,CAPACITY:.spec.capacity.storage,ACCESS:.spec.accessModes[0] --no-headers 2>/dev/null | head -5)
if [[ -n "$available_pvs" ]]; then
log "INFO" "Available PVs:"
echo "$available_pvs" | while read -r line; do
log "INFO" " $line"
done
# Try ReadWriteOnce instead of ReadWriteMany
if [[ "$current_access_mode" == "ReadWriteMany" ]]; then
if confirm_action "Change PVC access mode from ReadWriteMany to ReadWriteOnce"; then
backup_resource "pvc" "$pvc_name" "$namespace"
if [[ "$DRY_RUN" == "true" ]]; then
log "INFO" "[DRY RUN] Would change PVC access mode to ReadWriteOnce"
else
if oc patch pvc "$pvc_name" -n "$namespace" -p '{"spec":{"accessModes":["ReadWriteOnce"]}}' 2>/dev/null; then
log "INFO" "Changed PVC access mode to ReadWriteOnce"
FIXES_APPLIED+=("changed_access_mode")
if wait_for_condition "pvc" "$pvc_name" "Bound" 60 "$namespace"; then
log "INFO" "PVC is now bound"
return 0
fi
else
log "ERROR" "Failed to change PVC access mode"
fi
fi
fi
fi
fi
}
# Recreate PVC
recreate_pvc() {
local pvc_name=$1
local namespace=$2
log "INFO" "Recreating PVC: $pvc_name"
# Backup current PVC
backup_resource "pvc" "$pvc_name" "$namespace"
if [[ "$DRY_RUN" == "true" ]]; then
log "INFO" "[DRY RUN] Would recreate PVC: $pvc_name"
return 0
fi
# Get PVC definition
local pvc_def=$(oc get pvc "$pvc_name" -n "$namespace" -o yaml 2>/dev/null)
if [[ -n "$pvc_def" ]]; then
# Remove PVC
if oc delete pvc "$pvc_name" -n "$namespace" --timeout=60s 2>/dev/null; then
log "INFO" "Deleted PVC: $pvc_name"
# Wait a moment
sleep 5
# Recreate PVC (remove metadata that shouldn't be recreated)
echo "$pvc_def" | yq eval 'del(.metadata.uid, .metadata.resourceVersion, .metadata.creationTimestamp, .metadata.selfLink, .status)' - | oc apply -f -
if [[ $? -eq 0 ]]; then
log "INFO" "Recreated PVC: $pvc_name"
FIXES_APPLIED+=("recreated_pvc")
# Wait for PVC to be bound
if wait_for_condition "pvc" "$pvc_name" "Bound" 120 "$namespace"; then
log "INFO" "Recreated PVC is now bound"
return 0
else
log "WARN" "Recreated PVC still not bound"
fi
else
log "ERROR" "Failed to recreate PVC"
fi
else
log "ERROR" "Failed to delete PVC for recreation"
fi
else
log "ERROR" "Could not get PVC definition for recreation"
fi
return 1
}
# Fix PV released issues
fix_pv_released() {
local pv_name=$1
log "INFO" "Fixing released PV: $pv_name"
# Get PV reclaim policy
local reclaim_policy=$(get_resource_info "pv" "$pv_name" "" '.spec.persistentVolumeReclaimPolicy')
log "INFO" "PV reclaim policy: $reclaim_policy"
# Strategy 1: Remove claim reference to make PV available
if confirm_action "Remove claim reference from PV $pv_name to make it available"; then
backup_resource "pv" "$pv_name"
if [[ "$DRY_RUN" == "true" ]]; then
log "INFO" "[DRY RUN] Would remove claim reference from PV"
else
if oc patch pv "$pv_name" --type json -p '[{"op": "remove", "path": "/spec/claimRef"}]' 2>/dev/null; then
log "INFO" "Removed claim reference from PV"
FIXES_APPLIED+=("removed_claim_ref")
# Wait for PV to be available
if wait_for_condition "pv" "$pv_name" "Available" 30; then
log "INFO" "PV is now available"
return 0
else
log "WARN" "PV still not available after removing claim reference"
fi
else
log "ERROR" "Failed to remove claim reference from PV"
fi
fi
fi
# Strategy 2: Change reclaim policy to Retain
if [[ "$reclaim_policy" != "Retain" ]]; then
if confirm_action "Change PV reclaim policy to Retain"; then
backup_resource "pv" "$pv_name"
if [[ "$DRY_RUN" == "true" ]]; then
log "INFO" "[DRY RUN] Would change PV reclaim policy to Retain"
else
if oc patch pv "$pv_name" -p '{"spec":{"persistentVolumeReclaimPolicy":"Retain"}}' 2>/dev/null; then
log "INFO" "Changed PV reclaim policy to Retain"
FIXES_APPLIED+=("changed_reclaim_policy")
else
log "ERROR" "Failed to change PV reclaim policy"
fi
fi
fi
fi
}
# Fix pod mount issues
fix_pod_mount_issues() {
local pod_name=$1
local namespace=$2
local pvc_name=$3
log "INFO" "Fixing pod mount issues for pod: $pod_name"
# Strategy 1: Check node storage
local node_name=$(get_resource_info "pod" "$pod_name" "$namespace" '.spec.nodeName')
if [[ -n "$node_name" ]]; then
log "INFO" "Pod is on node: $node_name"
# Check if node has disk pressure
if [[ " ${ISSUES_FOUND[*]} " =~ " node_disk_pressure " ]]; then
if confirm_action "Cordon node $node_name due to disk pressure"; then
if [[ "$DRY_RUN" == "true" ]]; then
log "INFO" "[DRY RUN] Would cordon node: $node_name"
else
if oc cordon "$node_name" 2>/dev/null; then
log "INFO" "Cordoned node: $node_name"
FIXES_APPLIED+=("cordoned_node")
else
log "ERROR" "Failed to cordon node: $node_name"
fi
fi
fi
fi
fi
# Strategy 2: Restart pod
if confirm_action "Restart pod $pod_name to retry mount"; then
if [[ "$DRY_RUN" == "true" ]]; then
log "INFO" "[DRY RUN] Would restart pod: $pod_name"
else
if oc delete pod "$pod_name" -n "$namespace" --timeout=60s 2>/dev/null; then
log "INFO" "Restarted pod: $pod_name"
FIXES_APPLIED+=("restarted_pod")
# Wait for pod to be recreated and running
sleep 10
if wait_for_condition "pod" "$pod_name" "Running" 180 "$namespace"; then
log "INFO" "Pod is now running"
return 0
else
log "WARN" "Pod still not running after restart"
fi
else
log "ERROR" "Failed to restart pod"
fi
fi
fi
# Strategy 3: Check and fix SELinux context
local pod_security_context=$(get_resource_info "pod" "$pod_name" "$namespace" '.spec.securityContext')
if [[ -n "$pod_security_context" ]]; then
log "INFO" "Pod has security context - checking SELinux"
# This would need more sophisticated SELinux handling
log "INFO" "SELinux context check not implemented - manual verification needed"
fi
}
# Confirm action
confirm_action() {
local action=$1
if [[ "$FORCE_FIX" == "true" ]]; then
return 0
fi
if [[ "$DRY_RUN" == "true" ]]; then
return 0
fi
echo -n "Do you want to $action? [y/N]: "
read -r response
case $response in
[yY][eE][sS]|[yY])
return 0
;;
*)
return 1
;;
esac
}
# Main diagnosis function
run_diagnosis() {
log "INFO" "Starting PVC/PV diagnosis"
# If specific PVC provided, diagnose it
if [[ -n "$PVC_NAME" ]]; then
diagnose_pvc "$PVC_NAME" "$NAMESPACE"
# If specific PV provided, diagnose it
elif [[ -n "$PV_NAME" ]]; then
diagnose_pv "$PV_NAME"
else
# Diagnose all PVCs in namespace
log "INFO" "Diagnosing all PVCs in namespace: $NAMESPACE"
local pvcs=$(oc get pvc -n "$NAMESPACE" -o name 2>/dev/null | cut -d'/' -f2)
if [[ -n "$pvcs" ]]; then
echo "$pvcs" | while read -r pvc; do
diagnose_pvc "$pvc" "$NAMESPACE"
done
else
log "INFO" "No PVCs found in namespace: $NAMESPACE"
fi
fi
# Show summary of issues found
if [[ ${#ISSUES_FOUND[@]} -gt 0 ]]; then
log "INFO" "Issues found:"
for issue in "${ISSUES_FOUND[@]}"; do
log "INFO" " - $issue"
done
else
log "INFO" "No issues found"
fi
}
# Main fix function
run_fixes() {
if [[ ${#ISSUES_FOUND[@]} -eq 0 ]]; then
log "INFO" "No issues to fix"
return 0
fi
log "INFO" "Starting automatic fixes"
# Fix PVC issues
if [[ -n "$PVC_NAME" ]]; then
if [[ " ${ISSUES_FOUND[*]} " =~ " pvc_pending " ]]; then
fix_pvc_pending "$PVC_NAME" "$NAMESPACE"
fi
if [[ " ${ISSUES_FOUND[*]} " =~ " pod_mount_failed " ]]; then
# Find pods using this PVC
local pods_using_pvc=$(oc get pods -n "$NAMESPACE" -o json | jq -r --arg pvc "$PVC_NAME" '.items[] | select(.spec.volumes[]?.persistentVolumeClaim?.claimName == $pvc) | .metadata.name' 2>/dev/null)
if [[ -n "$pods_using_pvc" ]]; then
echo "$pods_using_pvc" | while read -r pod; do
fix_pod_mount_issues "$pod" "$NAMESPACE" "$PVC_NAME"
done
fi
fi
fi
# Fix PV issues
if [[ -n "$PV_NAME" ]]; then
if [[ " ${ISSUES_FOUND[*]} " =~ " pv_released " ]]; then
fix_pv_released "$PV_NAME"
fi
fi
# Show summary of fixes applied
if [[ ${#FIXES_APPLIED[@]} -gt 0 ]]; then
log "INFO" "Fixes applied:"
for fix in "${FIXES_APPLIED[@]}"; do
log "INFO" " - $fix"
done
else
log "INFO" "No fixes were applied"
fi
}
# Generate report
generate_report() {
local report_file="${SCRIPT_DIR}/pvc_debug_report_$(date +%Y%m%d_%H%M%S).txt"
cat > "$report_file" << EOF
OpenShift PVC/PV Diagnostic Report
Generated: $(date)
Namespace: $NAMESPACE
PVC: $PVC_NAME
PV: $PV_NAME
=== Issues Found ===
EOF
if [[ ${#ISSUES_FOUND[@]} -gt 0 ]]; then
for issue in "${ISSUES_FOUND[@]}"; do
echo "- $issue" >> "$report_file"
done
else
echo "No issues found" >> "$report_file"
fi
cat >> "$report_file" << EOF
=== Fixes Applied ===
EOF
if [[ ${#FIXES_APPLIED[@]} -gt 0 ]]; then
for fix in "${FIXES_APPLIED[@]}"; do
echo "- $fix" >> "$report_file"
done
else
echo "No fixes applied" >> "$report_file"
fi
cat >> "$report_file" << EOF
=== Log File ===
$LOG_FILE
=== Backup Directory ===
$BACKUP_DIR
=== Recommendations ===
EOF
# Add recommendations based on issues found
if [[ " ${ISSUES_FOUND[*]} " =~ " ceph_unhealthy " ]]; then
echo "- Monitor Ceph cluster health regularly" >> "$report_file"
echo "- Consider increasing Ceph cluster resources" >> "$report_file"
fi
if [[ " ${ISSUES_FOUND[*]} " =~ " node_disk_pressure " ]]; then
echo "- Monitor node disk usage" >> "$report_file"
echo "- Consider adding storage capacity to nodes" >> "$report_file"
fi
if [[ " ${ISSUES_FOUND[*]} " =~ " dynamic_provisioning_failed " ]]; then
echo "- Verify storage class configuration" >> "$report_file"
echo "- Check provisioner pod health" >> "$report_file"
fi
echo "- Review backup files before deleting: $BACKUP_DIR" >> "$report_file"
echo "- Monitor fixed resources for stability" >> "$report_file"
log "INFO" "Report generated: $report_file"
}
# Main function
main() {
parse_args "$@"
log "INFO" "Starting OpenShift PVC/PV diagnostic script"
log "INFO" "Log file: $LOG_FILE"
if [[ "$DRY_RUN" == "true" ]]; then
log "INFO" "Running in DRY RUN mode - no changes will be made"
fi
check_prerequisites
# Run diagnosis
run_diagnosis
# Run fixes if not dry run
if [[ "$DRY_RUN" != "true" ]]; then
run_fixes
fi
# Generate report
generate_report
log "INFO" "Script completed"
# Exit with appropriate code
if [[ ${#ISSUES_FOUND[@]} -gt 0 ]]; then
log "WARN" "Issues were found - check the report for details"
exit 1
else
log "INFO" "No issues found - all resources are healthy"
exit 0
fi
}
# Check if script is being sourced or executed
if [[ "${BASH_SOURCE[0]}" == "${0}" ]]; then
main "$@"
fi
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment