Last active
July 22, 2025 10:00
-
-
Save shanduur/2335afc28a67fbf1edf2a8428157fade to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/usr/bin/env bash | |
| # reproduce https://github.com/etcd-io/etcd/issues/20340 | |
| set -eEuo pipefail | |
| # Define versions | |
| VERSION="v3.6.2" | |
| export PATH=./${VERSION}:${PATH} | |
| # Define member names | |
| MEMBER_1="etcd-1" | |
| MEMBER_2="etcd-2" | |
| MEMBER_3="etcd-3" | |
| MAX_RETRIES=10 | |
| RETRY_DELAY=5 | |
| # Cleanup any existing data | |
| echo "- [*] Cleaning up old etcd data..." | |
| rm -rf ./tmp/etcd-* | |
| sleep 2 | |
| pkill -f "etcd --name" || true | |
| sleep 2 | |
| pkill -f "etcd --name" || true | |
| sleep 2 | |
| mkdir -p ./tmp/ | |
| ############################################################ | |
| # Helpers | |
| ############################################################ | |
| function check_logs_for_panic() { | |
| echo "- [*] Checking panic..." | |
| if grep -i panic ./tmp/*.log; then | |
| echo "=== Panic found in logs!" | |
| exit 1 | |
| else | |
| echo " - No panic found in logs." | |
| fi | |
| echo "- [*] Checking failure log..." | |
| if grep -i 'failed to nodeToMember' ./tmp/*.log; then | |
| echo "=== 'failed to nodeToMember' found in logs!" | |
| exit 1 | |
| else | |
| echo " - No 'failed to nodeToMember' found in logs." | |
| fi | |
| } | |
| function healthcheck() { | |
| check_logs_for_panic | |
| echo "- [*] Performing healthchecks..." | |
| echo " - $MEMBER_1" | |
| etcdctl --endpoints=http://127.0.0.1:2379 endpoint status | |
| etcdctl --endpoints=http://127.0.0.1:2379 endpoint health | |
| echo " - $MEMBER_2" | |
| etcdctl --endpoints=http://127.0.0.1:2378 endpoint status | |
| etcdctl --endpoints=http://127.0.0.1:2378 endpoint health | |
| echo " - $MEMBER_3" | |
| etcdctl --endpoints=http://127.0.0.1:2377 endpoint status | |
| etcdctl --endpoints=http://127.0.0.1:2377 endpoint health | |
| } | |
| function forfeit_leadership() { | |
| echo "- [*] Changing leadership..." | |
| LEADER_ID=$(etcdctl --endpoints=http://127.0.0.1:2379 endpoint status | awk -F'[ ,]+' '{print $2}') | |
| sleep 5 | |
| echo " - Forfeiting leadership for member with ID: $LEADER_ID" | |
| etcdctl --endpoints=http://127.0.0.1:2379 move-leader $LEADER_ID | |
| } | |
| ############################################################ | |
| # Starting Cluster | |
| ############################################################ | |
| function phase_prep() { | |
| # Start first etcd member | |
| echo "- [*] Starting first etcd member ($MEMBER_1)..." | |
| nohup etcd --name $MEMBER_1 \ | |
| --data-dir ./tmp/etcd-$MEMBER_1 \ | |
| --initial-advertise-peer-urls http://127.0.0.1:2380 \ | |
| --listen-peer-urls http://127.0.0.1:2380 \ | |
| --advertise-client-urls http://127.0.0.1:2379 \ | |
| --listen-client-urls http://127.0.0.1:2379 \ | |
| --initial-cluster "$MEMBER_1=http://127.0.0.1:2380" \ | |
| --initial-cluster-state new > ./tmp/etcd-$MEMBER_1.log 2>&1 & | |
| sleep 5 | |
| echo "- [*] add members" | |
| # Add and promote the first learner | |
| echo " - Adding learner ($MEMBER_2)..." | |
| MEMBER2_ID=$(etcdctl --endpoints=http://127.0.0.1:2379 member add $MEMBER_2 --peer-urls=http://127.0.0.1:2382 --learner | grep 'Member' | awk '{print $2}') | |
| nohup etcd --name $MEMBER_2 \ | |
| --data-dir ./tmp/etcd-$MEMBER_2 \ | |
| --initial-advertise-peer-urls http://127.0.0.1:2382 \ | |
| --listen-peer-urls http://127.0.0.1:2382 \ | |
| --advertise-client-urls http://127.0.0.1:2378 \ | |
| --listen-client-urls http://127.0.0.1:2378 \ | |
| --initial-cluster "$MEMBER_1=http://127.0.0.1:2380,$MEMBER_2=http://127.0.0.1:2382" \ | |
| --initial-cluster-state existing > ./tmp/etcd-$MEMBER_2.log 2>&1 & | |
| ATTEMPT=1 | |
| while [ $ATTEMPT -le $MAX_RETRIES ]; do | |
| echo " - Promoting learner ($MEMBER_2) with ID: ${MEMBER2_ID}... (attempt $ATTEMPT)" | |
| if etcdctl --endpoints=http://127.0.0.1:2379 member promote "${MEMBER2_ID}"; then | |
| echo " - Successfully promoted learner ${MEMBER_2}." | |
| break | |
| else | |
| echo "=== Failed to promote learner ${MEMBER_2}. Retrying in ${RETRY_DELAY}s..." | |
| sleep $RETRY_DELAY | |
| ((ATTEMPT++)) | |
| fi | |
| done | |
| if [ $ATTEMPT -gt $MAX_RETRIES ]; then | |
| echo "=== Failed to promote learner ${MEMBER_2} after $MAX_RETRIES attempts." | |
| exit 1 | |
| fi | |
| # Add and promote the second learner | |
| echo " - Adding learner ($MEMBER_3)..." | |
| MEMBER3_ID=$(etcdctl --endpoints=http://127.0.0.1:2379 member add $MEMBER_2 --peer-urls=http://127.0.0.1:2384 --learner | grep 'Member' | awk '{print $2}') | |
| nohup etcd --name $MEMBER_3 \ | |
| --data-dir ./tmp/etcd-$MEMBER_3 \ | |
| --initial-advertise-peer-urls http://127.0.0.1:2384 \ | |
| --listen-peer-urls http://127.0.0.1:2384 \ | |
| --advertise-client-urls http://127.0.0.1:2377 \ | |
| --listen-client-urls http://127.0.0.1:2377 \ | |
| --initial-cluster "$MEMBER_1=http://127.0.0.1:2380,$MEMBER_2=http://127.0.0.1:2382,$MEMBER_3=http://127.0.0.1:2384" \ | |
| --initial-cluster-state existing > ./tmp/etcd-$MEMBER_3.log 2>&1 & | |
| ATTEMPT=1 | |
| while [ $ATTEMPT -le $MAX_RETRIES ]; do | |
| echo " - Promoting learner ($MEMBER_3) with ID: ${MEMBER3_ID}... (attempt $ATTEMPT)" | |
| if etcdctl --endpoints=http://127.0.0.1:2379 member promote "${MEMBER3_ID}"; then | |
| echo " - Successfully promoted learner ${MEMBER_3}." | |
| break | |
| else | |
| echo "=== Failed to promote learner ${MEMBER_3}. Retrying in ${RETRY_DELAY}s..." | |
| sleep $RETRY_DELAY | |
| ((ATTEMPT++)) | |
| fi | |
| done | |
| if [ $ATTEMPT -gt $MAX_RETRIES ]; then | |
| echo "=== Failed to promote learner ${MEMBER_3} after $MAX_RETRIES attempts." | |
| exit 1 | |
| fi | |
| # read -p "Continue [y/N]? " -r confirm | |
| # [[ "${confirm,,}" == "y" ]] || exit 1 | |
| } | |
| function phase_snap_restore() { | |
| # Create a snapshot | |
| echo "- [*] Creating a snapshot" | |
| rm -f ./tmp/snapshot.db | |
| etcdctl snapshot save ./tmp/snapshot.db | |
| # Stop all members and cleanup data | |
| echo "- [*] Stop all members and cleanup data" | |
| for member in $MEMBER_1 $MEMBER_2 $MEMBER_3; do | |
| echo "Stopping $member..." | |
| pgrep -f "etcd --name $member" | xargs kill -9 | |
| sleep 2 | |
| done | |
| rm -rf ./tmp/etcd-* | |
| # Restore single-node cluster | |
| echo "- [*] Restoring single-node cluster" | |
| etcdutl snapshot restore ./tmp/snapshot.db --data-dir=./tmp/etcd-etcd-1 --name="etcd-1" --initial-cluster="etcd-1=http://127.0.0.1:2380" --initial-advertise-peer-urls=http://127.0.0.1:2380 | |
| echo "- [*] Starting single-node etcd cluster: ($MEMBER_1)..." | |
| nohup etcd --name $MEMBER_1 \ | |
| --data-dir ./tmp/etcd-$MEMBER_1 \ | |
| --initial-advertise-peer-urls http://127.0.0.1:2380 \ | |
| --listen-peer-urls http://127.0.0.1:2380 \ | |
| --advertise-client-urls http://127.0.0.1:2379 \ | |
| --listen-client-urls http://127.0.0.1:2379 \ | |
| --initial-cluster "$MEMBER_1=http://127.0.0.1:2380" \ | |
| --initial-cluster-state new > ./tmp/etcd-$MEMBER_1.log 2>&1 & | |
| # read -p "Continue [y/N]? " -r confirm | |
| # [[ "${confirm,,}" == "y" ]] || exit 1 | |
| sleep 5 | |
| echo "- [*] add members" | |
| # Add and promote the first learner | |
| echo " - Adding learner ($MEMBER_2)..." | |
| MEMBER2_ID=$(etcdctl --endpoints=http://127.0.0.1:2379 member add $MEMBER_2 --peer-urls=http://127.0.0.1:2382 --learner | grep 'Member' | awk '{print $2}') | |
| nohup etcd --name $MEMBER_2 \ | |
| --data-dir ./tmp/etcd-$MEMBER_2 \ | |
| --initial-advertise-peer-urls http://127.0.0.1:2382 \ | |
| --listen-peer-urls http://127.0.0.1:2382 \ | |
| --advertise-client-urls http://127.0.0.1:2378 \ | |
| --listen-client-urls http://127.0.0.1:2378 \ | |
| --initial-cluster "$MEMBER_1=http://127.0.0.1:2380,$MEMBER_2=http://127.0.0.1:2382" \ | |
| --initial-cluster-state existing > ./tmp/etcd-$MEMBER_2.log 2>&1 & | |
| ATTEMPT=1 | |
| while [ $ATTEMPT -le $MAX_RETRIES ]; do | |
| echo " - Promoting learner ($MEMBER_2) with ID: ${MEMBER2_ID}... (attempt $ATTEMPT)" | |
| if etcdctl --endpoints=http://127.0.0.1:2379 member promote "${MEMBER2_ID}"; then | |
| echo " - Successfully promoted learner ${MEMBER_2}." | |
| break | |
| else | |
| echo "=== Failed to promote learner ${MEMBER_2}. Retrying in ${RETRY_DELAY}s..." | |
| sleep $RETRY_DELAY | |
| ((ATTEMPT++)) | |
| fi | |
| done | |
| if [ $ATTEMPT -gt $MAX_RETRIES ]; then | |
| echo "=== Failed to promote learner ${MEMBER_2} after $MAX_RETRIES attempts." | |
| exit 1 | |
| fi | |
| # Add and promote the second learner | |
| echo " - Adding learner ($MEMBER_3)..." | |
| MEMBER3_ID=$(etcdctl --endpoints=http://127.0.0.1:2379 member add $MEMBER_2 --peer-urls=http://127.0.0.1:2384 --learner | grep 'Member' | awk '{print $2}') | |
| nohup etcd --name $MEMBER_3 \ | |
| --data-dir ./tmp/etcd-$MEMBER_3 \ | |
| --initial-advertise-peer-urls http://127.0.0.1:2384 \ | |
| --listen-peer-urls http://127.0.0.1:2384 \ | |
| --advertise-client-urls http://127.0.0.1:2377 \ | |
| --listen-client-urls http://127.0.0.1:2377 \ | |
| --initial-cluster "$MEMBER_1=http://127.0.0.1:2380,$MEMBER_2=http://127.0.0.1:2382,$MEMBER_3=http://127.0.0.1:2384" \ | |
| --initial-cluster-state existing > ./tmp/etcd-$MEMBER_3.log 2>&1 & | |
| ATTEMPT=1 | |
| while [ $ATTEMPT -le $MAX_RETRIES ]; do | |
| echo " - Promoting learner ($MEMBER_3) with ID: ${MEMBER3_ID}... (attempt $ATTEMPT)" | |
| if etcdctl --endpoints=http://127.0.0.1:2379 member promote "${MEMBER3_ID}"; then | |
| echo " - Successfully promoted learner ${MEMBER_3}." | |
| break | |
| else | |
| echo "=== Failed to promote learner ${MEMBER_3}. Retrying in ${RETRY_DELAY}s..." | |
| sleep $RETRY_DELAY | |
| ((ATTEMPT++)) | |
| fi | |
| done | |
| if [ $ATTEMPT -gt $MAX_RETRIES ]; then | |
| echo "=== Failed to promote learner ${MEMBER_3} after $MAX_RETRIES attempts." | |
| exit 1 | |
| fi | |
| # read -p "Continue [y/N]? " -r confirm | |
| # [[ "${confirm,,}" == "y" ]] || exit 1 | |
| } | |
| ############################################################ | |
| # Removing Members | |
| ############################################################ | |
| function phase_remove() { | |
| echo "- [*] Checking if leadership change is needed..." | |
| LEADER_ID=$(etcdctl --endpoints=http://127.0.0.1:2379 endpoint status | awk -F'[ ,]+' '{print $2}') | |
| sleep 5 | |
| if [ "$MEMBER2_ID" == "$LEADER_ID" ]; then | |
| echo " - Forfeiting leadership for member with ID: $LEADER_ID" | |
| etcdctl --endpoints=http://127.0.0.1:2379 move-leader $LEADER_ID | |
| fi | |
| # Remove etcd-2 | |
| echo "- [*] Removing member ($MEMBER_2)..." | |
| etcdctl member remove ${MEMBER2_ID} | |
| sleep 5 | |
| rm -rf ./tmp/etcd-$MEMBER_2 | |
| echo "- [*] Adding member ($MEMBER_2)..." | |
| echo " - Adding learner ($MEMBER_2)..." | |
| MEMBER2_ID=$(etcdctl --endpoints=http://127.0.0.1:2379 member add $MEMBER_2 --peer-urls=http://127.0.0.1:2382 --learner | grep 'Member' | awk '{print $2}') | |
| nohup etcd --name $MEMBER_2 \ | |
| --data-dir ./tmp/etcd-$MEMBER_2 \ | |
| --initial-advertise-peer-urls http://127.0.0.1:2382 \ | |
| --listen-peer-urls http://127.0.0.1:2382 \ | |
| --advertise-client-urls http://127.0.0.1:2378 \ | |
| --listen-client-urls http://127.0.0.1:2378 \ | |
| --initial-cluster "$MEMBER_1=http://127.0.0.1:2380,$MEMBER_2=http://127.0.0.1:2382,$MEMBER_3=http://127.0.0.1:2384" \ | |
| --initial-cluster-state existing > ./tmp/etcd-$MEMBER_2.log 2>&1 & | |
| ATTEMPT=1 | |
| while [ $ATTEMPT -le $MAX_RETRIES ]; do | |
| echo " - Promoting learner ($MEMBER_2) with ID: ${MEMBER2_ID}... (attempt $ATTEMPT)" | |
| if etcdctl --endpoints=http://127.0.0.1:2379 member promote "${MEMBER2_ID}"; then | |
| echo " - Successfully promoted learner ${MEMBER_2}." | |
| break | |
| else | |
| echo "=== Failed to promote learner ${MEMBER_2}. Retrying in ${RETRY_DELAY}s..." | |
| sleep $RETRY_DELAY | |
| ((ATTEMPT++)) | |
| fi | |
| done | |
| if [ $ATTEMPT -gt $MAX_RETRIES ]; then | |
| echo "=== Failed to promote learner ${MEMBER_2} after $MAX_RETRIES attempts." | |
| exit 1 | |
| fi | |
| # echo "- [*] Checking if leadership change is needed..." | |
| # LEADER_ID=$(etcdctl --endpoints=http://127.0.0.1:2379 endpoint status | awk -F'[ ,]+' '{print $2}') | |
| # sleep 5 | |
| # if [ "$MEMBER3_ID" == "$LEADER_ID" ]; then | |
| # echo " - Forfeiting leadership for member with ID: $LEADER_ID" | |
| # etcdctl --endpoints=http://127.0.0.1:2379 move-leader $LEADER_ID | |
| # fi | |
| # # Remove etcd-3 | |
| # echo "- [*] Removing member ($MEMBER_3)..." | |
| # etcdctl member remove ${MEMBER3_ID} | |
| # sleep 5 | |
| # rm -rf ./tmp/etcd-$MEMBER_3 | |
| # echo "- [*] Adding member ($MEMBER_3)..." | |
| # echo " - Adding learner ($MEMBER_3)..." | |
| # MEMBER3_ID=$(etcdctl --endpoints=http://127.0.0.1:2379 member add $MEMBER_2 --peer-urls=http://127.0.0.1:2384 --learner | grep 'Member' | awk '{print $2}') | |
| # nohup etcd --name $MEMBER_3 \ | |
| # --data-dir ./tmp/etcd-$MEMBER_3 \ | |
| # --initial-advertise-peer-urls http://127.0.0.1:2384 \ | |
| # --listen-peer-urls http://127.0.0.1:2384 \ | |
| # --advertise-client-urls http://127.0.0.1:2377 \ | |
| # --listen-client-urls http://127.0.0.1:2377 \ | |
| # --initial-cluster "$MEMBER_1=http://127.0.0.1:2380,$MEMBER_2=http://127.0.0.1:2382,$MEMBER_3=http://127.0.0.1:2384" \ | |
| # --initial-cluster-state existing > ./tmp/etcd-$MEMBER_3.log 2>&1 & | |
| # ATTEMPT=1 | |
| # while [ $ATTEMPT -le $MAX_RETRIES ]; do | |
| # echo " - Promoting learner ($MEMBER_3) with ID: ${MEMBER3_ID}... (attempt $ATTEMPT)" | |
| # if etcdctl --endpoints=http://127.0.0.1:2379 member promote "${MEMBER3_ID}"; then | |
| # echo " - Successfully promoted learner ${MEMBER_3}." | |
| # break | |
| # else | |
| # echo "=== Failed to promote learner ${MEMBER_3}. Retrying in ${RETRY_DELAY}s..." | |
| # sleep $RETRY_DELAY | |
| # ((ATTEMPT++)) | |
| # fi | |
| # done | |
| # if [ $ATTEMPT -gt $MAX_RETRIES ]; then | |
| # echo "=== Failed to promote learner ${MEMBER_3} after $MAX_RETRIES attempts." | |
| # exit 1 | |
| # fi | |
| # read -p "Continue [y/N]? " -r confirm | |
| # [[ "${confirm,,}" == "y" ]] || exit 1 | |
| } | |
| ############################################################ | |
| # Cluster reboot | |
| ############################################################ | |
| function phase_reboot() { | |
| # Stop all members | |
| echo "- [*] Stop all members" | |
| for member in $MEMBER_1 $MEMBER_2 $MEMBER_3; do | |
| echo " - Stopping $member..." | |
| pgrep -f "etcd --name $member" | xargs kill -9 | |
| rm -f ./tmp/etcd-${member}.log | |
| sleep 2 | |
| done | |
| # read -p "Continue [y/N]? " -r confirm | |
| # [[ "${confirm,,}" == "y" ]] || exit 1 | |
| # Start all members again | |
| echo "- [*] start all members again" | |
| echo " - start $MEMBER_1" | |
| nohup etcd --name $MEMBER_1 \ | |
| --data-dir ./tmp/etcd-$MEMBER_1 \ | |
| --initial-advertise-peer-urls http://127.0.0.1:2380 \ | |
| --listen-peer-urls http://127.0.0.1:2380 \ | |
| --advertise-client-urls http://127.0.0.1:2379 \ | |
| --listen-client-urls http://127.0.0.1:2379 \ | |
| --initial-cluster "$MEMBER_1=http://127.0.0.1:2380,$MEMBER_2=http://127.0.0.1:2382,$MEMBER_3=http://127.0.0.1:2384" \ | |
| --initial-cluster-state new > ./tmp/etcd-$MEMBER_1.log 2>&1 & | |
| echo " - start $MEMBER_2" | |
| nohup etcd --name $MEMBER_2 \ | |
| --data-dir ./tmp/etcd-$MEMBER_2 \ | |
| --initial-advertise-peer-urls http://127.0.0.1:2382 \ | |
| --listen-peer-urls http://127.0.0.1:2382 \ | |
| --advertise-client-urls http://127.0.0.1:2378 \ | |
| --listen-client-urls http://127.0.0.1:2378 \ | |
| --initial-cluster "$MEMBER_1=http://127.0.0.1:2380,$MEMBER_2=http://127.0.0.1:2382,$MEMBER_3=http://127.0.0.1:2384" \ | |
| --initial-cluster-state existing > ./tmp/etcd-$MEMBER_2.log 2>&1 & | |
| echo " - start $MEMBER_3" | |
| nohup etcd --name $MEMBER_3 \ | |
| --data-dir ./tmp/etcd-$MEMBER_3 \ | |
| --initial-advertise-peer-urls http://127.0.0.1:2384 \ | |
| --listen-peer-urls http://127.0.0.1:2384 \ | |
| --advertise-client-urls http://127.0.0.1:2377 \ | |
| --listen-client-urls http://127.0.0.1:2377 \ | |
| --initial-cluster "$MEMBER_1=http://127.0.0.1:2380,$MEMBER_2=http://127.0.0.1:2382,$MEMBER_3=http://127.0.0.1:2384" \ | |
| --initial-cluster-state existing > ./tmp/etcd-$MEMBER_3.log 2>&1 & | |
| sleep 5 | |
| # read -p "Continue [y/N]? " -r confirm | |
| # [[ "${confirm,,}" == "y" ]] || exit 1 | |
| } | |
| ############################################################ | |
| # Cleanup | |
| ############################################################ | |
| function phase_cleanup() { | |
| # Stop all members | |
| echo "- [*] Stop all members" | |
| for member in $MEMBER_1 $MEMBER_2 $MEMBER_3; do | |
| echo " - Stopping $member..." | |
| pgrep -f "etcd --name $member" | xargs kill -9 | |
| sleep 2 | |
| done | |
| echo "- [*] Done" | |
| } | |
| phase_prep | |
| forfeit_leadership | |
| healthcheck | |
| while true; do | |
| echo "=== Starting new round ===" | |
| phase_snap_restore | |
| forfeit_leadership | |
| healthcheck | |
| phase_remove | |
| forfeit_leadership | |
| healthcheck | |
| phase_reboot | |
| forfeit_leadership | |
| healthcheck | |
| echo "=== Round completed successfully ===" | |
| done | |
| phase_cleanup |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment