Skip to content

Instantly share code, notes, and snippets.

@shanduur
Last active July 22, 2025 10:00
Show Gist options
  • Select an option

  • Save shanduur/2335afc28a67fbf1edf2a8428157fade to your computer and use it in GitHub Desktop.

Select an option

Save shanduur/2335afc28a67fbf1edf2a8428157fade to your computer and use it in GitHub Desktop.
#!/usr/bin/env bash
# reproduce https://github.com/etcd-io/etcd/issues/20340
set -eEuo pipefail
# Define versions
VERSION="v3.6.2"
export PATH=./${VERSION}:${PATH}
# Define member names
MEMBER_1="etcd-1"
MEMBER_2="etcd-2"
MEMBER_3="etcd-3"
MAX_RETRIES=10
RETRY_DELAY=5
# Cleanup any existing data
echo "- [*] Cleaning up old etcd data..."
rm -rf ./tmp/etcd-*
sleep 2
pkill -f "etcd --name" || true
sleep 2
pkill -f "etcd --name" || true
sleep 2
mkdir -p ./tmp/
############################################################
# Helpers
############################################################
function check_logs_for_panic() {
echo "- [*] Checking panic..."
if grep -i panic ./tmp/*.log; then
echo "=== Panic found in logs!"
exit 1
else
echo " - No panic found in logs."
fi
echo "- [*] Checking failure log..."
if grep -i 'failed to nodeToMember' ./tmp/*.log; then
echo "=== 'failed to nodeToMember' found in logs!"
exit 1
else
echo " - No 'failed to nodeToMember' found in logs."
fi
}
function healthcheck() {
check_logs_for_panic
echo "- [*] Performing healthchecks..."
echo " - $MEMBER_1"
etcdctl --endpoints=http://127.0.0.1:2379 endpoint status
etcdctl --endpoints=http://127.0.0.1:2379 endpoint health
echo " - $MEMBER_2"
etcdctl --endpoints=http://127.0.0.1:2378 endpoint status
etcdctl --endpoints=http://127.0.0.1:2378 endpoint health
echo " - $MEMBER_3"
etcdctl --endpoints=http://127.0.0.1:2377 endpoint status
etcdctl --endpoints=http://127.0.0.1:2377 endpoint health
}
function forfeit_leadership() {
echo "- [*] Changing leadership..."
LEADER_ID=$(etcdctl --endpoints=http://127.0.0.1:2379 endpoint status | awk -F'[ ,]+' '{print $2}')
sleep 5
echo " - Forfeiting leadership for member with ID: $LEADER_ID"
etcdctl --endpoints=http://127.0.0.1:2379 move-leader $LEADER_ID
}
############################################################
# Starting Cluster
############################################################
function phase_prep() {
# Start first etcd member
echo "- [*] Starting first etcd member ($MEMBER_1)..."
nohup etcd --name $MEMBER_1 \
--data-dir ./tmp/etcd-$MEMBER_1 \
--initial-advertise-peer-urls http://127.0.0.1:2380 \
--listen-peer-urls http://127.0.0.1:2380 \
--advertise-client-urls http://127.0.0.1:2379 \
--listen-client-urls http://127.0.0.1:2379 \
--initial-cluster "$MEMBER_1=http://127.0.0.1:2380" \
--initial-cluster-state new > ./tmp/etcd-$MEMBER_1.log 2>&1 &
sleep 5
echo "- [*] add members"
# Add and promote the first learner
echo " - Adding learner ($MEMBER_2)..."
MEMBER2_ID=$(etcdctl --endpoints=http://127.0.0.1:2379 member add $MEMBER_2 --peer-urls=http://127.0.0.1:2382 --learner | grep 'Member' | awk '{print $2}')
nohup etcd --name $MEMBER_2 \
--data-dir ./tmp/etcd-$MEMBER_2 \
--initial-advertise-peer-urls http://127.0.0.1:2382 \
--listen-peer-urls http://127.0.0.1:2382 \
--advertise-client-urls http://127.0.0.1:2378 \
--listen-client-urls http://127.0.0.1:2378 \
--initial-cluster "$MEMBER_1=http://127.0.0.1:2380,$MEMBER_2=http://127.0.0.1:2382" \
--initial-cluster-state existing > ./tmp/etcd-$MEMBER_2.log 2>&1 &
ATTEMPT=1
while [ $ATTEMPT -le $MAX_RETRIES ]; do
echo " - Promoting learner ($MEMBER_2) with ID: ${MEMBER2_ID}... (attempt $ATTEMPT)"
if etcdctl --endpoints=http://127.0.0.1:2379 member promote "${MEMBER2_ID}"; then
echo " - Successfully promoted learner ${MEMBER_2}."
break
else
echo "=== Failed to promote learner ${MEMBER_2}. Retrying in ${RETRY_DELAY}s..."
sleep $RETRY_DELAY
((ATTEMPT++))
fi
done
if [ $ATTEMPT -gt $MAX_RETRIES ]; then
echo "=== Failed to promote learner ${MEMBER_2} after $MAX_RETRIES attempts."
exit 1
fi
# Add and promote the second learner
echo " - Adding learner ($MEMBER_3)..."
MEMBER3_ID=$(etcdctl --endpoints=http://127.0.0.1:2379 member add $MEMBER_2 --peer-urls=http://127.0.0.1:2384 --learner | grep 'Member' | awk '{print $2}')
nohup etcd --name $MEMBER_3 \
--data-dir ./tmp/etcd-$MEMBER_3 \
--initial-advertise-peer-urls http://127.0.0.1:2384 \
--listen-peer-urls http://127.0.0.1:2384 \
--advertise-client-urls http://127.0.0.1:2377 \
--listen-client-urls http://127.0.0.1:2377 \
--initial-cluster "$MEMBER_1=http://127.0.0.1:2380,$MEMBER_2=http://127.0.0.1:2382,$MEMBER_3=http://127.0.0.1:2384" \
--initial-cluster-state existing > ./tmp/etcd-$MEMBER_3.log 2>&1 &
ATTEMPT=1
while [ $ATTEMPT -le $MAX_RETRIES ]; do
echo " - Promoting learner ($MEMBER_3) with ID: ${MEMBER3_ID}... (attempt $ATTEMPT)"
if etcdctl --endpoints=http://127.0.0.1:2379 member promote "${MEMBER3_ID}"; then
echo " - Successfully promoted learner ${MEMBER_3}."
break
else
echo "=== Failed to promote learner ${MEMBER_3}. Retrying in ${RETRY_DELAY}s..."
sleep $RETRY_DELAY
((ATTEMPT++))
fi
done
if [ $ATTEMPT -gt $MAX_RETRIES ]; then
echo "=== Failed to promote learner ${MEMBER_3} after $MAX_RETRIES attempts."
exit 1
fi
# read -p "Continue [y/N]? " -r confirm
# [[ "${confirm,,}" == "y" ]] || exit 1
}
function phase_snap_restore() {
# Create a snapshot
echo "- [*] Creating a snapshot"
rm -f ./tmp/snapshot.db
etcdctl snapshot save ./tmp/snapshot.db
# Stop all members and cleanup data
echo "- [*] Stop all members and cleanup data"
for member in $MEMBER_1 $MEMBER_2 $MEMBER_3; do
echo "Stopping $member..."
pgrep -f "etcd --name $member" | xargs kill -9
sleep 2
done
rm -rf ./tmp/etcd-*
# Restore single-node cluster
echo "- [*] Restoring single-node cluster"
etcdutl snapshot restore ./tmp/snapshot.db --data-dir=./tmp/etcd-etcd-1 --name="etcd-1" --initial-cluster="etcd-1=http://127.0.0.1:2380" --initial-advertise-peer-urls=http://127.0.0.1:2380
echo "- [*] Starting single-node etcd cluster: ($MEMBER_1)..."
nohup etcd --name $MEMBER_1 \
--data-dir ./tmp/etcd-$MEMBER_1 \
--initial-advertise-peer-urls http://127.0.0.1:2380 \
--listen-peer-urls http://127.0.0.1:2380 \
--advertise-client-urls http://127.0.0.1:2379 \
--listen-client-urls http://127.0.0.1:2379 \
--initial-cluster "$MEMBER_1=http://127.0.0.1:2380" \
--initial-cluster-state new > ./tmp/etcd-$MEMBER_1.log 2>&1 &
# read -p "Continue [y/N]? " -r confirm
# [[ "${confirm,,}" == "y" ]] || exit 1
sleep 5
echo "- [*] add members"
# Add and promote the first learner
echo " - Adding learner ($MEMBER_2)..."
MEMBER2_ID=$(etcdctl --endpoints=http://127.0.0.1:2379 member add $MEMBER_2 --peer-urls=http://127.0.0.1:2382 --learner | grep 'Member' | awk '{print $2}')
nohup etcd --name $MEMBER_2 \
--data-dir ./tmp/etcd-$MEMBER_2 \
--initial-advertise-peer-urls http://127.0.0.1:2382 \
--listen-peer-urls http://127.0.0.1:2382 \
--advertise-client-urls http://127.0.0.1:2378 \
--listen-client-urls http://127.0.0.1:2378 \
--initial-cluster "$MEMBER_1=http://127.0.0.1:2380,$MEMBER_2=http://127.0.0.1:2382" \
--initial-cluster-state existing > ./tmp/etcd-$MEMBER_2.log 2>&1 &
ATTEMPT=1
while [ $ATTEMPT -le $MAX_RETRIES ]; do
echo " - Promoting learner ($MEMBER_2) with ID: ${MEMBER2_ID}... (attempt $ATTEMPT)"
if etcdctl --endpoints=http://127.0.0.1:2379 member promote "${MEMBER2_ID}"; then
echo " - Successfully promoted learner ${MEMBER_2}."
break
else
echo "=== Failed to promote learner ${MEMBER_2}. Retrying in ${RETRY_DELAY}s..."
sleep $RETRY_DELAY
((ATTEMPT++))
fi
done
if [ $ATTEMPT -gt $MAX_RETRIES ]; then
echo "=== Failed to promote learner ${MEMBER_2} after $MAX_RETRIES attempts."
exit 1
fi
# Add and promote the second learner
echo " - Adding learner ($MEMBER_3)..."
MEMBER3_ID=$(etcdctl --endpoints=http://127.0.0.1:2379 member add $MEMBER_2 --peer-urls=http://127.0.0.1:2384 --learner | grep 'Member' | awk '{print $2}')
nohup etcd --name $MEMBER_3 \
--data-dir ./tmp/etcd-$MEMBER_3 \
--initial-advertise-peer-urls http://127.0.0.1:2384 \
--listen-peer-urls http://127.0.0.1:2384 \
--advertise-client-urls http://127.0.0.1:2377 \
--listen-client-urls http://127.0.0.1:2377 \
--initial-cluster "$MEMBER_1=http://127.0.0.1:2380,$MEMBER_2=http://127.0.0.1:2382,$MEMBER_3=http://127.0.0.1:2384" \
--initial-cluster-state existing > ./tmp/etcd-$MEMBER_3.log 2>&1 &
ATTEMPT=1
while [ $ATTEMPT -le $MAX_RETRIES ]; do
echo " - Promoting learner ($MEMBER_3) with ID: ${MEMBER3_ID}... (attempt $ATTEMPT)"
if etcdctl --endpoints=http://127.0.0.1:2379 member promote "${MEMBER3_ID}"; then
echo " - Successfully promoted learner ${MEMBER_3}."
break
else
echo "=== Failed to promote learner ${MEMBER_3}. Retrying in ${RETRY_DELAY}s..."
sleep $RETRY_DELAY
((ATTEMPT++))
fi
done
if [ $ATTEMPT -gt $MAX_RETRIES ]; then
echo "=== Failed to promote learner ${MEMBER_3} after $MAX_RETRIES attempts."
exit 1
fi
# read -p "Continue [y/N]? " -r confirm
# [[ "${confirm,,}" == "y" ]] || exit 1
}
############################################################
# Removing Members
############################################################
function phase_remove() {
echo "- [*] Checking if leadership change is needed..."
LEADER_ID=$(etcdctl --endpoints=http://127.0.0.1:2379 endpoint status | awk -F'[ ,]+' '{print $2}')
sleep 5
if [ "$MEMBER2_ID" == "$LEADER_ID" ]; then
echo " - Forfeiting leadership for member with ID: $LEADER_ID"
etcdctl --endpoints=http://127.0.0.1:2379 move-leader $LEADER_ID
fi
# Remove etcd-2
echo "- [*] Removing member ($MEMBER_2)..."
etcdctl member remove ${MEMBER2_ID}
sleep 5
rm -rf ./tmp/etcd-$MEMBER_2
echo "- [*] Adding member ($MEMBER_2)..."
echo " - Adding learner ($MEMBER_2)..."
MEMBER2_ID=$(etcdctl --endpoints=http://127.0.0.1:2379 member add $MEMBER_2 --peer-urls=http://127.0.0.1:2382 --learner | grep 'Member' | awk '{print $2}')
nohup etcd --name $MEMBER_2 \
--data-dir ./tmp/etcd-$MEMBER_2 \
--initial-advertise-peer-urls http://127.0.0.1:2382 \
--listen-peer-urls http://127.0.0.1:2382 \
--advertise-client-urls http://127.0.0.1:2378 \
--listen-client-urls http://127.0.0.1:2378 \
--initial-cluster "$MEMBER_1=http://127.0.0.1:2380,$MEMBER_2=http://127.0.0.1:2382,$MEMBER_3=http://127.0.0.1:2384" \
--initial-cluster-state existing > ./tmp/etcd-$MEMBER_2.log 2>&1 &
ATTEMPT=1
while [ $ATTEMPT -le $MAX_RETRIES ]; do
echo " - Promoting learner ($MEMBER_2) with ID: ${MEMBER2_ID}... (attempt $ATTEMPT)"
if etcdctl --endpoints=http://127.0.0.1:2379 member promote "${MEMBER2_ID}"; then
echo " - Successfully promoted learner ${MEMBER_2}."
break
else
echo "=== Failed to promote learner ${MEMBER_2}. Retrying in ${RETRY_DELAY}s..."
sleep $RETRY_DELAY
((ATTEMPT++))
fi
done
if [ $ATTEMPT -gt $MAX_RETRIES ]; then
echo "=== Failed to promote learner ${MEMBER_2} after $MAX_RETRIES attempts."
exit 1
fi
# echo "- [*] Checking if leadership change is needed..."
# LEADER_ID=$(etcdctl --endpoints=http://127.0.0.1:2379 endpoint status | awk -F'[ ,]+' '{print $2}')
# sleep 5
# if [ "$MEMBER3_ID" == "$LEADER_ID" ]; then
# echo " - Forfeiting leadership for member with ID: $LEADER_ID"
# etcdctl --endpoints=http://127.0.0.1:2379 move-leader $LEADER_ID
# fi
# # Remove etcd-3
# echo "- [*] Removing member ($MEMBER_3)..."
# etcdctl member remove ${MEMBER3_ID}
# sleep 5
# rm -rf ./tmp/etcd-$MEMBER_3
# echo "- [*] Adding member ($MEMBER_3)..."
# echo " - Adding learner ($MEMBER_3)..."
# MEMBER3_ID=$(etcdctl --endpoints=http://127.0.0.1:2379 member add $MEMBER_2 --peer-urls=http://127.0.0.1:2384 --learner | grep 'Member' | awk '{print $2}')
# nohup etcd --name $MEMBER_3 \
# --data-dir ./tmp/etcd-$MEMBER_3 \
# --initial-advertise-peer-urls http://127.0.0.1:2384 \
# --listen-peer-urls http://127.0.0.1:2384 \
# --advertise-client-urls http://127.0.0.1:2377 \
# --listen-client-urls http://127.0.0.1:2377 \
# --initial-cluster "$MEMBER_1=http://127.0.0.1:2380,$MEMBER_2=http://127.0.0.1:2382,$MEMBER_3=http://127.0.0.1:2384" \
# --initial-cluster-state existing > ./tmp/etcd-$MEMBER_3.log 2>&1 &
# ATTEMPT=1
# while [ $ATTEMPT -le $MAX_RETRIES ]; do
# echo " - Promoting learner ($MEMBER_3) with ID: ${MEMBER3_ID}... (attempt $ATTEMPT)"
# if etcdctl --endpoints=http://127.0.0.1:2379 member promote "${MEMBER3_ID}"; then
# echo " - Successfully promoted learner ${MEMBER_3}."
# break
# else
# echo "=== Failed to promote learner ${MEMBER_3}. Retrying in ${RETRY_DELAY}s..."
# sleep $RETRY_DELAY
# ((ATTEMPT++))
# fi
# done
# if [ $ATTEMPT -gt $MAX_RETRIES ]; then
# echo "=== Failed to promote learner ${MEMBER_3} after $MAX_RETRIES attempts."
# exit 1
# fi
# read -p "Continue [y/N]? " -r confirm
# [[ "${confirm,,}" == "y" ]] || exit 1
}
############################################################
# Cluster reboot
############################################################
function phase_reboot() {
# Stop all members
echo "- [*] Stop all members"
for member in $MEMBER_1 $MEMBER_2 $MEMBER_3; do
echo " - Stopping $member..."
pgrep -f "etcd --name $member" | xargs kill -9
rm -f ./tmp/etcd-${member}.log
sleep 2
done
# read -p "Continue [y/N]? " -r confirm
# [[ "${confirm,,}" == "y" ]] || exit 1
# Start all members again
echo "- [*] start all members again"
echo " - start $MEMBER_1"
nohup etcd --name $MEMBER_1 \
--data-dir ./tmp/etcd-$MEMBER_1 \
--initial-advertise-peer-urls http://127.0.0.1:2380 \
--listen-peer-urls http://127.0.0.1:2380 \
--advertise-client-urls http://127.0.0.1:2379 \
--listen-client-urls http://127.0.0.1:2379 \
--initial-cluster "$MEMBER_1=http://127.0.0.1:2380,$MEMBER_2=http://127.0.0.1:2382,$MEMBER_3=http://127.0.0.1:2384" \
--initial-cluster-state new > ./tmp/etcd-$MEMBER_1.log 2>&1 &
echo " - start $MEMBER_2"
nohup etcd --name $MEMBER_2 \
--data-dir ./tmp/etcd-$MEMBER_2 \
--initial-advertise-peer-urls http://127.0.0.1:2382 \
--listen-peer-urls http://127.0.0.1:2382 \
--advertise-client-urls http://127.0.0.1:2378 \
--listen-client-urls http://127.0.0.1:2378 \
--initial-cluster "$MEMBER_1=http://127.0.0.1:2380,$MEMBER_2=http://127.0.0.1:2382,$MEMBER_3=http://127.0.0.1:2384" \
--initial-cluster-state existing > ./tmp/etcd-$MEMBER_2.log 2>&1 &
echo " - start $MEMBER_3"
nohup etcd --name $MEMBER_3 \
--data-dir ./tmp/etcd-$MEMBER_3 \
--initial-advertise-peer-urls http://127.0.0.1:2384 \
--listen-peer-urls http://127.0.0.1:2384 \
--advertise-client-urls http://127.0.0.1:2377 \
--listen-client-urls http://127.0.0.1:2377 \
--initial-cluster "$MEMBER_1=http://127.0.0.1:2380,$MEMBER_2=http://127.0.0.1:2382,$MEMBER_3=http://127.0.0.1:2384" \
--initial-cluster-state existing > ./tmp/etcd-$MEMBER_3.log 2>&1 &
sleep 5
# read -p "Continue [y/N]? " -r confirm
# [[ "${confirm,,}" == "y" ]] || exit 1
}
############################################################
# Cleanup
############################################################
function phase_cleanup() {
# Stop all members
echo "- [*] Stop all members"
for member in $MEMBER_1 $MEMBER_2 $MEMBER_3; do
echo " - Stopping $member..."
pgrep -f "etcd --name $member" | xargs kill -9
sleep 2
done
echo "- [*] Done"
}
phase_prep
forfeit_leadership
healthcheck
while true; do
echo "=== Starting new round ==="
phase_snap_restore
forfeit_leadership
healthcheck
phase_remove
forfeit_leadership
healthcheck
phase_reboot
forfeit_leadership
healthcheck
echo "=== Round completed successfully ==="
done
phase_cleanup
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment