Skip to content

Instantly share code, notes, and snippets.

@aamir814
Last active December 16, 2024 19:42
Show Gist options
  • Select an option

  • Save aamir814/092ed85bd90e28d79af029e561c1da88 to your computer and use it in GitHub Desktop.

Select an option

Save aamir814/092ed85bd90e28d79af029e561c1da88 to your computer and use it in GitHub Desktop.
Test sync_replication_slot to keep the logical replication running during a failover
#!/usr/bin/env bash
#####################################################################
# Purpose:
# The following steps show how to use the new PostgreSQL parameter
# sync_replication_slot introduced in version 17 to keep logical
# replication running during a failover or major upgrade.
#
# The setup includes three RDS PostgreSQL instances: Primary (PRI),
# Read Replica (RR), and Logical Replica (LR). After successfully
# creating the logical replication, the idea is to promote the RR
# (to simulate failover) and update the LR (subscriber) to point to
# the new primary instance without re-syncing the data.
#
# Assumption:
# * VPC Security group must exists and should allow your IP to connect to RDS Instances.
# * AWS CLI must be configured.
# * Must have connectivity to the RDS instance.
#
# REFERENCES
# - https://docs.aws.amazon.com/cli/latest/reference/rds/create-db-instance-read-replica.html
# - https://repost.aws/knowledge-center/rds-postgresql-use-logical-replication
# - https://docs.aws.amazon.com/cli/latest/reference/rds/#cli-aws-rds
# - https://docs.aws.amazon.com/AmazonRDS/latest/UserGuide/Appendix.PostgreSQL.CommonDBATasks.pglogical.slot.synchronization.html
# - https://www.postgresql.org/docs/current/sql-altersubscription.html
# - https://docs.aws.amazon.com/cli/latest/reference/rds/create-blue-green-deployment.html
#####################################################################
# change as needed
export DB_INSTANCE_NAME=pgtest-17
export DB_INSTANCE_NAME_LR=pgtest-17-lr
export DB_INSTANCE_NAME_RR=pgtest-17-rr
export VPC_SG_IDS=sg-06797e8215f149043
export DB_ENGINE=postgres
export DB_VERSION=17.1
export DB_PASSWORD=ChangeM3
export PGPASSWORD=$DB_PASSWORD
# create parameter group for primary and read replica
aws rds create-db-parameter-group \
--db-parameter-group-name pg-$DB_INSTANCE_NAME \
--db-parameter-group-family postgres17 \
--description "logical replication test with sync_replication_slots"
aws rds create-db-parameter-group \
--db-parameter-group-name pg-$DB_INSTANCE_NAME_RR \
--db-parameter-group-family postgres17 \
--description "logical replication test with sync_replication_slots"
# create parameter group for logical replica
aws rds create-db-parameter-group \
--db-parameter-group-name pg-$DB_INSTANCE_NAME_LR \
--db-parameter-group-family postgres17 \
--description "logical replication test with sync_replication_slots"
# modify the parameter group to set wal_level to logical.
# add the following DB parameters as needed. For this test default is good.
#max_wal_senders
#max_replication_slots
#max_connections
aws rds modify-db-parameter-group \
--db-parameter-group-name pg-$DB_INSTANCE_NAME \
--parameters "ParameterName='rds.logical_replication',ParameterValue=1,ApplyMethod=pending-reboot" \
"ParameterName='sync_replication_slots',ParameterValue=1,ApplyMethod=pending-reboot"
# Update rds.logical_slot_sync_dbname to desire database
aws rds modify-db-parameter-group \
--db-parameter-group-name pg-$DB_INSTANCE_NAME_RR \
--parameters "ParameterName='rds.logical_replication',ParameterValue=1,ApplyMethod=pending-reboot" \
"ParameterName='hot_standby_feedback',ParameterValue='1',ApplyMethod=pending-reboot" \
"ParameterName='rds.logical_slot_sync_dbname',ParameterValue='postgres',ApplyMethod=pending-reboot" \
"ParameterName='sync_replication_slots',ParameterValue=1,ApplyMethod=pending-reboot"
# create instances with public access (not recommended in Production)
aws rds create-db-instance \
--db-instance-identifier $DB_INSTANCE_NAME \
--allocated-storage 20 \
--storage-encrypted \
--db-instance-class db.t3.medium \
--engine $DB_ENGINE \
--master-username postgres \
--master-user-password $DB_PASSWORD \
--vpc-security-group-ids $VPC_SG_IDS \
--engine-version $DB_VERSION \
--no-multi-az
aws rds create-db-instance \
--db-instance-identifier $DB_INSTANCE_NAME_LR \
--allocated-storage 20 \
--storage-encrypted \
--db-instance-class db.t3.medium \
--engine $DB_ENGINE \
--master-username postgres \
--master-user-password $DB_PASSWORD \
--vpc-security-group-ids $VPC_SG_IDS \
--engine-version $DB_VERSION \
--no-multi-az
#create Read Replica
aws rds create-db-instance-read-replica \
--db-instance-identifier $DB_INSTANCE_NAME_RR \
--source-db-instance-identifier $DB_INSTANCE_NAME \
--vpc-security-group-ids $VPC_SG_IDS
# verfiy instances role
aws rds describe-db-instances \
--query "*[].{DBInstanceIdentifier:DBInstanceIdentifier,ReadReplica:ReadReplicaDBInstanceIdentifiers[0],ReadReplicaSourceDB:ReadReplicaSourceDBInstanceIdentifier}" \
--output table
:'
------------------------------------------------------------------
| DescribeDBInstances |
+----------------------+-----------------+-----------------------+
| DBInstanceIdentifier | ReadReplica | ReadReplicaSourceDB |
+----------------------+-----------------+-----------------------+
| upgtest-17 | upgtest-17-rr | None |
| upgtest-17-lr | None | None |
| upgtest-17-rr | None | upgtest-17 |
+----------------------+-----------------+-----------------------+
'
# get endpoint name to connect to instance
DB_PRI=$(aws rds describe-db-instances \
--db-instance-identifier $DB_INSTANCE_NAME \
--query "*[].Endpoint.Address" \
--output text)
DB_RR=$(aws rds describe-db-instances \
--db-instance-identifier $DB_INSTANCE_NAME_RR \
--query "*[].Endpoint.Address" \
--output text)
DB_LR=$(aws rds describe-db-instances \
--db-instance-identifier $DB_INSTANCE_NAME_LR \
--query "*[].Endpoint.Address" \
--output text)
# echo $DB_PRI $DB_RR $DB_LR
# get the physical slot name from the primary for the synchronized_standby_slots parameter
PHYSICAL_SLOT=$(psql -h $DB_PRI -U postgres -t -c "SELECT slot_name FROM pg_replication_slots where slot_type='physical'")
# echo $PHYSICAL_SLOT
# Modify primary parameter group with Physical standby slot name
aws rds modify-db-parameter-group \
--db-parameter-group-name pg-$DB_INSTANCE_NAME \
--parameters "ParameterName='synchronized_standby_slots',ParameterValue=$PHYSICAL_SLOT,ApplyMethod=pending-reboot"
# assign parameter groups to db instances
aws rds modify-db-instance \
--db-instance-identifier $DB_INSTANCE_NAME \
--db-parameter-group-name pg-$DB_INSTANCE_NAME \
--apply-immediately
aws rds modify-db-instance \
--db-instance-identifier $DB_INSTANCE_NAME_RR \
--db-parameter-group-name pg-$DB_INSTANCE_NAME_RR \
--apply-immediately
# restart instances
aws rds reboot-db-instance \
--db-instance-identifier $DB_INSTANCE_NAME
aws rds reboot-db-instance \
--db-instance-identifier $DB_INSTANCE_NAME_RR
# assign parameter group to the logical replication instance
aws rds modify-db-instance \
--db-instance-identifier $DB_INSTANCE_NAME_LR \
--db-parameter-group-name pg-$DB_INSTANCE_NAME_LR \
--apply-immediately
# restart LR instance
aws rds reboot-db-instance \
--db-instance-identifier $DB_INSTANCE_NAME_LR
# verify parameters
# These parameters must be set in order for LR to work
# after failover.
psql -h $DB_PRI -U postgres -c "SELECT name,setting FROM pg_settings WHERE name IN ('wal_level','rds.logical_replication','sync_replication_slots','synchronized_standby_slots')"
:' output
name | setting
----------------------------+---------------------------------------------
rds.logical_replication | on
sync_replication_slots | on
synchronized_standby_slots | rds_us_east_1_db_dn2uyr2436rexq3u7gcdfzw4hy
wal_level | logical
(4 rows)
'
# rds.logical_slot_sync_dbname is not available in pg_settings
# make sure to update it if your database is not "postgres."
psql -h $DB_RR -U postgres -c "SELECT name,setting FROM pg_settings WHERE name IN ('wal_level','rds.logical_replication','sync_replication_slots','hot_standby_feedback','rds.logical_slot_sync_dbname')"
:' output
name | setting
-------------------------+---------
hot_standby_feedback | on
rds.logical_replication | on
sync_replication_slots | on
wal_level | logical
(4 rows)
'
# Connect to the source database and create the tables and publication:
psql -h $DB_PRI -U postgres -c "CREATE TABLE reptab1 (slno int primary key);
CREATE TABLE reptab2 (name varchar(20));
--Insert data into the source tables:
INSERT INTO reptab1 VALUES (generate_series(1,1000));
INSERT INTO reptab2 SELECT SUBSTR ('ABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789',((random()*(36-1)+1)::integer),1) FROM generate_series(1,50);
--Create a publication for the source tables. Use a SELECT query to verify the details of the publication that was created:
CREATE PUBLICATION testpub FOR TABLE reptab1,reptab2;
SELECT * FROM pg_publication;
--Verify that the source tables are added to the publication:
SELECT * FROM pg_publication_tables;
"
:'output
pubname | schemaname | tablename | attnames | rowfilter
---------+------------+-----------+----------+-----------
testpub | public | reptab1 | {slno} |
testpub | public | reptab2 | {name} |
(2 rows)
'
# Connect to the LR and create the target tables with the
# same names as the source tables.
# Be sure that there's no data present in the target tables by
# running a SELECT query on the target tables:
psql -h $DB_LR -U postgres -c "CREATE TABLE reptab1 (slno int primary key);
CREATE TABLE reptab2 (name varchar(20));"
psql -h $DB_LR -U postgres -c "SELECT count(*) FROM reptab1;"
psql -h $DB_LR -U postgres -c "SELECT count(*) FROM reptab2;"
# Create and verify subscriptions in the target database
# Important: To avoid storing a plaintext version of your user name and password in the database logs, before creating the subscription, run the following commands:
# SET log_min_messages to 'PANIC';
# SET log_statement to NONE;
# run the create subscription command on ec2 instance inside the same VPC
# so hostname resolves to private IP or
# modify the security group to allow connections from public IP addresses.
aws ec2 authorize-security-group-ingress \
--group-id $VPC_SG_IDS \
--protocol tcp \
--port 5432 \
--cidr $(nslookup $DB_PRI | grep Address | tail -1 | awk '{print $2"/32"}')
aws ec2 authorize-security-group-ingress \
--group-id $VPC_SG_IDS \
--protocol tcp \
--port 5432 \
--cidr $(nslookup $DB_RR | grep Address | tail -1 | awk '{print $2"/32"}')
aws ec2 authorize-security-group-ingress \
--group-id $VPC_SG_IDS \
--protocol tcp \
--port 5432 \
--cidr $(nslookup $DB_LR | grep Address | tail -1 | awk '{print $2"/32"}')
# can only execute "create subscription" in the command line
psql -h $DB_LR -U postgres -c "CREATE SUBSCRIPTION testsub CONNECTION 'host=$DB_PRI port=5432 dbname=postgres user=postgres password=$DB_PASSWORD'
PUBLICATION testpub WITH (failover = true);"
:'output
NOTICE: created replication slot "testsub" on publisher
CREATE SUBSCRIPTION
'
# verify subscription
psql -h $DB_LR -U postgres -c "SELECT oid,subname,subenabled,subslotname,subpublications FROM pg_subscription;"
:'output
oid | subname | subenabled | subslotname | subpublications
-------+---------+------------+-------------+-----------------
24611 | testsub | t | testsub | {testpub}
(1 row)
'
# verify that data has been copied to LR
psql -h $DB_LR -U postgres -c "SELECT count(*) FROM reptab1;"
:'output
count
-------
1000
(1 row)
'
psql -h $DB_LR -U postgres -c "SELECT count(*) FROM reptab2;"
:'output
count
-------
50
(1 row)
'
# Verify the replication slot in the PRI database
psql -h $DB_PRI -U postgres -c "SELECT slot_name, pg_size_pretty(pg_wal_lsn_diff(pg_current_wal_lsn(),restart_lsn)) AS replicationSlotLag,
active, failover, synced FROM pg_replication_slots"
:'output
slot_name | replicationslotlag | active | failover | synced
---------------------------------------------+--------------------+--------+----------+--------
testsub | 56 bytes | t | t | f
rds_us_east_1_db_dn2uyr2436rexq3u7gcdfzw4hy | 0 bytes | t | f | f
(2 rows)
'
# The logical slot also exist in the RR instance
psql -h $DB_RR -U postgres -c "select slot_name, slot_type, active, failover, synced from pg_replication_slots;"
:'output
slot_name | slot_type | active | failover | synced
-----------+-----------+--------+----------+--------
testsub | logical | f | t | t
(1 row)
'
# Test the CDC by inserting data in the source tables
psql -h $DB_PRI -U postgres -c "INSERT INTO reptab1 VALUES(generate_series(1001,2000));
INSERT INTO reptab2 SELECT SUBSTR('ABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789',((random()*(36-1)+1)::integer),1) FROM generate_series(1,50);"
psql -h $DB_PRI -U postgres -c "SELECT count(*) FROM reptab1;"
:'output
count
-------
2000
(1 row)
'
psql -h $DB_PRI -U postgres -c "SELECT count(*) FROM reptab2;"
:'output
count
-------
100
(1 row)
'
# verifying number of rows in the LR matches
psql -h $DB_LR -U postgres -c "SELECT count(*) FROM reptab1;"
:'output
count
-------
2000
(1 row)
'
psql -h $DB_LR -U postgres -c "SELECT count(*) FROM reptab2;"
:'output
count
-------
100
(1 row)
'
######################################################
####################### TEST ########################
######################################################
# Now we have everything set up. Let's promote the RR and
# point LR to RR
# Promote RR and see if logical replication still works.
aws rds promote-read-replica \
--db-instance-identifier $DB_INSTANCE_NAME_RR
# Alter the subscription in the LR instance to point to RR (which is now writable)
psql -h $DB_LR -U postgres -c "ALTER SUBSCRIPTION testsub CONNECTION 'host=$DB_RR port=5432 dbname=postgres user=postgres password=$DB_PASSWORD';"
# test by inserting rows in the promoted RR
psql -h $DB_RR -U postgres -c "INSERT INTO reptab1 VALUES(generate_series(2001,3000));
INSERT INTO reptab2 SELECT SUBSTR('ABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789',((random()*(36-1)+1)::integer),1) FROM generate_series(1,50);"
# verify the row count in LR
psql -h $DB_LR -U postgres -c "SELECT count(*) FROM reptab1;"
:'output
count
-------
3000
(1 row)
'
psql -h $DB_LR -U postgres -c "SELECT count(*) FROM reptab2;"
:'output
count
-------
150
(1 row)
'
# The replication slots in DB_PRI is not active
psql -h $DB_PRI -U postgres -c "SELECT slot_name, pg_size_pretty(pg_wal_lsn_diff(pg_current_wal_lsn(),restart_lsn)) AS replicationSlotLag,
active FROM pg_replication_slots"
:'output
slot_name | replicationslotlag | active
-----------+--------------------+--------
testsub | 192 MB | f
(1 row)
'
# The replication slots in DB_RR is active
psql -h $DB_RR -U postgres -c "SELECT slot_name, pg_size_pretty(pg_wal_lsn_diff(pg_current_wal_lsn(),restart_lsn)) AS replicationSlotLag,
active FROM pg_replication_slots"
:'output
slot_name | replicationslotlag | active
-----------+--------------------+--------
testsub | 56 bytes | t
(1 row)
'
######################################################
##################### CLEAN-UP #######################
######################################################
# remove security group inbound rules
aws ec2 revoke-security-group-ingress \
--group-id $VPC_SG_IDS \
--protocol tcp \
--port 5432 \
--cidr $(nslookup $DB_PRI | grep Address | tail -1 | awk '{print $2"/32"}')
aws ec2 revoke-security-group-ingress \
--group-id $VPC_SG_IDS \
--protocol tcp \
--port 5432 \
--cidr $(nslookup $DB_RR | grep Address | tail -1 | awk '{print $2"/32"}')
aws ec2 revoke-security-group-ingress \
--group-id $VPC_SG_IDS \
--protocol tcp \
--port 5432 \
--cidr $(nslookup $DB_LR | grep Address | tail -1 | awk '{print $2"/32"}')
# Delete instances
aws rds delete-db-instance \
--db-instance-identifier $DB_INSTANCE_NAME \
--skip-final-snapshot
aws rds delete-db-instance \
--db-instance-identifier $DB_INSTANCE_NAME_RR \
--skip-final-snapshot
aws rds delete-db-instance \
--db-instance-identifier $DB_INSTANCE_NAME_LR \
--skip-final-snapshot
# Delete parameter groups
aws rds delete-db-parameter-group \
--db-parameter-group-name pg-$DB_INSTANCE_NAME
aws rds delete-db-parameter-group \
--db-parameter-group-name pg-$DB_INSTANCE_NAME_RR
aws rds delete-db-parameter-group \
--db-parameter-group-name pg-$DB_INSTANCE_NAME_LR
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment