summaryrefslogtreecommitdiff
path: root/src/test
diff options
context:
space:
mode:
authorAmit Kapila2024-04-03 08:34:59 +0000
committerAmit Kapila2024-04-03 08:34:59 +0000
commit2ec005b4e29740f0d36e6646d149af192328b2ff (patch)
tree666945f7acefb7bf88adb1a84ef22ce368581ae6 /src/test
parente37662f22158c29bc55eda4eda1757f444cf701a (diff)
Ensure that the sync slots reach a consistent state after promotion without losing data.
We were directly copying the LSN locations while syncing the slots on the standby. Now, it is possible that at some particular restart_lsn there are some running xacts, which means if we start reading the WAL from that location after promotion, we won't reach a consistent snapshot state at that point. However, on the primary, we would have already been in a consistent snapshot state at that restart_lsn so we would have just serialized the existing snapshot. To avoid this problem we will use the advance_slot functionality unless the snapshot already exists at the synced restart_lsn location. This will help us to ensure that snapbuilder/slot statuses are updated properly without generating any changes. Note that the synced slot will remain as RS_TEMPORARY till the decoding from corresponding restart_lsn can reach a consistent snapshot state after which they will be marked as RS_PERSISTENT. Per buildfarm Author: Hou Zhijie Reviewed-by: Bertrand Drouvot, Shveta Malik, Bharath Rupireddy, Amit Kapila Discussion: https://postgr.es/m/OS0PR01MB5716B3942AE49F3F725ACA92943B2@OS0PR01MB5716.jpnprd01.prod.outlook.com
Diffstat (limited to 'src/test')
-rw-r--r--src/test/recovery/t/040_standby_failover_slots_sync.pl97
1 files changed, 82 insertions, 15 deletions
diff --git a/src/test/recovery/t/040_standby_failover_slots_sync.pl b/src/test/recovery/t/040_standby_failover_slots_sync.pl
index f47bfd78eb6..869e3d2e914 100644
--- a/src/test/recovery/t/040_standby_failover_slots_sync.pl
+++ b/src/test/recovery/t/040_standby_failover_slots_sync.pl
@@ -366,6 +366,68 @@ ok( $stderr =~
$cascading_standby->stop;
##################################################
+# Create a failover slot and advance the restart_lsn to a position where a
+# running transaction exists. This setup is for testing that the synced slots
+# can achieve the consistent snapshot state starting from the restart_lsn
+# after promotion without losing any data that otherwise would have been
+# received from the primary.
+##################################################
+
+$primary->safe_psql('postgres',
+ "SELECT pg_create_logical_replication_slot('snap_test_slot', 'test_decoding', false, false, true);"
+);
+
+$standby1->safe_psql('postgres', "SELECT pg_sync_replication_slots();");
+
+# Two xl_running_xacts logs are generated here. When decoding the first log, it
+# only serializes the snapshot, without advancing the restart_lsn to the latest
+# position. This is because if a transaction is running, the restart_lsn can
+# only move to a position before that transaction. Hence, the second
+# xl_running_xacts log is needed, the decoding for which allows the restart_lsn
+# to advance to the last serialized snapshot's position (the first log).
+$primary->safe_psql(
+ 'postgres', qq(
+ BEGIN;
+ SELECT txid_current();
+ SELECT pg_log_standby_snapshot();
+ COMMIT;
+ BEGIN;
+ SELECT txid_current();
+ SELECT pg_log_standby_snapshot();
+ COMMIT;
+));
+
+$primary->wait_for_replay_catchup($standby1);
+
+# Advance the restart_lsn to the position of the first xl_running_xacts log
+# generated above. Note that there might be concurrent xl_running_xacts logs
+# written by the bgwriter, which could cause the position to be advanced to an
+# unexpected point, but that would be a rare scenario and doesn't affect the
+# test results.
+$primary->safe_psql('postgres',
+ "SELECT pg_replication_slot_advance('snap_test_slot', pg_current_wal_lsn());"
+);
+
+# Log a message that will be consumed on the standby after promotion using the
+# synced slot. See the test where we promote standby (Promote the standby1 to
+# primary.)
+$primary->safe_psql('postgres',
+ "SELECT pg_logical_emit_message(false, 'test', 'test');"
+);
+
+# Get the confirmed_flush_lsn for the logical slot snap_test_slot on the primary
+my $confirmed_flush_lsn = $primary->safe_psql('postgres',
+ "SELECT confirmed_flush_lsn from pg_replication_slots WHERE slot_name = 'snap_test_slot';");
+
+$standby1->safe_psql('postgres', "SELECT pg_sync_replication_slots();");
+
+# Verify that confirmed_flush_lsn of snap_test_slot slot is synced to the standby
+ok( $standby1->poll_query_until(
+ 'postgres',
+ "SELECT '$confirmed_flush_lsn' = confirmed_flush_lsn from pg_replication_slots WHERE slot_name = 'snap_test_slot' AND synced AND NOT temporary;"),
+ 'confirmed_flush_lsn of slot snap_test_slot synced to standby');
+
+##################################################
# Test to confirm that the slot synchronization is protected from malicious
# users.
##################################################
@@ -458,8 +520,8 @@ $standby1->wait_for_log(qr/slot sync worker started/,
$log_offset);
##################################################
-# Test to confirm that restart_lsn and confirmed_flush_lsn of the logical slot
-# on the primary is synced to the standby via the slot sync worker.
+# Test to confirm that confirmed_flush_lsn of the logical slot on the primary
+# is synced to the standby via the slot sync worker.
##################################################
# Insert data on the primary
@@ -479,8 +541,8 @@ $subscriber1->safe_psql(
$subscriber1->wait_for_subscription_sync;
-# Do not allow any further advancement of the restart_lsn and
-# confirmed_flush_lsn for the lsub1_slot.
+# Do not allow any further advancement of the confirmed_flush_lsn for the
+# lsub1_slot.
$subscriber1->safe_psql('postgres', "ALTER SUBSCRIPTION regress_mysub1 DISABLE");
# Wait for the replication slot to become inactive on the publisher
@@ -489,20 +551,15 @@ $primary->poll_query_until(
"SELECT COUNT(*) FROM pg_catalog.pg_replication_slots WHERE slot_name = 'lsub1_slot' AND active='f'",
1);
-# Get the restart_lsn for the logical slot lsub1_slot on the primary
-my $primary_restart_lsn = $primary->safe_psql('postgres',
- "SELECT restart_lsn from pg_replication_slots WHERE slot_name = 'lsub1_slot';");
-
# Get the confirmed_flush_lsn for the logical slot lsub1_slot on the primary
my $primary_flush_lsn = $primary->safe_psql('postgres',
"SELECT confirmed_flush_lsn from pg_replication_slots WHERE slot_name = 'lsub1_slot';");
-# Confirm that restart_lsn and confirmed_flush_lsn of lsub1_slot slot are synced
-# to the standby
+# Confirm that confirmed_flush_lsn of lsub1_slot slot is synced to the standby
ok( $standby1->poll_query_until(
'postgres',
- "SELECT '$primary_restart_lsn' = restart_lsn AND '$primary_flush_lsn' = confirmed_flush_lsn from pg_replication_slots WHERE slot_name = 'lsub1_slot' AND synced AND NOT temporary;"),
- 'restart_lsn and confirmed_flush_lsn of slot lsub1_slot synced to standby');
+ "SELECT '$primary_flush_lsn' = confirmed_flush_lsn from pg_replication_slots WHERE slot_name = 'lsub1_slot' AND synced AND NOT temporary;"),
+ 'confirmed_flush_lsn of slot lsub1_slot synced to standby');
##################################################
# Test that logical failover replication slots wait for the specified
@@ -744,8 +801,9 @@ $primary->reload;
##################################################
# Promote the standby1 to primary. Confirm that:
-# a) the slot 'lsub1_slot' is retained on the new primary
+# a) the slot 'lsub1_slot' and 'snap_test_slot' are retained on the new primary
# b) logical replication for regress_mysub1 is resumed successfully after failover
+# c) changes can be consumed from the synced slot 'snap_test_slot'
##################################################
$standby1->start;
$primary->wait_for_replay_catchup($standby1);
@@ -759,8 +817,8 @@ $subscriber1->safe_psql('postgres',
# Confirm the synced slot 'lsub1_slot' is retained on the new primary
is($standby1->safe_psql('postgres',
- q{SELECT slot_name FROM pg_replication_slots WHERE slot_name = 'lsub1_slot' AND synced AND NOT temporary;}),
- 'lsub1_slot',
+ q{SELECT count(*) = 2 FROM pg_replication_slots WHERE slot_name IN ('lsub1_slot', 'snap_test_slot') AND synced AND NOT temporary;}),
+ 't',
'synced slot retained on the new primary');
# Insert data on the new primary
@@ -773,4 +831,13 @@ is( $subscriber1->safe_psql('postgres', q{SELECT count(*) FROM tab_int;}),
"20",
'data replicated from the new primary');
+# Consume the data from the snap_test_slot. The synced slot should reach a
+# consistent point by restoring the snapshot at the restart_lsn serialized
+# during slot synchronization.
+$result = $standby1->safe_psql('postgres',
+ "SELECT count(*) FROM pg_logical_slot_get_changes('snap_test_slot', NULL, NULL) WHERE data ~ 'message*';"
+);
+
+is($result, '1', "data can be consumed using snap_test_slot");
+
done_testing();