Test restartpoints in archive recovery.

v14 commit 1f95181b44c843729caaa688f74babe9403b5850 and its v13 equivalent caused timing-dependent failures in archive recovery, at restartpoints. The symptom was "invalid magic number 0000 in log segment X, offset 0", "unexpected pageaddr X in log segment Y, offset 0" [X < Y], or an assertion failure. Commit 3635a0a35aafd3bfa80b7a809bc6e91ccd36606a and predecessors back-patched v15 changes to fix that. This test reproduces the problem probabilistically, typically in less than 1000 iterations of the test. Hence, buildfarm and CI runs would have surfaced enough failures to get attention within a day. Reported-by: Arun Thirupathi <arunth@google.com> Discussion: https://postgr.es/m/20250306193013.36.nmisch@google.com Backpatch-through: 13
author: Noah Misch 2025-04-20 15:28:48 +0000
committer: Noah Misch 2025-04-20 15:28:48 +0000
commit: 714bd9e3a733a98da47531a750182ee5981cc3ed (patch)
tree: 6d1c062a2943654b2511156a084408c19d1f1c30
parent: 2d5350cfbdc6e61124214d68713295bb060bc541 (diff)
2 files changed, 58 insertions, 0 deletions
diff --git a/src/test/recovery/meson.build b/src/test/recovery/meson.build
index 057bcde1434..cb983766c67 100644
--- a/src/test/recovery/meson.build
+++ b/src/test/recovery/meson.build
@@ -53,6 +53,7 @@ tests += {
       't/042_low_level_backup.pl',
       't/043_no_contrecord_switch.pl',
       't/044_invalidate_inactive_slots.pl',
+      't/045_archive_restartpoint.pl',
     ],
   },
 }
diff --git a/src/test/recovery/t/045_archive_restartpoint.pl b/src/test/recovery/t/045_archive_restartpoint.pl
new file mode 100644
index 00000000000..b143bc4e1d4
--- /dev/null
+++ b/src/test/recovery/t/045_archive_restartpoint.pl
@@ -0,0 +1,57 @@
+
+# Copyright (c) 2024-2025, PostgreSQL Global Development Group
+
+# Test restartpoints during archive recovery.
+use strict;
+use warnings;
+
+use PostgreSQL::Test::Cluster;
+use PostgreSQL::Test::Utils;
+use Test::More;
+
+my $archive_max_mb = 320;
+my $wal_segsize = 1;
+
+# Initialize primary node
+my $node_primary = PostgreSQL::Test::Cluster->new('primary');
+$node_primary->init(
+	has_archiving => 1,
+	allows_streaming => 1,
+	extra => [ '--wal-segsize' => $wal_segsize ]);
+$node_primary->start;
+my $backup_name = 'my_backup';
+$node_primary->backup($backup_name);
+
+$node_primary->safe_psql('postgres',
+	('DO $$BEGIN FOR i IN 1..' . $archive_max_mb / $wal_segsize)
+	  . ' LOOP CHECKPOINT; PERFORM pg_switch_wal(); END LOOP; END$$;');
+
+# Force archiving of WAL file containing recovery target
+my $until_lsn = $node_primary->lsn('write');
+$node_primary->safe_psql('postgres', "SELECT pg_switch_wal()");
+$node_primary->stop;
+
+# Archive recovery
+my $node_restore = PostgreSQL::Test::Cluster->new('restore');
+$node_restore->init_from_backup($node_primary, $backup_name,
+	has_restoring => 1);
+$node_restore->append_conf('postgresql.conf',
+	"recovery_target_lsn = '$until_lsn'");
+$node_restore->append_conf('postgresql.conf',
+	'recovery_target_action = pause');
+$node_restore->append_conf('postgresql.conf',
+	'max_wal_size = ' . 2 * $wal_segsize);
+$node_restore->append_conf('postgresql.conf', 'log_checkpoints = on');
+
+$node_restore->start;
+
+# Wait until restore has replayed enough data
+my $caughtup_query =
+  "SELECT '$until_lsn'::pg_lsn <= pg_last_wal_replay_lsn()";
+$node_restore->poll_query_until('postgres', $caughtup_query)
+  or die "Timed out while waiting for restore to catch up";
+
+$node_restore->stop;
+ok(1, 'restore caught up');
+
+done_testing();
author	Noah Misch	2025-04-20 15:28:48 +0000
committer	Noah Misch	2025-04-20 15:28:48 +0000
commit	714bd9e3a733a98da47531a750182ee5981cc3ed (patch)
tree	6d1c062a2943654b2511156a084408c19d1f1c30
parent	2d5350cfbdc6e61124214d68713295bb060bc541 (diff)