Fix checkpoint signalling
authorAlvaro Herrera <alvherre@alvh.no-ip.org>
Wed, 29 Apr 2020 22:46:42 +0000 (18:46 -0400)
committerAlvaro Herrera <alvherre@alvh.no-ip.org>
Wed, 29 Apr 2020 22:46:42 +0000 (18:46 -0400)
Checkpointer uses its MyLatch to wake up when a checkpoint request is
received.  But before commit c6550776394e the latch was not used for
anything else, so the code could just go to sleep after each loop
without rechecking the sleeping condition.  That commit added a separate
ResetLatch in its code path[1], which can cause a checkpoint to go
unnoticed for potentially a long time.

Fix by skipping sleep if any checkpoint flags are set.  Also add a test
to verify this; authored by Kyotaro Horiguchi.

[1] CreateCheckPoint -> InvalidateObsoleteReplicationSlots ->
ConditionVariableTimeSleep

Report and diagnosis by Kyotaro Horiguchi.
Co-authored-by: Kyotaro Horiguchi <horikyota.ntt@gmail.com>
Co-authored-by: Álvaro Herrera <alvherre@alvh.no-ip.org>
Discussion: https://postgr.es/m/20200408.141956.891237856186513376.horikyota.ntt@gmail.com

src/backend/postmaster/checkpointer.c
src/test/recovery/t/019_replslot_limit.pl

index e354a78725eab03a364df836b0189560f6620905..2b552a7ff90c6e7c3f45123d8312c1b07ceb31e3 100644 (file)
@@ -494,6 +494,13 @@ CheckpointerMain(void)
                 */
                pgstat_send_bgwriter();
 
+               /*
+                * If any checkpoint flags have been set, redo the loop to handle the
+                * checkpoint without sleeping.
+                */
+               if (((volatile CheckpointerShmemStruct *) CheckpointerShmem)->ckpt_flags)
+                       continue;
+
                /*
                 * Sleep until we are signaled or it's time for another checkpoint or
                 * xlog file switch.
index 32dce545229fbf90b219e1a39e2e6e00ab19bae2..634f2bec8bb9e26e45c6ee49e59ade528fb41141 100644 (file)
@@ -8,7 +8,7 @@ use TestLib;
 use PostgresNode;
 
 use File::Path qw(rmtree);
-use Test::More tests => 13;
+use Test::More tests => 14;
 use Time::HiRes qw(usleep);
 
 $ENV{PGDATABASE} = 'postgres';
@@ -179,7 +179,47 @@ for (my $i = 0; $i < 10000; $i++)
 }
 ok($failed, 'check that replication has been broken');
 
-$node_standby->stop;
+$node_master->stop('immediate');
+$node_standby->stop('immediate');
+
+my $node_master2 = get_new_node('master2');
+$node_master2->init(allows_streaming => 1);
+$node_master2->append_conf(
+       'postgresql.conf', qq(
+min_wal_size = 32MB
+max_wal_size = 32MB
+log_checkpoints = yes
+));
+$node_master2->start;
+$node_master2->safe_psql('postgres',
+       "SELECT pg_create_physical_replication_slot('rep1')");
+$backup_name = 'my_backup2';
+$node_master2->backup($backup_name);
+
+$node_master2->stop;
+$node_master2->append_conf(
+       'postgresql.conf', qq(
+max_slot_wal_keep_size = 0
+));
+$node_master2->start;
+
+$node_standby = get_new_node('standby_2');
+$node_standby->init_from_backup($node_master2, $backup_name,
+       has_streaming => 1);
+$node_standby->append_conf('postgresql.conf', "primary_slot_name = 'rep1'");
+$node_standby->start;
+my @result =
+  split(
+       '\n',
+       $node_master2->safe_psql(
+               'postgres',
+               "CREATE TABLE tt();
+                DROP TABLE tt;
+                SELECT pg_switch_wal();
+                CHECKPOINT;
+                SELECT 'finished';",
+               timeout => '60'));
+is($result[1], 'finished', 'check if checkpoint command is not blocked');
 
 #####################################
 # Advance WAL of $node by $n segments