Add temporary debug info to help debug 019_replslot_limit.pl failures.
authorAndres Freund <andres@anarazel.de>
Wed, 23 Feb 2022 02:02:34 +0000 (18:02 -0800)
committerAndres Freund <andres@anarazel.de>
Wed, 23 Feb 2022 02:02:34 +0000 (18:02 -0800)
I have not been able to reproduce the occasional failures of
019_replslot_limit.pl we are seeing in the buildfarm and not for lack of
trying. The additional logging and increased log level will hopefully help.

Will be reverted once the cause is identified.

Discussion: https://postgr.es/m/20220218231415.c4plkp4i3reqcwip@alap3.anarazel.de

src/backend/replication/slot.c
src/bin/pg_basebackup/pg_basebackup.c
src/test/recovery/t/019_replslot_limit.pl

index 5da5fa825a2dec335b3627f9e478f4db0bef87c8..3d39fddaaef2b420b53ad8f4ddbf5d03f1c773ea 100644 (file)
@@ -177,6 +177,10 @@ ReplicationSlotInitialize(void)
 static void
 ReplicationSlotShmemExit(int code, Datum arg)
 {
+       /* temp debugging aid to analyze 019_replslot_limit failures */
+       elog(DEBUG3, "replication slot exit hook, %s active slot",
+                MyReplicationSlot != NULL ? "with" : "without");
+
        /* Make sure active replication slots are released */
        if (MyReplicationSlot != NULL)
                ReplicationSlotRelease();
@@ -554,6 +558,9 @@ ReplicationSlotCleanup(void)
        Assert(MyReplicationSlot == NULL);
 
 restart:
+       /* temp debugging aid to analyze 019_replslot_limit failures */
+       elog(DEBUG3, "temporary replication slot cleanup: begin");
+
        LWLockAcquire(ReplicationSlotControlLock, LW_SHARED);
        for (i = 0; i < max_replication_slots; i++)
        {
@@ -579,6 +586,8 @@ restart:
        }
 
        LWLockRelease(ReplicationSlotControlLock);
+
+       elog(DEBUG3, "temporary replication slot cleanup: done");
 }
 
 /*
@@ -1284,6 +1293,12 @@ InvalidatePossiblyObsoleteSlot(ReplicationSlot *s, XLogRecPtr oldestLSN,
                                (void) kill(active_pid, SIGTERM);
                                last_signaled_pid = active_pid;
                        }
+                       else
+                       {
+                               /* temp debugging aid to analyze 019_replslot_limit failures */
+                               elog(DEBUG3, "not signalling process %d during invalidation of slot \"%s\"",
+                                        active_pid, NameStr(slotname));
+                       }
 
                        /* Wait until the slot is released. */
                        ConditionVariableSleep(&s->active_cv,
@@ -1347,6 +1362,10 @@ InvalidateObsoleteReplicationSlots(XLogSegNo oldestSegno)
        XLogSegNoOffsetToRecPtr(oldestSegno, 0, wal_segment_size, oldestLSN);
 
 restart:
+       /* temp debugging aid to analyze 019_replslot_limit failures */
+       elog(DEBUG3, "begin invalidating obsolete replication slots older than %X/%X",
+                LSN_FORMAT_ARGS(oldestLSN));
+
        LWLockAcquire(ReplicationSlotControlLock, LW_SHARED);
        for (int i = 0; i < max_replication_slots; i++)
        {
@@ -1372,6 +1391,8 @@ restart:
                ReplicationSlotsComputeRequiredLSN();
        }
 
+       elog(DEBUG3, "done invalidating obsolete replication slots");
+
        return invalidated;
 }
 
index 08b07d5a06e90a79672dc165da8fb51a2531311e..8c77c533e64c42cd86a8b839e78c9db76619ebde 100644 (file)
@@ -700,8 +700,16 @@ StartLogStreamer(char *startpos, uint32 timeline, char *sysidentifier)
        bgchild = fork();
        if (bgchild == 0)
        {
+               int                     ret;
+
                /* in child process */
-               exit(LogStreamerMain(param));
+               ret = LogStreamerMain(param);
+
+               /* temp debugging aid to analyze 019_replslot_limit failures */
+               if (verbose)
+                       pg_log_info("log streamer with pid %d exiting", getpid());
+
+               exit(ret);
        }
        else if (bgchild < 0)
        {
index 4257bd4d35a821f02508f83d04af709f2d6615b0..0c9da9bf27218746cd7957bd564fef7eda90df09 100644 (file)
@@ -316,13 +316,16 @@ $node_primary3->append_conf(
        max_wal_size = 2MB
        log_checkpoints = yes
        max_slot_wal_keep_size = 1MB
+
+       # temp debugging aid to analyze 019_replslot_limit failures
+       log_min_messages=debug3
        ));
 $node_primary3->start;
 $node_primary3->safe_psql('postgres',
        "SELECT pg_create_physical_replication_slot('rep3')");
 # Take backup
 $backup_name = 'my_backup';
-$node_primary3->backup($backup_name);
+$node_primary3->backup($backup_name, backup_options => ['--verbose']);
 # Create standby
 my $node_standby3 = PostgreSQL::Test::Cluster->new('standby_3');
 $node_standby3->init_from_backup($node_primary3, $backup_name,