Add logging for excessive ProcSignalBarrier waits.
authorThomas Munro <tmunro@postgresql.org>
Wed, 11 May 2022 06:03:03 +0000 (18:03 +1200)
committerThomas Munro <tmunro@postgresql.org>
Wed, 11 May 2022 06:03:03 +0000 (18:03 +1200)
To enable diagnosis of systems that are not processing ProcSignalBarrier
requests promptly, add a LOG message every 5 seconds if we seem to be
wedged.  Although you could already see this state as a wait event in
pg_stat_activity, the log message also shows the PID of the process that
is preventing progress.

Also add DEBUG1 logging around the whole wait loop.

Reviewed-by: Robert Haas <robertmhaas@gmail.com>
Discussion: https://postgr.es/m/CA%2BTgmoYJ03r5359gQutRGP9BtigYCg3_UskcmnVjBf-QO3-0pQ%40mail.gmail.com

src/backend/storage/ipc/procsignal.c

index 00d66902d8bfd8aa1ece8e456c2136aa1b6ffe5a..21a9fc0fdd2edc18d7160d1500b4e86994742383 100644 (file)
@@ -393,6 +393,11 @@ WaitForProcSignalBarrier(uint64 generation)
 {
        Assert(generation <= pg_atomic_read_u64(&ProcSignal->psh_barrierGeneration));
 
+       elog(DEBUG1,
+                "waiting for all backends to process ProcSignalBarrier generation "
+                UINT64_FORMAT,
+                generation);
+
        for (int i = NumProcSignalSlots - 1; i >= 0; i--)
        {
                ProcSignalSlot *slot = &ProcSignal->psh_slot[i];
@@ -407,13 +412,22 @@ WaitForProcSignalBarrier(uint64 generation)
                oldval = pg_atomic_read_u64(&slot->pss_barrierGeneration);
                while (oldval < generation)
                {
-                       ConditionVariableSleep(&slot->pss_barrierCV,
-                                                                  WAIT_EVENT_PROC_SIGNAL_BARRIER);
+                       if (ConditionVariableTimedSleep(&slot->pss_barrierCV,
+                                                                                       5000,
+                                                                                       WAIT_EVENT_PROC_SIGNAL_BARRIER))
+                               ereport(LOG,
+                                               (errmsg("still waiting for backend with PID %lu to accept ProcSignalBarrier",
+                                                               (unsigned long) slot->pss_pid)));
                        oldval = pg_atomic_read_u64(&slot->pss_barrierGeneration);
                }
                ConditionVariableCancelSleep();
        }
 
+       elog(DEBUG1,
+                "finished waiting for all backends to process ProcSignalBarrier generation "
+                UINT64_FORMAT,
+                generation);
+
        /*
         * The caller is probably calling this function because it wants to read
         * the shared state or perform further writes to shared state once all