Use condition variables to wait for checkpoints.
authorThomas Munro <tmunro@postgresql.org>
Wed, 13 Mar 2019 21:25:27 +0000 (10:25 +1300)
committerThomas Munro <tmunro@postgresql.org>
Wed, 13 Mar 2019 21:59:33 +0000 (10:59 +1300)
Previously we used a polling/sleeping loop to wait for checkpoints
to begin and end, which leads to up to a couple hundred milliseconds
of needless thumb-twiddling.  Use condition variables instead.

Author: Thomas Munro
Reviewed-by: Andres Freund
Discussion: https://postgr.es/m/CA%2BhUKGLY7sDe%2Bbg1K%3DbnEzOofGoo4bJHYh9%2BcDCXJepb6DQmLw%40mail.gmail.com

doc/src/sgml/monitoring.sgml
src/backend/postmaster/checkpointer.c
src/backend/postmaster/pgstat.c
src/include/pgstat.h

index e2630fd3682ab8bb99f0ced32b05d17e65b34976..60b89356f709981827d86cce8dc7379aebc3d821 100644 (file)
@@ -1281,7 +1281,7 @@ postgres   27093  0.0  0.0  30096  2752 ?        Ss   11:34   0:00 postgres: ser
          <entry>Waiting in an extension.</entry>
         </row>
         <row>
-         <entry morerows="34"><literal>IPC</literal></entry>
+         <entry morerows="36"><literal>IPC</literal></entry>
          <entry><literal>BgWorkerShutdown</literal></entry>
          <entry>Waiting for background worker to shut down.</entry>
         </row>
@@ -1293,6 +1293,14 @@ postgres   27093  0.0  0.0  30096  2752 ?        Ss   11:34   0:00 postgres: ser
          <entry><literal>BtreePage</literal></entry>
          <entry>Waiting for the page number needed to continue a parallel B-tree scan to become available.</entry>
         </row>
+        <row>
+         <entry><literal>CheckpointDone</literal></entry>
+         <entry>Waiting for a checkpoint to complete.</entry>
+        </row>
+        <row>
+         <entry><literal>CheckpointStart</literal></entry>
+         <entry>Waiting for a checkpoint to start.</entry>
+        </row>
         <row>
          <entry><literal>ClogGroupUpdate</literal></entry>
          <entry>Waiting for group leader to update transaction status at transaction end.</entry>
index fe96c41359b70b6f2a82196c41eea7e33b5478fb..3d5b382d048d778a3d4549669a9a1be851493981 100644 (file)
@@ -126,6 +126,9 @@ typedef struct
 
        int                     ckpt_flags;             /* checkpoint flags, as defined in xlog.h */
 
+       ConditionVariable start_cv; /* signaled when ckpt_started advances */
+       ConditionVariable done_cv;      /* signaled when ckpt_done advances */
+
        uint32          num_backend_writes; /* counts user backend buffer writes */
        uint32          num_backend_fsync;      /* counts user backend fsync calls */
 
@@ -428,6 +431,8 @@ CheckpointerMain(void)
                        CheckpointerShmem->ckpt_started++;
                        SpinLockRelease(&CheckpointerShmem->ckpt_lck);
 
+                       ConditionVariableBroadcast(&CheckpointerShmem->start_cv);
+
                        /*
                         * The end-of-recovery checkpoint is a real checkpoint that's
                         * performed while we're still in recovery.
@@ -488,6 +493,8 @@ CheckpointerMain(void)
                        CheckpointerShmem->ckpt_done = CheckpointerShmem->ckpt_started;
                        SpinLockRelease(&CheckpointerShmem->ckpt_lck);
 
+                       ConditionVariableBroadcast(&CheckpointerShmem->done_cv);
+
                        if (ckpt_performed)
                        {
                                /*
@@ -915,6 +922,8 @@ CheckpointerShmemInit(void)
                MemSet(CheckpointerShmem, 0, size);
                SpinLockInit(&CheckpointerShmem->ckpt_lck);
                CheckpointerShmem->max_requests = NBuffers;
+               ConditionVariableInit(&CheckpointerShmem->start_cv);
+               ConditionVariableInit(&CheckpointerShmem->done_cv);
        }
 }
 
@@ -1023,6 +1032,7 @@ RequestCheckpoint(int flags)
                                        new_failed;
 
                /* Wait for a new checkpoint to start. */
+               ConditionVariablePrepareToSleep(&CheckpointerShmem->start_cv);
                for (;;)
                {
                        SpinLockAcquire(&CheckpointerShmem->ckpt_lck);
@@ -1032,13 +1042,15 @@ RequestCheckpoint(int flags)
                        if (new_started != old_started)
                                break;
 
-                       CHECK_FOR_INTERRUPTS();
-                       pg_usleep(100000L);
+                       ConditionVariableSleep(&CheckpointerShmem->start_cv,
+                                                                  WAIT_EVENT_CHECKPOINT_START);
                }
+               ConditionVariableCancelSleep();
 
                /*
                 * We are waiting for ckpt_done >= new_started, in a modulo sense.
                 */
+               ConditionVariablePrepareToSleep(&CheckpointerShmem->done_cv);
                for (;;)
                {
                        int                     new_done;
@@ -1051,9 +1063,10 @@ RequestCheckpoint(int flags)
                        if (new_done - new_started >= 0)
                                break;
 
-                       CHECK_FOR_INTERRUPTS();
-                       pg_usleep(100000L);
+                       ConditionVariableSleep(&CheckpointerShmem->done_cv,
+                                                                  WAIT_EVENT_CHECKPOINT_DONE);
                }
+               ConditionVariableCancelSleep();
 
                if (new_failed != old_failed)
                        ereport(ERROR,
index ba31f532ea4e00e3f1e20a4901c873b061a83cd9..2fbfadd9f0c755cb623e04010dcf2145c6048457 100644 (file)
@@ -3623,6 +3623,12 @@ pgstat_get_wait_ipc(WaitEventIPC w)
                case WAIT_EVENT_BTREE_PAGE:
                        event_name = "BtreePage";
                        break;
+               case WAIT_EVENT_CHECKPOINT_DONE:
+                       event_name = "CheckpointDone";
+                       break;
+               case WAIT_EVENT_CHECKPOINT_START:
+                       event_name = "CheckpointStart";
+                       break;
                case WAIT_EVENT_CLOG_GROUP_UPDATE:
                        event_name = "ClogGroupUpdate";
                        break;
index 725c8b0d64a9515342f653071530fea54b2fb0fa..ea6cc8b560f221b8e3314dfb4bc5bf8441006bc7 100644 (file)
@@ -817,6 +817,8 @@ typedef enum
        WAIT_EVENT_BGWORKER_STARTUP,
        WAIT_EVENT_BTREE_PAGE,
        WAIT_EVENT_CLOG_GROUP_UPDATE,
+       WAIT_EVENT_CHECKPOINT_DONE,
+       WAIT_EVENT_CHECKPOINT_START,
        WAIT_EVENT_EXECUTE_GATHER,
        WAIT_EVENT_HASH_BATCH_ALLOCATING,
        WAIT_EVENT_HASH_BATCH_ELECTING,