Optimize WAL insertion lock acquisition and release with some atomics
authorMichael Paquier <michael@paquier.xyz>
Tue, 25 Jul 2023 04:38:58 +0000 (13:38 +0900)
committerMichael Paquier <michael@paquier.xyz>
Tue, 25 Jul 2023 04:38:58 +0000 (13:38 +0900)
The WAL insertion lock variable insertingAt is currently being read
and written with the help of the LWLock wait list lock to avoid any read
of torn values.  This wait list lock can become a point of contention on
a highly concurrent write workloads.

This commit switches insertingAt to a 64b atomic variable that provides
torn-free reads/writes.  On platforms without 64b atomic support, the
fallback implementation uses spinlocks to provide the same guarantees
for the values read.  LWLockWaitForVar(), through
LWLockConflictsWithVar(), reads the new value to check if it still needs
to wait with a u64 atomic operation.  LWLockUpdateVar() updates the
variable before waking up the waiters with an exchange_u64 (full memory
barrier).  LWLockReleaseClearVar() now uses also an exchange_u64 to
reset the variable.  Before this commit, all these steps relied on
LWLockWaitListLock() and LWLockWaitListUnlock().

This reduces contention on LWLock wait list lock and improves
performance of highly-concurrent write workloads.  Here are some
numbers using pg_logical_emit_message() (HEAD at d6677b93) with various
arbitrary record lengths and clients up to 1k on a rather-large machine
(64 vCPUs, 512GB of RAM, 16 cores per sockets, 2 sockets), in terms of
TPS numbers coming from pgbench:
 message_size_b     |     16 |     64 |    256 |   1024
--------------------+--------+--------+--------+-------
 patch_4_clients    |  83830 |  82929 |  80478 |  73131
 patch_16_clients   | 267655 | 264973 | 250566 | 213985
 patch_64_clients   | 380423 | 378318 | 356907 | 294248
 patch_256_clients  | 360915 | 354436 | 326209 | 263664
 patch_512_clients  | 332654 | 321199 | 287521 | 240128
 patch_1024_clients | 288263 | 276614 | 258220 | 217063
 patch_2048_clients | 252280 | 243558 | 230062 | 192429
 patch_4096_clients | 212566 | 213654 | 205951 | 166955
 head_4_clients     |  83686 |  83766 |  81233 |  73749
 head_16_clients    | 266503 | 265546 | 249261 | 213645
 head_64_clients    | 366122 | 363462 | 341078 | 261707
 head_256_clients   | 132600 | 132573 | 134392 | 165799
 head_512_clients   | 118937 | 114332 | 116860 | 150672
 head_1024_clients  | 133546 | 115256 | 125236 | 151390
 head_2048_clients  | 137877 | 117802 | 120909 | 138165
 head_4096_clients  | 113440 | 115611 | 120635 | 114361

Bharath has been measuring similar improvements, where the limit of the
WAL insertion lock begins to be felt when more than 256 concurrent
clients are involved in this specific workload.

An extra patch has been discussed to introduce a fast-exit path in
LWLockUpdateVar() when there are no waiters, still this does not
influence the write-heavy workload cases discussed as there are always
waiters.  This will be considered separately.

Author: Bharath Rupireddy
Reviewed-by: Nathan Bossart, Andres Freund, Michael Paquier
Discussion: https://postgr.es/m/CALj2ACVF+6jLvqKe6xhDzCCkr=rfd6upaGc3477Pji1Ke9G7Bg@mail.gmail.com

src/backend/access/transam/xlog.c
src/backend/storage/lmgr/lwlock.c
src/include/storage/lwlock.h

index 8b0710abe6037cec974b2f44f0fc21f9d6dc83b8..f7d4750fc0bd63982eee67c31e5b9ee7a1e5beb2 100644 (file)
@@ -376,7 +376,7 @@ typedef struct XLogwrtResult
 typedef struct
 {
        LWLock          lock;
-       XLogRecPtr      insertingAt;
+       pg_atomic_uint64 insertingAt;
        XLogRecPtr      lastImportantAt;
 } WALInsertLock;
 
@@ -4611,7 +4611,7 @@ XLOGShmemInit(void)
        for (i = 0; i < NUM_XLOGINSERT_LOCKS; i++)
        {
                LWLockInitialize(&WALInsertLocks[i].l.lock, LWTRANCHE_WAL_INSERT);
-               WALInsertLocks[i].l.insertingAt = InvalidXLogRecPtr;
+               pg_atomic_init_u64(&WALInsertLocks[i].l.insertingAt, InvalidXLogRecPtr);
                WALInsertLocks[i].l.lastImportantAt = InvalidXLogRecPtr;
        }
 
index 01d738f306bb8d62c0aa58ba279392426d3f1e79..ffa865eb28ad25bb6e57298664fbf3751856159e 100644 (file)
@@ -1547,9 +1547,8 @@ LWLockAcquireOrWait(LWLock *lock, LWLockMode mode)
  * *result is set to true if the lock was free, and false otherwise.
  */
 static bool
-LWLockConflictsWithVar(LWLock *lock,
-                                          uint64 *valptr, uint64 oldval, uint64 *newval,
-                                          bool *result)
+LWLockConflictsWithVar(LWLock *lock, pg_atomic_uint64 *valptr, uint64 oldval,
+                                          uint64 *newval, bool *result)
 {
        bool            mustwait;
        uint64          value;
@@ -1572,13 +1571,10 @@ LWLockConflictsWithVar(LWLock *lock,
        *result = false;
 
        /*
-        * Read value using the lwlock's wait list lock, as we can't generally
-        * rely on atomic 64 bit reads/stores.  TODO: On platforms with a way to
-        * do atomic 64 bit reads/writes the spinlock should be optimized away.
+        * Reading this value atomically is safe even on platforms where uint64
+        * cannot be read without observing a torn value.
         */
-       LWLockWaitListLock(lock);
-       value = *valptr;
-       LWLockWaitListUnlock(lock);
+       value = pg_atomic_read_u64(valptr);
 
        if (value != oldval)
        {
@@ -1607,7 +1603,8 @@ LWLockConflictsWithVar(LWLock *lock,
  * in shared mode, returns 'true'.
  */
 bool
-LWLockWaitForVar(LWLock *lock, uint64 *valptr, uint64 oldval, uint64 *newval)
+LWLockWaitForVar(LWLock *lock, pg_atomic_uint64 *valptr, uint64 oldval,
+                                uint64 *newval)
 {
        PGPROC     *proc = MyProc;
        int                     extraWaits = 0;
@@ -1735,29 +1732,32 @@ LWLockWaitForVar(LWLock *lock, uint64 *valptr, uint64 oldval, uint64 *newval)
  * LWLockUpdateVar - Update a variable and wake up waiters atomically
  *
  * Sets *valptr to 'val', and wakes up all processes waiting for us with
- * LWLockWaitForVar().  Setting the value and waking up the processes happen
- * atomically so that any process calling LWLockWaitForVar() on the same lock
- * is guaranteed to see the new value, and act accordingly.
+ * LWLockWaitForVar().  It first sets the value atomically and then wakes up
+ * waiting processes so that any process calling LWLockWaitForVar() on the same
+ * lock is guaranteed to see the new value, and act accordingly.
  *
  * The caller must be holding the lock in exclusive mode.
  */
 void
-LWLockUpdateVar(LWLock *lock, uint64 *valptr, uint64 val)
+LWLockUpdateVar(LWLock *lock, pg_atomic_uint64 *valptr, uint64 val)
 {
        proclist_head wakeup;
        proclist_mutable_iter iter;
 
        PRINT_LWDEBUG("LWLockUpdateVar", lock, LW_EXCLUSIVE);
 
+       /*
+        * Note that pg_atomic_exchange_u64 is a full barrier, so we're guaranteed
+        * that the variable is updated before waking up waiters.
+        */
+       pg_atomic_exchange_u64(valptr, val);
+
        proclist_init(&wakeup);
 
        LWLockWaitListLock(lock);
 
        Assert(pg_atomic_read_u32(&lock->state) & LW_VAL_EXCLUSIVE);
 
-       /* Update the lock's value */
-       *valptr = val;
-
        /*
         * See if there are any LW_WAIT_UNTIL_FREE waiters that need to be woken
         * up. They are always in the front of the queue.
@@ -1873,17 +1873,13 @@ LWLockRelease(LWLock *lock)
  * LWLockReleaseClearVar - release a previously acquired lock, reset variable
  */
 void
-LWLockReleaseClearVar(LWLock *lock, uint64 *valptr, uint64 val)
+LWLockReleaseClearVar(LWLock *lock, pg_atomic_uint64 *valptr, uint64 val)
 {
-       LWLockWaitListLock(lock);
-
        /*
-        * Set the variable's value before releasing the lock, that prevents race
-        * a race condition wherein a new locker acquires the lock, but hasn't yet
-        * set the variables value.
+        * Note that pg_atomic_exchange_u64 is a full barrier, so we're guaranteed
+        * that the variable is updated before releasing the lock.
         */
-       *valptr = val;
-       LWLockWaitListUnlock(lock);
+       pg_atomic_exchange_u64(valptr, val);
 
        LWLockRelease(lock);
 }
index 34169e5889e4f6d369b1e066e9cac1987c8489f4..d77410bdea70976519e7f6d1f10828d658f7b324 100644 (file)
@@ -129,14 +129,14 @@ extern bool LWLockAcquire(LWLock *lock, LWLockMode mode);
 extern bool LWLockConditionalAcquire(LWLock *lock, LWLockMode mode);
 extern bool LWLockAcquireOrWait(LWLock *lock, LWLockMode mode);
 extern void LWLockRelease(LWLock *lock);
-extern void LWLockReleaseClearVar(LWLock *lock, uint64 *valptr, uint64 val);
+extern void LWLockReleaseClearVar(LWLock *lock, pg_atomic_uint64 *valptr, uint64 val);
 extern void LWLockReleaseAll(void);
 extern bool LWLockHeldByMe(LWLock *lock);
 extern bool LWLockAnyHeldByMe(LWLock *lock, int nlocks, size_t stride);
 extern bool LWLockHeldByMeInMode(LWLock *lock, LWLockMode mode);
 
-extern bool LWLockWaitForVar(LWLock *lock, uint64 *valptr, uint64 oldval, uint64 *newval);
-extern void LWLockUpdateVar(LWLock *lock, uint64 *valptr, uint64 val);
+extern bool LWLockWaitForVar(LWLock *lock, pg_atomic_uint64 *valptr, uint64 oldval, uint64 *newval);
+extern void LWLockUpdateVar(LWLock *lock, pg_atomic_uint64 *valptr, uint64 val);
 
 extern Size LWLockShmemSize(void);
 extern void CreateLWLocks(void);