Fix race condition between hot standby and restoring a full-page image.

author Heikki Linnakangas <heikki.linnakangas@iki.fi>

Thu, 13 Nov 2014 17:47:44 +0000 (19:47 +0200)

committer Heikki Linnakangas <heikki.linnakangas@iki.fi>

Thu, 13 Nov 2014 18:01:18 +0000 (20:01 +0200)
author Heikki Linnakangas <heikki.linnakangas@iki.fi>
Thu, 13 Nov 2014 17:47:44 +0000 (19:47 +0200)
committer Heikki Linnakangas <heikki.linnakangas@iki.fi>
Thu, 13 Nov 2014 18:01:18 +0000 (20:01 +0200)
diff --git a/src/backend/access/hash/hashpage.c b/src/backend/access/hash/hashpage.c

index 7e828d2875c581ee1807407e8ab24613c7742075..8b349cfaf87d2655b78e3f32f98a5c2f634a065d 100644 (file)
--- a/src/backend/access/hash/hashpage.c
+++ b/src/backend/access/hash/hashpage.c
@@ -155,9 +155,8 @@ _hash_getinitbuf(Relation rel, BlockNumber blkno)
     if (blkno == P_NEW)
         elog(ERROR, "hash AM does not use P_NEW");
  
-   buf = ReadBufferExtended(rel, MAIN_FORKNUM, blkno, RBM_ZERO, NULL);
-
-   LockBuffer(buf, HASH_WRITE);
+   buf = ReadBufferExtended(rel, MAIN_FORKNUM, blkno, RBM_ZERO_AND_LOCK,
+                            NULL);
  
     /* ref count and lock type are correct */
  
@@ -198,11 +197,13 @@ _hash_getnewbuf(Relation rel, BlockNumber blkno, ForkNumber forkNum)
         if (BufferGetBlockNumber(buf) != blkno)
             elog(ERROR, "unexpected hash relation size: %u, should be %u",
                  BufferGetBlockNumber(buf), blkno);
+       LockBuffer(buf, HASH_WRITE);
     }
     else
-       buf = ReadBufferExtended(rel, forkNum, blkno, RBM_ZERO, NULL);
-
-   LockBuffer(buf, HASH_WRITE);
+   {
+       buf = ReadBufferExtended(rel, forkNum, blkno, RBM_ZERO_AND_LOCK,
+                                NULL);
+   }
  
     /* ref count and lock type are correct */
  
diff --git a/src/backend/access/heap/heapam.c b/src/backend/access/heap/heapam.c

index 8f167180efe03b51d6784de7339c2c42d56a9a6a..2a494f59f4cf44a1c4a6751645d7920733d05ce3 100644 (file)
--- a/src/backend/access/heap/heapam.c
+++ b/src/backend/access/heap/heapam.c
@@ -4884,9 +4884,8 @@ heap_xlog_newpage(XLogRecPtr lsn, XLogRecord *record)
      * not do anything that assumes we are touching a heap.
      */
     buffer = XLogReadBufferExtended(xlrec->node, xlrec->forknum, xlrec->blkno,
-                                   RBM_ZERO);
+                                   RBM_ZERO_AND_LOCK);
     Assert(BufferIsValid(buffer));
-   LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
     page = (Page) BufferGetPage(buffer);
  
     Assert(record->xl_len == SizeOfHeapNewpage + BLCKSZ);
diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c

index 5a6b2a389e0d30d320d3aa337e80d6bbaa01fd62..1b87df6784a174c0dd234e9069529519f70dc8da 100644 (file)
--- a/src/backend/access/transam/xlog.c
+++ b/src/backend/access/transam/xlog.c
@@ -3813,12 +3813,8 @@ RestoreBackupBlock(XLogRecPtr lsn, XLogRecord *record, int block_index,
         {
             /* Found it, apply the update */
             buffer = XLogReadBufferExtended(bkpb.node, bkpb.fork, bkpb.block,
-                                           RBM_ZERO);
+                                           get_cleanup_lock ? RBM_ZERO_AND_CLEANUP_LOCK : RBM_ZERO_AND_LOCK);
             Assert(BufferIsValid(buffer));
-           if (get_cleanup_lock)
-               LockBufferForCleanup(buffer);
-           else
-               LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
  
             page = (Page) BufferGetPage(buffer);
  
diff --git a/src/backend/access/transam/xlogutils.c b/src/backend/access/transam/xlogutils.c

index e9134be3d3509c70af4406652f1c0289f06f6243..a9d27f6d14fa12b2cc3c9a4bbd654c293aefd744 100644 (file)
--- a/src/backend/access/transam/xlogutils.c
+++ b/src/backend/access/transam/xlogutils.c
@@ -257,7 +257,8 @@ XLogCheckInvalidPages(void)
   * The returned buffer is exclusively-locked.
   *
   * For historical reasons, instead of a ReadBufferMode argument, this only
- * supports RBM_ZERO (init == true) and RBM_NORMAL (init == false) modes.
+ * supports RBM_ZERO_AND_LOCK (init == true) and RBM_NORMAL (init == false)
+ * modes.
   */
  Buffer
  XLogReadBuffer(RelFileNode rnode, BlockNumber blkno, bool init)
@@ -265,8 +266,8 @@ XLogReadBuffer(RelFileNode rnode, BlockNumber blkno, bool init)
     Buffer      buf;
  
     buf = XLogReadBufferExtended(rnode, MAIN_FORKNUM, blkno,
-                                init ? RBM_ZERO : RBM_NORMAL);
-   if (BufferIsValid(buf))
+                                init ? RBM_ZERO_AND_LOCK : RBM_NORMAL);
+   if (BufferIsValid(buf) && !init)
         LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE);
  
     return buf;
@@ -285,8 +286,8 @@ XLogReadBuffer(RelFileNode rnode, BlockNumber blkno, bool init)
   * dropped or truncated. If we don't see evidence of that later in the WAL
   * sequence, we'll complain at the end of WAL replay.)
   *
- * In RBM_ZERO and RBM_ZERO_ON_ERROR modes, if the page doesn't exist, the
- * relation is extended with all-zeroes pages up to the given block number.
+ * In RBM_ZERO_* modes, if the page doesn't exist, the relation is extended
+ * with all-zeroes pages up to the given block number.
   *
   * In RBM_NORMAL_NO_LOG mode, we return InvalidBuffer if the page doesn't
   * exist, and we don't check for all-zeroes.  Thus, no log entry is made
@@ -340,7 +341,11 @@ XLogReadBufferExtended(RelFileNode rnode, ForkNumber forknum,
         do
         {
             if (buffer != InvalidBuffer)
+           {
+               if (mode == RBM_ZERO_AND_LOCK || mode == RBM_ZERO_AND_CLEANUP_LOCK)
+                   LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
                 ReleaseBuffer(buffer);
+           }
             buffer = ReadBufferWithoutRelcache(rnode, forknum,
                                                P_NEW, mode, NULL);
         }
@@ -348,6 +353,8 @@ XLogReadBufferExtended(RelFileNode rnode, ForkNumber forknum,
         /* Handle the corner case that P_NEW returns non-consecutive pages */
         if (BufferGetBlockNumber(buffer) != blkno)
         {
+           if (mode == RBM_ZERO_AND_LOCK || mode == RBM_ZERO_AND_CLEANUP_LOCK)
+               LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
             ReleaseBuffer(buffer);
             buffer = ReadBufferWithoutRelcache(rnode, forknum, blkno,
                                                mode, NULL);
diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c

index 34f36b4e8dbb8efda804a3bf1a5a9acce38c43d8..d46c36c7f26b185f81b9d8831b5968077aa3b80c 100644 (file)
--- a/src/backend/storage/buffer/bufmgr.c
+++ b/src/backend/storage/buffer/bufmgr.c
@@ -210,14 +210,19 @@ ReadBuffer(Relation reln, BlockNumber blockNum)
   * valid, the page is zeroed instead of throwing an error. This is intended
   * for non-critical data, where the caller is prepared to repair errors.
   *
- * In RBM_ZERO mode, if the page isn't in buffer cache already, it's filled
- * with zeros instead of reading it from disk.  Useful when the caller is
- * going to fill the page from scratch, since this saves I/O and avoids
+ * In RBM_ZERO_AND_LOCK mode, if the page isn't in buffer cache already, it's
+ * filled with zeros instead of reading it from disk.  Useful when the caller
+ * is going to fill the page from scratch, since this saves I/O and avoids
   * unnecessary failure if the page-on-disk has corrupt page headers.
+ * The page is returned locked to ensure that the caller has a chance to
+ * initialize the page before it's made visible to others.
   * Caution: do not use this mode to read a page that is beyond the relation's
   * current physical EOF; that is likely to cause problems in md.c when
   * the page is modified and written out. P_NEW is OK, though.
   *
+ * RBM_ZERO_AND_CLEANUP_LOCK is the same as RBM_ZERO_AND_LOCK, but acquires
+ * a cleanup-strength lock on the page.
+ *
   * RBM_NORMAL_NO_LOG mode is treated the same as RBM_NORMAL here.
   *
   * If strategy is not NULL, a nondefault buffer access strategy is used.
@@ -359,6 +364,18 @@ ReadBuffer_common(SMgrRelation smgr, char relpersistence, ForkNumber forkNum,
                                               isExtend,
                                               found);
  
+           /*
+            * In RBM_ZERO_AND_LOCK mode, the caller expects the buffer to
+            * be already locked on return.
+            */
+           if (!isLocalBuf)
+           {
+               if (mode == RBM_ZERO_AND_LOCK)
+                   LWLockAcquire(bufHdr->content_lock, LW_EXCLUSIVE);
+               else if (mode == RBM_ZERO_AND_CLEANUP_LOCK)
+                   LockBufferForCleanup(BufferDescriptorGetBuffer(bufHdr));
+           }
+
             return BufferDescriptorGetBuffer(bufHdr);
         }
  
@@ -439,8 +456,11 @@ ReadBuffer_common(SMgrRelation smgr, char relpersistence, ForkNumber forkNum,
          * Read in the page, unless the caller intends to overwrite it and
          * just wants us to allocate a buffer.
          */
-       if (mode == RBM_ZERO)
+       if (mode == RBM_ZERO_AND_LOCK || mode == RBM_ZERO_AND_CLEANUP_LOCK ||
+           mode == RBM_DO_NOT_USE)
+       {
             MemSet((char *) bufBlock, 0, BLCKSZ);
+       }
         else
         {
             instr_time  io_start,
@@ -481,6 +501,19 @@ ReadBuffer_common(SMgrRelation smgr, char relpersistence, ForkNumber forkNum,
         }
     }
  
+   /*
+    * In RBM_ZERO_AND_LOCK mode, grab the buffer content lock before marking
+    * the page as valid, to make sure that no other backend sees the zeroed
+    * page before the caller has had a chance to initialize it.
+    *
+    * Since no-one else can be looking at the page contents yet, there is no
+    * difference between an exclusive lock and a cleanup-strength lock.
+    * (Note that we cannot use LockBuffer() of LockBufferForCleanup() here,
+    * because they assert that the buffer is already valid.)
+    */
+   if (mode == RBM_ZERO_AND_LOCK || mode == RBM_ZERO_AND_CLEANUP_LOCK)
+       LWLockAcquire(bufHdr->content_lock, LW_EXCLUSIVE);
+
     if (isLocalBuf)
     {
         /* Only need to adjust flags */
diff --git a/src/include/storage/bufmgr.h b/src/include/storage/bufmgr.h

index 50c324957c3a61add916093cadb17a7c9b5ba523..4e2bd8866a9c538701b1e4551e44b662829286aa 100644 (file)
--- a/src/include/storage/bufmgr.h
+++ b/src/include/storage/bufmgr.h
@@ -36,11 +36,16 @@ typedef enum BufferAccessStrategyType
  typedef enum
  {
     RBM_NORMAL,                 /* Normal read */
-   RBM_ZERO,                   /* Don't read from disk, caller will
-                                * initialize */
+   RBM_DO_NOT_USE,             /* This used to be RBM_ZERO. Only kept for
+                                * binary compatibility with 3rd party
+                                * extensions. */
     RBM_ZERO_ON_ERROR,          /* Read, but return an all-zeros page on error */
-   RBM_NORMAL_NO_LOG           /* Don't log page as invalid during WAL
+   RBM_NORMAL_NO_LOG,          /* Don't log page as invalid during WAL
                                  * replay; otherwise same as RBM_NORMAL */
+   RBM_ZERO_AND_LOCK,          /* Don't read from disk, caller will
+                                * initialize. Also locks the page. */
+   RBM_ZERO_AND_CLEANUP_LOCK   /* Like RBM_ZERO_AND_LOCK, but locks the page
+                                * in "cleanup" mode */
  } ReadBufferMode;
  
  /* in globals.c ... this duplicates miscadmin.h */
author	Heikki Linnakangas <heikki.linnakangas@iki.fi>
	Thu, 13 Nov 2014 17:47:44 +0000 (19:47 +0200)
committer	Heikki Linnakangas <heikki.linnakangas@iki.fi>
	Thu, 13 Nov 2014 18:01:18 +0000 (20:01 +0200)
src/backend/access/hash/hashpage.c		patch \| blob \| blame \| history
src/backend/access/heap/heapam.c		patch \| blob \| blame \| history
src/backend/access/transam/xlog.c		patch \| blob \| blame \| history
src/backend/access/transam/xlogutils.c		patch \| blob \| blame \| history
src/backend/storage/buffer/bufmgr.c		patch \| blob \| blame \| history
src/include/storage/bufmgr.h		patch \| blob \| blame \| history