Optimize DropRelFileNodeBuffers() for recovery.

author Amit Kapila <akapila@postgresql.org>

Tue, 12 Jan 2021 02:15:40 +0000 (07:45 +0530)

committer Amit Kapila <akapila@postgresql.org>

Tue, 12 Jan 2021 02:15:40 +0000 (07:45 +0530)
author Amit Kapila <akapila@postgresql.org>
Tue, 12 Jan 2021 02:15:40 +0000 (07:45 +0530)
committer Amit Kapila <akapila@postgresql.org>
Tue, 12 Jan 2021 02:15:40 +0000 (07:45 +0530)
diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c

index 71b5852224fcca27500442cec1aa47a887fdef2f..c192c2e35b5c398f3d236c872dedca44cae750cd 100644 (file)
--- a/src/backend/storage/buffer/bufmgr.c
+++ b/src/backend/storage/buffer/bufmgr.c
@@ -70,6 +70,14 @@
  
  #define RELS_BSEARCH_THRESHOLD     20
  
+/*
+ * This is the size (in the number of blocks) above which we scan the
+ * entire buffer pool to remove the buffers for all the pages of relation
+ * being dropped. For the relations with size below this threshold, we find
+ * the buffers by doing lookups in BufMapping table.
+ */
+#define BUF_DROP_FULL_SCAN_THRESHOLD       (uint32) (NBuffers / 32)
+
  typedef struct PrivateRefCountEntry
  {
     Buffer      buffer;
@@ -473,6 +481,10 @@ static BufferDesc *BufferAlloc(SMgrRelation smgr,
                                BufferAccessStrategy strategy,
                                bool *foundPtr);
  static void FlushBuffer(BufferDesc *buf, SMgrRelation reln);
+static void FindAndDropRelFileNodeBuffers(RelFileNode rnode,
+                                         ForkNumber forkNum,
+                                         BlockNumber nForkBlock,
+                                         BlockNumber firstDelBlock);
  static void AtProcExit_Buffers(int code, Datum arg);
  static void CheckForBufferLeaks(void);
  static int rnode_comparator(const void *p1, const void *p2);
@@ -2965,19 +2977,19 @@ BufferGetLSNAtomic(Buffer buffer)
   *     later.  It is also the responsibility of higher-level code to ensure
   *     that no other process could be trying to load more pages of the
   *     relation into buffers.
- *
- *     XXX currently it sequentially searches the buffer pool, should be
- *     changed to more clever ways of searching.  However, this routine
- *     is used only in code paths that aren't very performance-critical,
- *     and we shouldn't slow down the hot paths to make it faster ...
   * --------------------------------------------------------------------
   */
  void
-DropRelFileNodeBuffers(RelFileNodeBackend rnode, ForkNumber *forkNum,
+DropRelFileNodeBuffers(SMgrRelation smgr_reln, ForkNumber *forkNum,
                        int nforks, BlockNumber *firstDelBlock)
  {
     int         i;
     int         j;
+   RelFileNodeBackend rnode;
+   BlockNumber nForkBlock[MAX_FORKNUM];
+   BlockNumber nBlocksToInvalidate = 0;
+
+   rnode = smgr_reln->smgr_rnode;
  
     /* If it's a local relation, it's localbuf.c's problem. */
     if (RelFileNodeBackendIsTemp(rnode))
@@ -2991,6 +3003,56 @@ DropRelFileNodeBuffers(RelFileNodeBackend rnode, ForkNumber *forkNum,
         return;
     }
  
+   /*
+    * To remove all the pages of the specified relation forks from the buffer
+    * pool, we need to scan the entire buffer pool but we can optimize it by
+    * finding the buffers from BufMapping table provided we know the exact
+    * size of each fork of the relation. The exact size is required to ensure
+    * that we don't leave any buffer for the relation being dropped as
+    * otherwise the background writer or checkpointer can lead to a PANIC
+    * error while flushing buffers corresponding to files that don't exist.
+    *
+    * To know the exact size, we rely on the size cached for each fork by us
+    * during recovery which limits the optimization to recovery and on
+    * standbys but we can easily extend it once we have shared cache for
+    * relation size.
+    *
+    * In recovery, we cache the value returned by the first lseek(SEEK_END)
+    * and the future writes keeps the cached value up-to-date. See
+    * smgrextend. It is possible that the value of the first lseek is smaller
+    * than the actual number of existing blocks in the file due to buggy
+    * Linux kernels that might not have accounted for the recent write. But
+    * that should be fine because there must not be any buffers after that
+    * file size.
+    */
+   for (i = 0; i < nforks; i++)
+   {
+       /* Get the number of blocks for a relation's fork */
+       nForkBlock[i] = smgrnblocks_cached(smgr_reln, forkNum[i]);
+
+       if (nForkBlock[i] == InvalidBlockNumber)
+       {
+           nBlocksToInvalidate = InvalidBlockNumber;
+           break;
+       }
+
+       /* calculate the number of blocks to be invalidated */
+       nBlocksToInvalidate += (nForkBlock[i] - firstDelBlock[i]);
+   }
+
+   /*
+    * We apply the optimization iff the total number of blocks to invalidate
+    * is below the BUF_DROP_FULL_SCAN_THRESHOLD.
+    */
+   if (BlockNumberIsValid(nBlocksToInvalidate) &&
+       nBlocksToInvalidate < BUF_DROP_FULL_SCAN_THRESHOLD)
+   {
+       for (j = 0; j < nforks; j++)
+           FindAndDropRelFileNodeBuffers(rnode.node, forkNum[j],
+                                         nForkBlock[j], firstDelBlock[j]);
+       return;
+   }
+
     for (i = 0; i < NBuffers; i++)
     {
         BufferDesc *bufHdr = GetBufferDescriptor(i);
@@ -3133,6 +3195,65 @@ DropRelFileNodesAllBuffers(RelFileNodeBackend *rnodes, int nnodes)
     pfree(nodes);
  }
  
+/* ---------------------------------------------------------------------
+ *     FindAndDropRelFileNodeBuffers
+ *
+ *     This function performs look up in BufMapping table and removes from the
+ *     buffer pool all the pages of the specified relation fork that has block
+ *     number >= firstDelBlock. (In particular, with firstDelBlock = 0, all
+ *     pages are removed.)
+ * --------------------------------------------------------------------
+ */
+static void
+FindAndDropRelFileNodeBuffers(RelFileNode rnode, ForkNumber forkNum,
+                             BlockNumber nForkBlock,
+                             BlockNumber firstDelBlock)
+{
+   BlockNumber curBlock;
+
+   for (curBlock = firstDelBlock; curBlock < nForkBlock; curBlock++)
+   {
+       uint32      bufHash;    /* hash value for tag */
+       BufferTag   bufTag;     /* identity of requested block */
+       LWLock     *bufPartitionLock;   /* buffer partition lock for it */
+       int         buf_id;
+       BufferDesc *bufHdr;
+       uint32      buf_state;
+
+       /* create a tag so we can lookup the buffer */
+       INIT_BUFFERTAG(bufTag, rnode, forkNum, curBlock);
+
+       /* determine its hash code and partition lock ID */
+       bufHash = BufTableHashCode(&bufTag);
+       bufPartitionLock = BufMappingPartitionLock(bufHash);
+
+       /* Check that it is in the buffer pool. If not, do nothing. */
+       LWLockAcquire(bufPartitionLock, LW_SHARED);
+       buf_id = BufTableLookup(&bufTag, bufHash);
+       LWLockRelease(bufPartitionLock);
+
+       if (buf_id < 0)
+           continue;
+
+       bufHdr = GetBufferDescriptor(buf_id);
+
+       /*
+        * We need to lock the buffer header and recheck if the buffer is
+        * still associated with the same block because the buffer could be
+        * evicted by some other backend loading blocks for a different
+        * relation after we release lock on the BufMapping table.
+        */
+       buf_state = LockBufHdr(bufHdr);
+
+       if (RelFileNodeEquals(bufHdr->tag.rnode, rnode) &&
+           bufHdr->tag.forkNum == forkNum &&
+           bufHdr->tag.blockNum >= firstDelBlock)
+           InvalidateBuffer(bufHdr);   /* releases spinlock */
+       else
+           UnlockBufHdr(bufHdr, buf_state);
+   }
+}
+
  /* ---------------------------------------------------------------------
   *     DropDatabaseBuffers
   *
@@ -3245,8 +3366,7 @@ PrintPinnedBufs(void)
   *     XXX currently it sequentially searches the buffer pool, should be
   *     changed to more clever ways of searching.  This routine is not
   *     used in any performance-critical code paths, so it's not worth
- *     adding additional overhead to normal paths to make it go faster;
- *     but see also DropRelFileNodeBuffers.
+ *     adding additional overhead to normal paths to make it go faster.
   * --------------------------------------------------------------------
   */
  void
diff --git a/src/backend/storage/smgr/smgr.c b/src/backend/storage/smgr/smgr.c

index 0f31ff38221f61f9b20f3cbce9bbf6514b9846b9..af603c3db3b3ba2505ffb95a0dcc51ffdbd33c06 100644 (file)
--- a/src/backend/storage/smgr/smgr.c
+++ b/src/backend/storage/smgr/smgr.c
@@ -549,6 +549,28 @@ smgrnblocks(SMgrRelation reln, ForkNumber forknum)
  {
     BlockNumber result;
  
+   /* Check and return if we get the cached value for the number of blocks. */
+   result = smgrnblocks_cached(reln, forknum);
+   if (result != InvalidBlockNumber)
+       return result;
+
+   result = smgrsw[reln->smgr_which].smgr_nblocks(reln, forknum);
+
+   reln->smgr_cached_nblocks[forknum] = result;
+
+   return result;
+}
+
+/*
+ * smgrnblocks_cached() -- Get the cached number of blocks in the supplied
+ *                         relation.
+ *
+ * Returns an InvalidBlockNumber when not in recovery and when the relation
+ * fork size is not cached.
+ */
+BlockNumber
+smgrnblocks_cached(SMgrRelation reln, ForkNumber forknum)
+{
     /*
      * For now, we only use cached values in recovery due to lack of a shared
      * invalidation mechanism for changes in file size.
@@ -556,11 +578,7 @@ smgrnblocks(SMgrRelation reln, ForkNumber forknum)
     if (InRecovery && reln->smgr_cached_nblocks[forknum] != InvalidBlockNumber)
         return reln->smgr_cached_nblocks[forknum];
  
-   result = smgrsw[reln->smgr_which].smgr_nblocks(reln, forknum);
-
-   reln->smgr_cached_nblocks[forknum] = result;
-
-   return result;
+   return InvalidBlockNumber;
  }
  
  /*
@@ -582,7 +600,7 @@ smgrtruncate(SMgrRelation reln, ForkNumber *forknum, int nforks, BlockNumber *nb
      * Get rid of any buffers for the about-to-be-deleted blocks. bufmgr will
      * just drop them without bothering to write the contents.
      */
-   DropRelFileNodeBuffers(reln->smgr_rnode, forknum, nforks, nblocks);
+   DropRelFileNodeBuffers(reln, forknum, nforks, nblocks);
  
     /*
      * Send a shared-inval message to force other backends to close any smgr
diff --git a/src/include/storage/bufmgr.h b/src/include/storage/bufmgr.h

index ff6cd0fc54e36483470ceb4407dd158b9bc8afb6..0c484f3addb925c5c4212f018093306bfa996fcd 100644 (file)
--- a/src/include/storage/bufmgr.h
+++ b/src/include/storage/bufmgr.h
@@ -203,7 +203,7 @@ extern void FlushOneBuffer(Buffer buffer);
  extern void FlushRelationBuffers(Relation rel);
  extern void FlushRelationsAllBuffers(struct SMgrRelationData **smgrs, int nrels);
  extern void FlushDatabaseBuffers(Oid dbid);
-extern void DropRelFileNodeBuffers(RelFileNodeBackend rnode, ForkNumber *forkNum,
+extern void DropRelFileNodeBuffers(struct SMgrRelationData *smgr_reln, ForkNumber *forkNum,
                                    int nforks, BlockNumber *firstDelBlock);
  extern void DropRelFileNodesAllBuffers(RelFileNodeBackend *rnodes, int nnodes);
  extern void DropDatabaseBuffers(Oid dbid);
diff --git a/src/include/storage/smgr.h b/src/include/storage/smgr.h

index ebf4a199dcb4d15beaf7b0ed8c7a27cc99175adc..a6fbf7b6a6c8a33fb6577f70278527fac53116aa 100644 (file)
--- a/src/include/storage/smgr.h
+++ b/src/include/storage/smgr.h
@@ -99,6 +99,7 @@ extern void smgrwrite(SMgrRelation reln, ForkNumber forknum,
  extern void smgrwriteback(SMgrRelation reln, ForkNumber forknum,
                           BlockNumber blocknum, BlockNumber nblocks);
  extern BlockNumber smgrnblocks(SMgrRelation reln, ForkNumber forknum);
+extern BlockNumber smgrnblocks_cached(SMgrRelation reln, ForkNumber forknum);
  extern void smgrtruncate(SMgrRelation reln, ForkNumber *forknum,
                          int nforks, BlockNumber *nblocks);
  extern void smgrimmedsync(SMgrRelation reln, ForkNumber forknum);
author	Amit Kapila <akapila@postgresql.org>
	Tue, 12 Jan 2021 02:15:40 +0000 (07:45 +0530)
committer	Amit Kapila <akapila@postgresql.org>
	Tue, 12 Jan 2021 02:15:40 +0000 (07:45 +0530)
src/backend/storage/buffer/bufmgr.c		patch \| blob \| blame \| history
src/backend/storage/smgr/smgr.c		patch \| blob \| blame \| history
src/include/storage/bufmgr.h		patch \| blob \| blame \| history
src/include/storage/smgr.h		patch \| blob \| blame \| history