Support PrefetchBuffer() in recovery.
authorThomas Munro <tmunro@postgresql.org>
Wed, 8 Apr 2020 01:36:45 +0000 (13:36 +1200)
committerThomas Munro <tmunro@postgresql.org>
Wed, 8 Apr 2020 02:56:57 +0000 (14:56 +1200)
Provide PrefetchSharedBuffer(), a variant that takes SMgrRelation, for
use in recovery.  Rename LocalPrefetchBuffer() to PrefetchLocalBuffer()
for consistency.

Add a return value to all of these.  In recovery, tolerate and report
missing files, so we can handle relations unlinked before crash recovery
began.  Also report cache hits and misses, so that callers can do faster
buffer lookups and better I/O accounting.

Reviewed-by: Alvaro Herrera <alvherre@2ndquadrant.com>
Reviewed-by: Andres Freund <andres@anarazel.de>
Discussion: https://postgr.es/m/CA%2BhUKGJ4VJN8ttxScUFM8dOKX0BrBiboo5uz1cq%3DAovOddfHpA%40mail.gmail.com

src/backend/storage/buffer/bufmgr.c
src/backend/storage/buffer/localbuf.c
src/backend/storage/smgr/md.c
src/backend/storage/smgr/smgr.c
src/include/storage/buf_internals.h
src/include/storage/bufmgr.h
src/include/storage/md.h
src/include/storage/smgr.h

index a7a39dd2a1ef77f00ae5db94a2f40c41e1be7ed4..f9980cf80ce804428717d7284b123a100f6eb62a 100644 (file)
@@ -480,18 +480,99 @@ static int        ckpt_buforder_comparator(const void *pa, const void *pb);
 static int     ts_ckpt_progress_comparator(Datum a, Datum b, void *arg);
 
 
+/*
+ * Implementation of PrefetchBuffer() for shared buffers.
+ */
+PrefetchBufferResult
+PrefetchSharedBuffer(SMgrRelation smgr_reln,
+                                        ForkNumber forkNum,
+                                        BlockNumber blockNum)
+{
+       PrefetchBufferResult result = {InvalidBuffer, false};
+       BufferTag       newTag;                 /* identity of requested block */
+       uint32          newHash;                /* hash value for newTag */
+       LWLock     *newPartitionLock;   /* buffer partition lock for it */
+       int                     buf_id;
+
+       Assert(BlockNumberIsValid(blockNum));
+
+       /* create a tag so we can lookup the buffer */
+       INIT_BUFFERTAG(newTag, smgr_reln->smgr_rnode.node,
+                                  forkNum, blockNum);
+
+       /* determine its hash code and partition lock ID */
+       newHash = BufTableHashCode(&newTag);
+       newPartitionLock = BufMappingPartitionLock(newHash);
+
+       /* see if the block is in the buffer pool already */
+       LWLockAcquire(newPartitionLock, LW_SHARED);
+       buf_id = BufTableLookup(&newTag, newHash);
+       LWLockRelease(newPartitionLock);
+
+       /* If not in buffers, initiate prefetch */
+       if (buf_id < 0)
+       {
+#ifdef USE_PREFETCH
+               /*
+                * Try to initiate an asynchronous read.  This returns false in
+                * recovery if the relation file doesn't exist.
+                */
+               if (smgrprefetch(smgr_reln, forkNum, blockNum))
+                       result.initiated_io = true;
+#endif                                                 /* USE_PREFETCH */
+       }
+       else
+       {
+               /*
+                * Report the buffer it was in at that time.  The caller may be able
+                * to avoid a buffer table lookup, but it's not pinned and it must be
+                * rechecked!
+                */
+               result.recent_buffer = buf_id + 1;
+       }
+
+       /*
+        * If the block *is* in buffers, we do nothing.  This is not really ideal:
+        * the block might be just about to be evicted, which would be stupid
+        * since we know we are going to need it soon.  But the only easy answer
+        * is to bump the usage_count, which does not seem like a great solution:
+        * when the caller does ultimately touch the block, usage_count would get
+        * bumped again, resulting in too much favoritism for blocks that are
+        * involved in a prefetch sequence. A real fix would involve some
+        * additional per-buffer state, and it's not clear that there's enough of
+        * a problem to justify that.
+        */
+
+       return result;
+}
+
 /*
  * PrefetchBuffer -- initiate asynchronous read of a block of a relation
  *
  * This is named by analogy to ReadBuffer but doesn't actually allocate a
  * buffer.  Instead it tries to ensure that a future ReadBuffer for the given
  * block will not be delayed by the I/O.  Prefetching is optional.
- * No-op if prefetching isn't compiled in.
+ *
+ * There are three possible outcomes:
+ *
+ * 1.  If the block is already cached, the result includes a valid buffer that
+ * could be used by the caller to avoid the need for a later buffer lookup, but
+ * it's not pinned, so the caller must recheck it.
+ *
+ * 2.  If the kernel has been asked to initiate I/O, the initated_io member is
+ * true.  Currently there is no way to know if the data was already cached by
+ * the kernel and therefore didn't really initiate I/O, and no way to know when
+ * the I/O completes other than using synchronous ReadBuffer().
+ *
+ * 3.  Otherwise, the buffer wasn't already cached by PostgreSQL, and either
+ * USE_PREFETCH is not defined (this build doesn't support prefetching due to
+ * lack of a kernel facility), or the underlying relation file wasn't found and
+ * we are in recovery.  (If the relation file wasn't found and we are not in
+ * recovery, an error is raised).
  */
-void
+PrefetchBufferResult
 PrefetchBuffer(Relation reln, ForkNumber forkNum, BlockNumber blockNum)
 {
-#ifdef USE_PREFETCH
        Assert(RelationIsValid(reln));
        Assert(BlockNumberIsValid(blockNum));
 
@@ -507,45 +588,13 @@ PrefetchBuffer(Relation reln, ForkNumber forkNum, BlockNumber blockNum)
                                         errmsg("cannot access temporary tables of other sessions")));
 
                /* pass it off to localbuf.c */
-               LocalPrefetchBuffer(reln->rd_smgr, forkNum, blockNum);
+               return PrefetchLocalBuffer(reln->rd_smgr, forkNum, blockNum);
        }
        else
        {
-               BufferTag       newTag;         /* identity of requested block */
-               uint32          newHash;        /* hash value for newTag */
-               LWLock     *newPartitionLock;   /* buffer partition lock for it */
-               int                     buf_id;
-
-               /* create a tag so we can lookup the buffer */
-               INIT_BUFFERTAG(newTag, reln->rd_smgr->smgr_rnode.node,
-                                          forkNum, blockNum);
-
-               /* determine its hash code and partition lock ID */
-               newHash = BufTableHashCode(&newTag);
-               newPartitionLock = BufMappingPartitionLock(newHash);
-
-               /* see if the block is in the buffer pool already */
-               LWLockAcquire(newPartitionLock, LW_SHARED);
-               buf_id = BufTableLookup(&newTag, newHash);
-               LWLockRelease(newPartitionLock);
-
-               /* If not in buffers, initiate prefetch */
-               if (buf_id < 0)
-                       smgrprefetch(reln->rd_smgr, forkNum, blockNum);
-
-               /*
-                * If the block *is* in buffers, we do nothing.  This is not really
-                * ideal: the block might be just about to be evicted, which would be
-                * stupid since we know we are going to need it soon.  But the only
-                * easy answer is to bump the usage_count, which does not seem like a
-                * great solution: when the caller does ultimately touch the block,
-                * usage_count would get bumped again, resulting in too much
-                * favoritism for blocks that are involved in a prefetch sequence. A
-                * real fix would involve some additional per-buffer state, and it's
-                * not clear that there's enough of a problem to justify that.
-                */
+               /* pass it to the shared buffer version */
+               return PrefetchSharedBuffer(reln->rd_smgr, forkNum, blockNum);
        }
-#endif                                                 /* USE_PREFETCH */
 }
 
 
index cac08e1b1acdf1a7e938b875dbd791fd6c50bb3b..6ffd7b330621cce9ecba2e33f6731022bb49f82d 100644 (file)
@@ -54,17 +54,17 @@ static Block GetLocalBufferStorage(void);
 
 
 /*
- * LocalPrefetchBuffer -
+ * PrefetchLocalBuffer -
  *       initiate asynchronous read of a block of a relation
  *
  * Do PrefetchBuffer's work for temporary relations.
  * No-op if prefetching isn't compiled in.
  */
-void
-LocalPrefetchBuffer(SMgrRelation smgr, ForkNumber forkNum,
+PrefetchBufferResult
+PrefetchLocalBuffer(SMgrRelation smgr, ForkNumber forkNum,
                                        BlockNumber blockNum)
 {
-#ifdef USE_PREFETCH
+       PrefetchBufferResult result = {InvalidBuffer, false};
        BufferTag       newTag;                 /* identity of requested block */
        LocalBufferLookupEnt *hresult;
 
@@ -81,12 +81,18 @@ LocalPrefetchBuffer(SMgrRelation smgr, ForkNumber forkNum,
        if (hresult)
        {
                /* Yes, so nothing to do */
-               return;
+               result.recent_buffer = -hresult->id - 1;
        }
-
-       /* Not in buffers, so initiate prefetch */
-       smgrprefetch(smgr, forkNum, blockNum);
+       else
+       {
+#ifdef USE_PREFETCH
+               /* Not in buffers, so initiate prefetch */
+               smgrprefetch(smgr, forkNum, blockNum);
+               result.initiated_io = true;
 #endif                                                 /* USE_PREFETCH */
+       }
+
+       return result;
 }
 
 
index ee9822c6e1056d35cc59a37f416628619a836713..e0b020da11126a4b865cecd33204e7fcaeb3ea27 100644 (file)
@@ -524,14 +524,17 @@ mdclose(SMgrRelation reln, ForkNumber forknum)
 /*
  *     mdprefetch() -- Initiate asynchronous read of the specified block of a relation
  */
-void
+bool
 mdprefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum)
 {
 #ifdef USE_PREFETCH
        off_t           seekpos;
        MdfdVec    *v;
 
-       v = _mdfd_getseg(reln, forknum, blocknum, false, EXTENSION_FAIL);
+       v = _mdfd_getseg(reln, forknum, blocknum, false,
+                                        InRecovery ? EXTENSION_RETURN_NULL : EXTENSION_FAIL);
+       if (v == NULL)
+               return false;
 
        seekpos = (off_t) BLCKSZ * (blocknum % ((BlockNumber) RELSEG_SIZE));
 
@@ -539,6 +542,8 @@ mdprefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum)
 
        (void) FilePrefetch(v->mdfd_vfd, seekpos, BLCKSZ, WAIT_EVENT_DATA_FILE_PREFETCH);
 #endif                                                 /* USE_PREFETCH */
+
+       return true;
 }
 
 /*
index 72c9696ad19d97afce7fc42b4cd144a82914084a..b053a4dc761e61ee99a91035e560d0ea83aa3920 100644 (file)
@@ -49,7 +49,7 @@ typedef struct f_smgr
                                                                bool isRedo);
        void            (*smgr_extend) (SMgrRelation reln, ForkNumber forknum,
                                                                BlockNumber blocknum, char *buffer, bool skipFsync);
-       void            (*smgr_prefetch) (SMgrRelation reln, ForkNumber forknum,
+       bool            (*smgr_prefetch) (SMgrRelation reln, ForkNumber forknum,
                                                                  BlockNumber blocknum);
        void            (*smgr_read) (SMgrRelation reln, ForkNumber forknum,
                                                          BlockNumber blocknum, char *buffer);
@@ -524,11 +524,15 @@ smgrextend(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
 
 /*
  *     smgrprefetch() -- Initiate asynchronous read of the specified block of a relation.
+ *
+ *             In recovery only, this can return false to indicate that a file
+ *             doesn't exist (presumably it has been dropped by a later WAL
+ *             record).
  */
-void
+bool
 smgrprefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum)
 {
-       smgrsw[reln->smgr_which].smgr_prefetch(reln, forknum, blocknum);
+       return smgrsw[reln->smgr_which].smgr_prefetch(reln, forknum, blocknum);
 }
 
 /*
index bf3b8ad340ec0fb6e18d8cafa74508afc7021d2b..e57f84ee9c8ce0ae731558c323dd12eea7c7ac11 100644 (file)
@@ -327,8 +327,9 @@ extern int  BufTableInsert(BufferTag *tagPtr, uint32 hashcode, int buf_id);
 extern void BufTableDelete(BufferTag *tagPtr, uint32 hashcode);
 
 /* localbuf.c */
-extern void LocalPrefetchBuffer(SMgrRelation smgr, ForkNumber forkNum,
-                                                               BlockNumber blockNum);
+extern PrefetchBufferResult PrefetchLocalBuffer(SMgrRelation smgr,
+                                                                                               ForkNumber forkNum,
+                                                                                               BlockNumber blockNum);
 extern BufferDesc *LocalBufferAlloc(SMgrRelation smgr, ForkNumber forkNum,
                                                                        BlockNumber blockNum, bool *foundPtr);
 extern void MarkLocalBufferDirty(Buffer buffer);
index bf3b12a2de3283f0620ab31dd1ad255084092e46..ee91b8fa26c178a35a7d220de8f32393f6753ce5 100644 (file)
@@ -46,6 +46,15 @@ typedef enum
                                                                 * replay; otherwise same as RBM_NORMAL */
 } ReadBufferMode;
 
+/*
+ * Type returned by PrefetchBuffer().
+ */
+typedef struct PrefetchBufferResult
+{
+       Buffer          recent_buffer;  /* If valid, a hit (recheck needed!) */
+       bool            initiated_io;   /* If true, a miss resulting in async I/O */
+} PrefetchBufferResult;
+
 /* forward declared, to avoid having to expose buf_internals.h here */
 struct WritebackContext;
 
@@ -162,8 +171,11 @@ extern PGDLLIMPORT int32 *LocalRefCount;
 /*
  * prototypes for functions in bufmgr.c
  */
-extern void PrefetchBuffer(Relation reln, ForkNumber forkNum,
-                                                  BlockNumber blockNum);
+extern PrefetchBufferResult PrefetchSharedBuffer(struct SMgrRelationData *smgr_reln,
+                                                                                                ForkNumber forkNum,
+                                                                                                BlockNumber blockNum);
+extern PrefetchBufferResult PrefetchBuffer(Relation reln, ForkNumber forkNum,
+                                                                                  BlockNumber blockNum);
 extern Buffer ReadBuffer(Relation reln, BlockNumber blockNum);
 extern Buffer ReadBufferExtended(Relation reln, ForkNumber forkNum,
                                                                 BlockNumber blockNum, ReadBufferMode mode,
index ec7630ce3b77fb33e73609a2e6d7b2cd8e836eb3..07fd1bb7d06c8045962907aeeef6ba44567cd6de 100644 (file)
@@ -28,7 +28,7 @@ extern bool mdexists(SMgrRelation reln, ForkNumber forknum);
 extern void mdunlink(RelFileNodeBackend rnode, ForkNumber forknum, bool isRedo);
 extern void mdextend(SMgrRelation reln, ForkNumber forknum,
                                         BlockNumber blocknum, char *buffer, bool skipFsync);
-extern void mdprefetch(SMgrRelation reln, ForkNumber forknum,
+extern bool mdprefetch(SMgrRelation reln, ForkNumber forknum,
                                           BlockNumber blocknum);
 extern void mdread(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
                                   char *buffer);
index 79dfe0e373af2c849ed9518e173d895657f22574..bb8428f27f609073c8da0b2ed6aeca750236ca94 100644 (file)
@@ -93,7 +93,7 @@ extern void smgrdosyncall(SMgrRelation *rels, int nrels);
 extern void smgrdounlinkall(SMgrRelation *rels, int nrels, bool isRedo);
 extern void smgrextend(SMgrRelation reln, ForkNumber forknum,
                                           BlockNumber blocknum, char *buffer, bool skipFsync);
-extern void smgrprefetch(SMgrRelation reln, ForkNumber forknum,
+extern bool smgrprefetch(SMgrRelation reln, ForkNumber forknum,
                                                 BlockNumber blocknum);
 extern void smgrread(SMgrRelation reln, ForkNumber forknum,
                                         BlockNumber blocknum, char *buffer);