Refactor the fsync queue for wider use.

author Thomas Munro <tmunro@postgresql.org>

Thu, 4 Apr 2019 08:56:03 +0000 (21:56 +1300)

committer Thomas Munro <tmunro@postgresql.org>

Thu, 4 Apr 2019 10:38:38 +0000 (23:38 +1300)
author Thomas Munro <tmunro@postgresql.org>
Thu, 4 Apr 2019 08:56:03 +0000 (21:56 +1300)
committer Thomas Munro <tmunro@postgresql.org>
Thu, 4 Apr 2019 10:38:38 +0000 (23:38 +1300)
diff --git a/src/backend/access/transam/twophase.c b/src/backend/access/transam/twophase.c

index 11992f7447d8f8be9f9112cd6f3e28a6ee3feb1d..ecc01f741d4f72ab341ca7b5cb5b10a00f419bb3 100644 (file)
--- a/src/backend/access/transam/twophase.c
+++ b/src/backend/access/transam/twophase.c
@@ -98,6 +98,7 @@
  #include "replication/walsender.h"
  #include "storage/fd.h"
  #include "storage/ipc.h"
+#include "storage/md.h"
  #include "storage/predicate.h"
  #include "storage/proc.h"
  #include "storage/procarray.h"
diff --git a/src/backend/access/transam/xact.c b/src/backend/access/transam/xact.c

index b04fdb5d5ed6b6d5e14648223d5169b9605ea95c..bd5024ef00aa1c1b13204c9dd46f74e41a850a44 100644 (file)
--- a/src/backend/access/transam/xact.c
+++ b/src/backend/access/transam/xact.c
@@ -50,6 +50,7 @@
  #include "storage/fd.h"
  #include "storage/freespace.h"
  #include "storage/lmgr.h"
+#include "storage/md.h"
  #include "storage/predicate.h"
  #include "storage/proc.h"
  #include "storage/procarray.h"
diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c

index e3a3110716d6aedcf32eb9636dbcd4d854711bdc..c00b63c751cea37bc16a7de4ccf17eb9172c7be8 100644 (file)
--- a/src/backend/access/transam/xlog.c
+++ b/src/backend/access/transam/xlog.c
@@ -66,6 +66,7 @@
  #include "storage/reinit.h"
  #include "storage/smgr.h"
  #include "storage/spin.h"
+#include "storage/sync.h"
  #include "utils/builtins.h"
  #include "utils/guc.h"
  #include "utils/memutils.h"
@@ -6981,7 +6982,7 @@ StartupXLOG(void)
         if (ArchiveRecoveryRequested && IsUnderPostmaster)
         {
             PublishStartupProcessInformation();
-           SetForwardFsyncRequests();
+           EnableSyncRequestForwarding();
             SendPostmasterSignal(PMSIGNAL_RECOVERY_STARTED);
             bgwriterLaunched = true;
         }
@@ -8566,7 +8567,7 @@ CreateCheckPoint(int flags)
      * the REDO pointer.  Note that smgr must not do anything that'd have to
      * be undone if we decide no checkpoint is needed.
      */
-   smgrpreckpt();
+   SyncPreCheckpoint();
  
     /* Begin filling in the checkpoint WAL record */
     MemSet(&checkPoint, 0, sizeof(checkPoint));
@@ -8856,7 +8857,7 @@ CreateCheckPoint(int flags)
     /*
      * Let smgr do post-checkpoint cleanup (eg, deleting old files).
      */
-   smgrpostckpt();
+   SyncPostCheckpoint();
  
     /*
      * Update the average distance between checkpoints if the prior checkpoint
diff --git a/src/backend/commands/dbcommands.c b/src/backend/commands/dbcommands.c

index 35cad0b62946bd55b9aabb113ce5a2cbf60d71e9..9707afabd98d15377dee1781e96e11475b2dcfcd 100644 (file)
--- a/src/backend/commands/dbcommands.c
+++ b/src/backend/commands/dbcommands.c
@@ -54,6 +54,7 @@
  #include "storage/fd.h"
  #include "storage/lmgr.h"
  #include "storage/ipc.h"
+#include "storage/md.h"
  #include "storage/procarray.h"
  #include "storage/smgr.h"
  #include "utils/acl.h"
@@ -941,11 +942,11 @@ dropdb(const char *dbname, bool missing_ok)
      * worse, it will delete files that belong to a newly created database
      * with the same OID.
      */
-   ForgetDatabaseFsyncRequests(db_id);
+   ForgetDatabaseSyncRequests(db_id);
  
     /*
      * Force a checkpoint to make sure the checkpointer has received the
-    * message sent by ForgetDatabaseFsyncRequests. On Windows, this also
+    * message sent by ForgetDatabaseSyncRequests. On Windows, this also
      * ensures that background procs don't hold any open files, which would
      * cause rmdir() to fail.
      */
@@ -2150,7 +2151,7 @@ dbase_redo(XLogReaderState *record)
         DropDatabaseBuffers(xlrec->db_id);
  
         /* Also, clean out any fsync requests that might be pending in md.c */
-       ForgetDatabaseFsyncRequests(xlrec->db_id);
+       ForgetDatabaseSyncRequests(xlrec->db_id);
  
         /* Clean out the xlog relcache too */
         XLogDropDatabase(xlrec->db_id);
diff --git a/src/backend/postmaster/checkpointer.c b/src/backend/postmaster/checkpointer.c

index c2411081a5ed13e010e9febf8ceb84c83b5be4d8..d303ce367945ef57d19f73de2781a6106018ea3c 100644 (file)
--- a/src/backend/postmaster/checkpointer.c
+++ b/src/backend/postmaster/checkpointer.c
@@ -108,10 +108,8 @@
   */
  typedef struct
  {
-   RelFileNode rnode;
-   ForkNumber  forknum;
-   BlockNumber segno;          /* see md.c for special values */
-   /* might add a real request-type field later; not needed yet */
+   SyncRequestType type;       /* request type */
+   FileTag     ftag;           /* file identifier */
  } CheckpointerRequest;
  
  typedef struct
@@ -349,7 +347,7 @@ CheckpointerMain(void)
         /*
          * Process any requests or signals received recently.
          */
-       AbsorbFsyncRequests();
+       AbsorbSyncRequests();
  
         if (got_SIGHUP)
         {
@@ -684,7 +682,7 @@ CheckpointWriteDelay(int flags, double progress)
             UpdateSharedMemoryConfig();
         }
  
-       AbsorbFsyncRequests();
+       AbsorbSyncRequests();
         absorb_counter = WRITES_PER_ABSORB;
  
         CheckArchiveTimeout();
@@ -709,7 +707,7 @@ CheckpointWriteDelay(int flags, double progress)
          * operations even when we don't sleep, to prevent overflow of the
          * fsync request queue.
          */
-       AbsorbFsyncRequests();
+       AbsorbSyncRequests();
         absorb_counter = WRITES_PER_ABSORB;
     }
  }
@@ -1084,7 +1082,7 @@ RequestCheckpoint(int flags)
  }
  
  /*
- * ForwardFsyncRequest
+ * ForwardSyncRequest
   *     Forward a file-fsync request from a backend to the checkpointer
   *
   * Whenever a backend is compelled to write directly to a relation
@@ -1093,15 +1091,6 @@ RequestCheckpoint(int flags)
   * is dirty and must be fsync'd before next checkpoint.  We also use this
   * opportunity to count such writes for statistical purposes.
   *
- * This functionality is only supported for regular (not backend-local)
- * relations, so the rnode argument is intentionally RelFileNode not
- * RelFileNodeBackend.
- *
- * segno specifies which segment (not block!) of the relation needs to be
- * fsync'd.  (Since the valid range is much less than BlockNumber, we can
- * use high values for special flags; that's all internal to md.c, which
- * see for details.)
- *
   * To avoid holding the lock for longer than necessary, we normally write
   * to the requests[] queue without checking for duplicates.  The checkpointer
   * will have to eliminate dups internally anyway.  However, if we discover
@@ -1113,7 +1102,7 @@ RequestCheckpoint(int flags)
   * let the backend know by returning false.
   */
  bool
-ForwardFsyncRequest(RelFileNode rnode, ForkNumber forknum, BlockNumber segno)
+ForwardSyncRequest(const FileTag *ftag, SyncRequestType type)
  {
     CheckpointerRequest *request;
     bool        too_full;
@@ -1122,7 +1111,7 @@ ForwardFsyncRequest(RelFileNode rnode, ForkNumber forknum, BlockNumber segno)
         return false;           /* probably shouldn't even get here */
  
     if (AmCheckpointerProcess())
-       elog(ERROR, "ForwardFsyncRequest must not be called in checkpointer");
+       elog(ERROR, "ForwardSyncRequest must not be called in checkpointer");
  
     LWLockAcquire(CheckpointerCommLock, LW_EXCLUSIVE);
  
@@ -1151,9 +1140,8 @@ ForwardFsyncRequest(RelFileNode rnode, ForkNumber forknum, BlockNumber segno)
  
     /* OK, insert request */
     request = &CheckpointerShmem->requests[CheckpointerShmem->num_requests++];
-   request->rnode = rnode;
-   request->forknum = forknum;
-   request->segno = segno;
+   request->ftag = *ftag;
+   request->type = type;
  
     /* If queue is more than half full, nudge the checkpointer to empty it */
     too_full = (CheckpointerShmem->num_requests >=
@@ -1284,8 +1272,8 @@ CompactCheckpointerRequestQueue(void)
  }
  
  /*
- * AbsorbFsyncRequests
- *     Retrieve queued fsync requests and pass them to local smgr.
+ * AbsorbSyncRequests
+ *     Retrieve queued sync requests and pass them to sync mechanism.
   *
   * This is exported because it must be called during CreateCheckPoint;
   * we have to be sure we have accepted all pending requests just before
@@ -1293,7 +1281,7 @@ CompactCheckpointerRequestQueue(void)
   * non-checkpointer processes, do nothing if not checkpointer.
   */
  void
-AbsorbFsyncRequests(void)
+AbsorbSyncRequests(void)
  {
     CheckpointerRequest *requests = NULL;
     CheckpointerRequest *request;
@@ -1335,7 +1323,7 @@ AbsorbFsyncRequests(void)
     LWLockRelease(CheckpointerCommLock);
  
     for (request = requests; n > 0; request++, n--)
-       RememberFsyncRequest(request->rnode, request->forknum, request->segno);
+       RememberSyncRequest(&request->ftag, request->type);
  
     END_CRIT_SECTION();
  
diff --git a/src/backend/storage/Makefile b/src/backend/storage/Makefile

index bd2d272c6eac1ef916a0bd196960870a35ae18fa..8376cdfca202bf414dbbd98e65bdc8ed8274398e 100644 (file)
--- a/src/backend/storage/Makefile
+++ b/src/backend/storage/Makefile
@@ -8,6 +8,6 @@ subdir = src/backend/storage
  top_builddir = ../../..
  include $(top_builddir)/src/Makefile.global
  
-SUBDIRS     = buffer file freespace ipc large_object lmgr page smgr
+SUBDIRS     = buffer file freespace ipc large_object lmgr page smgr sync
  
  include $(top_srcdir)/src/backend/common.mk
diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c

index 273e2f385fe9ef2dc98d86cc78148a0ad23b2b90..887023fc8a5a20ad93f1eadbcee0cee6d410677f 100644 (file)
--- a/src/backend/storage/buffer/bufmgr.c
+++ b/src/backend/storage/buffer/bufmgr.c
@@ -2584,7 +2584,7 @@ CheckPointBuffers(int flags)
     BufferSync(flags);
     CheckpointStats.ckpt_sync_t = GetCurrentTimestamp();
     TRACE_POSTGRESQL_BUFFER_CHECKPOINT_SYNC_START();
-   smgrsync();
+   ProcessSyncRequests();
     CheckpointStats.ckpt_sync_end_t = GetCurrentTimestamp();
     TRACE_POSTGRESQL_BUFFER_CHECKPOINT_DONE();
  }
diff --git a/src/backend/storage/smgr/md.c b/src/backend/storage/smgr/md.c

index 6ed68185edb9fdcea69f9e01ed054813e7023644..ffb3569698f7a0a28cc3b050cf38ada7dea3f4d3 100644 (file)
--- a/src/backend/storage/smgr/md.c
+++ b/src/backend/storage/smgr/md.c
@@ -29,45 +29,17 @@
  #include "access/xlogutils.h"
  #include "access/xlog.h"
  #include "pgstat.h"
-#include "portability/instr_time.h"
  #include "postmaster/bgwriter.h"
  #include "storage/fd.h"
  #include "storage/bufmgr.h"
+#include "storage/md.h"
  #include "storage/relfilenode.h"
  #include "storage/smgr.h"
+#include "storage/sync.h"
  #include "utils/hsearch.h"
  #include "utils/memutils.h"
  #include "pg_trace.h"
  
-
-/* intervals for calling AbsorbFsyncRequests in mdsync and mdpostckpt */
-#define FSYNCS_PER_ABSORB      10
-#define UNLINKS_PER_ABSORB     10
-
-/*
- * Special values for the segno arg to RememberFsyncRequest.
- *
- * Note that CompactCheckpointerRequestQueue assumes that it's OK to remove an
- * fsync request from the queue if an identical, subsequent request is found.
- * See comments there before making changes here.
- */
-#define FORGET_RELATION_FSYNC  (InvalidBlockNumber)
-#define FORGET_DATABASE_FSYNC  (InvalidBlockNumber-1)
-#define UNLINK_RELATION_REQUEST (InvalidBlockNumber-2)
-
-/*
- * On Windows, we have to interpret EACCES as possibly meaning the same as
- * ENOENT, because if a file is unlinked-but-not-yet-gone on that platform,
- * that's what you get.  Ugh.  This code is designed so that we don't
- * actually believe these cases are okay without further evidence (namely,
- * a pending fsync request getting canceled ... see mdsync).
- */
-#ifndef WIN32
-#define FILE_POSSIBLY_DELETED(err) ((err) == ENOENT)
-#else
-#define FILE_POSSIBLY_DELETED(err) ((err) == ENOENT || (err) == EACCES)
-#endif
-
  /*
   * The magnetic disk storage manager keeps track of open file
   * descriptors in its own descriptor pool.  This is done to make it
@@ -115,49 +87,15 @@ typedef struct _MdfdVec
  static MemoryContext MdCxt;        /* context for all MdfdVec objects */
  
  
-/*
- * In some contexts (currently, standalone backends and the checkpointer)
- * we keep track of pending fsync operations: we need to remember all relation
- * segments that have been written since the last checkpoint, so that we can
- * fsync them down to disk before completing the next checkpoint.  This hash
- * table remembers the pending operations.  We use a hash table mostly as
- * a convenient way of merging duplicate requests.
- *
- * We use a similar mechanism to remember no-longer-needed files that can
- * be deleted after the next checkpoint, but we use a linked list instead of
- * a hash table, because we don't expect there to be any duplicate requests.
- *
- * These mechanisms are only used for non-temp relations; we never fsync
- * temp rels, nor do we need to postpone their deletion (see comments in
- * mdunlink).
- *
- * (Regular backends do not track pending operations locally, but forward
- * them to the checkpointer.)
- */
-typedef uint16 CycleCtr;       /* can be any convenient integer size */
-
-typedef struct
-{
-   RelFileNode rnode;          /* hash table key (must be first!) */
-   CycleCtr    cycle_ctr;      /* mdsync_cycle_ctr of oldest request */
-   /* requests[f] has bit n set if we need to fsync segment n of fork f */
-   Bitmapset  *requests[MAX_FORKNUM + 1];
-   /* canceled[f] is true if we canceled fsyncs for fork "recently" */
-   bool        canceled[MAX_FORKNUM + 1];
-} PendingOperationEntry;
-
-typedef struct
-{
-   RelFileNode rnode;          /* the dead relation to delete */
-   CycleCtr    cycle_ctr;      /* mdckpt_cycle_ctr when request was made */
-} PendingUnlinkEntry;
-
-static HTAB *pendingOpsTable = NULL;
-static List *pendingUnlinks = NIL;
-static MemoryContext pendingOpsCxt; /* context for the above  */
-
-static CycleCtr mdsync_cycle_ctr = 0;
-static CycleCtr mdckpt_cycle_ctr = 0;
+/* Populate a file tag describing an md.c segment file. */
+#define INIT_MD_FILETAG(a,xx_rnode,xx_forknum,xx_segno) \
+( \
+   memset(&(a), 0, sizeof(FileTag)), \
+   (a).handler = SYNC_HANDLER_MD, \
+   (a).rnode = (xx_rnode), \
+   (a).forknum = (xx_forknum), \
+   (a).segno = (xx_segno) \
+)
  
  
  /*** behavior for mdopen & _mdfd_getseg ***/
@@ -185,7 +123,10 @@ static void mdunlinkfork(RelFileNodeBackend rnode, ForkNumber forkNum,
  static MdfdVec *mdopen(SMgrRelation reln, ForkNumber forknum, int behavior);
  static void register_dirty_segment(SMgrRelation reln, ForkNumber forknum,
                        MdfdVec *seg);
-static void register_unlink(RelFileNodeBackend rnode);
+static void register_unlink_segment(RelFileNodeBackend rnode, ForkNumber forknum,
+                       BlockNumber segno);
+static void register_forget_request(RelFileNodeBackend rnode, ForkNumber forknum,
+                       BlockNumber segno);
  static void _fdvec_resize(SMgrRelation reln,
               ForkNumber forknum,
               int nseg);
@@ -208,64 +149,6 @@ mdinit(void)
     MdCxt = AllocSetContextCreate(TopMemoryContext,
                                   "MdSmgr",
                                   ALLOCSET_DEFAULT_SIZES);
-
-   /*
-    * Create pending-operations hashtable if we need it.  Currently, we need
-    * it if we are standalone (not under a postmaster) or if we are a startup
-    * or checkpointer auxiliary process.
-    */
-   if (!IsUnderPostmaster || AmStartupProcess() || AmCheckpointerProcess())
-   {
-       HASHCTL     hash_ctl;
-
-       /*
-        * XXX: The checkpointer needs to add entries to the pending ops table
-        * when absorbing fsync requests.  That is done within a critical
-        * section, which isn't usually allowed, but we make an exception. It
-        * means that there's a theoretical possibility that you run out of
-        * memory while absorbing fsync requests, which leads to a PANIC.
-        * Fortunately the hash table is small so that's unlikely to happen in
-        * practice.
-        */
-       pendingOpsCxt = AllocSetContextCreate(MdCxt,
-                                             "Pending ops context",
-                                             ALLOCSET_DEFAULT_SIZES);
-       MemoryContextAllowInCriticalSection(pendingOpsCxt, true);
-
-       MemSet(&hash_ctl, 0, sizeof(hash_ctl));
-       hash_ctl.keysize = sizeof(RelFileNode);
-       hash_ctl.entrysize = sizeof(PendingOperationEntry);
-       hash_ctl.hcxt = pendingOpsCxt;
-       pendingOpsTable = hash_create("Pending Ops Table",
-                                     100L,
-                                     &hash_ctl,
-                                     HASH_ELEM | HASH_BLOBS | HASH_CONTEXT);
-       pendingUnlinks = NIL;
-   }
-}
-
-/*
- * In archive recovery, we rely on checkpointer to do fsyncs, but we will have
- * already created the pendingOpsTable during initialization of the startup
- * process.  Calling this function drops the local pendingOpsTable so that
- * subsequent requests will be forwarded to checkpointer.
- */
-void
-SetForwardFsyncRequests(void)
-{
-   /* Perform any pending fsyncs we may have queued up, then drop table */
-   if (pendingOpsTable)
-   {
-       mdsync();
-       hash_destroy(pendingOpsTable);
-   }
-   pendingOpsTable = NULL;
-
-   /*
-    * We should not have any pending unlink requests, since mdunlink doesn't
-    * queue unlink requests when isRedo.
-    */
-   Assert(pendingUnlinks == NIL);
  }
  
  /*
@@ -380,16 +263,6 @@ mdcreate(SMgrRelation reln, ForkNumber forkNum, bool isRedo)
  void
  mdunlink(RelFileNodeBackend rnode, ForkNumber forkNum, bool isRedo)
  {
-   /*
-    * We have to clean out any pending fsync requests for the doomed
-    * relation, else the next mdsync() will fail.  There can't be any such
-    * requests for a temp relation, though.  We can send just one request
-    * even when deleting multiple forks, since the fsync queuing code accepts
-    * the "InvalidForkNumber = all forks" convention.
-    */
-   if (!RelFileNodeBackendIsTemp(rnode))
-       ForgetRelationFsyncRequests(rnode.node, forkNum);
-
     /* Now do the per-fork work */
     if (forkNum == InvalidForkNumber)
     {
@@ -413,6 +286,11 @@ mdunlinkfork(RelFileNodeBackend rnode, ForkNumber forkNum, bool isRedo)
      */
     if (isRedo || forkNum != MAIN_FORKNUM || RelFileNodeBackendIsTemp(rnode))
     {
+       /* First, forget any pending sync requests for the first segment */
+       if (!RelFileNodeBackendIsTemp(rnode))
+           register_forget_request(rnode, forkNum, 0 /* first seg */ );
+
+       /* Next unlink the file */
         ret = unlink(path);
         if (ret < 0 && errno != ENOENT)
             ereport(WARNING,
@@ -442,7 +320,7 @@ mdunlinkfork(RelFileNodeBackend rnode, ForkNumber forkNum, bool isRedo)
                      errmsg("could not truncate file \"%s\": %m", path)));
  
         /* Register request to unlink first segment later */
-       register_unlink(rnode);
+       register_unlink_segment(rnode, forkNum, 0 /* first seg */ );
     }
  
     /*
@@ -459,6 +337,13 @@ mdunlinkfork(RelFileNodeBackend rnode, ForkNumber forkNum, bool isRedo)
          */
         for (segno = 1;; segno++)
         {
+           /*
+            * Forget any pending sync requests for this segment before we try
+            * to unlink.
+            */
+           if (!RelFileNodeBackendIsTemp(rnode))
+               register_forget_request(rnode, forkNum, segno);
+
             sprintf(segpath, "%s.%u", path, segno);
             if (unlink(segpath) < 0)
             {
@@ -1003,413 +888,27 @@ mdimmedsync(SMgrRelation reln, ForkNumber forknum)
     }
  }
  
-/*
- * mdsync() -- Sync previous writes to stable storage.
- */
-void
-mdsync(void)
-{
-   static bool mdsync_in_progress = false;
-
-   HASH_SEQ_STATUS hstat;
-   PendingOperationEntry *entry;
-   int         absorb_counter;
-
-   /* Statistics on sync times */
-   int         processed = 0;
-   instr_time  sync_start,
-               sync_end,
-               sync_diff;
-   uint64      elapsed;
-   uint64      longest = 0;
-   uint64      total_elapsed = 0;
-
-   /*
-    * This is only called during checkpoints, and checkpoints should only
-    * occur in processes that have created a pendingOpsTable.
-    */
-   if (!pendingOpsTable)
-       elog(ERROR, "cannot sync without a pendingOpsTable");
-
-   /*
-    * If we are in the checkpointer, the sync had better include all fsync
-    * requests that were queued by backends up to this point.  The tightest
-    * race condition that could occur is that a buffer that must be written
-    * and fsync'd for the checkpoint could have been dumped by a backend just
-    * before it was visited by BufferSync().  We know the backend will have
-    * queued an fsync request before clearing the buffer's dirtybit, so we
-    * are safe as long as we do an Absorb after completing BufferSync().
-    */
-   AbsorbFsyncRequests();
-
-   /*
-    * To avoid excess fsync'ing (in the worst case, maybe a never-terminating
-    * checkpoint), we want to ignore fsync requests that are entered into the
-    * hashtable after this point --- they should be processed next time,
-    * instead.  We use mdsync_cycle_ctr to tell old entries apart from new
-    * ones: new ones will have cycle_ctr equal to the incremented value of
-    * mdsync_cycle_ctr.
-    *
-    * In normal circumstances, all entries present in the table at this point
-    * will have cycle_ctr exactly equal to the current (about to be old)
-    * value of mdsync_cycle_ctr.  However, if we fail partway through the
-    * fsync'ing loop, then older values of cycle_ctr might remain when we
-    * come back here to try again.  Repeated checkpoint failures would
-    * eventually wrap the counter around to the point where an old entry
-    * might appear new, causing us to skip it, possibly allowing a checkpoint
-    * to succeed that should not have.  To forestall wraparound, any time the
-    * previous mdsync() failed to complete, run through the table and
-    * forcibly set cycle_ctr = mdsync_cycle_ctr.
-    *
-    * Think not to merge this loop with the main loop, as the problem is
-    * exactly that that loop may fail before having visited all the entries.
-    * From a performance point of view it doesn't matter anyway, as this path
-    * will never be taken in a system that's functioning normally.
-    */
-   if (mdsync_in_progress)
-   {
-       /* prior try failed, so update any stale cycle_ctr values */
-       hash_seq_init(&hstat, pendingOpsTable);
-       while ((entry = (PendingOperationEntry *) hash_seq_search(&hstat)) != NULL)
-       {
-           entry->cycle_ctr = mdsync_cycle_ctr;
-       }
-   }
-
-   /* Advance counter so that new hashtable entries are distinguishable */
-   mdsync_cycle_ctr++;
-
-   /* Set flag to detect failure if we don't reach the end of the loop */
-   mdsync_in_progress = true;
-
-   /* Now scan the hashtable for fsync requests to process */
-   absorb_counter = FSYNCS_PER_ABSORB;
-   hash_seq_init(&hstat, pendingOpsTable);
-   while ((entry = (PendingOperationEntry *) hash_seq_search(&hstat)) != NULL)
-   {
-       ForkNumber  forknum;
-
-       /*
-        * If the entry is new then don't process it this time; it might
-        * contain multiple fsync-request bits, but they are all new.  Note
-        * "continue" bypasses the hash-remove call at the bottom of the loop.
-        */
-       if (entry->cycle_ctr == mdsync_cycle_ctr)
-           continue;
-
-       /* Else assert we haven't missed it */
-       Assert((CycleCtr) (entry->cycle_ctr + 1) == mdsync_cycle_ctr);
-
-       /*
-        * Scan over the forks and segments represented by the entry.
-        *
-        * The bitmap manipulations are slightly tricky, because we can call
-        * AbsorbFsyncRequests() inside the loop and that could result in
-        * bms_add_member() modifying and even re-palloc'ing the bitmapsets.
-        * So we detach it, but if we fail we'll merge it with any new
-        * requests that have arrived in the meantime.
-        */
-       for (forknum = 0; forknum <= MAX_FORKNUM; forknum++)
-       {
-           Bitmapset  *requests = entry->requests[forknum];
-           int         segno;
-
-           entry->requests[forknum] = NULL;
-           entry->canceled[forknum] = false;
-
-           segno = -1;
-           while ((segno = bms_next_member(requests, segno)) >= 0)
-           {
-               int         failures;
-
-               /*
-                * If fsync is off then we don't have to bother opening the
-                * file at all.  (We delay checking until this point so that
-                * changing fsync on the fly behaves sensibly.)
-                */
-               if (!enableFsync)
-                   continue;
-
-               /*
-                * If in checkpointer, we want to absorb pending requests
-                * every so often to prevent overflow of the fsync request
-                * queue.  It is unspecified whether newly-added entries will
-                * be visited by hash_seq_search, but we don't care since we
-                * don't need to process them anyway.
-                */
-               if (--absorb_counter <= 0)
-               {
-                   AbsorbFsyncRequests();
-                   absorb_counter = FSYNCS_PER_ABSORB;
-               }
-
-               /*
-                * The fsync table could contain requests to fsync segments
-                * that have been deleted (unlinked) by the time we get to
-                * them. Rather than just hoping an ENOENT (or EACCES on
-                * Windows) error can be ignored, what we do on error is
-                * absorb pending requests and then retry.  Since mdunlink()
-                * queues a "cancel" message before actually unlinking, the
-                * fsync request is guaranteed to be marked canceled after the
-                * absorb if it really was this case. DROP DATABASE likewise
-                * has to tell us to forget fsync requests before it starts
-                * deletions.
-                */
-               for (failures = 0;; failures++) /* loop exits at "break" */
-               {
-                   SMgrRelation reln;
-                   MdfdVec    *seg;
-                   char       *path;
-                   int         save_errno;
-
-                   /*
-                    * Find or create an smgr hash entry for this relation.
-                    * This may seem a bit unclean -- md calling smgr?  But
-                    * it's really the best solution.  It ensures that the
-                    * open file reference isn't permanently leaked if we get
-                    * an error here. (You may say "but an unreferenced
-                    * SMgrRelation is still a leak!" Not really, because the
-                    * only case in which a checkpoint is done by a process
-                    * that isn't about to shut down is in the checkpointer,
-                    * and it will periodically do smgrcloseall(). This fact
-                    * justifies our not closing the reln in the success path
-                    * either, which is a good thing since in non-checkpointer
-                    * cases we couldn't safely do that.)
-                    */
-                   reln = smgropen(entry->rnode, InvalidBackendId);
-
-                   /* Attempt to open and fsync the target segment */
-                   seg = _mdfd_getseg(reln, forknum,
-                                      (BlockNumber) segno * (BlockNumber) RELSEG_SIZE,
-                                      false,
-                                      EXTENSION_RETURN_NULL
-                                      | EXTENSION_DONT_CHECK_SIZE);
-
-                   INSTR_TIME_SET_CURRENT(sync_start);
-
-                   if (seg != NULL &&
-                       FileSync(seg->mdfd_vfd, WAIT_EVENT_DATA_FILE_SYNC) >= 0)
-                   {
-                       /* Success; update statistics about sync timing */
-                       INSTR_TIME_SET_CURRENT(sync_end);
-                       sync_diff = sync_end;
-                       INSTR_TIME_SUBTRACT(sync_diff, sync_start);
-                       elapsed = INSTR_TIME_GET_MICROSEC(sync_diff);
-                       if (elapsed > longest)
-                           longest = elapsed;
-                       total_elapsed += elapsed;
-                       processed++;
-                       requests = bms_del_member(requests, segno);
-                       if (log_checkpoints)
-                           elog(DEBUG1, "checkpoint sync: number=%d file=%s time=%.3f msec",
-                                processed,
-                                FilePathName(seg->mdfd_vfd),
-                                (double) elapsed / 1000);
-
-                       break;  /* out of retry loop */
-                   }
-
-                   /* Compute file name for use in message */
-                   save_errno = errno;
-                   path = _mdfd_segpath(reln, forknum, (BlockNumber) segno);
-                   errno = save_errno;
-
-                   /*
-                    * It is possible that the relation has been dropped or
-                    * truncated since the fsync request was entered.
-                    * Therefore, allow ENOENT, but only if we didn't fail
-                    * already on this file.  This applies both for
-                    * _mdfd_getseg() and for FileSync, since fd.c might have
-                    * closed the file behind our back.
-                    *
-                    * XXX is there any point in allowing more than one retry?
-                    * Don't see one at the moment, but easy to change the
-                    * test here if so.
-                    */
-                   if (!FILE_POSSIBLY_DELETED(errno) ||
-                       failures > 0)
-                   {
-                       Bitmapset  *new_requests;
-
-                       /*
-                        * We need to merge these unsatisfied requests with
-                        * any others that have arrived since we started.
-                        */
-                       new_requests = entry->requests[forknum];
-                       entry->requests[forknum] =
-                           bms_join(new_requests, requests);
-
-                       errno = save_errno;
-                       ereport(data_sync_elevel(ERROR),
-                               (errcode_for_file_access(),
-                                errmsg("could not fsync file \"%s\": %m",
-                                       path)));
-                   }
-                   else
-                       ereport(DEBUG1,
-                               (errcode_for_file_access(),
-                                errmsg("could not fsync file \"%s\" but retrying: %m",
-                                       path)));
-                   pfree(path);
-
-                   /*
-                    * Absorb incoming requests and check to see if a cancel
-                    * arrived for this relation fork.
-                    */
-                   AbsorbFsyncRequests();
-                   absorb_counter = FSYNCS_PER_ABSORB; /* might as well... */
-
-                   if (entry->canceled[forknum])
-                       break;
-               }               /* end retry loop */
-           }
-           bms_free(requests);
-       }
-
-       /*
-        * We've finished everything that was requested before we started to
-        * scan the entry.  If no new requests have been inserted meanwhile,
-        * remove the entry.  Otherwise, update its cycle counter, as all the
-        * requests now in it must have arrived during this cycle.
-        */
-       for (forknum = 0; forknum <= MAX_FORKNUM; forknum++)
-       {
-           if (entry->requests[forknum] != NULL)
-               break;
-       }
-       if (forknum <= MAX_FORKNUM)
-           entry->cycle_ctr = mdsync_cycle_ctr;
-       else
-       {
-           /* Okay to remove it */
-           if (hash_search(pendingOpsTable, &entry->rnode,
-                           HASH_REMOVE, NULL) == NULL)
-               elog(ERROR, "pendingOpsTable corrupted");
-       }
-   }                           /* end loop over hashtable entries */
-
-   /* Return sync performance metrics for report at checkpoint end */
-   CheckpointStats.ckpt_sync_rels = processed;
-   CheckpointStats.ckpt_longest_sync = longest;
-   CheckpointStats.ckpt_agg_sync_time = total_elapsed;
-
-   /* Flag successful completion of mdsync */
-   mdsync_in_progress = false;
-}
-
-/*
- * mdpreckpt() -- Do pre-checkpoint work
- *
- * To distinguish unlink requests that arrived before this checkpoint
- * started from those that arrived during the checkpoint, we use a cycle
- * counter similar to the one we use for fsync requests. That cycle
- * counter is incremented here.
- *
- * This must be called *before* the checkpoint REDO point is determined.
- * That ensures that we won't delete files too soon.
- *
- * Note that we can't do anything here that depends on the assumption
- * that the checkpoint will be completed.
- */
-void
-mdpreckpt(void)
-{
-   /*
-    * Any unlink requests arriving after this point will be assigned the next
-    * cycle counter, and won't be unlinked until next checkpoint.
-    */
-   mdckpt_cycle_ctr++;
-}
-
-/*
- * mdpostckpt() -- Do post-checkpoint work
- *
- * Remove any lingering files that can now be safely removed.
- */
-void
-mdpostckpt(void)
-{
-   int         absorb_counter;
-
-   absorb_counter = UNLINKS_PER_ABSORB;
-   while (pendingUnlinks != NIL)
-   {
-       PendingUnlinkEntry *entry = (PendingUnlinkEntry *) linitial(pendingUnlinks);
-       char       *path;
-
-       /*
-        * New entries are appended to the end, so if the entry is new we've
-        * reached the end of old entries.
-        *
-        * Note: if just the right number of consecutive checkpoints fail, we
-        * could be fooled here by cycle_ctr wraparound.  However, the only
-        * consequence is that we'd delay unlinking for one more checkpoint,
-        * which is perfectly tolerable.
-        */
-       if (entry->cycle_ctr == mdckpt_cycle_ctr)
-           break;
-
-       /* Unlink the file */
-       path = relpathperm(entry->rnode, MAIN_FORKNUM);
-       if (unlink(path) < 0)
-       {
-           /*
-            * There's a race condition, when the database is dropped at the
-            * same time that we process the pending unlink requests. If the
-            * DROP DATABASE deletes the file before we do, we will get ENOENT
-            * here. rmtree() also has to ignore ENOENT errors, to deal with
-            * the possibility that we delete the file first.
-            */
-           if (errno != ENOENT)
-               ereport(WARNING,
-                       (errcode_for_file_access(),
-                        errmsg("could not remove file \"%s\": %m", path)));
-       }
-       pfree(path);
-
-       /* And remove the list entry */
-       pendingUnlinks = list_delete_first(pendingUnlinks);
-       pfree(entry);
-
-       /*
-        * As in mdsync, we don't want to stop absorbing fsync requests for a
-        * long time when there are many deletions to be done.  We can safely
-        * call AbsorbFsyncRequests() at this point in the loop (note it might
-        * try to delete list entries).
-        */
-       if (--absorb_counter <= 0)
-       {
-           AbsorbFsyncRequests();
-           absorb_counter = UNLINKS_PER_ABSORB;
-       }
-   }
-}
-
  /*
   * register_dirty_segment() -- Mark a relation segment as needing fsync
   *
   * If there is a local pending-ops table, just make an entry in it for
- * mdsync to process later.  Otherwise, try to pass off the fsync request
- * to the checkpointer process.  If that fails, just do the fsync
- * locally before returning (we hope this will not happen often enough
- * to be a performance problem).
+ * ProcessSyncRequests to process later.  Otherwise, try to pass off the
+ * fsync request to the checkpointer process.  If that fails, just do the
+ * fsync locally before returning (we hope this will not happen often
+ * enough to be a performance problem).
   */
  static void
  register_dirty_segment(SMgrRelation reln, ForkNumber forknum, MdfdVec *seg)
  {
+   FileTag     tag;
+
+   INIT_MD_FILETAG(tag, reln->smgr_rnode.node, forknum, seg->mdfd_segno);
+
     /* Temp relations should never be fsync'd */
     Assert(!SmgrIsTemp(reln));
  
-   if (pendingOpsTable)
+   if (!RegisterSyncRequest(&tag, SYNC_REQUEST, false /* retryOnError */ ))
     {
-       /* push it into local pending-ops table */
-       RememberFsyncRequest(reln->smgr_rnode.node, forknum, seg->mdfd_segno);
-   }
-   else
-   {
-       if (ForwardFsyncRequest(reln->smgr_rnode.node, forknum, seg->mdfd_segno))
-           return;             /* passed it off successfully */
-
         ereport(DEBUG1,
                 (errmsg("could not forward fsync request because request queue is full")));
  
@@ -1423,254 +922,51 @@ register_dirty_segment(SMgrRelation reln, ForkNumber forknum, MdfdVec *seg)
  
  /*
   * register_unlink() -- Schedule a file to be deleted after next checkpoint
- *
- * We don't bother passing in the fork number, because this is only used
- * with main forks.
- *
- * As with register_dirty_segment, this could involve either a local or
- * a remote pending-ops table.
   */
  static void
-register_unlink(RelFileNodeBackend rnode)
+register_unlink_segment(RelFileNodeBackend rnode, ForkNumber forknum,
+                       BlockNumber segno)
  {
+   FileTag     tag;
+
+   INIT_MD_FILETAG(tag, rnode.node, forknum, segno);
+
     /* Should never be used with temp relations */
     Assert(!RelFileNodeBackendIsTemp(rnode));
  
-   if (pendingOpsTable)
-   {
-       /* push it into local pending-ops table */
-       RememberFsyncRequest(rnode.node, MAIN_FORKNUM,
-                            UNLINK_RELATION_REQUEST);
-   }
-   else
-   {
-       /*
-        * Notify the checkpointer about it.  If we fail to queue the request
-        * message, we have to sleep and try again, because we can't simply
-        * delete the file now.  Ugly, but hopefully won't happen often.
-        *
-        * XXX should we just leave the file orphaned instead?
-        */
-       Assert(IsUnderPostmaster);
-       while (!ForwardFsyncRequest(rnode.node, MAIN_FORKNUM,
-                                   UNLINK_RELATION_REQUEST))
-           pg_usleep(10000L);  /* 10 msec seems a good number */
-   }
+   RegisterSyncRequest(&tag, SYNC_UNLINK_REQUEST, true /* retryOnError */ );
  }
  
  /*
- * RememberFsyncRequest() -- callback from checkpointer side of fsync request
- *
- * We stuff fsync requests into the local hash table for execution
- * during the checkpointer's next checkpoint.  UNLINK requests go into a
- * separate linked list, however, because they get processed separately.
- *
- * The range of possible segment numbers is way less than the range of
- * BlockNumber, so we can reserve high values of segno for special purposes.
- * We define three:
- * - FORGET_RELATION_FSYNC means to cancel pending fsyncs for a relation,
- *  either for one fork, or all forks if forknum is InvalidForkNumber
- * - FORGET_DATABASE_FSYNC means to cancel pending fsyncs for a whole database
- * - UNLINK_RELATION_REQUEST is a request to delete the file after the next
- *  checkpoint.
- * Note also that we're assuming real segment numbers don't exceed INT_MAX.
- *
- * (Handling FORGET_DATABASE_FSYNC requests is a tad slow because the hash
- * table has to be searched linearly, but dropping a database is a pretty
- * heavyweight operation anyhow, so we'll live with it.)
+ * register_forget_request() -- forget any fsyncs for a relation fork's segment
   */
-void
-RememberFsyncRequest(RelFileNode rnode, ForkNumber forknum, BlockNumber segno)
+static void
+register_forget_request(RelFileNodeBackend rnode, ForkNumber forknum,
+                       BlockNumber segno)
  {
-   Assert(pendingOpsTable);
-
-   if (segno == FORGET_RELATION_FSYNC)
-   {
-       /* Remove any pending requests for the relation (one or all forks) */
-       PendingOperationEntry *entry;
-
-       entry = (PendingOperationEntry *) hash_search(pendingOpsTable,
-                                                     &rnode,
-                                                     HASH_FIND,
-                                                     NULL);
-       if (entry)
-       {
-           /*
-            * We can't just delete the entry since mdsync could have an
-            * active hashtable scan.  Instead we delete the bitmapsets; this
-            * is safe because of the way mdsync is coded.  We also set the
-            * "canceled" flags so that mdsync can tell that a cancel arrived
-            * for the fork(s).
-            */
-           if (forknum == InvalidForkNumber)
-           {
-               /* remove requests for all forks */
-               for (forknum = 0; forknum <= MAX_FORKNUM; forknum++)
-               {
-                   bms_free(entry->requests[forknum]);
-                   entry->requests[forknum] = NULL;
-                   entry->canceled[forknum] = true;
-               }
-           }
-           else
-           {
-               /* remove requests for single fork */
-               bms_free(entry->requests[forknum]);
-               entry->requests[forknum] = NULL;
-               entry->canceled[forknum] = true;
-           }
-       }
-   }
-   else if (segno == FORGET_DATABASE_FSYNC)
-   {
-       /* Remove any pending requests for the entire database */
-       HASH_SEQ_STATUS hstat;
-       PendingOperationEntry *entry;
-       ListCell   *cell,
-                  *prev,
-                  *next;
-
-       /* Remove fsync requests */
-       hash_seq_init(&hstat, pendingOpsTable);
-       while ((entry = (PendingOperationEntry *) hash_seq_search(&hstat)) != NULL)
-       {
-           if (entry->rnode.dbNode == rnode.dbNode)
-           {
-               /* remove requests for all forks */
-               for (forknum = 0; forknum <= MAX_FORKNUM; forknum++)
-               {
-                   bms_free(entry->requests[forknum]);
-                   entry->requests[forknum] = NULL;
-                   entry->canceled[forknum] = true;
-               }
-           }
-       }
-
-       /* Remove unlink requests */
-       prev = NULL;
-       for (cell = list_head(pendingUnlinks); cell; cell = next)
-       {
-           PendingUnlinkEntry *entry = (PendingUnlinkEntry *) lfirst(cell);
-
-           next = lnext(cell);
-           if (entry->rnode.dbNode == rnode.dbNode)
-           {
-               pendingUnlinks = list_delete_cell(pendingUnlinks, cell, prev);
-               pfree(entry);
-           }
-           else
-               prev = cell;
-       }
-   }
-   else if (segno == UNLINK_RELATION_REQUEST)
-   {
-       /* Unlink request: put it in the linked list */
-       MemoryContext oldcxt = MemoryContextSwitchTo(pendingOpsCxt);
-       PendingUnlinkEntry *entry;
-
-       /* PendingUnlinkEntry doesn't store forknum, since it's always MAIN */
-       Assert(forknum == MAIN_FORKNUM);
-
-       entry = palloc(sizeof(PendingUnlinkEntry));
-       entry->rnode = rnode;
-       entry->cycle_ctr = mdckpt_cycle_ctr;
-
-       pendingUnlinks = lappend(pendingUnlinks, entry);
-
-       MemoryContextSwitchTo(oldcxt);
-   }
-   else
-   {
-       /* Normal case: enter a request to fsync this segment */
-       MemoryContext oldcxt = MemoryContextSwitchTo(pendingOpsCxt);
-       PendingOperationEntry *entry;
-       bool        found;
-
-       entry = (PendingOperationEntry *) hash_search(pendingOpsTable,
-                                                     &rnode,
-                                                     HASH_ENTER,
-                                                     &found);
-       /* if new entry, initialize it */
-       if (!found)
-       {
-           entry->cycle_ctr = mdsync_cycle_ctr;
-           MemSet(entry->requests, 0, sizeof(entry->requests));
-           MemSet(entry->canceled, 0, sizeof(entry->canceled));
-       }
-
-       /*
-        * NB: it's intentional that we don't change cycle_ctr if the entry
-        * already exists.  The cycle_ctr must represent the oldest fsync
-        * request that could be in the entry.
-        */
-
-       entry->requests[forknum] = bms_add_member(entry->requests[forknum],
-                                                 (int) segno);
-
-       MemoryContextSwitchTo(oldcxt);
-   }
-}
+   FileTag     tag;
  
-/*
- * ForgetRelationFsyncRequests -- forget any fsyncs for a relation fork
- *
- * forknum == InvalidForkNumber means all forks, although this code doesn't
- * actually know that, since it's just forwarding the request elsewhere.
- */
-void
-ForgetRelationFsyncRequests(RelFileNode rnode, ForkNumber forknum)
-{
-   if (pendingOpsTable)
-   {
-       /* standalone backend or startup process: fsync state is local */
-       RememberFsyncRequest(rnode, forknum, FORGET_RELATION_FSYNC);
-   }
-   else if (IsUnderPostmaster)
-   {
-       /*
-        * Notify the checkpointer about it.  If we fail to queue the cancel
-        * message, we have to sleep and try again ... ugly, but hopefully
-        * won't happen often.
-        *
-        * XXX should we CHECK_FOR_INTERRUPTS in this loop?  Escaping with an
-        * error would leave the no-longer-used file still present on disk,
-        * which would be bad, so I'm inclined to assume that the checkpointer
-        * will always empty the queue soon.
-        */
-       while (!ForwardFsyncRequest(rnode, forknum, FORGET_RELATION_FSYNC))
-           pg_usleep(10000L);  /* 10 msec seems a good number */
+   INIT_MD_FILETAG(tag, rnode.node, forknum, segno);
  
-       /*
-        * Note we don't wait for the checkpointer to actually absorb the
-        * cancel message; see mdsync() for the implications.
-        */
-   }
+   RegisterSyncRequest(&tag, SYNC_FORGET_REQUEST, true /* retryOnError */ );
  }
  
  /*
   * ForgetDatabaseFsyncRequests -- forget any fsyncs and unlinks for a DB
   */
  void
-ForgetDatabaseFsyncRequests(Oid dbid)
+ForgetDatabaseSyncRequests(Oid dbid)
  {
+   FileTag     tag;
     RelFileNode rnode;
  
     rnode.dbNode = dbid;
     rnode.spcNode = 0;
     rnode.relNode = 0;
  
-   if (pendingOpsTable)
-   {
-       /* standalone backend or startup process: fsync state is local */
-       RememberFsyncRequest(rnode, InvalidForkNumber, FORGET_DATABASE_FSYNC);
-   }
-   else if (IsUnderPostmaster)
-   {
-       /* see notes in ForgetRelationFsyncRequests */
-       while (!ForwardFsyncRequest(rnode, InvalidForkNumber,
-                                   FORGET_DATABASE_FSYNC))
-           pg_usleep(10000L);  /* 10 msec seems a good number */
-   }
+   INIT_MD_FILETAG(tag, rnode, InvalidForkNumber, InvalidBlockNumber);
+
+   RegisterSyncRequest(&tag, SYNC_FILTER_REQUEST, true /* retryOnError */ );
  }
  
  /*
@@ -1951,3 +1247,72 @@ _mdnblocks(SMgrRelation reln, ForkNumber forknum, MdfdVec *seg)
     /* note that this calculation will ignore any partial block at EOF */
     return (BlockNumber) (len / BLCKSZ);
  }
+
+/*
+ * Sync a file to disk, given a file tag.  Write the path into an output
+ * buffer so the caller can use it in error messages.
+ *
+ * Return 0 on success, -1 on failure, with errno set.
+ */
+int
+mdsyncfiletag(const FileTag *ftag, char *path)
+{
+   SMgrRelation reln = smgropen(ftag->rnode, InvalidBackendId);
+   MdfdVec    *v;
+   char       *p;
+
+   /* Provide the path for informational messages. */
+   p = _mdfd_segpath(reln, ftag->forknum, ftag->segno);
+   strlcpy(path, p, MAXPGPATH);
+   pfree(p);
+
+   /* Try to find open the requested segment. */
+   v = _mdfd_getseg(reln, ftag->forknum, ftag->segno, false,
+                    EXTENSION_RETURN_NULL);
+   if (v == NULL)
+   {
+       errno = ENOENT;
+       return -1;
+   }
+
+   /* Try to fsync the file. */
+   return FileSync(v->mdfd_vfd, WAIT_EVENT_DATA_FILE_SYNC);
+}
+
+/*
+ * Unlink a file, given a file tag.  Write the path into an output
+ * buffer so the caller can use it in error messages.
+ *
+ * Return 0 on success, -1 on failure, with errno set.
+ */
+int
+mdunlinkfiletag(const FileTag *ftag, char *path)
+{
+   SMgrRelation reln = smgropen(ftag->rnode, InvalidBackendId);
+   char       *p;
+
+   /* Compute the path. */
+   p = _mdfd_segpath(reln, ftag->forknum, ftag->segno);
+   strlcpy(path, p, MAXPGPATH);
+   pfree(p);
+
+   /* Try to unlink the file. */
+   return unlink(path);
+}
+
+/*
+ * Check if a given candidate request matches a given tag, when processing
+ * a SYNC_FILTER_REQUEST request.  This will be called for all pending
+ * requests to find out whether to forget them.
+ */
+bool
+mdfiletagmatches(const FileTag *ftag, const FileTag *candidate)
+{
+   /*
+    * For now we only use filter requests as a way to drop all scheduled
+    * callbacks relating to a given database, when dropping the database.
+    * We'll return true for all candidates that have the same database OID as
+    * the ftag from the SYNC_FILTER_REQUEST request, so they're forgotten.
+    */
+   return ftag->rnode.dbNode == candidate->rnode.dbNode;
+}
diff --git a/src/backend/storage/smgr/smgr.c b/src/backend/storage/smgr/smgr.c

index f6de9df9e619e043d80e85fe638182a18860257d..8191118b619d042b2ab04821fe29c6cd6147c287 100644 (file)
--- a/src/backend/storage/smgr/smgr.c
+++ b/src/backend/storage/smgr/smgr.c
@@ -21,6 +21,7 @@
  #include "lib/ilist.h"
  #include "storage/bufmgr.h"
  #include "storage/ipc.h"
+#include "storage/md.h"
  #include "storage/smgr.h"
  #include "utils/hsearch.h"
  #include "utils/inval.h"
@@ -60,12 +61,8 @@ typedef struct f_smgr
     void        (*smgr_truncate) (SMgrRelation reln, ForkNumber forknum,
                                   BlockNumber nblocks);
     void        (*smgr_immedsync) (SMgrRelation reln, ForkNumber forknum);
-   void        (*smgr_pre_ckpt) (void);    /* may be NULL */
-   void        (*smgr_sync) (void);    /* may be NULL */
-   void        (*smgr_post_ckpt) (void);   /* may be NULL */
  } f_smgr;
  
-
  static const f_smgr smgrsw[] = {
     /* magnetic disk */
     {
@@ -83,15 +80,11 @@ static const f_smgr smgrsw[] = {
         .smgr_nblocks = mdnblocks,
         .smgr_truncate = mdtruncate,
         .smgr_immedsync = mdimmedsync,
-       .smgr_pre_ckpt = mdpreckpt,
-       .smgr_sync = mdsync,
-       .smgr_post_ckpt = mdpostckpt
     }
  };
  
  static const int NSmgr = lengthof(smgrsw);
  
-
  /*
   * Each backend has a hashtable that stores all extant SMgrRelation objects.
   * In addition, "unowned" SMgrRelation objects are chained together in a list.
@@ -705,52 +698,6 @@ smgrimmedsync(SMgrRelation reln, ForkNumber forknum)
     smgrsw[reln->smgr_which].smgr_immedsync(reln, forknum);
  }
  
-
-/*
- * smgrpreckpt() -- Prepare for checkpoint.
- */
-void
-smgrpreckpt(void)
-{
-   int         i;
-
-   for (i = 0; i < NSmgr; i++)
-   {
-       if (smgrsw[i].smgr_pre_ckpt)
-           smgrsw[i].smgr_pre_ckpt();
-   }
-}
-
-/*
- * smgrsync() -- Sync files to disk during checkpoint.
- */
-void
-smgrsync(void)
-{
-   int         i;
-
-   for (i = 0; i < NSmgr; i++)
-   {
-       if (smgrsw[i].smgr_sync)
-           smgrsw[i].smgr_sync();
-   }
-}
-
-/*
- * smgrpostckpt() -- Post-checkpoint cleanup.
- */
-void
-smgrpostckpt(void)
-{
-   int         i;
-
-   for (i = 0; i < NSmgr; i++)
-   {
-       if (smgrsw[i].smgr_post_ckpt)
-           smgrsw[i].smgr_post_ckpt();
-   }
-}
-
  /*
   * AtEOXact_SMgr
   *
diff --git a/src/backend/storage/sync/Makefile b/src/backend/storage/sync/Makefile

new file mode 100644 (file)

index 0000000..cfc60ca
--- /dev/null
+++ b/src/backend/storage/sync/Makefile
@@ -0,0 +1,17 @@
+#-------------------------------------------------------------------------
+#
+# Makefile--
+#    Makefile for storage/sync
+#
+# IDENTIFICATION
+#    src/backend/storage/sync/Makefile
+#
+#-------------------------------------------------------------------------
+
+subdir = src/backend/storage/sync
+top_builddir = ../../../..
+include $(top_builddir)/src/Makefile.global
+
+OBJS = sync.o
+
+include $(top_srcdir)/src/backend/common.mk
diff --git a/src/backend/storage/sync/sync.c b/src/backend/storage/sync/sync.c

new file mode 100644 (file)

index 0000000..f77519d
--- /dev/null
+++ b/src/backend/storage/sync/sync.c
@@ -0,0 +1,598 @@
+/*-------------------------------------------------------------------------
+ *
+ * sync.c
+ *   File synchronization management code.
+ *
+ * Portions Copyright (c) 1996-2019, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ *   src/backend/storage/sync/sync.c
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include <unistd.h>
+#include <fcntl.h>
+#include <sys/file.h>
+
+#include "miscadmin.h"
+#include "pgstat.h"
+#include "access/xlogutils.h"
+#include "access/xlog.h"
+#include "commands/tablespace.h"
+#include "portability/instr_time.h"
+#include "postmaster/bgwriter.h"
+#include "storage/bufmgr.h"
+#include "storage/ipc.h"
+#include "storage/md.h"
+#include "utils/hsearch.h"
+#include "utils/memutils.h"
+#include "utils/inval.h"
+
+static MemoryContext pendingOpsCxt; /* context for the pending ops state  */
+
+/*
+ * In some contexts (currently, standalone backends and the checkpointer)
+ * we keep track of pending fsync operations: we need to remember all relation
+ * segments that have been written since the last checkpoint, so that we can
+ * fsync them down to disk before completing the next checkpoint.  This hash
+ * table remembers the pending operations.  We use a hash table mostly as
+ * a convenient way of merging duplicate requests.
+ *
+ * We use a similar mechanism to remember no-longer-needed files that can
+ * be deleted after the next checkpoint, but we use a linked list instead of
+ * a hash table, because we don't expect there to be any duplicate requests.
+ *
+ * These mechanisms are only used for non-temp relations; we never fsync
+ * temp rels, nor do we need to postpone their deletion (see comments in
+ * mdunlink).
+ *
+ * (Regular backends do not track pending operations locally, but forward
+ * them to the checkpointer.)
+ */
+typedef uint16 CycleCtr;       /* can be any convenient integer size */
+
+typedef struct
+{
+   FileTag     tag;            /* identifies handler and file */
+   CycleCtr    cycle_ctr;      /* sync_cycle_ctr of oldest request */
+   bool        canceled;       /* canceled is true if we canceled "recently" */
+} PendingFsyncEntry;
+
+typedef struct
+{
+   FileTag     tag;            /* identifies handler and file */
+   CycleCtr    cycle_ctr;      /* checkpoint_cycle_ctr when request was made */
+} PendingUnlinkEntry;
+
+static HTAB *pendingOps = NULL;
+static List *pendingUnlinks = NIL;
+static MemoryContext pendingOpsCxt; /* context for the above  */
+
+static CycleCtr sync_cycle_ctr = 0;
+static CycleCtr checkpoint_cycle_ctr = 0;
+
+/* Intervals for calling AbsorbFsyncRequests */
+#define FSYNCS_PER_ABSORB      10
+#define UNLINKS_PER_ABSORB     10
+
+/*
+ * Function pointers for handling sync and unlink requests.
+ */
+typedef struct SyncOps
+{
+   int         (*sync_syncfiletag) (const FileTag *ftag, char *path);
+   int         (*sync_unlinkfiletag) (const FileTag *ftag, char *path);
+   bool        (*sync_filetagmatches) (const FileTag *ftag,
+                                       const FileTag *candidate);
+} SyncOps;
+
+static const SyncOps syncsw[] = {
+   /* magnetic disk */
+   {
+       .sync_syncfiletag = mdsyncfiletag,
+       .sync_unlinkfiletag = mdunlinkfiletag,
+       .sync_filetagmatches = mdfiletagmatches
+   }
+};
+
+/*
+ * Initialize data structures for the file sync tracking.
+ */
+void
+InitSync(void)
+{
+   /*
+    * Create pending-operations hashtable if we need it.  Currently, we need
+    * it if we are standalone (not under a postmaster) or if we are a startup
+    * or checkpointer auxiliary process.
+    */
+   if (!IsUnderPostmaster || AmStartupProcess() || AmCheckpointerProcess())
+   {
+       HASHCTL     hash_ctl;
+
+       /*
+        * XXX: The checkpointer needs to add entries to the pending ops table
+        * when absorbing fsync requests.  That is done within a critical
+        * section, which isn't usually allowed, but we make an exception. It
+        * means that there's a theoretical possibility that you run out of
+        * memory while absorbing fsync requests, which leads to a PANIC.
+        * Fortunately the hash table is small so that's unlikely to happen in
+        * practice.
+        */
+       pendingOpsCxt = AllocSetContextCreate(TopMemoryContext,
+                                             "Pending ops context",
+                                             ALLOCSET_DEFAULT_SIZES);
+       MemoryContextAllowInCriticalSection(pendingOpsCxt, true);
+
+       MemSet(&hash_ctl, 0, sizeof(hash_ctl));
+       hash_ctl.keysize = sizeof(FileTag);
+       hash_ctl.entrysize = sizeof(PendingFsyncEntry);
+       hash_ctl.hcxt = pendingOpsCxt;
+       pendingOps = hash_create("Pending Ops Table",
+                                100L,
+                                &hash_ctl,
+                                HASH_ELEM | HASH_BLOBS | HASH_CONTEXT);
+       pendingUnlinks = NIL;
+   }
+
+}
+
+/*
+ * SyncPreCheckpoint() -- Do pre-checkpoint work
+ *
+ * To distinguish unlink requests that arrived before this checkpoint
+ * started from those that arrived during the checkpoint, we use a cycle
+ * counter similar to the one we use for fsync requests. That cycle
+ * counter is incremented here.
+ *
+ * This must be called *before* the checkpoint REDO point is determined.
+ * That ensures that we won't delete files too soon.
+ *
+ * Note that we can't do anything here that depends on the assumption
+ * that the checkpoint will be completed.
+ */
+void
+SyncPreCheckpoint(void)
+{
+   /*
+    * Any unlink requests arriving after this point will be assigned the next
+    * cycle counter, and won't be unlinked until next checkpoint.
+    */
+   checkpoint_cycle_ctr++;
+}
+
+/*
+ * SyncPostCheckpoint() -- Do post-checkpoint work
+ *
+ * Remove any lingering files that can now be safely removed.
+ */
+void
+SyncPostCheckpoint(void)
+{
+   int         absorb_counter;
+
+   absorb_counter = UNLINKS_PER_ABSORB;
+   while (pendingUnlinks != NIL)
+   {
+       PendingUnlinkEntry *entry = (PendingUnlinkEntry *) linitial(pendingUnlinks);
+       char        path[MAXPGPATH];
+
+       /*
+        * New entries are appended to the end, so if the entry is new we've
+        * reached the end of old entries.
+        *
+        * Note: if just the right number of consecutive checkpoints fail, we
+        * could be fooled here by cycle_ctr wraparound.  However, the only
+        * consequence is that we'd delay unlinking for one more checkpoint,
+        * which is perfectly tolerable.
+        */
+       if (entry->cycle_ctr == checkpoint_cycle_ctr)
+           break;
+
+       /* Unlink the file */
+       if (syncsw[entry->tag.handler].sync_unlinkfiletag(&entry->tag,
+                                                         path) < 0)
+       {
+           /*
+            * There's a race condition, when the database is dropped at the
+            * same time that we process the pending unlink requests. If the
+            * DROP DATABASE deletes the file before we do, we will get ENOENT
+            * here. rmtree() also has to ignore ENOENT errors, to deal with
+            * the possibility that we delete the file first.
+            */
+           if (errno != ENOENT)
+               ereport(WARNING,
+                       (errcode_for_file_access(),
+                        errmsg("could not remove file \"%s\": %m", path)));
+       }
+
+       /* And remove the list entry */
+       pendingUnlinks = list_delete_first(pendingUnlinks);
+       pfree(entry);
+
+       /*
+        * As in ProcessFsyncRequests, we don't want to stop absorbing fsync
+        * requests for along time when there are many deletions to be done.
+        * We can safely call AbsorbFsyncRequests() at this point in the loop
+        * (note it might try to delete list entries).
+        */
+       if (--absorb_counter <= 0)
+       {
+           AbsorbSyncRequests();
+           absorb_counter = UNLINKS_PER_ABSORB;
+       }
+   }
+}
+
+/*
+
+ * ProcessSyncRequests() -- Process queued fsync requests.
+ */
+void
+ProcessSyncRequests(void)
+{
+   static bool sync_in_progress = false;
+
+   HASH_SEQ_STATUS hstat;
+   PendingFsyncEntry *entry;
+   int         absorb_counter;
+
+   /* Statistics on sync times */
+   int         processed = 0;
+   instr_time  sync_start,
+               sync_end,
+               sync_diff;
+   uint64      elapsed;
+   uint64      longest = 0;
+   uint64      total_elapsed = 0;
+
+   /*
+    * This is only called during checkpoints, and checkpoints should only
+    * occur in processes that have created a pendingOps.
+    */
+   if (!pendingOps)
+       elog(ERROR, "cannot sync without a pendingOps table");
+
+   /*
+    * If we are in the checkpointer, the sync had better include all fsync
+    * requests that were queued by backends up to this point.  The tightest
+    * race condition that could occur is that a buffer that must be written
+    * and fsync'd for the checkpoint could have been dumped by a backend just
+    * before it was visited by BufferSync().  We know the backend will have
+    * queued an fsync request before clearing the buffer's dirtybit, so we
+    * are safe as long as we do an Absorb after completing BufferSync().
+    */
+   AbsorbSyncRequests();
+
+   /*
+    * To avoid excess fsync'ing (in the worst case, maybe a never-terminating
+    * checkpoint), we want to ignore fsync requests that are entered into the
+    * hashtable after this point --- they should be processed next time,
+    * instead.  We use sync_cycle_ctr to tell old entries apart from new
+    * ones: new ones will have cycle_ctr equal to the incremented value of
+    * sync_cycle_ctr.
+    *
+    * In normal circumstances, all entries present in the table at this point
+    * will have cycle_ctr exactly equal to the current (about to be old)
+    * value of sync_cycle_ctr.  However, if we fail partway through the
+    * fsync'ing loop, then older values of cycle_ctr might remain when we
+    * come back here to try again.  Repeated checkpoint failures would
+    * eventually wrap the counter around to the point where an old entry
+    * might appear new, causing us to skip it, possibly allowing a checkpoint
+    * to succeed that should not have.  To forestall wraparound, any time the
+    * previous ProcessFsyncRequests() failed to complete, run through the
+    * table and forcibly set cycle_ctr = sync_cycle_ctr.
+    *
+    * Think not to merge this loop with the main loop, as the problem is
+    * exactly that that loop may fail before having visited all the entries.
+    * From a performance point of view it doesn't matter anyway, as this path
+    * will never be taken in a system that's functioning normally.
+    */
+   if (sync_in_progress)
+   {
+       /* prior try failed, so update any stale cycle_ctr values */
+       hash_seq_init(&hstat, pendingOps);
+       while ((entry = (PendingFsyncEntry *) hash_seq_search(&hstat)) != NULL)
+       {
+           entry->cycle_ctr = sync_cycle_ctr;
+       }
+   }
+
+   /* Advance counter so that new hashtable entries are distinguishable */
+   sync_cycle_ctr++;
+
+   /* Set flag to detect failure if we don't reach the end of the loop */
+   sync_in_progress = true;
+
+   /* Now scan the hashtable for fsync requests to process */
+   absorb_counter = FSYNCS_PER_ABSORB;
+   hash_seq_init(&hstat, pendingOps);
+   while ((entry = (PendingFsyncEntry *) hash_seq_search(&hstat)) != NULL)
+   {
+       int         failures;
+
+       /*
+        * If fsync is off then we don't have to bother opening the file at
+        * all.  (We delay checking until this point so that changing fsync on
+        * the fly behaves sensibly.)
+        */
+       if (!enableFsync)
+           continue;
+
+       /*
+        * If the entry is new then don't process it this time; it is new.
+        * Note "continue" bypasses the hash-remove call at the bottom of the
+        * loop.
+        */
+       if (entry->cycle_ctr == sync_cycle_ctr)
+           continue;
+
+       /* Else assert we haven't missed it */
+       Assert((CycleCtr) (entry->cycle_ctr + 1) == sync_cycle_ctr);
+
+       /*
+        * If in checkpointer, we want to absorb pending requests every so
+        * often to prevent overflow of the fsync request queue.  It is
+        * unspecified whether newly-added entries will be visited by
+        * hash_seq_search, but we don't care since we don't need to process
+        * them anyway.
+        */
+       if (--absorb_counter <= 0)
+       {
+           AbsorbSyncRequests();
+           absorb_counter = FSYNCS_PER_ABSORB;
+       }
+
+       /*
+        * The fsync table could contain requests to fsync segments that have
+        * been deleted (unlinked) by the time we get to them. Rather than
+        * just hoping an ENOENT (or EACCES on Windows) error can be ignored,
+        * what we do on error is absorb pending requests and then retry.
+        * Since mdunlink() queues a "cancel" message before actually
+        * unlinking, the fsync request is guaranteed to be marked canceled
+        * after the absorb if it really was this case. DROP DATABASE likewise
+        * has to tell us to forget fsync requests before it starts deletions.
+        */
+       for (failures = 0; !entry->canceled; failures++)
+       {
+           char        path[MAXPGPATH];
+
+           INSTR_TIME_SET_CURRENT(sync_start);
+           if (syncsw[entry->tag.handler].sync_syncfiletag(&entry->tag,
+                                                           path) == 0)
+           {
+               /* Success; update statistics about sync timing */
+               INSTR_TIME_SET_CURRENT(sync_end);
+               sync_diff = sync_end;
+               INSTR_TIME_SUBTRACT(sync_diff, sync_start);
+               elapsed = INSTR_TIME_GET_MICROSEC(sync_diff);
+               if (elapsed > longest)
+                   longest = elapsed;
+               total_elapsed += elapsed;
+               processed++;
+
+               if (log_checkpoints)
+                   elog(DEBUG1, "checkpoint sync: number=%d file=%s time=%.3f msec",
+                        processed,
+                        path,
+                        (double) elapsed / 1000);
+
+               break;          /* out of retry loop */
+           }
+
+           /*
+            * It is possible that the relation has been dropped or truncated
+            * since the fsync request was entered. Therefore, allow ENOENT,
+            * but only if we didn't fail already on this file.
+            */
+           if (!FILE_POSSIBLY_DELETED(errno) || failures > 0)
+               ereport(data_sync_elevel(ERROR),
+                       (errcode_for_file_access(),
+                        errmsg("could not fsync file \"%s\": %m",
+                               path)));
+           else
+               ereport(DEBUG1,
+                       (errcode_for_file_access(),
+                        errmsg("could not fsync file \"%s\" but retrying: %m",
+                               path)));
+
+           /*
+            * Absorb incoming requests and check to see if a cancel arrived
+            * for this relation fork.
+            */
+           AbsorbSyncRequests();
+           absorb_counter = FSYNCS_PER_ABSORB; /* might as well... */
+       }                       /* end retry loop */
+
+       /* We are done with this entry, remove it */
+       if (hash_search(pendingOps, &entry->tag, HASH_REMOVE, NULL) == NULL)
+           elog(ERROR, "pendingOps corrupted");
+   }                           /* end loop over hashtable entries */
+
+   /* Return sync performance metrics for report at checkpoint end */
+   CheckpointStats.ckpt_sync_rels = processed;
+   CheckpointStats.ckpt_longest_sync = longest;
+   CheckpointStats.ckpt_agg_sync_time = total_elapsed;
+
+   /* Flag successful completion of ProcessSyncRequests */
+   sync_in_progress = false;
+}
+
+/*
+ * RememberSyncRequest() -- callback from checkpointer side of sync request
+ *
+ * We stuff fsync requests into the local hash table for execution
+ * during the checkpointer's next checkpoint.  UNLINK requests go into a
+ * separate linked list, however, because they get processed separately.
+ *
+ * See sync.h for more information on the types of sync requests supported.
+ */
+void
+RememberSyncRequest(const FileTag *ftag, SyncRequestType type)
+{
+   Assert(pendingOps);
+
+   if (type == SYNC_FORGET_REQUEST)
+   {
+       PendingFsyncEntry *entry;
+
+       /* Cancel previously entered request */
+       entry = (PendingFsyncEntry *) hash_search(pendingOps,
+                                                 (void *) ftag,
+                                                 HASH_FIND,
+                                                 NULL);
+       if (entry != NULL)
+           entry->canceled = true;
+   }
+   else if (type == SYNC_FILTER_REQUEST)
+   {
+       HASH_SEQ_STATUS hstat;
+       PendingFsyncEntry *entry;
+       ListCell   *cell,
+                  *prev,
+                  *next;
+
+       /* Cancel matching fsync requests */
+       hash_seq_init(&hstat, pendingOps);
+       while ((entry = (PendingFsyncEntry *) hash_seq_search(&hstat)) != NULL)
+       {
+           if (entry->tag.handler == ftag->handler &&
+               syncsw[ftag->handler].sync_filetagmatches(ftag, &entry->tag))
+               entry->canceled = true;
+       }
+
+       /* Remove matching unlink requests */
+       prev = NULL;
+       for (cell = list_head(pendingUnlinks); cell; cell = next)
+       {
+           PendingUnlinkEntry *entry = (PendingUnlinkEntry *) lfirst(cell);
+
+           next = lnext(cell);
+           if (entry->tag.handler == ftag->handler &&
+               syncsw[ftag->handler].sync_filetagmatches(ftag, &entry->tag))
+           {
+               pendingUnlinks = list_delete_cell(pendingUnlinks, cell, prev);
+               pfree(entry);
+           }
+           else
+               prev = cell;
+       }
+   }
+   else if (type == SYNC_UNLINK_REQUEST)
+   {
+       /* Unlink request: put it in the linked list */
+       MemoryContext oldcxt = MemoryContextSwitchTo(pendingOpsCxt);
+       PendingUnlinkEntry *entry;
+
+       entry = palloc(sizeof(PendingUnlinkEntry));
+       entry->tag = *ftag;
+       entry->cycle_ctr = checkpoint_cycle_ctr;
+
+       pendingUnlinks = lappend(pendingUnlinks, entry);
+
+       MemoryContextSwitchTo(oldcxt);
+   }
+   else
+   {
+       /* Normal case: enter a request to fsync this segment */
+       MemoryContext oldcxt = MemoryContextSwitchTo(pendingOpsCxt);
+       PendingFsyncEntry *entry;
+       bool        found;
+
+       Assert(type == SYNC_REQUEST);
+
+       entry = (PendingFsyncEntry *) hash_search(pendingOps,
+                                                 (void *) ftag,
+                                                 HASH_ENTER,
+                                                 &found);
+       /* if new entry, initialize it */
+       if (!found)
+       {
+           entry->cycle_ctr = sync_cycle_ctr;
+           entry->canceled = false;
+       }
+
+       /*
+        * NB: it's intentional that we don't change cycle_ctr if the entry
+        * already exists.  The cycle_ctr must represent the oldest fsync
+        * request that could be in the entry.
+        */
+
+       MemoryContextSwitchTo(oldcxt);
+   }
+}
+
+/*
+ * Register the sync request locally, or forward it to the checkpointer.
+ *
+ * If retryOnError is true, we'll keep trying if there is no space in the
+ * queue.  Return true if we succeeded, or false if there wasn't space.
+ */
+bool
+RegisterSyncRequest(const FileTag *ftag, SyncRequestType type,
+                   bool retryOnError)
+{
+   bool        ret;
+
+   if (pendingOps != NULL)
+   {
+       /* standalone backend or startup process: fsync state is local */
+       RememberSyncRequest(ftag, type);
+       return true;
+   }
+
+   for (;;)
+   {
+       /*
+        * Notify the checkpointer about it.  If we fail to queue a message
+        * in retryOnError mode, we have to sleep and try again ... ugly, but
+        * hopefully won't happen often.
+        *
+        * XXX should we CHECK_FOR_INTERRUPTS in this loop?  Escaping with an
+        * error in the case of SYNC_UNLINK_REQUEST would leave the
+        * no-longer-used file still present on disk, which would be bad, so
+        * I'm inclined to assume that the checkpointer will always empty the
+        * queue soon.
+        */
+       ret = ForwardSyncRequest(ftag, type);
+
+       /*
+        * If we are successful in queueing the request, or we failed and were
+        * instructed not to retry on error, break.
+        */
+       if (ret || (!ret && !retryOnError))
+           break;
+
+       pg_usleep(10000L);
+   }
+
+   return ret;
+}
+
+/*
+ * In archive recovery, we rely on checkpointer to do fsyncs, but we will have
+ * already created the pendingOps during initialization of the startup
+ * process.  Calling this function drops the local pendingOps so that
+ * subsequent requests will be forwarded to checkpointer.
+ */
+void
+EnableSyncRequestForwarding(void)
+{
+   /* Perform any pending fsyncs we may have queued up, then drop table */
+   if (pendingOps)
+   {
+       ProcessSyncRequests();
+       hash_destroy(pendingOps);
+   }
+   pendingOps = NULL;
+
+   /*
+    * We should not have any pending unlink requests, since mdunlink doesn't
+    * queue unlink requests when isRedo.
+    */
+   Assert(pendingUnlinks == NIL);
+}
diff --git a/src/backend/utils/init/postinit.c b/src/backend/utils/init/postinit.c

index 66b4ee864d3a27a260a0c8bfb22e3f579072c0ab..e9f72b506921fe957367df515dd0dddab92c3c44 100644 (file)
--- a/src/backend/utils/init/postinit.c
+++ b/src/backend/utils/init/postinit.c
@@ -51,6 +51,7 @@
  #include "storage/proc.h"
  #include "storage/sinvaladt.h"
  #include "storage/smgr.h"
+#include "storage/sync.h"
  #include "tcop/tcopprot.h"
  #include "utils/acl.h"
  #include "utils/fmgroids.h"
@@ -557,6 +558,7 @@ BaseInit(void)
  
     /* Do local initialization of file, storage and buffer managers */
     InitFileAccess();
+   InitSync();
     smgrinit();
     InitBufferPoolAccess();
  }
diff --git a/src/include/postmaster/bgwriter.h b/src/include/postmaster/bgwriter.h

index 53b8f5fe3cb6fc16f9a4b34b6e9370f1332f99eb..630366f49efd241750b57e10e649aa7718d34424 100644 (file)
--- a/src/include/postmaster/bgwriter.h
+++ b/src/include/postmaster/bgwriter.h
@@ -17,6 +17,8 @@
  
  #include "storage/block.h"
  #include "storage/relfilenode.h"
+#include "storage/smgr.h"
+#include "storage/sync.h"
  
  
  /* GUC options */
@@ -31,9 +33,9 @@ extern void CheckpointerMain(void) pg_attribute_noreturn();
  extern void RequestCheckpoint(int flags);
  extern void CheckpointWriteDelay(int flags, double progress);
  
-extern bool ForwardFsyncRequest(RelFileNode rnode, ForkNumber forknum,
-                   BlockNumber segno);
-extern void AbsorbFsyncRequests(void);
+extern bool ForwardSyncRequest(const FileTag *ftag, SyncRequestType type);
+
+extern void AbsorbSyncRequests(void);
  
  extern Size CheckpointerShmemSize(void);
  extern void CheckpointerShmemInit(void);
diff --git a/src/include/storage/fd.h b/src/include/storage/fd.h

index 74c34757fb54e812f200a3013d57e5efd88ced6d..a03b4d14a26ff42dbbcefc7e573a637f31bf1ee6 100644 (file)
--- a/src/include/storage/fd.h
+++ b/src/include/storage/fd.h
@@ -54,6 +54,18 @@ extern PGDLLIMPORT bool data_sync_retry;
   */
  extern int max_safe_fds;
  
+/*
+ * On Windows, we have to interpret EACCES as possibly meaning the same as
+ * ENOENT, because if a file is unlinked-but-not-yet-gone on that platform,
+ * that's what you get.  Ugh.  This code is designed so that we don't
+ * actually believe these cases are okay without further evidence (namely,
+ * a pending fsync request getting canceled ... see ProcessSyncRequests).
+ */
+#ifndef WIN32
+#define FILE_POSSIBLY_DELETED(err) ((err) == ENOENT)
+#else
+#define FILE_POSSIBLY_DELETED(err) ((err) == ENOENT || (err) == EACCES)
+#endif
  
  /*
   * prototypes for functions in fd.c
diff --git a/src/include/storage/md.h b/src/include/storage/md.h

new file mode 100644 (file)

index 0000000..a6758a1
--- /dev/null
+++ b/src/include/storage/md.h
@@ -0,0 +1,51 @@
+/*-------------------------------------------------------------------------
+ *
+ * md.h
+ *   magnetic disk storage manager public interface declarations.
+ *
+ *
+ * Portions Copyright (c) 1996-2019, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * src/include/storage/md.h
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifndef MD_H
+#define MD_H
+
+#include "storage/block.h"
+#include "storage/relfilenode.h"
+#include "storage/smgr.h"
+#include "storage/sync.h"
+
+/* md storage manager functionality */
+extern void mdinit(void);
+extern void mdclose(SMgrRelation reln, ForkNumber forknum);
+extern void mdcreate(SMgrRelation reln, ForkNumber forknum, bool isRedo);
+extern bool mdexists(SMgrRelation reln, ForkNumber forknum);
+extern void mdunlink(RelFileNodeBackend rnode, ForkNumber forknum, bool isRedo);
+extern void mdextend(SMgrRelation reln, ForkNumber forknum,
+        BlockNumber blocknum, char *buffer, bool skipFsync);
+extern void mdprefetch(SMgrRelation reln, ForkNumber forknum,
+          BlockNumber blocknum);
+extern void mdread(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
+      char *buffer);
+extern void mdwrite(SMgrRelation reln, ForkNumber forknum,
+       BlockNumber blocknum, char *buffer, bool skipFsync);
+extern void mdwriteback(SMgrRelation reln, ForkNumber forknum,
+           BlockNumber blocknum, BlockNumber nblocks);
+extern BlockNumber mdnblocks(SMgrRelation reln, ForkNumber forknum);
+extern void mdtruncate(SMgrRelation reln, ForkNumber forknum,
+          BlockNumber nblocks);
+extern void mdimmedsync(SMgrRelation reln, ForkNumber forknum);
+
+extern void ForgetDatabaseSyncRequests(Oid dbid);
+extern void DropRelationFiles(RelFileNode *delrels, int ndelrels, bool isRedo);
+
+/* md sync callbacks */
+extern int mdsyncfiletag(const FileTag *ftag, char *path);
+extern int mdunlinkfiletag(const FileTag *ftag, char *path);
+extern bool mdfiletagmatches(const FileTag *ftag, const FileTag *candidate);
+
+#endif                         /* MD_H */
diff --git a/src/include/storage/smgr.h b/src/include/storage/smgr.h

index 8e982738789dae0ccb13f72e3d0311e65837210d..770193e285e80939b32ee6030f6a44ea4e98e49a 100644 (file)
--- a/src/include/storage/smgr.h
+++ b/src/include/storage/smgr.h
@@ -18,7 +18,6 @@
  #include "storage/block.h"
  #include "storage/relfilenode.h"
  
-
  /*
   * smgr.c maintains a table of SMgrRelation objects, which are essentially
   * cached file handles.  An SMgrRelation is created (if not already present)
@@ -106,43 +105,6 @@ extern BlockNumber smgrnblocks(SMgrRelation reln, ForkNumber forknum);
  extern void smgrtruncate(SMgrRelation reln, ForkNumber forknum,
              BlockNumber nblocks);
  extern void smgrimmedsync(SMgrRelation reln, ForkNumber forknum);
-extern void smgrpreckpt(void);
-extern void smgrsync(void);
-extern void smgrpostckpt(void);
  extern void AtEOXact_SMgr(void);
  
-
-/* internals: move me elsewhere -- ay 7/94 */
-
-/* in md.c */
-extern void mdinit(void);
-extern void mdclose(SMgrRelation reln, ForkNumber forknum);
-extern void mdcreate(SMgrRelation reln, ForkNumber forknum, bool isRedo);
-extern bool mdexists(SMgrRelation reln, ForkNumber forknum);
-extern void mdunlink(RelFileNodeBackend rnode, ForkNumber forknum, bool isRedo);
-extern void mdextend(SMgrRelation reln, ForkNumber forknum,
-        BlockNumber blocknum, char *buffer, bool skipFsync);
-extern void mdprefetch(SMgrRelation reln, ForkNumber forknum,
-          BlockNumber blocknum);
-extern void mdread(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
-      char *buffer);
-extern void mdwrite(SMgrRelation reln, ForkNumber forknum,
-       BlockNumber blocknum, char *buffer, bool skipFsync);
-extern void mdwriteback(SMgrRelation reln, ForkNumber forknum,
-           BlockNumber blocknum, BlockNumber nblocks);
-extern BlockNumber mdnblocks(SMgrRelation reln, ForkNumber forknum);
-extern void mdtruncate(SMgrRelation reln, ForkNumber forknum,
-          BlockNumber nblocks);
-extern void mdimmedsync(SMgrRelation reln, ForkNumber forknum);
-extern void mdpreckpt(void);
-extern void mdsync(void);
-extern void mdpostckpt(void);
-
-extern void SetForwardFsyncRequests(void);
-extern void RememberFsyncRequest(RelFileNode rnode, ForkNumber forknum,
-                    BlockNumber segno);
-extern void ForgetRelationFsyncRequests(RelFileNode rnode, ForkNumber forknum);
-extern void ForgetDatabaseFsyncRequests(Oid dbid);
-extern void DropRelationFiles(RelFileNode *delrels, int ndelrels, bool isRedo);
-
  #endif                         /* SMGR_H */
diff --git a/src/include/storage/sync.h b/src/include/storage/sync.h

new file mode 100644 (file)

index 0000000..124a49e
--- /dev/null
+++ b/src/include/storage/sync.h
@@ -0,0 +1,62 @@
+/*-------------------------------------------------------------------------
+ *
+ * sync.h
+ *   File synchronization management code.
+ *
+ * Portions Copyright (c) 1996-2019, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * src/include/storage/sync.h
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifndef SYNC_H
+#define SYNC_H
+
+#include "storage/relfilenode.h"
+
+/*
+ * Type of sync request.  These are used to manage the set of pending
+ * requests to call a sync handler's sync or unlink functions at the next
+ * checkpoint.
+ */
+typedef enum SyncRequestType
+{
+   SYNC_REQUEST,               /* schedule a call of sync function */
+   SYNC_UNLINK_REQUEST,        /* schedule a call of unlink function */
+   SYNC_FORGET_REQUEST,        /* forget all calls for a tag */
+   SYNC_FILTER_REQUEST         /* forget all calls satisfying match fn */
+} SyncRequestType;
+
+/*
+ * Which set of functions to use to handle a given request.  See the function
+ * table in sync.c.
+ */
+typedef enum SyncRequestHandler
+{
+   SYNC_HANDLER_MD = 0         /* md smgr */
+} SyncRequestHandler;
+
+/*
+ * A tag identifying a file.  Currently it has the members required for md.c's
+ * usage, but sync.c has no knowledge of the internal structure, and it is
+ * liable to change as required by future handlers.
+ */
+typedef struct FileTag
+{
+   int16       handler;        /* SyncRequstHandler value, saving space */
+   int16       forknum;        /* ForkNumber, saving space */
+   RelFileNode rnode;
+   uint32      segno;
+} FileTag;
+
+extern void InitSync(void);
+extern void SyncPreCheckpoint(void);
+extern void SyncPostCheckpoint(void);
+extern void ProcessSyncRequests(void);
+extern void RememberSyncRequest(const FileTag *ftag, SyncRequestType type);
+extern void EnableSyncRequestForwarding(void);
+extern bool RegisterSyncRequest(const FileTag *ftag, SyncRequestType type,
+                   bool retryOnError);
+
+#endif                         /* SYNC_H */
diff --git a/src/tools/pgindent/typedefs.list b/src/tools/pgindent/typedefs.list

index f31929664ac96517d0b88f6448e1c5e7c1b29890..e09f9353ed2618aeae60affe60323b0989d1fb68 100644 (file)
--- a/src/tools/pgindent/typedefs.list
+++ b/src/tools/pgindent/typedefs.list
@@ -651,6 +651,7 @@ File
  FileFdwExecutionState
  FileFdwPlanState
  FileNameMap
+FileTag
  FindSplitData
  FixedParallelExecutorState
  FixedParallelState
@@ -1700,7 +1701,7 @@ PathKeysComparison
  PathTarget
  Pattern_Prefix_Status
  Pattern_Type
-PendingOperationEntry
+PendingFsyncEntry
  PendingRelDelete
  PendingUnlinkEntry
  PendingWriteback
@@ -2276,7 +2277,10 @@ Subscription
  SubscriptionInfo
  SubscriptionRelState
  Syn
+SyncOps
  SyncRepConfigData
+SyncRequestHandler
+SyncRequestType
  SysScanDesc
  SyscacheCallbackFunction
  SystemRowsSamplerData
author	Thomas Munro <tmunro@postgresql.org>
	Thu, 4 Apr 2019 08:56:03 +0000 (21:56 +1300)
committer	Thomas Munro <tmunro@postgresql.org>
	Thu, 4 Apr 2019 10:38:38 +0000 (23:38 +1300)
src/backend/access/transam/twophase.c		patch \| blob \| blame \| history
src/backend/access/transam/xact.c		patch \| blob \| blame \| history
src/backend/access/transam/xlog.c		patch \| blob \| blame \| history
src/backend/commands/dbcommands.c		patch \| blob \| blame \| history
src/backend/postmaster/checkpointer.c		patch \| blob \| blame \| history
src/backend/storage/Makefile		patch \| blob \| blame \| history
src/backend/storage/buffer/bufmgr.c		patch \| blob \| blame \| history
src/backend/storage/smgr/md.c		patch \| blob \| blame \| history
src/backend/storage/smgr/smgr.c		patch \| blob \| blame \| history
src/backend/storage/sync/Makefile	[new file with mode: 0644]	patch \| blob
src/backend/storage/sync/sync.c	[new file with mode: 0644]	patch \| blob
src/backend/utils/init/postinit.c		patch \| blob \| blame \| history
src/include/postmaster/bgwriter.h		patch \| blob \| blame \| history
src/include/storage/fd.h		patch \| blob \| blame \| history
src/include/storage/md.h	[new file with mode: 0644]	patch \| blob
src/include/storage/smgr.h		patch \| blob \| blame \| history
src/include/storage/sync.h	[new file with mode: 0644]	patch \| blob
src/tools/pgindent/typedefs.list		patch \| blob \| blame \| history