Defer flushing of SLRU files.
authorThomas Munro <tmunro@postgresql.org>
Fri, 25 Sep 2020 06:49:43 +0000 (18:49 +1200)
committerThomas Munro <tmunro@postgresql.org>
Fri, 25 Sep 2020 07:00:15 +0000 (19:00 +1200)
Previously, we called fsync() after writing out individual pg_xact,
pg_multixact and pg_commit_ts pages due to cache pressure, leading to
regular I/O stalls in user backends and recovery.  Collapse requests for
the same file into a single system call as part of the next checkpoint,
as we already did for relation files, using the infrastructure developed
by commit 3eb77eba.  This can cause a significant improvement to
recovery performance, especially when it's otherwise CPU-bound.

Hoist ProcessSyncRequests() up into CheckPointGuts() to make it clearer
that it applies to all the SLRU mini-buffer-pools as well as the main
buffer pool.  Rearrange things so that data collected in CheckpointStats
includes SLRU activity.

Also remove the Shutdown{CLOG,CommitTS,SUBTRANS,MultiXact}() functions,
because they were redundant after the shutdown checkpoint that
immediately precedes them.  (I'm not sure if they were ever needed, but
they aren't now.)

Reviewed-by: Tom Lane <tgl@sss.pgh.pa.us> (parts)
Tested-by: Jakub Wartak <Jakub.Wartak@tomtom.com>
Discussion: https://postgr.es/m/CA+hUKGLJ=84YT+NvhkEEDAuUtVHMfQ9i-N7k_o50JmQ6Rpj_OQ@mail.gmail.com

16 files changed:
src/backend/access/transam/clog.c
src/backend/access/transam/commit_ts.c
src/backend/access/transam/multixact.c
src/backend/access/transam/slru.c
src/backend/access/transam/subtrans.c
src/backend/access/transam/xlog.c
src/backend/commands/async.c
src/backend/storage/buffer/bufmgr.c
src/backend/storage/lmgr/predicate.c
src/backend/storage/sync/sync.c
src/include/access/clog.h
src/include/access/commit_ts.h
src/include/access/multixact.h
src/include/access/slru.h
src/include/storage/sync.h
src/tools/pgindent/typedefs.list

index 9e352d26583541647facb81e76d6d5711d8a0bd0..034349aa7b986fdc63d1fde66a24896e2fe8e463 100644 (file)
@@ -42,6 +42,7 @@
 #include "pg_trace.h"
 #include "pgstat.h"
 #include "storage/proc.h"
+#include "storage/sync.h"
 
 /*
  * Defines for CLOG page sizes.  A page is the same BLCKSZ as is used
@@ -691,7 +692,8 @@ CLOGShmemInit(void)
 {
        XactCtl->PagePrecedes = CLOGPagePrecedes;
        SimpleLruInit(XactCtl, "Xact", CLOGShmemBuffers(), CLOG_LSNS_PER_PAGE,
-                                 XactSLRULock, "pg_xact", LWTRANCHE_XACT_BUFFER);
+                                 XactSLRULock, "pg_xact", LWTRANCHE_XACT_BUFFER,
+                                 SYNC_HANDLER_CLOG);
 }
 
 /*
@@ -808,34 +810,19 @@ TrimCLOG(void)
        LWLockRelease(XactSLRULock);
 }
 
-/*
- * This must be called ONCE during postmaster or standalone-backend shutdown
- */
-void
-ShutdownCLOG(void)
-{
-       /* Flush dirty CLOG pages to disk */
-       TRACE_POSTGRESQL_CLOG_CHECKPOINT_START(false);
-       SimpleLruFlush(XactCtl, false);
-
-       /*
-        * fsync pg_xact to ensure that any files flushed previously are durably
-        * on disk.
-        */
-       fsync_fname("pg_xact", true);
-
-       TRACE_POSTGRESQL_CLOG_CHECKPOINT_DONE(false);
-}
-
 /*
  * Perform a checkpoint --- either during shutdown, or on-the-fly
  */
 void
 CheckPointCLOG(void)
 {
-       /* Flush dirty CLOG pages to disk */
+       /*
+        * Write dirty CLOG pages to disk.  This may result in sync requests
+        * queued for later handling by ProcessSyncRequests(), as part of the
+        * checkpoint.
+        */
        TRACE_POSTGRESQL_CLOG_CHECKPOINT_START(true);
-       SimpleLruFlush(XactCtl, true);
+       SimpleLruWriteAll(XactCtl, true);
        TRACE_POSTGRESQL_CLOG_CHECKPOINT_DONE(true);
 }
 
@@ -1026,3 +1013,12 @@ clog_redo(XLogReaderState *record)
        else
                elog(PANIC, "clog_redo: unknown op code %u", info);
 }
+
+/*
+ * Entrypoint for sync.c to sync clog files.
+ */
+int
+clogsyncfiletag(const FileTag *ftag, char *path)
+{
+       return SlruSyncFileTag(XactCtl, ftag, path);
+}
index f6a7329ba3a5dc75c29b9f5aff3a0d7366210ce0..cb8a968801873da22259162c7750d1851b9e4ed8 100644 (file)
@@ -555,7 +555,8 @@ CommitTsShmemInit(void)
        CommitTsCtl->PagePrecedes = CommitTsPagePrecedes;
        SimpleLruInit(CommitTsCtl, "CommitTs", CommitTsShmemBuffers(), 0,
                                  CommitTsSLRULock, "pg_commit_ts",
-                                 LWTRANCHE_COMMITTS_BUFFER);
+                                 LWTRANCHE_COMMITTS_BUFFER,
+                                 SYNC_HANDLER_COMMIT_TS);
 
        commitTsShared = ShmemInitStruct("CommitTs shared",
                                                                         sizeof(CommitTimestampShared),
@@ -798,30 +799,18 @@ DeactivateCommitTs(void)
        LWLockRelease(CommitTsSLRULock);
 }
 
-/*
- * This must be called ONCE during postmaster or standalone-backend shutdown
- */
-void
-ShutdownCommitTs(void)
-{
-       /* Flush dirty CommitTs pages to disk */
-       SimpleLruFlush(CommitTsCtl, false);
-
-       /*
-        * fsync pg_commit_ts to ensure that any files flushed previously are
-        * durably on disk.
-        */
-       fsync_fname("pg_commit_ts", true);
-}
-
 /*
  * Perform a checkpoint --- either during shutdown, or on-the-fly
  */
 void
 CheckPointCommitTs(void)
 {
-       /* Flush dirty CommitTs pages to disk */
-       SimpleLruFlush(CommitTsCtl, true);
+       /*
+        * Write dirty CommitTs pages to disk.  This may result in sync requests
+        * queued for later handling by ProcessSyncRequests(), as part of the
+        * checkpoint.
+        */
+       SimpleLruWriteAll(CommitTsCtl, true);
 }
 
 /*
@@ -1077,3 +1066,12 @@ commit_ts_redo(XLogReaderState *record)
        else
                elog(PANIC, "commit_ts_redo: unknown op code %u", info);
 }
+
+/*
+ * Entrypoint for sync.c to sync commit_ts files.
+ */
+int
+committssyncfiletag(const FileTag *ftag, char *path)
+{
+       return SlruSyncFileTag(CommitTsCtl, ftag, path);
+}
index b8bedca04a4dae61e2e263cc2e20470cd669314c..a2ce617c8ce2b0094aba00d973bcbb54b9463cbf 100644 (file)
@@ -1831,11 +1831,13 @@ MultiXactShmemInit(void)
        SimpleLruInit(MultiXactOffsetCtl,
                                  "MultiXactOffset", NUM_MULTIXACTOFFSET_BUFFERS, 0,
                                  MultiXactOffsetSLRULock, "pg_multixact/offsets",
-                                 LWTRANCHE_MULTIXACTOFFSET_BUFFER);
+                                 LWTRANCHE_MULTIXACTOFFSET_BUFFER,
+                                 SYNC_HANDLER_MULTIXACT_OFFSET);
        SimpleLruInit(MultiXactMemberCtl,
                                  "MultiXactMember", NUM_MULTIXACTMEMBER_BUFFERS, 0,
                                  MultiXactMemberSLRULock, "pg_multixact/members",
-                                 LWTRANCHE_MULTIXACTMEMBER_BUFFER);
+                                 LWTRANCHE_MULTIXACTMEMBER_BUFFER,
+                                 SYNC_HANDLER_MULTIXACT_MEMBER);
 
        /* Initialize our shared state struct */
        MultiXactState = ShmemInitStruct("Shared MultiXact State",
@@ -2100,19 +2102,6 @@ TrimMultiXact(void)
        SetMultiXactIdLimit(oldestMXact, oldestMXactDB, true);
 }
 
-/*
- * This must be called ONCE during postmaster or standalone-backend shutdown
- */
-void
-ShutdownMultiXact(void)
-{
-       /* Flush dirty MultiXact pages to disk */
-       TRACE_POSTGRESQL_MULTIXACT_CHECKPOINT_START(false);
-       SimpleLruFlush(MultiXactOffsetCtl, false);
-       SimpleLruFlush(MultiXactMemberCtl, false);
-       TRACE_POSTGRESQL_MULTIXACT_CHECKPOINT_DONE(false);
-}
-
 /*
  * Get the MultiXact data to save in a checkpoint record
  */
@@ -2143,9 +2132,13 @@ CheckPointMultiXact(void)
 {
        TRACE_POSTGRESQL_MULTIXACT_CHECKPOINT_START(true);
 
-       /* Flush dirty MultiXact pages to disk */
-       SimpleLruFlush(MultiXactOffsetCtl, true);
-       SimpleLruFlush(MultiXactMemberCtl, true);
+       /*
+        * Write dirty MultiXact pages to disk.  This may result in sync requests
+        * queued for later handling by ProcessSyncRequests(), as part of the
+        * checkpoint.
+        */
+       SimpleLruWriteAll(MultiXactOffsetCtl, true);
+       SimpleLruWriteAll(MultiXactMemberCtl, true);
 
        TRACE_POSTGRESQL_MULTIXACT_CHECKPOINT_DONE(true);
 }
@@ -2728,14 +2721,10 @@ find_multixact_start(MultiXactId multi, MultiXactOffset *result)
        entryno = MultiXactIdToOffsetEntry(multi);
 
        /*
-        * Flush out dirty data, so PhysicalPageExists can work correctly.
-        * SimpleLruFlush() is a pretty big hammer for that.  Alternatively we
-        * could add an in-memory version of page exists, but find_multixact_start
-        * is called infrequently, and it doesn't seem bad to flush buffers to
-        * disk before truncation.
+        * Write out dirty data, so PhysicalPageExists can work correctly.
         */
-       SimpleLruFlush(MultiXactOffsetCtl, true);
-       SimpleLruFlush(MultiXactMemberCtl, true);
+       SimpleLruWriteAll(MultiXactOffsetCtl, true);
+       SimpleLruWriteAll(MultiXactMemberCtl, true);
 
        if (!SimpleLruDoesPhysicalPageExist(MultiXactOffsetCtl, pageno))
                return false;
@@ -3386,3 +3375,21 @@ pg_get_multixact_members(PG_FUNCTION_ARGS)
 
        SRF_RETURN_DONE(funccxt);
 }
+
+/*
+ * Entrypoint for sync.c to sync offsets files.
+ */
+int
+multixactoffsetssyncfiletag(const FileTag *ftag, char *path)
+{
+       return SlruSyncFileTag(MultiXactOffsetCtl, ftag, path);
+}
+
+/*
+ * Entrypoint for sync.c to sync members files.
+ */
+int
+multixactmemberssyncfiletag(const FileTag *ftag, char *path)
+{
+       return SlruSyncFileTag(MultiXactMemberCtl, ftag, path);
+}
index fe7d759a8c1cceaf8116a2f51888a819218b5393..16a78986971ff51085c7ea94a5cc7517e834e33a 100644 (file)
        snprintf(path, MAXPGPATH, "%s/%04X", (ctl)->Dir, seg)
 
 /*
- * During SimpleLruFlush(), we will usually not need to write/fsync more
- * than one or two physical files, but we may need to write several pages
- * per file.  We can consolidate the I/O requests by leaving files open
- * until control returns to SimpleLruFlush().  This data structure remembers
- * which files are open.
+ * During SimpleLruWriteAll(), we will usually not need to write more than one
+ * or two physical files, but we may need to write several pages per file.  We
+ * can consolidate the I/O requests by leaving files open until control returns
+ * to SimpleLruWriteAll().  This data structure remembers which files are open.
  */
-#define MAX_FLUSH_BUFFERS      16
+#define MAX_WRITEALL_BUFFERS   16
 
-typedef struct SlruFlushData
+typedef struct SlruWriteAllData
 {
        int                     num_files;              /* # files actually open */
-       int                     fd[MAX_FLUSH_BUFFERS];  /* their FD's */
-       int                     segno[MAX_FLUSH_BUFFERS];       /* their log seg#s */
-} SlruFlushData;
+       int                     fd[MAX_WRITEALL_BUFFERS];       /* their FD's */
+       int                     segno[MAX_WRITEALL_BUFFERS];    /* their log seg#s */
+} SlruWriteAllData;
 
-typedef struct SlruFlushData *SlruFlush;
+typedef struct SlruWriteAllData *SlruWriteAll;
+
+/*
+ * Populate a file tag describing a segment file.  We only use the segment
+ * number, since we can derive everything else we need by having separate
+ * sync handler functions for clog, multixact etc.
+ */
+#define INIT_SLRUFILETAG(a,xx_handler,xx_segno) \
+( \
+       memset(&(a), 0, sizeof(FileTag)), \
+       (a).handler = (xx_handler), \
+       (a).segno = (xx_segno) \
+)
 
 /*
  * Macro to mark a buffer slot "most recently used".  Note multiple evaluation
@@ -125,10 +136,10 @@ static int        slru_errno;
 
 static void SimpleLruZeroLSNs(SlruCtl ctl, int slotno);
 static void SimpleLruWaitIO(SlruCtl ctl, int slotno);
-static void SlruInternalWritePage(SlruCtl ctl, int slotno, SlruFlush fdata);
+static void SlruInternalWritePage(SlruCtl ctl, int slotno, SlruWriteAll fdata);
 static bool SlruPhysicalReadPage(SlruCtl ctl, int pageno, int slotno);
 static bool SlruPhysicalWritePage(SlruCtl ctl, int pageno, int slotno,
-                                                                 SlruFlush fdata);
+                                                                 SlruWriteAll fdata);
 static void SlruReportIOError(SlruCtl ctl, int pageno, TransactionId xid);
 static int     SlruSelectLRUPage(SlruCtl ctl, int pageno);
 
@@ -173,7 +184,8 @@ SimpleLruShmemSize(int nslots, int nlsns)
  */
 void
 SimpleLruInit(SlruCtl ctl, const char *name, int nslots, int nlsns,
-                         LWLock *ctllock, const char *subdir, int tranche_id)
+                         LWLock *ctllock, const char *subdir, int tranche_id,
+                         SyncRequestHandler sync_handler)
 {
        SlruShared      shared;
        bool            found;
@@ -251,7 +263,7 @@ SimpleLruInit(SlruCtl ctl, const char *name, int nslots, int nlsns,
         * assume caller set PagePrecedes.
         */
        ctl->shared = shared;
-       ctl->do_fsync = true;           /* default behavior */
+       ctl->sync_handler = sync_handler;
        strlcpy(ctl->Dir, subdir, sizeof(ctl->Dir));
 }
 
@@ -523,7 +535,7 @@ SimpleLruReadPage_ReadOnly(SlruCtl ctl, int pageno, TransactionId xid)
  * Control lock must be held at entry, and will be held at exit.
  */
 static void
-SlruInternalWritePage(SlruCtl ctl, int slotno, SlruFlush fdata)
+SlruInternalWritePage(SlruCtl ctl, int slotno, SlruWriteAll fdata)
 {
        SlruShared      shared = ctl->shared;
        int                     pageno = shared->page_number[slotno];
@@ -587,6 +599,10 @@ SlruInternalWritePage(SlruCtl ctl, int slotno, SlruFlush fdata)
        /* Now it's okay to ereport if we failed */
        if (!ok)
                SlruReportIOError(ctl, pageno, InvalidTransactionId);
+
+       /* If part of a checkpoint, count this as a buffer written. */
+       if (fdata)
+               CheckpointStats.ckpt_bufs_written++;
 }
 
 /*
@@ -730,13 +746,13 @@ SlruPhysicalReadPage(SlruCtl ctl, int pageno, int slotno)
  *
  * For now, assume it's not worth keeping a file pointer open across
  * independent read/write operations.  We do batch operations during
- * SimpleLruFlush, though.
+ * SimpleLruWriteAll, though.
  *
  * fdata is NULL for a standalone write, pointer to open-file info during
- * SimpleLruFlush.
+ * SimpleLruWriteAll.
  */
 static bool
-SlruPhysicalWritePage(SlruCtl ctl, int pageno, int slotno, SlruFlush fdata)
+SlruPhysicalWritePage(SlruCtl ctl, int pageno, int slotno, SlruWriteAll fdata)
 {
        SlruShared      shared = ctl->shared;
        int                     segno = pageno / SLRU_PAGES_PER_SEGMENT;
@@ -791,7 +807,7 @@ SlruPhysicalWritePage(SlruCtl ctl, int pageno, int slotno, SlruFlush fdata)
        }
 
        /*
-        * During a Flush, we may already have the desired file open.
+        * During a WriteAll, we may already have the desired file open.
         */
        if (fdata)
        {
@@ -837,7 +853,7 @@ SlruPhysicalWritePage(SlruCtl ctl, int pageno, int slotno, SlruFlush fdata)
 
                if (fdata)
                {
-                       if (fdata->num_files < MAX_FLUSH_BUFFERS)
+                       if (fdata->num_files < MAX_WRITEALL_BUFFERS)
                        {
                                fdata->fd[fdata->num_files] = fd;
                                fdata->segno[fdata->num_files] = segno;
@@ -870,23 +886,31 @@ SlruPhysicalWritePage(SlruCtl ctl, int pageno, int slotno, SlruFlush fdata)
        }
        pgstat_report_wait_end();
 
-       /*
-        * If not part of Flush, need to fsync now.  We assume this happens
-        * infrequently enough that it's not a performance issue.
-        */
-       if (!fdata)
+       /* Queue up a sync request for the checkpointer. */
+       if (ctl->sync_handler != SYNC_HANDLER_NONE)
        {
-               pgstat_report_wait_start(WAIT_EVENT_SLRU_SYNC);
-               if (ctl->do_fsync && pg_fsync(fd) != 0)
+               FileTag         tag;
+
+               INIT_SLRUFILETAG(tag, ctl->sync_handler, segno);
+               if (!RegisterSyncRequest(&tag, SYNC_REQUEST, false))
                {
+                       /* No space to enqueue sync request.  Do it synchronously. */
+                       pgstat_report_wait_start(WAIT_EVENT_SLRU_SYNC);
+                       if (pg_fsync(fd) != 0)
+                       {
+                               pgstat_report_wait_end();
+                               slru_errcause = SLRU_FSYNC_FAILED;
+                               slru_errno = errno;
+                               CloseTransientFile(fd);
+                               return false;
+                       }
                        pgstat_report_wait_end();
-                       slru_errcause = SLRU_FSYNC_FAILED;
-                       slru_errno = errno;
-                       CloseTransientFile(fd);
-                       return false;
                }
-               pgstat_report_wait_end();
+       }
 
+       /* Close file, unless part of flush request. */
+       if (!fdata)
+       {
                if (CloseTransientFile(fd) != 0)
                {
                        slru_errcause = SLRU_CLOSE_FAILED;
@@ -1122,13 +1146,16 @@ SlruSelectLRUPage(SlruCtl ctl, int pageno)
 }
 
 /*
- * Flush dirty pages to disk during checkpoint or database shutdown
+ * Write dirty pages to disk during checkpoint or database shutdown.  Flushing
+ * is deferred until the next call to ProcessSyncRequests(), though we do fsync
+ * the containing directory here to make sure that newly created directory
+ * entries are on disk.
  */
 void
-SimpleLruFlush(SlruCtl ctl, bool allow_redirtied)
+SimpleLruWriteAll(SlruCtl ctl, bool allow_redirtied)
 {
        SlruShared      shared = ctl->shared;
-       SlruFlushData fdata;
+       SlruWriteAllData fdata;
        int                     slotno;
        int                     pageno = 0;
        int                     i;
@@ -1162,21 +1189,11 @@ SimpleLruFlush(SlruCtl ctl, bool allow_redirtied)
        LWLockRelease(shared->ControlLock);
 
        /*
-        * Now fsync and close any files that were open
+        * Now close any files that were open
         */
        ok = true;
        for (i = 0; i < fdata.num_files; i++)
        {
-               pgstat_report_wait_start(WAIT_EVENT_SLRU_FLUSH_SYNC);
-               if (ctl->do_fsync && pg_fsync(fdata.fd[i]) != 0)
-               {
-                       slru_errcause = SLRU_FSYNC_FAILED;
-                       slru_errno = errno;
-                       pageno = fdata.segno[i] * SLRU_PAGES_PER_SEGMENT;
-                       ok = false;
-               }
-               pgstat_report_wait_end();
-
                if (CloseTransientFile(fdata.fd[i]) != 0)
                {
                        slru_errcause = SLRU_CLOSE_FAILED;
@@ -1189,7 +1206,7 @@ SimpleLruFlush(SlruCtl ctl, bool allow_redirtied)
                SlruReportIOError(ctl, pageno, InvalidTransactionId);
 
        /* Ensure that directory entries for new files are on disk. */
-       if (ctl->do_fsync)
+       if (ctl->sync_handler != SYNC_HANDLER_NONE)
                fsync_fname(ctl->Dir, true);
 }
 
@@ -1350,6 +1367,19 @@ restart:
        snprintf(path, MAXPGPATH, "%s/%04X", ctl->Dir, segno);
        ereport(DEBUG2,
                        (errmsg("removing file \"%s\"", path)));
+
+       /*
+        * Tell the checkpointer to forget any sync requests, before we unlink the
+        * file.
+        */
+       if (ctl->sync_handler != SYNC_HANDLER_NONE)
+       {
+               FileTag         tag;
+
+               INIT_SLRUFILETAG(tag, ctl->sync_handler, segno);
+               RegisterSyncRequest(&tag, SYNC_FORGET_REQUEST, true);
+       }
+
        unlink(path);
 
        LWLockRelease(shared->ControlLock);
@@ -1448,3 +1478,31 @@ SlruScanDirectory(SlruCtl ctl, SlruScanCallback callback, void *data)
 
        return retval;
 }
+
+/*
+ * Individual SLRUs (clog, ...) have to provide a sync.c handler function so
+ * that they can provide the correct "SlruCtl" (otherwise we don't know how to
+ * build the path), but they just forward to this common implementation that
+ * performs the fsync.
+ */
+int
+SlruSyncFileTag(SlruCtl ctl, const FileTag *ftag, char *path)
+{
+       int                     fd;
+       int                     save_errno;
+       int                     result;
+
+       SlruFileName(ctl, path, ftag->segno);
+
+       fd = OpenTransientFile(path, O_RDWR | PG_BINARY);
+       if (fd < 0)
+               return -1;
+
+       result = pg_fsync(fd);
+       save_errno = errno;
+
+       CloseTransientFile(fd);
+
+       errno = save_errno;
+       return result;
+}
index a50f60b99af280e17813501142d4321656593b59..0111e867c79a27cbbae61450cbcfdd8655282f2b 100644 (file)
@@ -193,9 +193,7 @@ SUBTRANSShmemInit(void)
        SubTransCtl->PagePrecedes = SubTransPagePrecedes;
        SimpleLruInit(SubTransCtl, "Subtrans", NUM_SUBTRANS_BUFFERS, 0,
                                  SubtransSLRULock, "pg_subtrans",
-                                 LWTRANCHE_SUBTRANS_BUFFER);
-       /* Override default assumption that writes should be fsync'd */
-       SubTransCtl->do_fsync = false;
+                                 LWTRANCHE_SUBTRANS_BUFFER, SYNC_HANDLER_NONE);
 }
 
 /*
@@ -278,23 +276,6 @@ StartupSUBTRANS(TransactionId oldestActiveXID)
        LWLockRelease(SubtransSLRULock);
 }
 
-/*
- * This must be called ONCE during postmaster or standalone-backend shutdown
- */
-void
-ShutdownSUBTRANS(void)
-{
-       /*
-        * Flush dirty SUBTRANS pages to disk
-        *
-        * This is not actually necessary from a correctness point of view. We do
-        * it merely as a debugging aid.
-        */
-       TRACE_POSTGRESQL_SUBTRANS_CHECKPOINT_START(false);
-       SimpleLruFlush(SubTransCtl, false);
-       TRACE_POSTGRESQL_SUBTRANS_CHECKPOINT_DONE(false);
-}
-
 /*
  * Perform a checkpoint --- either during shutdown, or on-the-fly
  */
@@ -302,14 +283,14 @@ void
 CheckPointSUBTRANS(void)
 {
        /*
-        * Flush dirty SUBTRANS pages to disk
+        * Write dirty SUBTRANS pages to disk
         *
         * This is not actually necessary from a correctness point of view. We do
         * it merely to improve the odds that writing of dirty pages is done by
         * the checkpoint process and not by backends.
         */
        TRACE_POSTGRESQL_SUBTRANS_CHECKPOINT_START(true);
-       SimpleLruFlush(SubTransCtl, true);
+       SimpleLruWriteAll(SubTransCtl, true);
        TRACE_POSTGRESQL_SUBTRANS_CHECKPOINT_DONE(true);
 }
 
index 61754312e269088c84138cc04f9ffc9cac55fc30..79a77ebbfe24dba01cbf347a3d9d33665f128fd8 100644 (file)
@@ -8528,10 +8528,6 @@ ShutdownXLOG(int code, Datum arg)
 
                CreateCheckPoint(CHECKPOINT_IS_SHUTDOWN | CHECKPOINT_IMMEDIATE);
        }
-       ShutdownCLOG();
-       ShutdownCommitTs();
-       ShutdownSUBTRANS();
-       ShutdownMultiXact();
 }
 
 /*
@@ -9176,17 +9172,29 @@ CreateEndOfRecoveryRecord(void)
 static void
 CheckPointGuts(XLogRecPtr checkPointRedo, int flags)
 {
-       CheckPointCLOG();
-       CheckPointCommitTs();
-       CheckPointSUBTRANS();
-       CheckPointMultiXact();
-       CheckPointPredicate();
        CheckPointRelationMap();
        CheckPointReplicationSlots();
        CheckPointSnapBuild();
        CheckPointLogicalRewriteHeap();
-       CheckPointBuffers(flags);       /* performs all required fsyncs */
        CheckPointReplicationOrigin();
+
+       /* Write out all dirty data in SLRUs and the main buffer pool */
+       TRACE_POSTGRESQL_BUFFER_CHECKPOINT_START(flags);
+       CheckpointStats.ckpt_write_t = GetCurrentTimestamp();
+       CheckPointCLOG();
+       CheckPointCommitTs();
+       CheckPointSUBTRANS();
+       CheckPointMultiXact();
+       CheckPointPredicate();
+       CheckPointBuffers(flags);
+
+       /* Perform all queued up fsyncs */
+       TRACE_POSTGRESQL_BUFFER_CHECKPOINT_SYNC_START();
+       CheckpointStats.ckpt_sync_t = GetCurrentTimestamp();
+       ProcessSyncRequests();
+       CheckpointStats.ckpt_sync_end_t = GetCurrentTimestamp();
+       TRACE_POSTGRESQL_BUFFER_CHECKPOINT_DONE();
+
        /* We deliberately delay 2PC checkpointing as long as possible */
        CheckPointTwoPhase(checkPointRedo);
 }
index cb341365df4652d0d7e00037deb1452280ffa9da..8dbcace3f931577e327e2492791a1fa4d1e47ea5 100644 (file)
@@ -554,9 +554,8 @@ AsyncShmemInit(void)
         */
        NotifyCtl->PagePrecedes = asyncQueuePagePrecedes;
        SimpleLruInit(NotifyCtl, "Notify", NUM_NOTIFY_BUFFERS, 0,
-                                 NotifySLRULock, "pg_notify", LWTRANCHE_NOTIFY_BUFFER);
-       /* Override default assumption that writes should be fsync'd */
-       NotifyCtl->do_fsync = false;
+                                 NotifySLRULock, "pg_notify", LWTRANCHE_NOTIFY_BUFFER,
+                                 SYNC_HANDLER_NONE);
 
        if (!found)
        {
index a2a963bd5b41f7e6d9ea3633fe9e1ee1a78e4e01..e549fa1d309fe5021e58877bbd21f80f9c0070d5 100644 (file)
@@ -2636,14 +2636,7 @@ PrintBufferLeakWarning(Buffer buffer)
 void
 CheckPointBuffers(int flags)
 {
-       TRACE_POSTGRESQL_BUFFER_CHECKPOINT_START(flags);
-       CheckpointStats.ckpt_write_t = GetCurrentTimestamp();
        BufferSync(flags);
-       CheckpointStats.ckpt_sync_t = GetCurrentTimestamp();
-       TRACE_POSTGRESQL_BUFFER_CHECKPOINT_SYNC_START();
-       ProcessSyncRequests();
-       CheckpointStats.ckpt_sync_end_t = GetCurrentTimestamp();
-       TRACE_POSTGRESQL_BUFFER_CHECKPOINT_DONE();
 }
 
 
index a2f8e7524b499b26407895e95f96cd66fcc6099d..8a365b400c6b6a5120582889b50b2c5b6a34c60d 100644 (file)
@@ -821,9 +821,7 @@ SerialInit(void)
        SerialSlruCtl->PagePrecedes = SerialPagePrecedesLogically;
        SimpleLruInit(SerialSlruCtl, "Serial",
                                  NUM_SERIAL_BUFFERS, 0, SerialSLRULock, "pg_serial",
-                                 LWTRANCHE_SERIAL_BUFFER);
-       /* Override default assumption that writes should be fsync'd */
-       SerialSlruCtl->do_fsync = false;
+                                 LWTRANCHE_SERIAL_BUFFER, SYNC_HANDLER_NONE);
 
        /*
         * Create or attach to the SerialControl structure.
@@ -1052,7 +1050,7 @@ CheckPointPredicate(void)
        SimpleLruTruncate(SerialSlruCtl, tailPage);
 
        /*
-        * Flush dirty SLRU pages to disk
+        * Write dirty SLRU pages to disk
         *
         * This is not actually necessary from a correctness point of view. We do
         * it merely as a debugging aid.
@@ -1061,7 +1059,7 @@ CheckPointPredicate(void)
         * before deleting the file in which they sit, which would be completely
         * pointless.
         */
-       SimpleLruFlush(SerialSlruCtl, true);
+       SimpleLruWriteAll(SerialSlruCtl, true);
 }
 
 /*------------------------------------------------------------------------*/
index 3ded2cdd716bc2cdc49604c9f5f61e682e2b34ec..1d635d596cad50ce013de0e648b3f292063810fa 100644 (file)
@@ -18,6 +18,9 @@
 #include <fcntl.h>
 #include <sys/file.h>
 
+#include "access/commit_ts.h"
+#include "access/clog.h"
+#include "access/multixact.h"
 #include "access/xlog.h"
 #include "access/xlogutils.h"
 #include "commands/tablespace.h"
@@ -90,12 +93,31 @@ typedef struct SyncOps
                                                                                const FileTag *candidate);
 } SyncOps;
 
+/*
+ * These indexes must correspond to the values of the SyncRequestHandler enum.
+ */
 static const SyncOps syncsw[] = {
        /* magnetic disk */
-       {
+       [SYNC_HANDLER_MD] = {
                .sync_syncfiletag = mdsyncfiletag,
                .sync_unlinkfiletag = mdunlinkfiletag,
                .sync_filetagmatches = mdfiletagmatches
+       },
+       /* pg_xact */
+       [SYNC_HANDLER_CLOG] = {
+               .sync_syncfiletag = clogsyncfiletag
+       },
+       /* pg_commit_ts */
+       [SYNC_HANDLER_COMMIT_TS] = {
+               .sync_syncfiletag = committssyncfiletag
+       },
+       /* pg_multixact/offsets */
+       [SYNC_HANDLER_MULTIXACT_OFFSET] = {
+               .sync_syncfiletag = multixactoffsetssyncfiletag
+       },
+       /* pg_multixact/members */
+       [SYNC_HANDLER_MULTIXACT_MEMBER] = {
+               .sync_syncfiletag = multixactmemberssyncfiletag
        }
 };
 
@@ -505,8 +527,8 @@ RememberSyncRequest(const FileTag *ftag, SyncRequestType type)
                                                                                                  (void *) ftag,
                                                                                                  HASH_ENTER,
                                                                                                  &found);
-               /* if new entry, initialize it */
-               if (!found)
+               /* if new entry, or was previously canceled, initialize it */
+               if (!found || entry->canceled)
                {
                        entry->cycle_ctr = sync_cycle_ctr;
                        entry->canceled = false;
index 2db8acb189f5ce79afc1646a965483647fb10762..6c840cbf299b383a1bda10879be2c0bea2bdf2db 100644 (file)
@@ -12,6 +12,7 @@
 #define CLOG_H
 
 #include "access/xlogreader.h"
+#include "storage/sync.h"
 #include "lib/stringinfo.h"
 
 /*
@@ -50,6 +51,8 @@ extern void CheckPointCLOG(void);
 extern void ExtendCLOG(TransactionId newestXact);
 extern void TruncateCLOG(TransactionId oldestXact, Oid oldestxid_datoid);
 
+extern int     clogsyncfiletag(const FileTag *ftag, char *path);
+
 /* XLOG stuff */
 #define CLOG_ZEROPAGE          0x00
 #define CLOG_TRUNCATE          0x10
index 2740c02a84fa86098ccc50f64fed5d83e9a3da6f..2d1724952257a15c8048287f7728ef76c48fdca7 100644 (file)
@@ -14,6 +14,7 @@
 #include "access/xlog.h"
 #include "datatype/timestamp.h"
 #include "replication/origin.h"
+#include "storage/sync.h"
 #include "utils/guc.h"
 
 
@@ -45,6 +46,8 @@ extern void SetCommitTsLimit(TransactionId oldestXact,
                                                         TransactionId newestXact);
 extern void AdvanceOldestCommitTsXid(TransactionId oldestXact);
 
+extern int     committssyncfiletag(const FileTag *ftag, char *path);
+
 /* XLOG stuff */
 #define COMMIT_TS_ZEROPAGE             0x00
 #define COMMIT_TS_TRUNCATE             0x10
index 6d729008c600604ca09e4929ac8c5b40179267cf..58c42ffe1fe66faf3b27e304b68904cc66536c6f 100644 (file)
@@ -13,6 +13,7 @@
 
 #include "access/xlogreader.h"
 #include "lib/stringinfo.h"
+#include "storage/sync.h"
 
 
 /*
@@ -116,6 +117,9 @@ extern bool MultiXactIdPrecedes(MultiXactId multi1, MultiXactId multi2);
 extern bool MultiXactIdPrecedesOrEquals(MultiXactId multi1,
                                                                                MultiXactId multi2);
 
+extern int     multixactoffsetssyncfiletag(const FileTag *ftag, char *path);
+extern int     multixactmemberssyncfiletag(const FileTag *ftag, char *path);
+
 extern void AtEOXact_MultiXact(void);
 extern void AtPrepare_MultiXact(void);
 extern void PostPrepare_MultiXact(TransactionId xid);
index 61fbc80ef0d6ae9f83863c1a04361309ed8031da..b39b43504d80c3e6cde86c50f8cb273750ae098d 100644 (file)
@@ -15,6 +15,7 @@
 
 #include "access/xlogdefs.h"
 #include "storage/lwlock.h"
+#include "storage/sync.h"
 
 
 /*
@@ -111,10 +112,10 @@ typedef struct SlruCtlData
        SlruShared      shared;
 
        /*
-        * This flag tells whether to fsync writes (true for pg_xact and multixact
-        * stuff, false for pg_subtrans and pg_notify).
+        * Which sync handler function to use when handing sync requests over to
+        * the checkpointer.  SYNC_HANDLER_NONE to disable fsync (eg pg_notify).
         */
-       bool            do_fsync;
+       SyncRequestHandler sync_handler;
 
        /*
         * Decide which of two page numbers is "older" for truncation purposes. We
@@ -135,14 +136,15 @@ typedef SlruCtlData *SlruCtl;
 
 extern Size SimpleLruShmemSize(int nslots, int nlsns);
 extern void SimpleLruInit(SlruCtl ctl, const char *name, int nslots, int nlsns,
-                                                 LWLock *ctllock, const char *subdir, int tranche_id);
+                                                 LWLock *ctllock, const char *subdir, int tranche_id,
+                                                 SyncRequestHandler sync_handler);
 extern int     SimpleLruZeroPage(SlruCtl ctl, int pageno);
 extern int     SimpleLruReadPage(SlruCtl ctl, int pageno, bool write_ok,
                                                          TransactionId xid);
 extern int     SimpleLruReadPage_ReadOnly(SlruCtl ctl, int pageno,
                                                                           TransactionId xid);
 extern void SimpleLruWritePage(SlruCtl ctl, int slotno);
-extern void SimpleLruFlush(SlruCtl ctl, bool allow_redirtied);
+extern void SimpleLruWriteAll(SlruCtl ctl, bool allow_redirtied);
 extern void SimpleLruTruncate(SlruCtl ctl, int cutoffPage);
 extern bool SimpleLruDoesPhysicalPageExist(SlruCtl ctl, int pageno);
 
@@ -151,6 +153,8 @@ typedef bool (*SlruScanCallback) (SlruCtl ctl, char *filename, int segpage,
 extern bool SlruScanDirectory(SlruCtl ctl, SlruScanCallback callback, void *data);
 extern void SlruDeleteSegment(SlruCtl ctl, int segno);
 
+extern int     SlruSyncFileTag(SlruCtl ctl, const FileTag *ftag, char *path);
+
 /* SlruScanDirectory public callbacks */
 extern bool SlruScanDirCbReportPresence(SlruCtl ctl, char *filename,
                                                                                int segpage, void *data);
index e16ab8e711c40deac71e49019908631b37f35d25..f32e412e751d9cef6d3fc1f041abd592c0f84445 100644 (file)
@@ -34,7 +34,12 @@ typedef enum SyncRequestType
  */
 typedef enum SyncRequestHandler
 {
-       SYNC_HANDLER_MD = 0                     /* md smgr */
+       SYNC_HANDLER_MD = 0,
+       SYNC_HANDLER_CLOG,
+       SYNC_HANDLER_COMMIT_TS,
+       SYNC_HANDLER_MULTIXACT_OFFSET,
+       SYNC_HANDLER_MULTIXACT_MEMBER,
+       SYNC_HANDLER_NONE
 } SyncRequestHandler;
 
 /*
index b1afb345c36851b22b1535d0a098f3a93b8f9d61..9cd1179af63cdcbb78b71906cdc0a7ecad829754 100644 (file)
@@ -2295,12 +2295,12 @@ SlotNumber
 SlruCtl
 SlruCtlData
 SlruErrorCause
-SlruFlush
-SlruFlushData
 SlruPageStatus
 SlruScanCallback
 SlruShared
 SlruSharedData
+SlruWriteAll
+SlruWriteAllData
 SnapBuild
 SnapBuildOnDisk
 SnapBuildState
@@ -2404,6 +2404,7 @@ Syn
 SyncOps
 SyncRepConfigData
 SyncRepStandbyData
+SyncRequestHandler
 SyncRequestType
 SysScanDesc
 SyscacheCallbackFunction