Fast promote mode skips checkpoint at end of recovery.
authorSimon Riggs <simon@2ndQuadrant.com>
Tue, 29 Jan 2013 00:06:15 +0000 (00:06 +0000)
committerSimon Riggs <simon@2ndQuadrant.com>
Tue, 29 Jan 2013 00:06:15 +0000 (00:06 +0000)
pg_ctl promote -m fast will skip the checkpoint at end of recovery so that we
can achieve very fast failover when the apply delay is low. Write new WAL record
XLOG_END_OF_RECOVERY to allow us to switch timeline correctly for downstream log
readers. If we skip synchronous end of recovery checkpoint we request a normal
spread checkpoint so that the window of re-recovery is low.

Simon Riggs and Kyotaro Horiguchi, with input from Fujii Masao.
Review by Heikki Linnakangas

src/backend/access/rmgrdesc/xlogdesc.c
src/backend/access/transam/xlog.c
src/bin/pg_ctl/pg_ctl.c
src/include/access/xlog_internal.h
src/include/catalog/pg_control.h

index 506b208c9cfa117b91983d481eb83b0393a79107..69012985161dcadd15cd6bd4d730cfc6e2fc9dd1 100644 (file)
@@ -18,6 +18,7 @@
 #include "access/xlog_internal.h"
 #include "catalog/pg_control.h"
 #include "utils/guc.h"
+#include "utils/timestamp.h"
 
 /*
  * GUC support
@@ -119,6 +120,15 @@ xlog_desc(StringInfo buf, uint8 xl_info, char *rec)
        memcpy(&fpw, rec, sizeof(bool));
        appendStringInfo(buf, "full_page_writes: %s", fpw ? "true" : "false");
    }
+   else if (info == XLOG_END_OF_RECOVERY)
+   {
+       xl_end_of_recovery xlrec;
+
+       memcpy(&xlrec, rec, sizeof(xl_end_of_recovery));
+       appendStringInfo(buf, "end_of_recovery: tli %u; time %s",
+                        xlrec.ThisTimeLineID,
+                        timestamptz_to_str(xlrec.end_time));
+   }
    else
        appendStringInfo(buf, "UNKNOWN");
 }
index cf2f6e70cff9e5fa0bf608183c22746d360e45ad..bcd379dca73253b8c786de1b1fc0f0fcf44a2d18 100644 (file)
@@ -66,6 +66,7 @@
 #define RECOVERY_COMMAND_FILE  "recovery.conf"
 #define RECOVERY_COMMAND_DONE  "recovery.done"
 #define PROMOTE_SIGNAL_FILE "promote"
+#define FAST_PROMOTE_SIGNAL_FILE "fast_promote"
 
 
 /* User-settable parameters */
@@ -210,6 +211,9 @@ bool StandbyMode = false;
 static char *PrimaryConnInfo = NULL;
 static char *TriggerFile = NULL;
 
+/* whether request for fast promotion has been made yet */
+static bool fast_promote = false;
+
 /* if recoveryStopsHere returns true, it saves actual stop xid/time/name here */
 static TransactionId recoveryStopXid;
 static TimestampTz recoveryStopTime;
@@ -611,6 +615,7 @@ static void CheckRequiredParameterValues(void);
 static void XLogReportParameters(void);
 static void checkTimeLineSwitch(XLogRecPtr lsn, TimeLineID newTLI);
 static void LocalSetXLogInsertAllowed(void);
+static void CreateEndOfRecoveryRecord(void);
 static void CheckPointGuts(XLogRecPtr checkPointRedo, int flags);
 static void KeepLogSeg(XLogRecPtr recptr, XLogSegNo *logSegNo);
 
@@ -642,7 +647,7 @@ static XLogRecord *ReadRecord(XLogReaderState *xlogreader, XLogRecPtr RecPtr,
           int emode, bool fetching_ckpt);
 static void CheckRecoveryConsistency(void);
 static XLogRecord *ReadCheckpointRecord(XLogReaderState *xlogreader,
-                    XLogRecPtr RecPtr, int whichChkpt);
+                    XLogRecPtr RecPtr, int whichChkpti, bool report);
 static bool rescanLatestTimeLine(void);
 static void WriteControlFile(void);
 static void ReadControlFile(void);
@@ -4848,7 +4853,7 @@ StartupXLOG(void)
         * When a backup_label file is present, we want to roll forward from
         * the checkpoint it identifies, rather than using pg_control.
         */
-       record = ReadCheckpointRecord(xlogreader, checkPointLoc, 0);
+       record = ReadCheckpointRecord(xlogreader, checkPointLoc, 0, true);
        if (record != NULL)
        {
            memcpy(&checkPoint, XLogRecGetData(record), sizeof(CheckPoint));
@@ -4890,7 +4895,7 @@ StartupXLOG(void)
         */
        checkPointLoc = ControlFile->checkPoint;
        RedoStartLSN = ControlFile->checkPointCopy.redo;
-       record = ReadCheckpointRecord(xlogreader, checkPointLoc, 1);
+       record = ReadCheckpointRecord(xlogreader, checkPointLoc, 1, true);
        if (record != NULL)
        {
            ereport(DEBUG1,
@@ -4909,7 +4914,7 @@ StartupXLOG(void)
        else
        {
            checkPointLoc = ControlFile->prevCheckPoint;
-           record = ReadCheckpointRecord(xlogreader, checkPointLoc, 2);
+           record = ReadCheckpointRecord(xlogreader, checkPointLoc, 2, true);
            if (record != NULL)
            {
                ereport(LOG,
@@ -5393,22 +5398,33 @@ StartupXLOG(void)
                }
 
                /*
-                * Before replaying this record, check if it is a shutdown
-                * checkpoint record that causes the current timeline to
-                * change. The checkpoint record is already considered to be
-                * part of the new timeline, so we update ThisTimeLineID
-                * before replaying it. That's important so that replayEndTLI,
-                * which is recorded as the minimum recovery point's TLI if
+                * Before replaying this record, check if this record
+                * causes the current timeline to change. The record is
+                * already considered to be part of the new timeline,
+                * so we update ThisTimeLineID before replaying it.
+                * That's important so that replayEndTLI, which is
+                * recorded as the minimum recovery point's TLI if
                 * recovery stops after this record, is set correctly.
                 */
-               if (record->xl_rmid == RM_XLOG_ID &&
-                   (record->xl_info & ~XLR_INFO_MASK) == XLOG_CHECKPOINT_SHUTDOWN)
+               if (record->xl_rmid == RM_XLOG_ID)
                {
-                   CheckPoint  checkPoint;
-                   TimeLineID  newTLI;
+                   TimeLineID  newTLI = ThisTimeLineID;
+                   uint8       info = record->xl_info & ~XLR_INFO_MASK;
+
+                   if (info == XLOG_CHECKPOINT_SHUTDOWN)
+                   {
+                       CheckPoint  checkPoint;
+
+                       memcpy(&checkPoint, XLogRecGetData(record), sizeof(CheckPoint));
+                       newTLI = checkPoint.ThisTimeLineID;
+                   }
+                   else if (info == XLOG_END_OF_RECOVERY)
+                   {
+                       xl_end_of_recovery  xlrec;
 
-                   memcpy(&checkPoint, XLogRecGetData(record), sizeof(CheckPoint));
-                   newTLI = checkPoint.ThisTimeLineID;
+                       memcpy(&xlrec, XLogRecGetData(record), sizeof(xl_end_of_recovery));
+                       newTLI = xlrec.ThisTimeLineID;
+                   }
 
                    if (newTLI != ThisTimeLineID)
                    {
@@ -5729,9 +5745,36 @@ StartupXLOG(void)
         * allows some extra error checking in xlog_redo.
         */
        if (bgwriterLaunched)
-           RequestCheckpoint(CHECKPOINT_END_OF_RECOVERY |
-                             CHECKPOINT_IMMEDIATE |
-                             CHECKPOINT_WAIT);
+       {
+           bool    checkpoint_wait = true;
+
+           /*
+            * If we've been explicitly promoted with fast option,
+            * end of recovery without a checkpoint if possible.
+            */
+           if (fast_promote)
+           {
+               checkPointLoc = ControlFile->prevCheckPoint;
+               record = ReadCheckpointRecord(xlogreader, checkPointLoc, 2, false);
+               if (record != NULL)
+               {
+                   checkpoint_wait = false;
+                   CreateEndOfRecoveryRecord();
+               }
+           }
+
+           /*
+            * In most cases we will wait for a full checkpoint to complete.
+            *
+            * If not, issue a normal, non-immediate checkpoint but don't wait.
+            */
+           if (checkpoint_wait)
+               RequestCheckpoint(CHECKPOINT_END_OF_RECOVERY |
+                                   CHECKPOINT_IMMEDIATE |
+                                   CHECKPOINT_WAIT);
+           else
+               RequestCheckpoint(0);   /* No flags */
+       }
        else
            CreateCheckPoint(CHECKPOINT_END_OF_RECOVERY | CHECKPOINT_IMMEDIATE);
 
@@ -6060,12 +6103,15 @@ LocalSetXLogInsertAllowed(void)
  */
 static XLogRecord *
 ReadCheckpointRecord(XLogReaderState *xlogreader, XLogRecPtr RecPtr,
-                    int whichChkpt)
+                    int whichChkpt, bool report)
 {
    XLogRecord *record;
 
    if (!XRecOffIsValid(RecPtr))
    {
+       if (!report)
+           return NULL;
+
        switch (whichChkpt)
        {
            case 1:
@@ -6088,6 +6134,9 @@ ReadCheckpointRecord(XLogReaderState *xlogreader, XLogRecPtr RecPtr,
 
    if (record == NULL)
    {
+       if (!report)
+           return NULL;
+
        switch (whichChkpt)
        {
            case 1:
@@ -6882,6 +6931,44 @@ CreateCheckPoint(int flags)
    LWLockRelease(CheckpointLock);
 }
 
+/*
+ * Mark the end of recovery in WAL though without running a full checkpoint.
+ * We can expect that a restartpoint is likely to be in progress as we
+ * do this, though we are unwilling to wait for it to complete. So be
+ * careful to avoid taking the CheckpointLock anywhere here.
+ *
+ * CreateRestartPoint() allows for the case where recovery may end before
+ * the restartpoint completes so there is no concern of concurrent behaviour.
+ */
+void
+CreateEndOfRecoveryRecord(void)
+{
+   xl_end_of_recovery  xlrec;
+   XLogRecData         rdata;
+
+   /* sanity check */
+   if (!RecoveryInProgress())
+       elog(ERROR, "can only be used to end recovery");
+
+   xlrec.end_time = time(NULL);
+   xlrec.ThisTimeLineID = ThisTimeLineID;
+
+   LocalSetXLogInsertAllowed();
+
+   START_CRIT_SECTION();
+
+   rdata.data = (char *) &xlrec;
+   rdata.len = sizeof(xl_end_of_recovery);
+   rdata.buffer = InvalidBuffer;
+   rdata.next = NULL;
+
+   (void) XLogInsert(RM_XLOG_ID, XLOG_END_OF_RECOVERY, &rdata);
+
+   END_CRIT_SECTION();
+
+   LocalXLogInsertAllowed = -1;        /* return to "check" state */
+}
+
 /*
  * Flush all data in shared memory to disk, and fsync
  *
@@ -7613,6 +7700,27 @@ xlog_redo(XLogRecPtr lsn, XLogRecord *record)
 
        RecoveryRestartPoint(&checkPoint);
    }
+   else if (info == XLOG_END_OF_RECOVERY)
+   {
+       xl_end_of_recovery xlrec;
+
+       memcpy(&xlrec, XLogRecGetData(record), sizeof(xl_end_of_recovery));
+
+       /*
+        * For Hot Standby, we could treat this like a Shutdown Checkpoint,
+        * but this case is rarer and harder to test, so the benefit doesn't
+        * outweigh the potential extra cost of maintenance.
+        */
+
+       /*
+        * We should've already switched to the new TLI before replaying this
+        * record.
+        */
+       if (xlrec.ThisTimeLineID != ThisTimeLineID)
+           ereport(PANIC,
+                   (errmsg("unexpected timeline ID %u (should be %u) in checkpoint record",
+                           xlrec.ThisTimeLineID, ThisTimeLineID)));
+   }
    else if (info == XLOG_NOOP)
    {
        /* nothing to do here */
@@ -9405,8 +9513,39 @@ CheckForStandbyTrigger(void)
 
    if (IsPromoteTriggered())
    {
-       ereport(LOG,
+       /*
+        * In 9.1 and 9.2 the postmaster unlinked the promote file
+        * inside the signal handler. We now leave the file in place
+        * and let the Startup process do the unlink. This allows
+        * Startup to know whether we're doing fast or normal
+        * promotion. Fast promotion takes precedence.
+        */
+       if (stat(FAST_PROMOTE_SIGNAL_FILE, &stat_buf) == 0)
+       {
+           unlink(FAST_PROMOTE_SIGNAL_FILE);
+           unlink(PROMOTE_SIGNAL_FILE);
+           fast_promote = true;
+       }
+       else if (stat(PROMOTE_SIGNAL_FILE, &stat_buf) == 0)
+       {
+           unlink(PROMOTE_SIGNAL_FILE);
+           fast_promote = false;
+       }
+
+       /*
+        * We only look for fast promote via the pg_ctl promote option.
+        * It would be possible to extend trigger file support for the
+        * fast promotion option but that wouldn't be backwards compatible
+        * anyway and we're looking to focus further work on the promote
+        * option as the right way to signal end of recovery.
+        */
+       if (fast_promote)
+           ereport(LOG,
+               (errmsg("received fast promote request")));
+       else
+           ereport(LOG,
                (errmsg("received promote request")));
+
        ResetPromoteTriggered();
        triggered = true;
        return true;
@@ -9435,15 +9574,10 @@ CheckPromoteSignal(void)
 {
    struct stat stat_buf;
 
-   if (stat(PROMOTE_SIGNAL_FILE, &stat_buf) == 0)
-   {
-       /*
-        * Since we are in a signal handler, it's not safe to elog. We
-        * silently ignore any error from unlink.
-        */
-       unlink(PROMOTE_SIGNAL_FILE);
+   if (stat(PROMOTE_SIGNAL_FILE, &stat_buf) == 0 ||
+       stat(FAST_PROMOTE_SIGNAL_FILE, &stat_buf) == 0)
        return true;
-   }
+
    return false;
 }
 
index e412d71dcff8c556653569a973469a6fc48cec9e..e086b1244cc1f3552bc9c123c7aed17982246b3b 100644 (file)
@@ -1136,6 +1136,15 @@ do_promote(void)
        exit(1);
    }
 
+   /*
+    * Use two different kinds of promotion file so we can understand
+    * the difference between smart and fast promotion.
+    */
+   if (shutdown_mode >= FAST_MODE)
+       snprintf(promote_file, MAXPGPATH, "%s/fast_promote", pg_data);
+   else
+       snprintf(promote_file, MAXPGPATH, "%s/promote", pg_data);
+
    if ((prmfile = fopen(promote_file, "w")) == NULL)
    {
        write_stderr(_("%s: could not create promote signal file \"%s\": %s\n"),
@@ -1799,7 +1808,7 @@ do_help(void)
             "                 [-o \"OPTIONS\"]\n"), progname);
    printf(_("  %s reload  [-D DATADIR] [-s]\n"), progname);
    printf(_("  %s status  [-D DATADIR]\n"), progname);
-   printf(_("  %s promote [-D DATADIR] [-s]\n"), progname);
+   printf(_("  %s promote [-D DATADIR] [-s] [-m PROMOTION-MODE]\n"), progname);
    printf(_("  %s kill    SIGNALNAME PID\n"), progname);
 #if defined(WIN32) || defined(__CYGWIN__)
    printf(_("  %s register   [-N SERVICENAME] [-U USERNAME] [-P PASSWORD] [-D DATADIR]\n"
@@ -1828,7 +1837,7 @@ do_help(void)
    printf(_("  -o OPTIONS             command line options to pass to postgres\n"
     "                         (PostgreSQL server executable) or initdb\n"));
    printf(_("  -p PATH-TO-POSTGRES    normally not necessary\n"));
-   printf(_("\nOptions for stop or restart:\n"));
+   printf(_("\nOptions for stop, restart or promote:\n"));
    printf(_("  -m, --mode=MODE        MODE can be \"smart\", \"fast\", or \"immediate\"\n"));
 
    printf(_("\nShutdown modes are:\n"));
@@ -1836,6 +1845,10 @@ do_help(void)
    printf(_("  fast        quit directly, with proper shutdown\n"));
    printf(_("  immediate   quit without complete shutdown; will lead to recovery on restart\n"));
 
+   printf(_("\nPromotion modes are:\n"));
+   printf(_("  smart       promote after performing a checkpoint\n"));
+   printf(_("  fast        promote quickly without waiting for checkpoint completion\n"));
+
    printf(_("\nAllowed signal names for kill:\n"));
    printf("  ABRT HUP INT QUIT TERM USR1 USR2\n");
 
@@ -2271,7 +2284,6 @@ main(int argc, char **argv)
        snprintf(pid_file, MAXPGPATH, "%s/postmaster.pid", pg_data);
        snprintf(backup_file, MAXPGPATH, "%s/backup_label", pg_data);
        snprintf(recovery_file, MAXPGPATH, "%s/recovery.conf", pg_data);
-       snprintf(promote_file, MAXPGPATH, "%s/promote", pg_data);
    }
 
    switch (ctl_command)
index 43e1e60f9bf82aa971938b4dd9bc3946885fe974..ce9957e618f7768352ad2e66416be3d33041a49e 100644 (file)
@@ -217,6 +217,12 @@ typedef struct xl_restore_point
    char        rp_name[MAXFNAMELEN];
 } xl_restore_point;
 
+/* End of recovery mark, when we don't do an END_OF_RECOVERY checkpoint */
+typedef struct xl_end_of_recovery
+{
+   TimestampTz end_time;
+   TimeLineID  ThisTimeLineID;
+} xl_end_of_recovery;
 
 /*
  * XLogRecord is defined in xlog.h, but we avoid #including that to keep
index e4a9abe7bc55d21b83a08143c0b5caf3e828b7f5..ec8cea7c86e749c3bde19063664a64bdd56f77ed 100644 (file)
@@ -64,6 +64,7 @@ typedef struct CheckPoint
 #define XLOG_PARAMETER_CHANGE          0x60
 #define XLOG_RESTORE_POINT             0x70
 #define XLOG_FPW_CHANGE                0x80
+#define XLOG_END_OF_RECOVERY           0x90
 
 
 /*