Fast promote mode skips checkpoint at end of recovery.

author Simon Riggs <simon@2ndQuadrant.com>

Tue, 29 Jan 2013 00:06:15 +0000 (00:06 +0000)

committer Simon Riggs <simon@2ndQuadrant.com>

Tue, 29 Jan 2013 00:06:15 +0000 (00:06 +0000)
author Simon Riggs <simon@2ndQuadrant.com>
Tue, 29 Jan 2013 00:06:15 +0000 (00:06 +0000)
committer Simon Riggs <simon@2ndQuadrant.com>
Tue, 29 Jan 2013 00:06:15 +0000 (00:06 +0000)
diff --git a/src/backend/access/rmgrdesc/xlogdesc.c b/src/backend/access/rmgrdesc/xlogdesc.c

index 506b208c9cfa117b91983d481eb83b0393a79107..69012985161dcadd15cd6bd4d730cfc6e2fc9dd1 100644 (file)
--- a/src/backend/access/rmgrdesc/xlogdesc.c
+++ b/src/backend/access/rmgrdesc/xlogdesc.c
@@ -18,6 +18,7 @@
  #include "access/xlog_internal.h"
  #include "catalog/pg_control.h"
  #include "utils/guc.h"
+#include "utils/timestamp.h"
  
  /*
   * GUC support
@@ -119,6 +120,15 @@ xlog_desc(StringInfo buf, uint8 xl_info, char *rec)
         memcpy(&fpw, rec, sizeof(bool));
         appendStringInfo(buf, "full_page_writes: %s", fpw ? "true" : "false");
     }
+   else if (info == XLOG_END_OF_RECOVERY)
+   {
+       xl_end_of_recovery xlrec;
+
+       memcpy(&xlrec, rec, sizeof(xl_end_of_recovery));
+       appendStringInfo(buf, "end_of_recovery: tli %u; time %s",
+                        xlrec.ThisTimeLineID,
+                        timestamptz_to_str(xlrec.end_time));
+   }
     else
         appendStringInfo(buf, "UNKNOWN");
  }
diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c

index cf2f6e70cff9e5fa0bf608183c22746d360e45ad..bcd379dca73253b8c786de1b1fc0f0fcf44a2d18 100644 (file)
--- a/src/backend/access/transam/xlog.c
+++ b/src/backend/access/transam/xlog.c
@@ -66,6 +66,7 @@
  #define RECOVERY_COMMAND_FILE  "recovery.conf"
  #define RECOVERY_COMMAND_DONE  "recovery.done"
  #define PROMOTE_SIGNAL_FILE "promote"
+#define FAST_PROMOTE_SIGNAL_FILE "fast_promote"
  
  
  /* User-settable parameters */
@@ -210,6 +211,9 @@ bool StandbyMode = false;
  static char *PrimaryConnInfo = NULL;
  static char *TriggerFile = NULL;
  
+/* whether request for fast promotion has been made yet */
+static bool fast_promote = false;
+
  /* if recoveryStopsHere returns true, it saves actual stop xid/time/name here */
  static TransactionId recoveryStopXid;
  static TimestampTz recoveryStopTime;
@@ -611,6 +615,7 @@ static void CheckRequiredParameterValues(void);
  static void XLogReportParameters(void);
  static void checkTimeLineSwitch(XLogRecPtr lsn, TimeLineID newTLI);
  static void LocalSetXLogInsertAllowed(void);
+static void CreateEndOfRecoveryRecord(void);
  static void CheckPointGuts(XLogRecPtr checkPointRedo, int flags);
  static void KeepLogSeg(XLogRecPtr recptr, XLogSegNo *logSegNo);
  
@@ -642,7 +647,7 @@ static XLogRecord *ReadRecord(XLogReaderState *xlogreader, XLogRecPtr RecPtr,
            int emode, bool fetching_ckpt);
  static void CheckRecoveryConsistency(void);
  static XLogRecord *ReadCheckpointRecord(XLogReaderState *xlogreader,
-                    XLogRecPtr RecPtr, int whichChkpt);
+                    XLogRecPtr RecPtr, int whichChkpti, bool report);
  static bool rescanLatestTimeLine(void);
  static void WriteControlFile(void);
  static void ReadControlFile(void);
@@ -4848,7 +4853,7 @@ StartupXLOG(void)
          * When a backup_label file is present, we want to roll forward from
          * the checkpoint it identifies, rather than using pg_control.
          */
-       record = ReadCheckpointRecord(xlogreader, checkPointLoc, 0);
+       record = ReadCheckpointRecord(xlogreader, checkPointLoc, 0, true);
         if (record != NULL)
         {
             memcpy(&checkPoint, XLogRecGetData(record), sizeof(CheckPoint));
@@ -4890,7 +4895,7 @@ StartupXLOG(void)
          */
         checkPointLoc = ControlFile->checkPoint;
         RedoStartLSN = ControlFile->checkPointCopy.redo;
-       record = ReadCheckpointRecord(xlogreader, checkPointLoc, 1);
+       record = ReadCheckpointRecord(xlogreader, checkPointLoc, 1, true);
         if (record != NULL)
         {
             ereport(DEBUG1,
@@ -4909,7 +4914,7 @@ StartupXLOG(void)
         else
         {
             checkPointLoc = ControlFile->prevCheckPoint;
-           record = ReadCheckpointRecord(xlogreader, checkPointLoc, 2);
+           record = ReadCheckpointRecord(xlogreader, checkPointLoc, 2, true);
             if (record != NULL)
             {
                 ereport(LOG,
@@ -5393,22 +5398,33 @@ StartupXLOG(void)
                 }
  
                 /*
-                * Before replaying this record, check if it is a shutdown
-                * checkpoint record that causes the current timeline to
-                * change. The checkpoint record is already considered to be
-                * part of the new timeline, so we update ThisTimeLineID
-                * before replaying it. That's important so that replayEndTLI,
-                * which is recorded as the minimum recovery point's TLI if
+                * Before replaying this record, check if this record
+                * causes the current timeline to change. The record is
+                * already considered to be part of the new timeline,
+                * so we update ThisTimeLineID before replaying it.
+                * That's important so that replayEndTLI, which is
+                * recorded as the minimum recovery point's TLI if
                  * recovery stops after this record, is set correctly.
                  */
-               if (record->xl_rmid == RM_XLOG_ID &&
-                   (record->xl_info & ~XLR_INFO_MASK) == XLOG_CHECKPOINT_SHUTDOWN)
+               if (record->xl_rmid == RM_XLOG_ID)
                 {
-                   CheckPoint  checkPoint;
-                   TimeLineID  newTLI;
+                   TimeLineID  newTLI = ThisTimeLineID;
+                   uint8       info = record->xl_info & ~XLR_INFO_MASK;
+
+                   if (info == XLOG_CHECKPOINT_SHUTDOWN)
+                   {
+                       CheckPoint  checkPoint;
+
+                       memcpy(&checkPoint, XLogRecGetData(record), sizeof(CheckPoint));
+                       newTLI = checkPoint.ThisTimeLineID;
+                   }
+                   else if (info == XLOG_END_OF_RECOVERY)
+                   {
+                       xl_end_of_recovery  xlrec;
  
-                   memcpy(&checkPoint, XLogRecGetData(record), sizeof(CheckPoint));
-                   newTLI = checkPoint.ThisTimeLineID;
+                       memcpy(&xlrec, XLogRecGetData(record), sizeof(xl_end_of_recovery));
+                       newTLI = xlrec.ThisTimeLineID;
+                   }
  
                     if (newTLI != ThisTimeLineID)
                     {
@@ -5729,9 +5745,36 @@ StartupXLOG(void)
          * allows some extra error checking in xlog_redo.
          */
         if (bgwriterLaunched)
-           RequestCheckpoint(CHECKPOINT_END_OF_RECOVERY |
-                             CHECKPOINT_IMMEDIATE |
-                             CHECKPOINT_WAIT);
+       {
+           bool    checkpoint_wait = true;
+
+           /*
+            * If we've been explicitly promoted with fast option,
+            * end of recovery without a checkpoint if possible.
+            */
+           if (fast_promote)
+           {
+               checkPointLoc = ControlFile->prevCheckPoint;
+               record = ReadCheckpointRecord(xlogreader, checkPointLoc, 2, false);
+               if (record != NULL)
+               {
+                   checkpoint_wait = false;
+                   CreateEndOfRecoveryRecord();
+               }
+           }
+
+           /*
+            * In most cases we will wait for a full checkpoint to complete.
+            *
+            * If not, issue a normal, non-immediate checkpoint but don't wait.
+            */
+           if (checkpoint_wait)
+               RequestCheckpoint(CHECKPOINT_END_OF_RECOVERY |
+                                   CHECKPOINT_IMMEDIATE |
+                                   CHECKPOINT_WAIT);
+           else
+               RequestCheckpoint(0);   /* No flags */
+       }
         else
             CreateCheckPoint(CHECKPOINT_END_OF_RECOVERY | CHECKPOINT_IMMEDIATE);
  
@@ -6060,12 +6103,15 @@ LocalSetXLogInsertAllowed(void)
   */
  static XLogRecord *
  ReadCheckpointRecord(XLogReaderState *xlogreader, XLogRecPtr RecPtr,
-                    int whichChkpt)
+                    int whichChkpt, bool report)
  {
     XLogRecord *record;
  
     if (!XRecOffIsValid(RecPtr))
     {
+       if (!report)
+           return NULL;
+
         switch (whichChkpt)
         {
             case 1:
@@ -6088,6 +6134,9 @@ ReadCheckpointRecord(XLogReaderState *xlogreader, XLogRecPtr RecPtr,
  
     if (record == NULL)
     {
+       if (!report)
+           return NULL;
+
         switch (whichChkpt)
         {
             case 1:
@@ -6882,6 +6931,44 @@ CreateCheckPoint(int flags)
     LWLockRelease(CheckpointLock);
  }
  
+/*
+ * Mark the end of recovery in WAL though without running a full checkpoint.
+ * We can expect that a restartpoint is likely to be in progress as we
+ * do this, though we are unwilling to wait for it to complete. So be
+ * careful to avoid taking the CheckpointLock anywhere here.
+ *
+ * CreateRestartPoint() allows for the case where recovery may end before
+ * the restartpoint completes so there is no concern of concurrent behaviour.
+ */
+void
+CreateEndOfRecoveryRecord(void)
+{
+   xl_end_of_recovery  xlrec;
+   XLogRecData         rdata;
+
+   /* sanity check */
+   if (!RecoveryInProgress())
+       elog(ERROR, "can only be used to end recovery");
+
+   xlrec.end_time = time(NULL);
+   xlrec.ThisTimeLineID = ThisTimeLineID;
+
+   LocalSetXLogInsertAllowed();
+
+   START_CRIT_SECTION();
+
+   rdata.data = (char *) &xlrec;
+   rdata.len = sizeof(xl_end_of_recovery);
+   rdata.buffer = InvalidBuffer;
+   rdata.next = NULL;
+
+   (void) XLogInsert(RM_XLOG_ID, XLOG_END_OF_RECOVERY, &rdata);
+
+   END_CRIT_SECTION();
+
+   LocalXLogInsertAllowed = -1;        /* return to "check" state */
+}
+
  /*
   * Flush all data in shared memory to disk, and fsync
   *
@@ -7613,6 +7700,27 @@ xlog_redo(XLogRecPtr lsn, XLogRecord *record)
  
         RecoveryRestartPoint(&checkPoint);
     }
+   else if (info == XLOG_END_OF_RECOVERY)
+   {
+       xl_end_of_recovery xlrec;
+
+       memcpy(&xlrec, XLogRecGetData(record), sizeof(xl_end_of_recovery));
+
+       /*
+        * For Hot Standby, we could treat this like a Shutdown Checkpoint,
+        * but this case is rarer and harder to test, so the benefit doesn't
+        * outweigh the potential extra cost of maintenance.
+        */
+
+       /*
+        * We should've already switched to the new TLI before replaying this
+        * record.
+        */
+       if (xlrec.ThisTimeLineID != ThisTimeLineID)
+           ereport(PANIC,
+                   (errmsg("unexpected timeline ID %u (should be %u) in checkpoint record",
+                           xlrec.ThisTimeLineID, ThisTimeLineID)));
+   }
     else if (info == XLOG_NOOP)
     {
         /* nothing to do here */
@@ -9405,8 +9513,39 @@ CheckForStandbyTrigger(void)
  
     if (IsPromoteTriggered())
     {
-       ereport(LOG,
+       /*
+        * In 9.1 and 9.2 the postmaster unlinked the promote file
+        * inside the signal handler. We now leave the file in place
+        * and let the Startup process do the unlink. This allows
+        * Startup to know whether we're doing fast or normal
+        * promotion. Fast promotion takes precedence.
+        */
+       if (stat(FAST_PROMOTE_SIGNAL_FILE, &stat_buf) == 0)
+       {
+           unlink(FAST_PROMOTE_SIGNAL_FILE);
+           unlink(PROMOTE_SIGNAL_FILE);
+           fast_promote = true;
+       }
+       else if (stat(PROMOTE_SIGNAL_FILE, &stat_buf) == 0)
+       {
+           unlink(PROMOTE_SIGNAL_FILE);
+           fast_promote = false;
+       }
+
+       /*
+        * We only look for fast promote via the pg_ctl promote option.
+        * It would be possible to extend trigger file support for the
+        * fast promotion option but that wouldn't be backwards compatible
+        * anyway and we're looking to focus further work on the promote
+        * option as the right way to signal end of recovery.
+        */
+       if (fast_promote)
+           ereport(LOG,
+               (errmsg("received fast promote request")));
+       else
+           ereport(LOG,
                 (errmsg("received promote request")));
+
         ResetPromoteTriggered();
         triggered = true;
         return true;
@@ -9435,15 +9574,10 @@ CheckPromoteSignal(void)
  {
     struct stat stat_buf;
  
-   if (stat(PROMOTE_SIGNAL_FILE, &stat_buf) == 0)
-   {
-       /*
-        * Since we are in a signal handler, it's not safe to elog. We
-        * silently ignore any error from unlink.
-        */
-       unlink(PROMOTE_SIGNAL_FILE);
+   if (stat(PROMOTE_SIGNAL_FILE, &stat_buf) == 0 ||
+       stat(FAST_PROMOTE_SIGNAL_FILE, &stat_buf) == 0)
         return true;
-   }
+
     return false;
  }
  
diff --git a/src/bin/pg_ctl/pg_ctl.c b/src/bin/pg_ctl/pg_ctl.c

index e412d71dcff8c556653569a973469a6fc48cec9e..e086b1244cc1f3552bc9c123c7aed17982246b3b 100644 (file)
--- a/src/bin/pg_ctl/pg_ctl.c
+++ b/src/bin/pg_ctl/pg_ctl.c
@@ -1136,6 +1136,15 @@ do_promote(void)
         exit(1);
     }
  
+   /*
+    * Use two different kinds of promotion file so we can understand
+    * the difference between smart and fast promotion.
+    */
+   if (shutdown_mode >= FAST_MODE)
+       snprintf(promote_file, MAXPGPATH, "%s/fast_promote", pg_data);
+   else
+       snprintf(promote_file, MAXPGPATH, "%s/promote", pg_data);
+
     if ((prmfile = fopen(promote_file, "w")) == NULL)
     {
         write_stderr(_("%s: could not create promote signal file \"%s\": %s\n"),
@@ -1799,7 +1808,7 @@ do_help(void)
              "                 [-o \"OPTIONS\"]\n"), progname);
     printf(_("  %s reload  [-D DATADIR] [-s]\n"), progname);
     printf(_("  %s status  [-D DATADIR]\n"), progname);
-   printf(_("  %s promote [-D DATADIR] [-s]\n"), progname);
+   printf(_("  %s promote [-D DATADIR] [-s] [-m PROMOTION-MODE]\n"), progname);
     printf(_("  %s kill    SIGNALNAME PID\n"), progname);
  #if defined(WIN32) || defined(__CYGWIN__)
     printf(_("  %s register   [-N SERVICENAME] [-U USERNAME] [-P PASSWORD] [-D DATADIR]\n"
@@ -1828,7 +1837,7 @@ do_help(void)
     printf(_("  -o OPTIONS             command line options to pass to postgres\n"
      "                         (PostgreSQL server executable) or initdb\n"));
     printf(_("  -p PATH-TO-POSTGRES    normally not necessary\n"));
-   printf(_("\nOptions for stop or restart:\n"));
+   printf(_("\nOptions for stop, restart or promote:\n"));
     printf(_("  -m, --mode=MODE        MODE can be \"smart\", \"fast\", or \"immediate\"\n"));
  
     printf(_("\nShutdown modes are:\n"));
@@ -1836,6 +1845,10 @@ do_help(void)
     printf(_("  fast        quit directly, with proper shutdown\n"));
     printf(_("  immediate   quit without complete shutdown; will lead to recovery on restart\n"));
  
+   printf(_("\nPromotion modes are:\n"));
+   printf(_("  smart       promote after performing a checkpoint\n"));
+   printf(_("  fast        promote quickly without waiting for checkpoint completion\n"));
+
     printf(_("\nAllowed signal names for kill:\n"));
     printf("  ABRT HUP INT QUIT TERM USR1 USR2\n");
  
@@ -2271,7 +2284,6 @@ main(int argc, char **argv)
         snprintf(pid_file, MAXPGPATH, "%s/postmaster.pid", pg_data);
         snprintf(backup_file, MAXPGPATH, "%s/backup_label", pg_data);
         snprintf(recovery_file, MAXPGPATH, "%s/recovery.conf", pg_data);
-       snprintf(promote_file, MAXPGPATH, "%s/promote", pg_data);
     }
  
     switch (ctl_command)
diff --git a/src/include/access/xlog_internal.h b/src/include/access/xlog_internal.h

index 43e1e60f9bf82aa971938b4dd9bc3946885fe974..ce9957e618f7768352ad2e66416be3d33041a49e 100644 (file)
--- a/src/include/access/xlog_internal.h
+++ b/src/include/access/xlog_internal.h
@@ -217,6 +217,12 @@ typedef struct xl_restore_point
     char        rp_name[MAXFNAMELEN];
  } xl_restore_point;
  
+/* End of recovery mark, when we don't do an END_OF_RECOVERY checkpoint */
+typedef struct xl_end_of_recovery
+{
+   TimestampTz end_time;
+   TimeLineID  ThisTimeLineID;
+} xl_end_of_recovery;
  
  /*
   * XLogRecord is defined in xlog.h, but we avoid #including that to keep
diff --git a/src/include/catalog/pg_control.h b/src/include/catalog/pg_control.h

index e4a9abe7bc55d21b83a08143c0b5caf3e828b7f5..ec8cea7c86e749c3bde19063664a64bdd56f77ed 100644 (file)
--- a/src/include/catalog/pg_control.h
+++ b/src/include/catalog/pg_control.h
@@ -64,6 +64,7 @@ typedef struct CheckPoint
  #define XLOG_PARAMETER_CHANGE          0x60
  #define XLOG_RESTORE_POINT             0x70
  #define XLOG_FPW_CHANGE                0x80
+#define XLOG_END_OF_RECOVERY           0x90
  
  
  /*
author	Simon Riggs <simon@2ndQuadrant.com>
	Tue, 29 Jan 2013 00:06:15 +0000 (00:06 +0000)
committer	Simon Riggs <simon@2ndQuadrant.com>
	Tue, 29 Jan 2013 00:06:15 +0000 (00:06 +0000)
src/backend/access/rmgrdesc/xlogdesc.c		patch \| blob \| blame \| history
src/backend/access/transam/xlog.c		patch \| blob \| blame \| history
src/bin/pg_ctl/pg_ctl.c		patch \| blob \| blame \| history
src/include/access/xlog_internal.h		patch \| blob \| blame \| history
src/include/catalog/pg_control.h		patch \| blob \| blame \| history