#define RECOVERY_COMMAND_FILE "recovery.conf"
#define RECOVERY_COMMAND_DONE "recovery.done"
#define PROMOTE_SIGNAL_FILE "promote"
+#define FAST_PROMOTE_SIGNAL_FILE "fast_promote"
/* User-settable parameters */
static char *PrimaryConnInfo = NULL;
static char *TriggerFile = NULL;
+/* whether request for fast promotion has been made yet */
+static bool fast_promote = false;
+
/* if recoveryStopsHere returns true, it saves actual stop xid/time/name here */
static TransactionId recoveryStopXid;
static TimestampTz recoveryStopTime;
static void XLogReportParameters(void);
static void checkTimeLineSwitch(XLogRecPtr lsn, TimeLineID newTLI);
static void LocalSetXLogInsertAllowed(void);
+static void CreateEndOfRecoveryRecord(void);
static void CheckPointGuts(XLogRecPtr checkPointRedo, int flags);
static void KeepLogSeg(XLogRecPtr recptr, XLogSegNo *logSegNo);
int emode, bool fetching_ckpt);
static void CheckRecoveryConsistency(void);
static XLogRecord *ReadCheckpointRecord(XLogReaderState *xlogreader,
- XLogRecPtr RecPtr, int whichChkpt);
+ XLogRecPtr RecPtr, int whichChkpti, bool report);
static bool rescanLatestTimeLine(void);
static void WriteControlFile(void);
static void ReadControlFile(void);
* When a backup_label file is present, we want to roll forward from
* the checkpoint it identifies, rather than using pg_control.
*/
- record = ReadCheckpointRecord(xlogreader, checkPointLoc, 0);
+ record = ReadCheckpointRecord(xlogreader, checkPointLoc, 0, true);
if (record != NULL)
{
memcpy(&checkPoint, XLogRecGetData(record), sizeof(CheckPoint));
*/
checkPointLoc = ControlFile->checkPoint;
RedoStartLSN = ControlFile->checkPointCopy.redo;
- record = ReadCheckpointRecord(xlogreader, checkPointLoc, 1);
+ record = ReadCheckpointRecord(xlogreader, checkPointLoc, 1, true);
if (record != NULL)
{
ereport(DEBUG1,
else
{
checkPointLoc = ControlFile->prevCheckPoint;
- record = ReadCheckpointRecord(xlogreader, checkPointLoc, 2);
+ record = ReadCheckpointRecord(xlogreader, checkPointLoc, 2, true);
if (record != NULL)
{
ereport(LOG,
}
/*
- * Before replaying this record, check if it is a shutdown
- * checkpoint record that causes the current timeline to
- * change. The checkpoint record is already considered to be
- * part of the new timeline, so we update ThisTimeLineID
- * before replaying it. That's important so that replayEndTLI,
- * which is recorded as the minimum recovery point's TLI if
+ * Before replaying this record, check if this record
+ * causes the current timeline to change. The record is
+ * already considered to be part of the new timeline,
+ * so we update ThisTimeLineID before replaying it.
+ * That's important so that replayEndTLI, which is
+ * recorded as the minimum recovery point's TLI if
* recovery stops after this record, is set correctly.
*/
- if (record->xl_rmid == RM_XLOG_ID &&
- (record->xl_info & ~XLR_INFO_MASK) == XLOG_CHECKPOINT_SHUTDOWN)
+ if (record->xl_rmid == RM_XLOG_ID)
{
- CheckPoint checkPoint;
- TimeLineID newTLI;
+ TimeLineID newTLI = ThisTimeLineID;
+ uint8 info = record->xl_info & ~XLR_INFO_MASK;
+
+ if (info == XLOG_CHECKPOINT_SHUTDOWN)
+ {
+ CheckPoint checkPoint;
+
+ memcpy(&checkPoint, XLogRecGetData(record), sizeof(CheckPoint));
+ newTLI = checkPoint.ThisTimeLineID;
+ }
+ else if (info == XLOG_END_OF_RECOVERY)
+ {
+ xl_end_of_recovery xlrec;
- memcpy(&checkPoint, XLogRecGetData(record), sizeof(CheckPoint));
- newTLI = checkPoint.ThisTimeLineID;
+ memcpy(&xlrec, XLogRecGetData(record), sizeof(xl_end_of_recovery));
+ newTLI = xlrec.ThisTimeLineID;
+ }
if (newTLI != ThisTimeLineID)
{
* allows some extra error checking in xlog_redo.
*/
if (bgwriterLaunched)
- RequestCheckpoint(CHECKPOINT_END_OF_RECOVERY |
- CHECKPOINT_IMMEDIATE |
- CHECKPOINT_WAIT);
+ {
+ bool checkpoint_wait = true;
+
+ /*
+ * If we've been explicitly promoted with fast option,
+ * end of recovery without a checkpoint if possible.
+ */
+ if (fast_promote)
+ {
+ checkPointLoc = ControlFile->prevCheckPoint;
+ record = ReadCheckpointRecord(xlogreader, checkPointLoc, 2, false);
+ if (record != NULL)
+ {
+ checkpoint_wait = false;
+ CreateEndOfRecoveryRecord();
+ }
+ }
+
+ /*
+ * In most cases we will wait for a full checkpoint to complete.
+ *
+ * If not, issue a normal, non-immediate checkpoint but don't wait.
+ */
+ if (checkpoint_wait)
+ RequestCheckpoint(CHECKPOINT_END_OF_RECOVERY |
+ CHECKPOINT_IMMEDIATE |
+ CHECKPOINT_WAIT);
+ else
+ RequestCheckpoint(0); /* No flags */
+ }
else
CreateCheckPoint(CHECKPOINT_END_OF_RECOVERY | CHECKPOINT_IMMEDIATE);
*/
static XLogRecord *
ReadCheckpointRecord(XLogReaderState *xlogreader, XLogRecPtr RecPtr,
- int whichChkpt)
+ int whichChkpt, bool report)
{
XLogRecord *record;
if (!XRecOffIsValid(RecPtr))
{
+ if (!report)
+ return NULL;
+
switch (whichChkpt)
{
case 1:
if (record == NULL)
{
+ if (!report)
+ return NULL;
+
switch (whichChkpt)
{
case 1:
LWLockRelease(CheckpointLock);
}
+/*
+ * Mark the end of recovery in WAL though without running a full checkpoint.
+ * We can expect that a restartpoint is likely to be in progress as we
+ * do this, though we are unwilling to wait for it to complete. So be
+ * careful to avoid taking the CheckpointLock anywhere here.
+ *
+ * CreateRestartPoint() allows for the case where recovery may end before
+ * the restartpoint completes so there is no concern of concurrent behaviour.
+ */
+void
+CreateEndOfRecoveryRecord(void)
+{
+ xl_end_of_recovery xlrec;
+ XLogRecData rdata;
+
+ /* sanity check */
+ if (!RecoveryInProgress())
+ elog(ERROR, "can only be used to end recovery");
+
+ xlrec.end_time = time(NULL);
+ xlrec.ThisTimeLineID = ThisTimeLineID;
+
+ LocalSetXLogInsertAllowed();
+
+ START_CRIT_SECTION();
+
+ rdata.data = (char *) &xlrec;
+ rdata.len = sizeof(xl_end_of_recovery);
+ rdata.buffer = InvalidBuffer;
+ rdata.next = NULL;
+
+ (void) XLogInsert(RM_XLOG_ID, XLOG_END_OF_RECOVERY, &rdata);
+
+ END_CRIT_SECTION();
+
+ LocalXLogInsertAllowed = -1; /* return to "check" state */
+}
+
/*
* Flush all data in shared memory to disk, and fsync
*
RecoveryRestartPoint(&checkPoint);
}
+ else if (info == XLOG_END_OF_RECOVERY)
+ {
+ xl_end_of_recovery xlrec;
+
+ memcpy(&xlrec, XLogRecGetData(record), sizeof(xl_end_of_recovery));
+
+ /*
+ * For Hot Standby, we could treat this like a Shutdown Checkpoint,
+ * but this case is rarer and harder to test, so the benefit doesn't
+ * outweigh the potential extra cost of maintenance.
+ */
+
+ /*
+ * We should've already switched to the new TLI before replaying this
+ * record.
+ */
+ if (xlrec.ThisTimeLineID != ThisTimeLineID)
+ ereport(PANIC,
+ (errmsg("unexpected timeline ID %u (should be %u) in checkpoint record",
+ xlrec.ThisTimeLineID, ThisTimeLineID)));
+ }
else if (info == XLOG_NOOP)
{
/* nothing to do here */
if (IsPromoteTriggered())
{
- ereport(LOG,
+ /*
+ * In 9.1 and 9.2 the postmaster unlinked the promote file
+ * inside the signal handler. We now leave the file in place
+ * and let the Startup process do the unlink. This allows
+ * Startup to know whether we're doing fast or normal
+ * promotion. Fast promotion takes precedence.
+ */
+ if (stat(FAST_PROMOTE_SIGNAL_FILE, &stat_buf) == 0)
+ {
+ unlink(FAST_PROMOTE_SIGNAL_FILE);
+ unlink(PROMOTE_SIGNAL_FILE);
+ fast_promote = true;
+ }
+ else if (stat(PROMOTE_SIGNAL_FILE, &stat_buf) == 0)
+ {
+ unlink(PROMOTE_SIGNAL_FILE);
+ fast_promote = false;
+ }
+
+ /*
+ * We only look for fast promote via the pg_ctl promote option.
+ * It would be possible to extend trigger file support for the
+ * fast promotion option but that wouldn't be backwards compatible
+ * anyway and we're looking to focus further work on the promote
+ * option as the right way to signal end of recovery.
+ */
+ if (fast_promote)
+ ereport(LOG,
+ (errmsg("received fast promote request")));
+ else
+ ereport(LOG,
(errmsg("received promote request")));
+
ResetPromoteTriggered();
triggered = true;
return true;
{
struct stat stat_buf;
- if (stat(PROMOTE_SIGNAL_FILE, &stat_buf) == 0)
- {
- /*
- * Since we are in a signal handler, it's not safe to elog. We
- * silently ignore any error from unlink.
- */
- unlink(PROMOTE_SIGNAL_FILE);
+ if (stat(PROMOTE_SIGNAL_FILE, &stat_buf) == 0 ||
+ stat(FAST_PROMOTE_SIGNAL_FILE, &stat_buf) == 0)
return true;
- }
+
return false;
}
exit(1);
}
+ /*
+ * Use two different kinds of promotion file so we can understand
+ * the difference between smart and fast promotion.
+ */
+ if (shutdown_mode >= FAST_MODE)
+ snprintf(promote_file, MAXPGPATH, "%s/fast_promote", pg_data);
+ else
+ snprintf(promote_file, MAXPGPATH, "%s/promote", pg_data);
+
if ((prmfile = fopen(promote_file, "w")) == NULL)
{
write_stderr(_("%s: could not create promote signal file \"%s\": %s\n"),
" [-o \"OPTIONS\"]\n"), progname);
printf(_(" %s reload [-D DATADIR] [-s]\n"), progname);
printf(_(" %s status [-D DATADIR]\n"), progname);
- printf(_(" %s promote [-D DATADIR] [-s]\n"), progname);
+ printf(_(" %s promote [-D DATADIR] [-s] [-m PROMOTION-MODE]\n"), progname);
printf(_(" %s kill SIGNALNAME PID\n"), progname);
#if defined(WIN32) || defined(__CYGWIN__)
printf(_(" %s register [-N SERVICENAME] [-U USERNAME] [-P PASSWORD] [-D DATADIR]\n"
printf(_(" -o OPTIONS command line options to pass to postgres\n"
" (PostgreSQL server executable) or initdb\n"));
printf(_(" -p PATH-TO-POSTGRES normally not necessary\n"));
- printf(_("\nOptions for stop or restart:\n"));
+ printf(_("\nOptions for stop, restart or promote:\n"));
printf(_(" -m, --mode=MODE MODE can be \"smart\", \"fast\", or \"immediate\"\n"));
printf(_("\nShutdown modes are:\n"));
printf(_(" fast quit directly, with proper shutdown\n"));
printf(_(" immediate quit without complete shutdown; will lead to recovery on restart\n"));
+ printf(_("\nPromotion modes are:\n"));
+ printf(_(" smart promote after performing a checkpoint\n"));
+ printf(_(" fast promote quickly without waiting for checkpoint completion\n"));
+
printf(_("\nAllowed signal names for kill:\n"));
printf(" ABRT HUP INT QUIT TERM USR1 USR2\n");
snprintf(pid_file, MAXPGPATH, "%s/postmaster.pid", pg_data);
snprintf(backup_file, MAXPGPATH, "%s/backup_label", pg_data);
snprintf(recovery_file, MAXPGPATH, "%s/recovery.conf", pg_data);
- snprintf(promote_file, MAXPGPATH, "%s/promote", pg_data);
}
switch (ctl_command)