summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorHeikki Linnakangas2009-01-23 12:31:41 +0000
committerHeikki Linnakangas2009-01-23 12:31:41 +0000
commitf4fd26775b8b6e7090774bb3d794b8529771fce5 (patch)
tree49c733474d89bae67b234791cff20cbf25dc6dc9
parent29f760c92ccd341410269ec2cdb423d4eade51b5 (diff)
Import Simon's recovery infrastructure patch v9recoveryinfrav9
-rw-r--r--src/backend/access/transam/clog.c3
-rw-r--r--src/backend/access/transam/multixact.c14
-rw-r--r--src/backend/access/transam/subtrans.c3
-rw-r--r--src/backend/access/transam/xact.c3
-rw-r--r--src/backend/access/transam/xlog.c723
-rw-r--r--src/backend/postmaster/bgwriter.c420
-rw-r--r--src/backend/postmaster/postmaster.c62
-rw-r--r--src/backend/storage/buffer/README9
-rw-r--r--src/bin/pg_controldata/pg_controldata.c3
-rw-r--r--src/bin/pg_resetxlog/pg_resetxlog.c2
-rw-r--r--src/include/access/xlog.h14
-rw-r--r--src/include/access/xlog_internal.h4
-rw-r--r--src/include/catalog/pg_control.h5
-rw-r--r--src/include/postmaster/bgwriter.h6
-rw-r--r--src/include/storage/pmsignal.h1
15 files changed, 959 insertions, 313 deletions
diff --git a/src/backend/access/transam/clog.c b/src/backend/access/transam/clog.c
index 528a219db4..5bd72154c5 100644
--- a/src/backend/access/transam/clog.c
+++ b/src/backend/access/transam/clog.c
@@ -475,6 +475,9 @@ ZeroCLOGPage(int pageno, bool writeXlog)
/*
* This must be called ONCE during postmaster or standalone-backend startup,
* after StartupXLOG has initialized ShmemVariableCache->nextXid.
+ *
+ * We access just a single clog page, so this action is atomic and safe
+ * for use if other processes are active during recovery.
*/
void
StartupCLOG(void)
diff --git a/src/backend/access/transam/multixact.c b/src/backend/access/transam/multixact.c
index 7314341101..881a588d69 100644
--- a/src/backend/access/transam/multixact.c
+++ b/src/backend/access/transam/multixact.c
@@ -1413,8 +1413,11 @@ ZeroMultiXactMemberPage(int pageno, bool writeXlog)
* MultiXactSetNextMXact and/or MultiXactAdvanceNextMXact. Note that we
* may already have replayed WAL data into the SLRU files.
*
- * We don't need any locks here, really; the SLRU locks are taken
- * only because slru.c expects to be called with locks held.
+ * We want this operation to be atomic to ensure that other processes can
+ * use MultiXact while we complete recovery. We access one page only from the
+ * offset and members buffers, so once locks are acquired they will not be
+ * dropped and re-acquired by SLRU code. So we take both locks at start, then
+ * hold them all the way to the end.
*/
void
StartupMultiXact(void)
@@ -1426,6 +1429,7 @@ StartupMultiXact(void)
/* Clean up offsets state */
LWLockAcquire(MultiXactOffsetControlLock, LW_EXCLUSIVE);
+ LWLockAcquire(MultiXactMemberControlLock, LW_EXCLUSIVE);
/*
* Initialize our idea of the latest page number.
@@ -1452,10 +1456,7 @@ StartupMultiXact(void)
MultiXactOffsetCtl->shared->page_dirty[slotno] = true;
}
- LWLockRelease(MultiXactOffsetControlLock);
-
/* And the same for members */
- LWLockAcquire(MultiXactMemberControlLock, LW_EXCLUSIVE);
/*
* Initialize our idea of the latest page number.
@@ -1483,6 +1484,7 @@ StartupMultiXact(void)
}
LWLockRelease(MultiXactMemberControlLock);
+ LWLockRelease(MultiXactOffsetControlLock);
/*
* Initialize lastTruncationPoint to invalid, ensuring that the first
@@ -1543,7 +1545,7 @@ CheckPointMultiXact(void)
* SimpleLruTruncate would get confused. It seems best not to risk
* removing any data during recovery anyway, so don't truncate.
*/
- if (!InRecovery)
+ if (!IsRecoveryProcessingMode())
TruncateMultiXact();
TRACE_POSTGRESQL_MULTIXACT_CHECKPOINT_DONE(true);
diff --git a/src/backend/access/transam/subtrans.c b/src/backend/access/transam/subtrans.c
index 0dbd2166be..eaad23182a 100644
--- a/src/backend/access/transam/subtrans.c
+++ b/src/backend/access/transam/subtrans.c
@@ -226,6 +226,9 @@ ZeroSUBTRANSPage(int pageno)
*
* oldestActiveXID is the oldest XID of any prepared transaction, or nextXid
* if there are none.
+ *
+ * Note that this is not atomic and is not yet safe to perform while other
+ * processes might access subtrans.
*/
void
StartupSUBTRANS(TransactionId oldestActiveXID)
diff --git a/src/backend/access/transam/xact.c b/src/backend/access/transam/xact.c
index c94e2a2251..d0ed3c0318 100644
--- a/src/backend/access/transam/xact.c
+++ b/src/backend/access/transam/xact.c
@@ -394,6 +394,9 @@ AssignTransactionId(TransactionState s)
bool isSubXact = (s->parent != NULL);
ResourceOwner currentOwner;
+ if (IsRecoveryProcessingMode())
+ elog(FATAL, "cannot assign TransactionIds during recovery");
+
/* Assert that caller didn't screw up */
Assert(!TransactionIdIsValid(s->transactionId));
Assert(s->state == TRANS_INPROGRESS);
diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c
index bd6035d4a6..7e480e2fb2 100644
--- a/src/backend/access/transam/xlog.c
+++ b/src/backend/access/transam/xlog.c
@@ -115,7 +115,8 @@ CheckpointStatsData CheckpointStats;
/*
* ThisTimeLineID will be same in all backends --- it identifies current
- * WAL timeline for the database system.
+ * WAL timeline for the database system. Zero is always a bug, so we
+ * start with that to allow us to spot any errors.
*/
TimeLineID ThisTimeLineID = 0;
@@ -125,6 +126,10 @@ bool InRecovery = false;
/* Are we recovering using offline XLOG archives? */
static bool InArchiveRecovery = false;
+/* Local copy of shared RecoveryProcessingMode state */
+static bool LocalRecoveryProcessingMode = true;
+static bool knownProcessingMode = false;
+
/* Was the last xlog file restored from archive, or local? */
static bool restoredFromArchive = false;
@@ -143,6 +148,9 @@ static TransactionId recoveryStopXid;
static TimestampTz recoveryStopTime;
static bool recoveryStopAfter;
+/* is the database proven consistent yet? */
+bool reachedSafeStartPoint = false;
+
/*
* During normal operation, the only timeline we care about is ThisTimeLineID.
* During recovery, however, things are more complicated. To simplify life
@@ -242,10 +250,30 @@ static XLogRecPtr RedoRecPtr;
* ControlFileLock: must be held to read/update control file or create
* new log file.
*
- * CheckpointLock: must be held to do a checkpoint (ensures only one
- * checkpointer at a time; currently, with all checkpoints done by the
- * bgwriter, this is just pro forma).
+ * CheckpointLock: must be held to do a checkpoint or restartpoint, ensuring
+ * we get just one of those at any time. In 8.4+ recovery, both startup and
+ * bgwriter processes may take restartpoints, so this locking must be strict
+ * to ensure there are no mistakes.
+ *
+ * In 8.4 we progress through a number of states at startup. Initially, the
+ * postmaster is in PM_STARTUP state and spawns the Startup process. We then
+ * progress until the database is in a consistent state, then if we are in
+ * InArchiveRecovery we go into PM_RECOVERY state. The bgwriter then starts
+ * up and takes over responsibility for performing restartpoints. We then
+ * progress until the end of recovery when we enter PM_RUN state upon
+ * termination of the Startup process. In summary:
+ *
+ * PM_STARTUP state: Startup process performs restartpoints
+ * PM_RECOVERY state: bgwriter process performs restartpoints
+ * PM_RUN state: bgwriter process performs checkpoints
*
+ * These transitions are fairly delicate, with many things that need to
+ * happen at the same time in order to change state successfully throughout
+ * the system. Changing PM_STARTUP to PM_RECOVERY only occurs when we can
+ * prove the databases are in a consistent state. Changing from PM_RECOVERY
+ * to PM_RUN happens whenever recovery ends, which could be forced upon us
+ * externally or it can occur becasue of damage or termination of the WAL
+ * sequence.
*----------
*/
@@ -287,11 +315,18 @@ typedef struct XLogCtlWrite
/*
* Total shared-memory state for XLOG.
+ *
+ * This small structure is accessed by many backends, so we take care to
+ * pad out the parts of the structure so they can be accessed by separate
+ * CPUs without causing false sharing cache flushes. Padding is generous
+ * to allow for a wide variety of CPU architectures.
*/
+#define XLOGCTL_BUFFER_SPACING 128
typedef struct XLogCtlData
{
/* Protected by WALInsertLock: */
XLogCtlInsert Insert;
+ char InsertPadding[XLOGCTL_BUFFER_SPACING - sizeof(XLogCtlInsert)];
/* Protected by info_lck: */
XLogwrtRqst LogwrtRqst;
@@ -299,9 +334,16 @@ typedef struct XLogCtlData
uint32 ckptXidEpoch; /* nextXID & epoch of latest checkpoint */
TransactionId ckptXid;
XLogRecPtr asyncCommitLSN; /* LSN of newest async commit */
+ /* add data structure padding for above info_lck declarations */
+ char InfoPadding[XLOGCTL_BUFFER_SPACING - sizeof(XLogwrtRqst)
+ - sizeof(XLogwrtResult)
+ - sizeof(uint32)
+ - sizeof(TransactionId)
+ - sizeof(XLogRecPtr)];
/* Protected by WALWriteLock: */
XLogCtlWrite Write;
+ char WritePadding[XLOGCTL_BUFFER_SPACING - sizeof(XLogCtlWrite)];
/*
* These values do not change after startup, although the pointed-to pages
@@ -313,6 +355,24 @@ typedef struct XLogCtlData
int XLogCacheBlck; /* highest allocated xlog buffer index */
TimeLineID ThisTimeLineID;
+ /*
+ * IsRecoveryProcessingMode shows whether the postmaster is in a
+ * postmaster state earlier than PM_RUN, or not. This is a globally
+ * accessible state to allow EXEC_BACKEND case.
+ *
+ * We also retain a local state variable InRecovery. InRecovery=true
+ * means the code is being executed by Startup process and therefore
+ * always during Recovery Processing Mode. This allows us to identify
+ * code executed *during* Recovery Processing Mode but not necessarily
+ * by Startup process itself.
+ *
+ * Protected by mode_lck
+ */
+ bool SharedRecoveryProcessingMode;
+ slock_t mode_lck;
+
+ char InfoLockPadding[XLOGCTL_BUFFER_SPACING];
+
slock_t info_lck; /* locks shared variables shown above */
} XLogCtlData;
@@ -399,8 +459,10 @@ static void XLogArchiveCleanup(const char *xlog);
static void readRecoveryCommandFile(void);
static void exitArchiveRecovery(TimeLineID endTLI,
uint32 endLogId, uint32 endLogSeg);
+static void exitRecovery(void);
static bool recoveryStopsHere(XLogRecord *record, bool *includeThis);
static void CheckPointGuts(XLogRecPtr checkPointRedo, int flags);
+static XLogRecPtr GetRedoLocationForCheckpoint(void);
static bool XLogCheckBuffer(XLogRecData *rdata, bool doPageWrites,
XLogRecPtr *lsn, BkpBlock *bkpb);
@@ -483,6 +545,11 @@ XLogInsert(RmgrId rmid, uint8 info, XLogRecData *rdata)
bool updrqst;
bool doPageWrites;
bool isLogSwitch = (rmid == RM_XLOG_ID && info == XLOG_SWITCH);
+ bool isRecoveryEnd = (rmid == RM_XLOG_ID && info == XLOG_RECOVERY_END);
+
+ /* cross-check on whether we should be here or not */
+ if (IsRecoveryProcessingMode() && !isRecoveryEnd)
+ elog(FATAL, "cannot make new WAL entries during recovery");
/* info's high bits are reserved for use by me */
if (info & XLR_INFO_MASK)
@@ -1729,8 +1796,7 @@ XLogFlush(XLogRecPtr record)
XLogRecPtr WriteRqstPtr;
XLogwrtRqst WriteRqst;
- /* Disabled during REDO */
- if (InRedo)
+ if (IsRecoveryProcessingMode())
return;
/* Quick exit if already known flushed */
@@ -1818,9 +1884,9 @@ XLogFlush(XLogRecPtr record)
* the bad page is encountered again during recovery then we would be
* unable to restart the database at all! (This scenario has actually
* happened in the field several times with 7.1 releases. Note that we
- * cannot get here while InRedo is true, but if the bad page is brought in
- * and marked dirty during recovery then CreateCheckPoint will try to
- * flush it at the end of recovery.)
+ * cannot get here while IsRecoveryProcessingMode(), but if the bad page is
+ * brought in and marked dirty during recovery then if a checkpoint were
+ * performed at the end of recovery it will try to flush it.
*
* The current approach is to ERROR under normal conditions, but only
* WARNING during recovery, so that the system can be brought up even if
@@ -1830,7 +1896,7 @@ XLogFlush(XLogRecPtr record)
* and so we will not force a restart for a bad LSN on a data page.
*/
if (XLByteLT(LogwrtResult.Flush, record))
- elog(InRecovery ? WARNING : ERROR,
+ elog(ERROR,
"xlog flush request %X/%X is not satisfied --- flushed only to %X/%X",
record.xlogid, record.xrecoff,
LogwrtResult.Flush.xlogid, LogwrtResult.Flush.xrecoff);
@@ -2103,7 +2169,8 @@ XLogFileInit(uint32 log, uint32 seg,
unlink(tmppath);
}
- elog(DEBUG2, "done creating and filling new WAL file");
+ XLogFileName(tmppath, ThisTimeLineID, log, seg);
+ elog(DEBUG2, "done creating and filling new WAL file %s", tmppath);
/* Set flag to tell caller there was no existent file */
*use_existent = false;
@@ -2409,6 +2476,28 @@ XLogFileRead(uint32 log, uint32 seg, int emode)
xlogfname);
set_ps_display(activitymsg, false);
+ /*
+ * Calculate and write out a new safeStartPoint. This defines
+ * the latest LSN that might appear on-disk while we apply
+ * the WAL records in this file. If we crash during recovery
+ * we must reach this point again before we can prove
+ * database consistency. Not a restartpoint! Restart points
+ * define where we should start recovery from, if we crash.
+ */
+ if (InArchiveRecovery)
+ {
+ uint32 nextLog = log;
+ uint32 nextSeg = seg;
+
+ NextLogSeg(nextLog, nextSeg);
+
+ LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
+ ControlFile->minSafeStartPoint.xlogid = nextLog;
+ ControlFile->minSafeStartPoint.xrecoff = nextSeg * XLogSegSize;
+ UpdateControlFile();
+ LWLockRelease(ControlFileLock);
+ }
+
return fd;
}
if (errno != ENOENT) /* unexpected failure? */
@@ -4283,6 +4372,7 @@ XLOGShmemInit(void)
XLogCtl->XLogCacheBlck = XLOGbuffers - 1;
XLogCtl->Insert.currpage = (XLogPageHeader) (XLogCtl->pages);
SpinLockInit(&XLogCtl->info_lck);
+ SpinLockInit(&XLogCtl->mode_lck);
/*
* If we are not in bootstrap mode, pg_control should already exist. Read
@@ -4593,12 +4683,12 @@ readRecoveryCommandFile(void)
* does nothing if a recovery_target is not also set
*/
if (!parse_bool(tok2, &recoveryLogRestartpoints))
- ereport(ERROR,
- (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
- errmsg("parameter \"log_restartpoints\" requires a Boolean value")));
+ ereport(ERROR,
+ (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+ errmsg("parameter \"log_restartpoints\" requires a Boolean value")));
ereport(LOG,
- (errmsg("log_restartpoints = %s", tok2)));
- }
+ (errmsg("log_restartpoints = %s", tok2)));
+ }
else
ereport(FATAL,
(errmsg("unrecognized recovery parameter \"%s\"",
@@ -4733,15 +4823,13 @@ exitArchiveRecovery(TimeLineID endTLI, uint32 endLogId, uint32 endLogSeg)
unlink(recoveryPath); /* ignore any error */
/*
- * Rename the config file out of the way, so that we don't accidentally
- * re-enter archive recovery mode in a subsequent crash.
+ * As of 8.4 we no longer rename the recovery.conf file out of the
+ * way until after we have performed a full checkpoint. This ensures
+ * that any crash between now and the end of the checkpoint does not
+ * attempt to restart from a WAL file that is no longer available to us.
+ * As soon as we remove recovery.conf we lose our recovery_command and
+ * cannot reaccess WAL files from the archive.
*/
- unlink(RECOVERY_COMMAND_DONE);
- if (rename(RECOVERY_COMMAND_FILE, RECOVERY_COMMAND_DONE) != 0)
- ereport(FATAL,
- (errcode_for_file_access(),
- errmsg("could not rename file \"%s\" to \"%s\": %m",
- RECOVERY_COMMAND_FILE, RECOVERY_COMMAND_DONE)));
ereport(LOG,
(errmsg("archive recovery complete")));
@@ -4876,6 +4964,7 @@ StartupXLOG(void)
CheckPoint checkPoint;
bool wasShutdown;
bool reachedStopPoint = false;
+ bool performedRecovery = false;
bool haveBackupLabel = false;
XLogRecPtr RecPtr,
LastRec,
@@ -4888,6 +4977,8 @@ StartupXLOG(void)
uint32 freespace;
TransactionId oldestActiveXID;
+ XLogCtl->SharedRecoveryProcessingMode = true;
+
/*
* Read control file and check XLOG status looks valid.
*
@@ -5108,9 +5199,15 @@ StartupXLOG(void)
if (minRecoveryLoc.xlogid != 0 || minRecoveryLoc.xrecoff != 0)
ControlFile->minRecoveryPoint = minRecoveryLoc;
ControlFile->time = (pg_time_t) time(NULL);
+ /* No need to hold ControlFileLock yet, we aren't up far enough */
UpdateControlFile();
/*
+ * Reset pgstat data, because it may be invalid after recovery.
+ */
+ pgstat_reset_all();
+
+ /*
* If there was a backup label file, it's done its job and the info
* has now been propagated into pg_control. We must get rid of the
* label file so that if we crash during recovery, we'll pick up at
@@ -5217,6 +5314,32 @@ StartupXLOG(void)
LastRec = ReadRecPtr;
+ /*
+ * Have we reached our safe starting point? If so, we can
+ * signal Postmaster to enter consistent recovery mode.
+ *
+ * There are two point in the log we must pass. The first is
+ * the minRecoveryPoint, which is the LSN at the time the
+ * base backup was taken that we are about to rollfoward from.
+ * If recovery has ever crashed or was stopped there is
+ * another point also: minSafeStartPoint, which we know the
+ * latest LSN that recovery could have reached prior to crash.
+ */
+ if (!reachedSafeStartPoint &&
+ XLByteLE(ControlFile->minSafeStartPoint, EndRecPtr) &&
+ XLByteLE(ControlFile->minRecoveryPoint, EndRecPtr))
+ {
+ reachedSafeStartPoint = true;
+ if (InArchiveRecovery)
+ {
+ ereport(LOG,
+ (errmsg("consistent recovery state reached at %X/%X",
+ EndRecPtr.xlogid, EndRecPtr.xrecoff)));
+ if (IsUnderPostmaster)
+ SendPostmasterSignal(PMSIGNAL_RECOVERY_START);
+ }
+ }
+
record = ReadRecord(NULL, LOG);
} while (record != NULL && recoveryContinue);
@@ -5238,6 +5361,7 @@ StartupXLOG(void)
/* there are no WAL records following the checkpoint */
ereport(LOG,
(errmsg("redo is not required")));
+ reachedSafeStartPoint = true;
}
}
@@ -5251,9 +5375,9 @@ StartupXLOG(void)
/*
* Complain if we did not roll forward far enough to render the backup
- * dump consistent.
+ * dump consistent and start safely.
*/
- if (XLByteLT(EndOfLog, ControlFile->minRecoveryPoint))
+ if (InRecovery && !reachedSafeStartPoint)
{
if (reachedStopPoint) /* stopped because of stop request */
ereport(FATAL,
@@ -5375,39 +5499,14 @@ StartupXLOG(void)
XLogCheckInvalidPages();
/*
- * Reset pgstat data, because it may be invalid after recovery.
+ * Finally exit recovery and mark that in WAL. Pre-8.4 we wrote
+ * a shutdown checkpoint here, but we ask bgwriter to do that now.
*/
- pgstat_reset_all();
+ exitRecovery();
- /*
- * Perform a checkpoint to update all our recovery activity to disk.
- *
- * Note that we write a shutdown checkpoint rather than an on-line
- * one. This is not particularly critical, but since we may be
- * assigning a new TLI, using a shutdown checkpoint allows us to have
- * the rule that TLI only changes in shutdown checkpoints, which
- * allows some extra error checking in xlog_redo.
- */
- CreateCheckPoint(CHECKPOINT_IS_SHUTDOWN | CHECKPOINT_IMMEDIATE);
+ performedRecovery = true;
}
- /*
- * Preallocate additional log files, if wanted.
- */
- PreallocXlogFiles(EndOfLog);
-
- /*
- * Okay, we're officially UP.
- */
- InRecovery = false;
-
- ControlFile->state = DB_IN_PRODUCTION;
- ControlFile->time = (pg_time_t) time(NULL);
- UpdateControlFile();
-
- /* start the archive_timeout timer running */
- XLogCtl->Write.lastSegSwitchTime = ControlFile->time;
-
/* initialize shared-memory copy of latest checkpoint XID/epoch */
XLogCtl->ckptXidEpoch = ControlFile->checkPointCopy.nextXidEpoch;
XLogCtl->ckptXid = ControlFile->checkPointCopy.nextXid;
@@ -5441,6 +5540,89 @@ StartupXLOG(void)
readRecordBuf = NULL;
readRecordBufSize = 0;
}
+
+ /*
+ * Prior to 8.4 we wrote a Shutdown Checkpoint at the end of recovery.
+ * This could add minutes to the startup time, so we want bgwriter
+ * to perform it. This then frees the Startup process to complete so we can
+ * allow transactions and WAL inserts. We still write a checkpoint, but
+ * it will be an online checkpoint. Online checkpoints have a redo
+ * location that can be prior to the actual checkpoint record. So we want
+ * to derive that redo location *before* we let anybody else write WAL,
+ * otherwise we might miss some WAL records if we crash.
+ */
+ if (performedRecovery)
+ {
+ XLogRecPtr redo;
+
+ /*
+ * We must grab the pointer before anybody writes WAL
+ */
+ redo = GetRedoLocationForCheckpoint();
+
+ /*
+ * Tell the bgwriter
+ */
+ SetRedoLocationForArchiveCheckpoint(redo);
+
+ /*
+ * Okay, we can come up now. Allow others to write WAL.
+ */
+ XLogCtl->SharedRecoveryProcessingMode = false;
+
+ /*
+ * Now request checkpoint
+ */
+ RequestCheckpoint(CHECKPOINT_FORCE | CHECKPOINT_IMMEDIATE);
+ }
+ else
+ {
+ /*
+ * No recovery, so lets just get on with it.
+ */
+ LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
+ ControlFile->state = DB_IN_PRODUCTION;
+ ControlFile->time = (pg_time_t) time(NULL);
+ UpdateControlFile();
+ LWLockRelease(ControlFileLock);
+
+ /*
+ * Okay, we're officially UP.
+ */
+ XLogCtl->SharedRecoveryProcessingMode = false;
+ }
+
+ /* start the archive_timeout timer running */
+ XLogCtl->Write.lastSegSwitchTime = (pg_time_t) time(NULL);
+
+}
+
+/*
+ * IsRecoveryProcessingMode()
+ *
+ * Fast test for whether we're still in recovery or not. We test the shared
+ * state each time only until we leave recovery mode. After that we never
+ * look again, relying upon the settings of our local state variables. This
+ * is designed to avoid the need for a separate initialisation step.
+ */
+bool
+IsRecoveryProcessingMode(void)
+{
+ if (knownProcessingMode && !LocalRecoveryProcessingMode)
+ return false;
+
+ {
+ /* use volatile pointer to prevent code rearrangement */
+ volatile XLogCtlData *xlogctl = XLogCtl;
+
+ SpinLockAcquire(&xlogctl->mode_lck);
+ LocalRecoveryProcessingMode = XLogCtl->SharedRecoveryProcessingMode;
+ SpinLockRelease(&xlogctl->mode_lck);
+ }
+
+ knownProcessingMode = true;
+
+ return LocalRecoveryProcessingMode;
}
/*
@@ -5698,20 +5880,24 @@ ShutdownXLOG(int code, Datum arg)
static void
LogCheckpointStart(int flags)
{
- elog(LOG, "checkpoint starting:%s%s%s%s%s%s",
- (flags & CHECKPOINT_IS_SHUTDOWN) ? " shutdown" : "",
- (flags & CHECKPOINT_IMMEDIATE) ? " immediate" : "",
- (flags & CHECKPOINT_FORCE) ? " force" : "",
- (flags & CHECKPOINT_WAIT) ? " wait" : "",
- (flags & CHECKPOINT_CAUSE_XLOG) ? " xlog" : "",
- (flags & CHECKPOINT_CAUSE_TIME) ? " time" : "");
+ if (flags & CHECKPOINT_RESTARTPOINT)
+ elog(LOG, "restartpoint starting:%s",
+ (flags & CHECKPOINT_IMMEDIATE) ? " immediate" : "");
+ else
+ elog(LOG, "checkpoint starting:%s%s%s%s%s%s",
+ (flags & CHECKPOINT_IS_SHUTDOWN) ? " shutdown" : "",
+ (flags & CHECKPOINT_IMMEDIATE) ? " immediate" : "",
+ (flags & CHECKPOINT_FORCE) ? " force" : "",
+ (flags & CHECKPOINT_WAIT) ? " wait" : "",
+ (flags & CHECKPOINT_CAUSE_XLOG) ? " xlog" : "",
+ (flags & CHECKPOINT_CAUSE_TIME) ? " time" : "");
}
/*
* Log end of a checkpoint.
*/
static void
-LogCheckpointEnd(void)
+LogCheckpointEnd(int flags)
{
long write_secs,
sync_secs,
@@ -5734,17 +5920,26 @@ LogCheckpointEnd(void)
CheckpointStats.ckpt_sync_end_t,
&sync_secs, &sync_usecs);
- elog(LOG, "checkpoint complete: wrote %d buffers (%.1f%%); "
- "%d transaction log file(s) added, %d removed, %d recycled; "
- "write=%ld.%03d s, sync=%ld.%03d s, total=%ld.%03d s",
- CheckpointStats.ckpt_bufs_written,
- (double) CheckpointStats.ckpt_bufs_written * 100 / NBuffers,
- CheckpointStats.ckpt_segs_added,
- CheckpointStats.ckpt_segs_removed,
- CheckpointStats.ckpt_segs_recycled,
- write_secs, write_usecs / 1000,
- sync_secs, sync_usecs / 1000,
- total_secs, total_usecs / 1000);
+ if (flags & CHECKPOINT_RESTARTPOINT)
+ elog(LOG, "restartpoint complete: wrote %d buffers (%.1f%%); "
+ "write=%ld.%03d s, sync=%ld.%03d s, total=%ld.%03d s",
+ CheckpointStats.ckpt_bufs_written,
+ (double) CheckpointStats.ckpt_bufs_written * 100 / NBuffers,
+ write_secs, write_usecs / 1000,
+ sync_secs, sync_usecs / 1000,
+ total_secs, total_usecs / 1000);
+ else
+ elog(LOG, "checkpoint complete: wrote %d buffers (%.1f%%); "
+ "%d transaction log file(s) added, %d removed, %d recycled; "
+ "write=%ld.%03d s, sync=%ld.%03d s, total=%ld.%03d s",
+ CheckpointStats.ckpt_bufs_written,
+ (double) CheckpointStats.ckpt_bufs_written * 100 / NBuffers,
+ CheckpointStats.ckpt_segs_added,
+ CheckpointStats.ckpt_segs_removed,
+ CheckpointStats.ckpt_segs_recycled,
+ write_secs, write_usecs / 1000,
+ sync_secs, sync_usecs / 1000,
+ total_secs, total_usecs / 1000);
}
/*
@@ -5769,17 +5964,16 @@ CreateCheckPoint(int flags)
XLogRecPtr recptr;
XLogCtlInsert *Insert = &XLogCtl->Insert;
XLogRecData rdata;
- uint32 freespace;
uint32 _logId;
uint32 _logSeg;
TransactionId *inCommitXids;
int nInCommit;
+ bool leavingArchiveRecovery = false;
/*
* Acquire CheckpointLock to ensure only one checkpoint happens at a time.
- * (This is just pro forma, since in the present system structure there is
- * only one process that is allowed to issue checkpoints at any given
- * time.)
+ * That shouldn't be happening, but checkpoints are an important aspect
+ * of our resilience, so we take no chances.
*/
LWLockAcquire(CheckpointLock, LW_EXCLUSIVE);
@@ -5794,15 +5988,24 @@ CreateCheckPoint(int flags)
CheckpointStats.ckpt_start_t = GetCurrentTimestamp();
/*
+ * Find out if this is the first checkpoint after archive recovery.
+ */
+ LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
+ leavingArchiveRecovery = (ControlFile->state == DB_IN_ARCHIVE_RECOVERY);
+ LWLockRelease(ControlFileLock);
+
+ /*
* Use a critical section to force system panic if we have trouble.
*/
START_CRIT_SECTION();
if (shutdown)
{
+ LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
ControlFile->state = DB_SHUTDOWNING;
ControlFile->time = (pg_time_t) time(NULL);
UpdateControlFile();
+ LWLockRelease(ControlFileLock);
}
/*
@@ -5817,91 +6020,69 @@ CreateCheckPoint(int flags)
checkPoint.ThisTimeLineID = ThisTimeLineID;
checkPoint.time = (pg_time_t) time(NULL);
- /*
- * We must hold WALInsertLock while examining insert state to determine
- * the checkpoint REDO pointer.
- */
- LWLockAcquire(WALInsertLock, LW_EXCLUSIVE);
+ if (leavingArchiveRecovery)
+ checkPoint.redo = GetRedoLocationForArchiveCheckpoint();
+ else
+ {
+ /*
+ * We must hold WALInsertLock while examining insert state to determine
+ * the checkpoint REDO pointer.
+ */
+ LWLockAcquire(WALInsertLock, LW_EXCLUSIVE);
- /*
- * If this isn't a shutdown or forced checkpoint, and we have not inserted
- * any XLOG records since the start of the last checkpoint, skip the
- * checkpoint. The idea here is to avoid inserting duplicate checkpoints
- * when the system is idle. That wastes log space, and more importantly it
- * exposes us to possible loss of both current and previous checkpoint
- * records if the machine crashes just as we're writing the update.
- * (Perhaps it'd make even more sense to checkpoint only when the previous
- * checkpoint record is in a different xlog page?)
- *
- * We have to make two tests to determine that nothing has happened since
- * the start of the last checkpoint: current insertion point must match
- * the end of the last checkpoint record, and its redo pointer must point
- * to itself.
- */
- if ((flags & (CHECKPOINT_IS_SHUTDOWN | CHECKPOINT_FORCE)) == 0)
- {
- XLogRecPtr curInsert;
-
- INSERT_RECPTR(curInsert, Insert, Insert->curridx);
- if (curInsert.xlogid == ControlFile->checkPoint.xlogid &&
- curInsert.xrecoff == ControlFile->checkPoint.xrecoff +
- MAXALIGN(SizeOfXLogRecord + sizeof(CheckPoint)) &&
- ControlFile->checkPoint.xlogid ==
- ControlFile->checkPointCopy.redo.xlogid &&
- ControlFile->checkPoint.xrecoff ==
- ControlFile->checkPointCopy.redo.xrecoff)
+ /*
+ * If this isn't a shutdown or forced checkpoint, and we have not inserted
+ * any XLOG records since the start of the last checkpoint, skip the
+ * checkpoint. The idea here is to avoid inserting duplicate checkpoints
+ * when the system is idle. That wastes log space, and more importantly it
+ * exposes us to possible loss of both current and previous checkpoint
+ * records if the machine crashes just as we're writing the update.
+ * (Perhaps it'd make even more sense to checkpoint only when the previous
+ * checkpoint record is in a different xlog page?)
+ *
+ * We have to make two tests to determine that nothing has happened since
+ * the start of the last checkpoint: current insertion point must match
+ * the end of the last checkpoint record, and its redo pointer must point
+ * to itself.
+ */
+ if ((flags & (CHECKPOINT_IS_SHUTDOWN | CHECKPOINT_FORCE)) == 0)
{
- LWLockRelease(WALInsertLock);
- LWLockRelease(CheckpointLock);
- END_CRIT_SECTION();
- return;
+ XLogRecPtr curInsert;
+
+ INSERT_RECPTR(curInsert, Insert, Insert->curridx);
+ if (curInsert.xlogid == ControlFile->checkPoint.xlogid &&
+ curInsert.xrecoff == ControlFile->checkPoint.xrecoff +
+ MAXALIGN(SizeOfXLogRecord + sizeof(CheckPoint)) &&
+ ControlFile->checkPoint.xlogid ==
+ ControlFile->checkPointCopy.redo.xlogid &&
+ ControlFile->checkPoint.xrecoff ==
+ ControlFile->checkPointCopy.redo.xrecoff)
+ {
+ LWLockRelease(WALInsertLock);
+ LWLockRelease(CheckpointLock);
+ END_CRIT_SECTION();
+ return;
+ }
}
- }
-
- /*
- * Compute new REDO record ptr = location of next XLOG record.
- *
- * NB: this is NOT necessarily where the checkpoint record itself will be,
- * since other backends may insert more XLOG records while we're off doing
- * the buffer flush work. Those XLOG records are logically after the
- * checkpoint, even though physically before it. Got that?
- */
- freespace = INSERT_FREESPACE(Insert);
- if (freespace < SizeOfXLogRecord)
- {
- (void) AdvanceXLInsertBuffer(false);
- /* OK to ignore update return flag, since we will do flush anyway */
- freespace = INSERT_FREESPACE(Insert);
- }
- INSERT_RECPTR(checkPoint.redo, Insert, Insert->curridx);
- /*
- * Here we update the shared RedoRecPtr for future XLogInsert calls; this
- * must be done while holding the insert lock AND the info_lck.
- *
- * Note: if we fail to complete the checkpoint, RedoRecPtr will be left
- * pointing past where it really needs to point. This is okay; the only
- * consequence is that XLogInsert might back up whole buffers that it
- * didn't really need to. We can't postpone advancing RedoRecPtr because
- * XLogInserts that happen while we are dumping buffers must assume that
- * their buffer changes are not included in the checkpoint.
- */
- {
- /* use volatile pointer to prevent code rearrangement */
- volatile XLogCtlData *xlogctl = XLogCtl;
+ /*
+ * Compute new REDO record ptr = location of next XLOG record.
+ *
+ * NB: this is NOT necessarily where the checkpoint record itself will be,
+ * since other backends may insert more XLOG records while we're off doing
+ * the buffer flush work. Those XLOG records are logically after the
+ * checkpoint, even though physically before it. Got that?
+ */
+ checkPoint.redo = GetRedoLocationForCheckpoint();
- SpinLockAcquire(&xlogctl->info_lck);
- RedoRecPtr = xlogctl->Insert.RedoRecPtr = checkPoint.redo;
- SpinLockRelease(&xlogctl->info_lck);
+ /*
+ * Now we can release WAL insert lock, allowing other xacts to proceed
+ * while we are flushing disk buffers.
+ */
+ LWLockRelease(WALInsertLock);
}
/*
- * Now we can release WAL insert lock, allowing other xacts to proceed
- * while we are flushing disk buffers.
- */
- LWLockRelease(WALInsertLock);
-
- /*
* If enabled, log checkpoint start. We postpone this until now so as not
* to log anything if we decided to skip the checkpoint.
*/
@@ -6010,18 +6191,43 @@ CreateCheckPoint(int flags)
XLByteToSeg(ControlFile->checkPointCopy.redo, _logId, _logSeg);
/*
- * Update the control file.
+ * Update the control file. In 8.4, this routine becomes the primary
+ * point for recording changes of state in the control file at the
+ * end of recovery. Postmaster state already shows us being in
+ * normal running mode, but it is only after this point that we
+ * are completely free of reperforming a recovery if we crash. Note
+ * that this is executed by bgwriter after the death of Startup process.
*/
LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
+
if (shutdown)
ControlFile->state = DB_SHUTDOWNED;
+ else
+ ControlFile->state = DB_IN_PRODUCTION;
+
ControlFile->prevCheckPoint = ControlFile->checkPoint;
ControlFile->checkPoint = ProcLastRecPtr;
ControlFile->checkPointCopy = checkPoint;
ControlFile->time = (pg_time_t) time(NULL);
UpdateControlFile();
+
LWLockRelease(ControlFileLock);
+ if (leavingArchiveRecovery)
+ {
+ /*
+ * Rename the config file out of the way, so that we don't accidentally
+ * re-enter archive recovery mode in a subsequent crash. Prior to
+ * 8.4 this step was performed at end of exitArchiveRecovery().
+ */
+ unlink(RECOVERY_COMMAND_DONE);
+ if (rename(RECOVERY_COMMAND_FILE, RECOVERY_COMMAND_DONE) != 0)
+ ereport(ERROR,
+ (errcode_for_file_access(),
+ errmsg("could not rename file \"%s\" to \"%s\": %m",
+ RECOVERY_COMMAND_FILE, RECOVERY_COMMAND_DONE)));
+ }
+
/* Update shared-memory copy of checkpoint XID/epoch */
{
/* use volatile pointer to prevent code rearrangement */
@@ -6068,12 +6274,11 @@ CreateCheckPoint(int flags)
* in subtrans.c). During recovery, though, we mustn't do this because
* StartupSUBTRANS hasn't been called yet.
*/
- if (!InRecovery)
- TruncateSUBTRANS(GetOldestXmin(true, false));
+ TruncateSUBTRANS(GetOldestXmin(true, false));
/* All real work is done, but log before releasing lock. */
if (log_checkpoints)
- LogCheckpointEnd();
+ LogCheckpointEnd(flags);
TRACE_POSTGRESQL_CHECKPOINT_DONE(CheckpointStats.ckpt_bufs_written,
NBuffers, CheckpointStats.ckpt_segs_added,
@@ -6083,6 +6288,51 @@ CreateCheckPoint(int flags)
LWLockRelease(CheckpointLock);
}
+/*
+ * GetRedoLocationForCheckpoint()
+ *
+ * When !IsRecoveryProcessingMode() this must be called while holding
+ * WALInsertLock().
+ */
+static XLogRecPtr
+GetRedoLocationForCheckpoint()
+{
+ XLogCtlInsert *Insert = &XLogCtl->Insert;
+ uint32 freespace;
+ XLogRecPtr redo;
+
+ freespace = INSERT_FREESPACE(Insert);
+ if (freespace < SizeOfXLogRecord)
+ {
+ (void) AdvanceXLInsertBuffer(false);
+ /* OK to ignore update return flag, since we will do flush anyway */
+ freespace = INSERT_FREESPACE(Insert);
+ }
+ INSERT_RECPTR(redo, Insert, Insert->curridx);
+
+ /*
+ * Here we update the shared RedoRecPtr for future XLogInsert calls; this
+ * must be done while holding the insert lock AND the info_lck.
+ *
+ * Note: if we fail to complete the checkpoint, RedoRecPtr will be left
+ * pointing past where it really needs to point. This is okay; the only
+ * consequence is that XLogInsert might back up whole buffers that it
+ * didn't really need to. We can't postpone advancing RedoRecPtr because
+ * XLogInserts that happen while we are dumping buffers must assume that
+ * their buffer changes are not included in the checkpoint.
+ */
+ {
+ /* use volatile pointer to prevent code rearrangement */
+ volatile XLogCtlData *xlogctl = XLogCtl;
+
+ SpinLockAcquire(&xlogctl->info_lck);
+ RedoRecPtr = xlogctl->Insert.RedoRecPtr = redo;
+ SpinLockRelease(&xlogctl->info_lck);
+ }
+
+ return redo;
+}
+
/*
* Flush all data in shared memory to disk, and fsync
*
@@ -6147,29 +6397,69 @@ RecoveryRestartPoint(const CheckPoint *checkPoint)
}
}
+ RequestRestartPoint(ReadRecPtr, checkPoint, reachedSafeStartPoint);
+}
+
+/*
+ * As of 8.4, RestartPoints are always created by the bgwriter
+ * once we have reachedSafeStartPoint. We use bgwriter's shared memory
+ * area wherever we call it from, to keep better code structure.
+ */
+void
+CreateRestartPoint(const XLogRecPtr ReadPtr, const CheckPoint *restartPoint, int flags)
+{
+ if (recoveryLogRestartpoints)
+ {
+ /*
+ * Prepare to accumulate statistics.
+ */
+
+ MemSet(&CheckpointStats, 0, sizeof(CheckpointStats));
+ CheckpointStats.ckpt_start_t = GetCurrentTimestamp();
+
+ LogCheckpointStart(CHECKPOINT_RESTARTPOINT | flags);
+ }
+
/*
- * OK, force data out to disk
+ * Acquire CheckpointLock to ensure only one restartpoint happens at a time.
+ * We rely on this lock to ensure that the startup process doesn't exit
+ * Recovery while we are half way through a restartpoint.
*/
- CheckPointGuts(checkPoint->redo, CHECKPOINT_IMMEDIATE);
+ LWLockAcquire(CheckpointLock, LW_EXCLUSIVE);
+
+ CheckPointGuts(restartPoint->redo, CHECKPOINT_RESTARTPOINT | flags);
/*
- * Update pg_control so that any subsequent crash will restart from this
- * checkpoint. Note: ReadRecPtr gives the XLOG address of the checkpoint
- * record itself.
+ * Update pg_control, using current time
*/
+ LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
ControlFile->prevCheckPoint = ControlFile->checkPoint;
- ControlFile->checkPoint = ReadRecPtr;
- ControlFile->checkPointCopy = *checkPoint;
+ ControlFile->checkPoint = ReadPtr;
+ ControlFile->checkPointCopy = *restartPoint;
ControlFile->time = (pg_time_t) time(NULL);
UpdateControlFile();
+ LWLockRelease(ControlFileLock);
+
+ /*
+ * Currently, there is no need to truncate pg_subtrans during recovery.
+ * If we did do that, we will need to have called StartupSUBTRANS()
+ * already and then TruncateSUBTRANS() would go here.
+ */
+
+ /* All real work is done, but log before releasing lock. */
+ if (recoveryLogRestartpoints)
+ LogCheckpointEnd(CHECKPOINT_RESTARTPOINT);
ereport((recoveryLogRestartpoints ? LOG : DEBUG2),
(errmsg("recovery restart point at %X/%X",
- checkPoint->redo.xlogid, checkPoint->redo.xrecoff)));
+ restartPoint->redo.xlogid, restartPoint->redo.xrecoff)));
+
if (recoveryLastXTime)
ereport((recoveryLogRestartpoints ? LOG : DEBUG2),
- (errmsg("last completed transaction was at log time %s",
- timestamptz_to_str(recoveryLastXTime))));
+ (errmsg("last completed transaction was at log time %s",
+ timestamptz_to_str(recoveryLastXTime))));
+
+ LWLockRelease(CheckpointLock);
}
/*
@@ -6234,7 +6524,63 @@ RequestXLogSwitch(void)
}
/*
- * XLOG resource manager's routines
+ * exitRecovery()
+ *
+ * Exit recovery state and write a XLOG_RECOVERY_END record. This is the
+ * only record type that can record a change of timelineID. We assume
+ * caller has already set ThisTimeLineID, if appropriate.
+ */
+static void
+exitRecovery(void)
+{
+ XLogRecData rdata;
+
+ rdata.buffer = InvalidBuffer;
+ rdata.data = (char *) (&ThisTimeLineID);
+ rdata.len = sizeof(TimeLineID);
+ rdata.next = NULL;
+
+ /*
+ * If a restartpoint is in progress, we will not be able to successfully
+ * acquire CheckpointLock. If bgwriter is still in progress then send
+ * a second signal to nudge bgwriter to go faster so we can avoid delay.
+ * Then wait for lock, so we know the restartpoint has completed. We do
+ * this because we don't want to interrupt the restartpoint half way
+ * through, which might leave us in a mess and we want to be robust. We're
+ * going to checkpoint soon anyway, so not it's not wasted effort.
+ */
+ if (LWLockConditionalAcquire(CheckpointLock, LW_EXCLUSIVE))
+ LWLockRelease(CheckpointLock);
+ else
+ {
+ RequestRestartPointCompletion();
+ ereport(LOG,
+ (errmsg("startup process waiting for restartpoint to complete")));
+ LWLockAcquire(CheckpointLock, LW_EXCLUSIVE);
+ LWLockRelease(CheckpointLock);
+ }
+
+ /*
+ * This is the only type of WAL message that can be inserted during
+ * recovery. This ensures that we don't allow others to get access
+ * until after we have changed state.
+ */
+ (void) XLogInsert(RM_XLOG_ID, XLOG_RECOVERY_END, &rdata);
+
+ /*
+ * We don't XLogFlush() here otherwise we'll end up zeroing the WAL
+ * file ourselves. So just let bgwriter's forthcoming checkpoint do
+ * that for us.
+ */
+
+ InRecovery = false;
+}
+
+/*
+ * XLOG resource manager's routines.
+ *
+ * Definitions of message info are in include/catalog/pg_control.h,
+ * though not all messages relate to control file processing.
*/
void
xlog_redo(XLogRecPtr lsn, XLogRecord *record)
@@ -6272,21 +6618,38 @@ xlog_redo(XLogRecPtr lsn, XLogRecord *record)
ControlFile->checkPointCopy.nextXid = checkPoint.nextXid;
/*
- * TLI may change in a shutdown checkpoint, but it shouldn't decrease
+ * TLI no longer changes at shutdown checkpoint, since as of 8.4,
+ * shutdown checkpoints only occur at shutdown. Much less confusing.
*/
- if (checkPoint.ThisTimeLineID != ThisTimeLineID)
+
+ RecoveryRestartPoint(&checkPoint);
+ }
+ else if (info == XLOG_RECOVERY_END)
+ {
+ TimeLineID tli;
+
+ memcpy(&tli, XLogRecGetData(record), sizeof(TimeLineID));
+
+ /*
+ * TLI may change when recovery ends, but it shouldn't decrease.
+ *
+ * This is the only WAL record that can tell us to change timelineID
+ * while we process WAL records.
+ *
+ * We can *choose* to stop recovery at any point, generating a
+ * new timelineID which is recorded using this record type.
+ */
+ if (tli != ThisTimeLineID)
{
- if (checkPoint.ThisTimeLineID < ThisTimeLineID ||
+ if (tli < ThisTimeLineID ||
!list_member_int(expectedTLIs,
- (int) checkPoint.ThisTimeLineID))
+ (int) tli))
ereport(PANIC,
- (errmsg("unexpected timeline ID %u (after %u) in checkpoint record",
- checkPoint.ThisTimeLineID, ThisTimeLineID)));
+ (errmsg("unexpected timeline ID %u (after %u) at recovery end record",
+ tli, ThisTimeLineID)));
/* Following WAL records should be run with new TLI */
- ThisTimeLineID = checkPoint.ThisTimeLineID;
+ ThisTimeLineID = tli;
}
-
- RecoveryRestartPoint(&checkPoint);
}
else if (info == XLOG_CHECKPOINT_ONLINE)
{
@@ -6309,7 +6672,7 @@ xlog_redo(XLogRecPtr lsn, XLogRecord *record)
ControlFile->checkPointCopy.nextXidEpoch = checkPoint.nextXidEpoch;
ControlFile->checkPointCopy.nextXid = checkPoint.nextXid;
- /* TLI should not change in an on-line checkpoint */
+ /* TLI must not change at a checkpoint */
if (checkPoint.ThisTimeLineID != ThisTimeLineID)
ereport(PANIC,
(errmsg("unexpected timeline ID %u (should be %u) in checkpoint record",
diff --git a/src/backend/postmaster/bgwriter.c b/src/backend/postmaster/bgwriter.c
index 6a0cd4eebf..3163fd3c1b 100644
--- a/src/backend/postmaster/bgwriter.c
+++ b/src/backend/postmaster/bgwriter.c
@@ -49,6 +49,7 @@
#include <unistd.h>
#include "access/xlog_internal.h"
+#include "catalog/pg_control.h"
#include "libpq/pqsignal.h"
#include "miscadmin.h"
#include "pgstat.h"
@@ -129,6 +130,13 @@ typedef struct
int ckpt_flags; /* checkpoint flags, as defined in xlog.h */
+ /*
+ * When the Startup process wants bgwriter to perform a restartpoint, it
+ * sets these fields so that we can update the control file afterwards.
+ */
+ XLogRecPtr ReadPtr; /* Requested log pointer */
+ CheckPoint restartPoint; /* restartPoint data for ControlFile */
+
uint32 num_backend_writes; /* counts non-bgwriter buffer writes */
int num_requests; /* current # of requests */
@@ -165,7 +173,7 @@ static bool ckpt_active = false;
/* these values are valid when ckpt_active is true: */
static pg_time_t ckpt_start_time;
-static XLogRecPtr ckpt_start_recptr;
+static XLogRecPtr ckpt_start_recptr; /* not used if IsRecoveryProcessingMode */
static double ckpt_cached_elapsed;
static pg_time_t last_checkpoint_time;
@@ -197,6 +205,7 @@ BackgroundWriterMain(void)
{
sigjmp_buf local_sigjmp_buf;
MemoryContext bgwriter_context;
+ bool BgWriterRecoveryMode;
BgWriterShmem->bgwriter_pid = MyProcPid;
am_bg_writer = true;
@@ -355,16 +364,17 @@ BackgroundWriterMain(void)
*/
PG_SETMASK(&UnBlockSig);
+ BgWriterRecoveryMode = IsRecoveryProcessingMode();
+
+ if (BgWriterRecoveryMode)
+ elog(DEBUG1, "bgwriter starting during recovery, pid = %u",
+ BgWriterShmem->bgwriter_pid);
+
/*
* Loop forever
*/
for (;;)
{
- bool do_checkpoint = false;
- int flags = 0;
- pg_time_t now;
- int elapsed_secs;
-
/*
* Emergency bailout if postmaster has died. This is to avoid the
* necessity for manual cleanup of all postmaster children.
@@ -382,118 +392,204 @@ BackgroundWriterMain(void)
got_SIGHUP = false;
ProcessConfigFile(PGC_SIGHUP);
}
- if (checkpoint_requested)
- {
- checkpoint_requested = false;
- do_checkpoint = true;
- BgWriterStats.m_requested_checkpoints++;
- }
- if (shutdown_requested)
- {
- /*
- * From here on, elog(ERROR) should end with exit(1), not send
- * control back to the sigsetjmp block above
- */
- ExitOnAnyError = true;
- /* Close down the database */
- ShutdownXLOG(0, 0);
- /* Normal exit from the bgwriter is here */
- proc_exit(0); /* done */
- }
- /*
- * Force a checkpoint if too much time has elapsed since the last one.
- * Note that we count a timed checkpoint in stats only when this
- * occurs without an external request, but we set the CAUSE_TIME flag
- * bit even if there is also an external request.
- */
- now = (pg_time_t) time(NULL);
- elapsed_secs = now - last_checkpoint_time;
- if (elapsed_secs >= CheckPointTimeout)
- {
- if (!do_checkpoint)
- BgWriterStats.m_timed_checkpoints++;
- do_checkpoint = true;
- flags |= CHECKPOINT_CAUSE_TIME;
- }
-
- /*
- * Do a checkpoint if requested, otherwise do one cycle of
- * dirty-buffer writing.
- */
- if (do_checkpoint)
- {
- /* use volatile pointer to prevent code rearrangement */
- volatile BgWriterShmemStruct *bgs = BgWriterShmem;
-
- /*
- * Atomically fetch the request flags to figure out what kind of a
- * checkpoint we should perform, and increase the started-counter
- * to acknowledge that we've started a new checkpoint.
- */
- SpinLockAcquire(&bgs->ckpt_lck);
- flags |= bgs->ckpt_flags;
- bgs->ckpt_flags = 0;
- bgs->ckpt_started++;
- SpinLockRelease(&bgs->ckpt_lck);
-
- /*
- * We will warn if (a) too soon since last checkpoint (whatever
- * caused it) and (b) somebody set the CHECKPOINT_CAUSE_XLOG flag
- * since the last checkpoint start. Note in particular that this
- * implementation will not generate warnings caused by
- * CheckPointTimeout < CheckPointWarning.
- */
- if ((flags & CHECKPOINT_CAUSE_XLOG) &&
- elapsed_secs < CheckPointWarning)
- ereport(LOG,
- (errmsg("checkpoints are occurring too frequently (%d seconds apart)",
- elapsed_secs),
- errhint("Consider increasing the configuration parameter \"checkpoint_segments\".")));
-
- /*
- * Initialize bgwriter-private variables used during checkpoint.
- */
- ckpt_active = true;
- ckpt_start_recptr = GetInsertRecPtr();
- ckpt_start_time = now;
- ckpt_cached_elapsed = 0;
-
- /*
- * Do the checkpoint.
- */
- CreateCheckPoint(flags);
+ if (BgWriterRecoveryMode)
+ {
+ if (shutdown_requested)
+ {
+ /*
+ * From here on, elog(ERROR) should end with exit(1), not send
+ * control back to the sigsetjmp block above
+ */
+ ExitOnAnyError = true;
+ /* Normal exit from the bgwriter is here */
+ proc_exit(0); /* done */
+ }
+
+ if (!IsRecoveryProcessingMode())
+ {
+ elog(DEBUG2, "bgwriter changing from recovery to normal mode");
+
+ InitXLOGAccess();
+ BgWriterRecoveryMode = false;
+
+ /*
+ * Start time-driven events from now
+ */
+ last_checkpoint_time = last_xlog_switch_time = (pg_time_t) time(NULL);
+
+ /*
+ * Notice that we do *not* act on a checkpoint_requested
+ * state at this point. We have changed mode, so we wish to
+ * perform a checkpoint not a restartpoint.
+ */
+ continue;
+ }
+
+ if (checkpoint_requested)
+ {
+ XLogRecPtr ReadPtr;
+ CheckPoint restartPoint;
+
+ checkpoint_requested = false;
+
+ /*
+ * Initialize bgwriter-private variables used during checkpoint.
+ */
+ ckpt_active = true;
+ ckpt_start_time = (pg_time_t) time(NULL);
+ ckpt_cached_elapsed = 0;
+
+ /*
+ * Get the requested values from shared memory that the
+ * Startup process has put there for us.
+ */
+ SpinLockAcquire(&BgWriterShmem->ckpt_lck);
+ ReadPtr = BgWriterShmem->ReadPtr;
+ memcpy(&restartPoint, &BgWriterShmem->restartPoint, sizeof(CheckPoint));
+ SpinLockRelease(&BgWriterShmem->ckpt_lck);
+
+ /* Use smoothed writes, until interrupted if ever */
+ CreateRestartPoint(ReadPtr, &restartPoint, 0);
+
+ /*
+ * After any checkpoint, close all smgr files. This is so we
+ * won't hang onto smgr references to deleted files indefinitely.
+ */
+ smgrcloseall();
+
+ ckpt_active = false;
+ checkpoint_requested = false;
+ }
+ else
+ {
+ /* Clean buffers dirtied by recovery */
+ BgBufferSync();
+
+ /* Nap for the configured time. */
+ BgWriterNap();
+ }
+ }
+ else /* Normal processing */
+ {
+ bool do_checkpoint = false;
+ int flags = 0;
+ pg_time_t now;
+ int elapsed_secs;
+
+ if (checkpoint_requested)
+ {
+ checkpoint_requested = false;
+ do_checkpoint = true;
+ BgWriterStats.m_requested_checkpoints++;
+ }
+ if (shutdown_requested)
+ {
+ /*
+ * From here on, elog(ERROR) should end with exit(1), not send
+ * control back to the sigsetjmp block above
+ */
+ ExitOnAnyError = true;
+ /* Close down the database */
+ ShutdownXLOG(0, 0);
+ /* Normal exit from the bgwriter is here */
+ proc_exit(0); /* done */
+ }
/*
- * After any checkpoint, close all smgr files. This is so we
- * won't hang onto smgr references to deleted files indefinitely.
+ * Force a checkpoint if too much time has elapsed since the last one.
+ * Note that we count a timed checkpoint in stats only when this
+ * occurs without an external request, but we set the CAUSE_TIME flag
+ * bit even if there is also an external request.
*/
- smgrcloseall();
+ now = (pg_time_t) time(NULL);
+ elapsed_secs = now - last_checkpoint_time;
+ if (elapsed_secs >= CheckPointTimeout)
+ {
+ if (!do_checkpoint)
+ BgWriterStats.m_timed_checkpoints++;
+ do_checkpoint = true;
+ flags |= CHECKPOINT_CAUSE_TIME;
+ }
/*
- * Indicate checkpoint completion to any waiting backends.
+ * Do a checkpoint if requested, otherwise do one cycle of
+ * dirty-buffer writing.
*/
- SpinLockAcquire(&bgs->ckpt_lck);
- bgs->ckpt_done = bgs->ckpt_started;
- SpinLockRelease(&bgs->ckpt_lck);
+ if (do_checkpoint)
+ {
+ /* use volatile pointer to prevent code rearrangement */
+ volatile BgWriterShmemStruct *bgs = BgWriterShmem;
+
+ /*
+ * Atomically fetch the request flags to figure out what kind of a
+ * checkpoint we should perform, and increase the started-counter
+ * to acknowledge that we've started a new checkpoint.
+ */
+ SpinLockAcquire(&bgs->ckpt_lck);
+ flags |= bgs->ckpt_flags;
+ bgs->ckpt_flags = 0;
+ bgs->ckpt_started++;
+ SpinLockRelease(&bgs->ckpt_lck);
+
+ /*
+ * We will warn if (a) too soon since last checkpoint (whatever
+ * caused it) and (b) somebody set the CHECKPOINT_CAUSE_XLOG flag
+ * since the last checkpoint start. Note in particular that this
+ * implementation will not generate warnings caused by
+ * CheckPointTimeout < CheckPointWarning.
+ */
+ if ((flags & CHECKPOINT_CAUSE_XLOG) &&
+ elapsed_secs < CheckPointWarning)
+ ereport(LOG,
+ (errmsg("checkpoints are occurring too frequently (%d seconds apart)",
+ elapsed_secs),
+ errhint("Consider increasing the configuration parameter \"checkpoint_segments\".")));
+
+ /*
+ * Initialize bgwriter-private variables used during checkpoint.
+ */
+ ckpt_active = true;
+ ckpt_start_recptr = GetInsertRecPtr();
+ ckpt_start_time = now;
+ ckpt_cached_elapsed = 0;
+
+ /*
+ * Do the checkpoint.
+ */
+ CreateCheckPoint(flags);
+
+ /*
+ * After any checkpoint, close all smgr files. This is so we
+ * won't hang onto smgr references to deleted files indefinitely.
+ */
+ smgrcloseall();
+
+ /*
+ * Indicate checkpoint completion to any waiting backends.
+ */
+ SpinLockAcquire(&bgs->ckpt_lck);
+ bgs->ckpt_done = bgs->ckpt_started;
+ SpinLockRelease(&bgs->ckpt_lck);
+
+ ckpt_active = false;
+
+ /*
+ * Note we record the checkpoint start time not end time as
+ * last_checkpoint_time. This is so that time-driven checkpoints
+ * happen at a predictable spacing.
+ */
+ last_checkpoint_time = now;
+ }
+ else
+ BgBufferSync();
- ckpt_active = false;
+ /* Check for archive_timeout and switch xlog files if necessary. */
+ CheckArchiveTimeout();
- /*
- * Note we record the checkpoint start time not end time as
- * last_checkpoint_time. This is so that time-driven checkpoints
- * happen at a predictable spacing.
- */
- last_checkpoint_time = now;
+ /* Nap for the configured time. */
+ BgWriterNap();
}
- else
- BgBufferSync();
-
- /* Check for archive_timeout and switch xlog files if necessary. */
- CheckArchiveTimeout();
-
- /* Nap for the configured time. */
- BgWriterNap();
}
}
@@ -586,7 +682,8 @@ BgWriterNap(void)
(ckpt_active ? ImmediateCheckpointRequested() : checkpoint_requested))
break;
pg_usleep(1000000L);
- AbsorbFsyncRequests();
+ if (!IsRecoveryProcessingMode())
+ AbsorbFsyncRequests();
udelay -= 1000000L;
}
@@ -640,6 +737,19 @@ CheckpointWriteDelay(int flags, double progress)
if (!am_bg_writer)
return;
+ /* Perform minimal duties during recovery and skip wait if requested */
+ if (IsRecoveryProcessingMode())
+ {
+ BgBufferSync();
+
+ if (!shutdown_requested &&
+ !checkpoint_requested &&
+ IsCheckpointOnSchedule(progress))
+ BgWriterNap();
+
+ return;
+ }
+
/*
* Perform the usual bgwriter duties and take a nap, unless we're behind
* schedule, in which case we just try to catch up as quickly as possible.
@@ -714,16 +824,19 @@ IsCheckpointOnSchedule(double progress)
* However, it's good enough for our purposes, we're only calculating an
* estimate anyway.
*/
- recptr = GetInsertRecPtr();
- elapsed_xlogs =
- (((double) (int32) (recptr.xlogid - ckpt_start_recptr.xlogid)) * XLogSegsPerFile +
- ((double) recptr.xrecoff - (double) ckpt_start_recptr.xrecoff) / XLogSegSize) /
- CheckPointSegments;
-
- if (progress < elapsed_xlogs)
+ if (!IsRecoveryProcessingMode())
{
- ckpt_cached_elapsed = elapsed_xlogs;
- return false;
+ recptr = GetInsertRecPtr();
+ elapsed_xlogs =
+ (((double) (int32) (recptr.xlogid - ckpt_start_recptr.xlogid)) * XLogSegsPerFile +
+ ((double) recptr.xrecoff - (double) ckpt_start_recptr.xrecoff) / XLogSegSize) /
+ CheckPointSegments;
+
+ if (progress < elapsed_xlogs)
+ {
+ ckpt_cached_elapsed = elapsed_xlogs;
+ return false;
+ }
}
/*
@@ -989,6 +1102,77 @@ RequestCheckpoint(int flags)
}
/*
+ * Always runs in Startup process (see xlog.c)
+ */
+void
+RequestRestartPoint(const XLogRecPtr ReadPtr, const CheckPoint *restartPoint, bool sendToBGWriter)
+{
+ /*
+ * Should we just do it ourselves?
+ */
+ if (!IsPostmasterEnvironment || !sendToBGWriter)
+ {
+ CreateRestartPoint(ReadPtr, restartPoint, CHECKPOINT_IMMEDIATE);
+ return;
+ }
+
+ /*
+ * Push requested values into shared memory, then signal to request restartpoint.
+ */
+ if (BgWriterShmem->bgwriter_pid == 0)
+ elog(LOG, "could not request restartpoint because bgwriter not running");
+
+#ifdef NOT_USED
+ elog(LOG, "tli = %u nextXidEpoch = %u nextXid = %u nextOid = %u",
+ restartPoint->ThisTimeLineID,
+ restartPoint->nextXidEpoch,
+ restartPoint->nextXid,
+ restartPoint->nextOid);
+#endif
+
+ SpinLockAcquire(&BgWriterShmem->ckpt_lck);
+ BgWriterShmem->ReadPtr = ReadPtr;
+ memcpy(&BgWriterShmem->restartPoint, restartPoint, sizeof(CheckPoint));
+ SpinLockRelease(&BgWriterShmem->ckpt_lck);
+
+ if (kill(BgWriterShmem->bgwriter_pid, SIGINT) != 0)
+ elog(LOG, "could not signal for restartpoint: %m");
+}
+
+/*
+ * Sends another checkpoint request signal to bgwriter, which causes it
+ * to avoid smoothed writes and continue processing as if it had been
+ * called with CHECKPOINT_IMMEDIATE. This is used at the end of recovery.
+ */
+void
+RequestRestartPointCompletion(void)
+{
+ if (BgWriterShmem->bgwriter_pid != 0 &&
+ kill(BgWriterShmem->bgwriter_pid, SIGINT) != 0)
+ elog(LOG, "could not signal for restartpoint immediate: %m");
+}
+
+XLogRecPtr
+GetRedoLocationForArchiveCheckpoint(void)
+{
+ XLogRecPtr redo;
+
+ SpinLockAcquire(&BgWriterShmem->ckpt_lck);
+ redo = BgWriterShmem->ReadPtr;
+ SpinLockRelease(&BgWriterShmem->ckpt_lck);
+
+ return redo;
+}
+
+void
+SetRedoLocationForArchiveCheckpoint(XLogRecPtr redo)
+{
+ SpinLockAcquire(&BgWriterShmem->ckpt_lck);
+ BgWriterShmem->ReadPtr = redo;
+ SpinLockRelease(&BgWriterShmem->ckpt_lck);
+}
+
+/*
* ForwardFsyncRequest
* Forward a file-fsync request from a backend to the bgwriter
*
diff --git a/src/backend/postmaster/postmaster.c b/src/backend/postmaster/postmaster.c
index 3380b806f6..5cb84be4b8 100644
--- a/src/backend/postmaster/postmaster.c
+++ b/src/backend/postmaster/postmaster.c
@@ -254,6 +254,11 @@ typedef enum
{
PM_INIT, /* postmaster starting */
PM_STARTUP, /* waiting for startup subprocess */
+ PM_RECOVERY, /* consistent recovery mode; state only
+ * entered for archive and streaming recovery,
+ * and only after the point where the
+ * all data is in consistent state.
+ */
PM_RUN, /* normal "database is alive" state */
PM_WAIT_BACKUP, /* waiting for online backup mode to end */
PM_WAIT_BACKENDS, /* waiting for live backends to exit */
@@ -1302,7 +1307,7 @@ ServerLoop(void)
* state that prevents it, start one. It doesn't matter if this
* fails, we'll just try again later.
*/
- if (BgWriterPID == 0 && pmState == PM_RUN)
+ if (BgWriterPID == 0 && (pmState == PM_RUN || pmState == PM_RECOVERY))
BgWriterPID = StartBackgroundWriter();
/*
@@ -2116,7 +2121,7 @@ reaper(SIGNAL_ARGS)
if (pid == StartupPID)
{
StartupPID = 0;
- Assert(pmState == PM_STARTUP);
+ Assert(pmState == PM_STARTUP || pmState == PM_RECOVERY);
/* FATAL exit of startup is treated as catastrophic */
if (!EXIT_STATUS_0(exitstatus))
@@ -2157,11 +2162,11 @@ reaper(SIGNAL_ARGS)
load_role();
/*
- * Crank up the background writer. It doesn't matter if this
- * fails, we'll just try again later.
+ * Check whether we need to start background writer, if not
+ * already running.
*/
- Assert(BgWriterPID == 0);
- BgWriterPID = StartBackgroundWriter();
+ if (BgWriterPID == 0)
+ BgWriterPID = StartBackgroundWriter();
/*
* Likewise, start other special children as needed. In a restart
@@ -3847,6 +3852,51 @@ sigusr1_handler(SIGNAL_ARGS)
PG_SETMASK(&BlockSig);
+ if (CheckPostmasterSignal(PMSIGNAL_RECOVERY_START))
+ {
+ Assert(pmState == PM_STARTUP);
+
+ /*
+ * Go to shutdown mode if a shutdown request was pending.
+ */
+ if (Shutdown > NoShutdown)
+ {
+ pmState = PM_WAIT_BACKENDS;
+ /* PostmasterStateMachine logic does the rest */
+ }
+ else
+ {
+ /*
+ * Startup process has entered recovery
+ */
+ pmState = PM_RECOVERY;
+
+ /*
+ * Load the flat authorization file into postmaster's cache. The
+ * startup process won't have recomputed this from the database yet,
+ * so we it may change following recovery.
+ */
+ load_role();
+
+ /*
+ * Crank up the background writer. It doesn't matter if this
+ * fails, we'll just try again later.
+ */
+ Assert(BgWriterPID == 0);
+ BgWriterPID = StartBackgroundWriter();
+
+ /*
+ * Likewise, start other special children as needed.
+ */
+ Assert(PgStatPID == 0);
+ PgStatPID = pgstat_start();
+
+ /* XXX at this point we could accept read-only connections */
+ ereport(DEBUG1,
+ (errmsg("database system is in consistent recovery mode")));
+ }
+ }
+
if (CheckPostmasterSignal(PMSIGNAL_PASSWORD_CHANGE))
{
/*
diff --git a/src/backend/storage/buffer/README b/src/backend/storage/buffer/README
index 62b22bd1db..a7b81e37a7 100644
--- a/src/backend/storage/buffer/README
+++ b/src/backend/storage/buffer/README
@@ -268,3 +268,12 @@ out (and anyone else who flushes buffer contents to disk must do so too).
This ensures that the page image transferred to disk is reasonably consistent.
We might miss a hint-bit update or two but that isn't a problem, for the same
reasons mentioned under buffer access rules.
+
+As of 8.4, background writer starts during recovery mode when there is
+some form of potentially extended recovery to perform. It performs an
+identical service to normal processing, except that checkpoints it
+writes are technically restartpoints. Flushing outstanding WAL for dirty
+buffers is also skipped, though there shouldn't ever be new WAL entries
+at that time in any case. We could choose to start background writer
+immediately but we hold off until we can prove the database is in a
+consistent state so that postmaster has a single, clean state change.
diff --git a/src/bin/pg_controldata/pg_controldata.c b/src/bin/pg_controldata/pg_controldata.c
index 4ea849d7f1..3bba50ab83 100644
--- a/src/bin/pg_controldata/pg_controldata.c
+++ b/src/bin/pg_controldata/pg_controldata.c
@@ -197,6 +197,9 @@ main(int argc, char *argv[])
printf(_("Minimum recovery ending location: %X/%X\n"),
ControlFile.minRecoveryPoint.xlogid,
ControlFile.minRecoveryPoint.xrecoff);
+ printf(_("Minimum safe starting location: %X/%X\n"),
+ ControlFile.minSafeStartPoint.xlogid,
+ ControlFile.minSafeStartPoint.xrecoff);
printf(_("Maximum data alignment: %u\n"),
ControlFile.maxAlign);
/* we don't print floatFormat since can't say much useful about it */
diff --git a/src/bin/pg_resetxlog/pg_resetxlog.c b/src/bin/pg_resetxlog/pg_resetxlog.c
index 51cdde1145..b20d4bd4dd 100644
--- a/src/bin/pg_resetxlog/pg_resetxlog.c
+++ b/src/bin/pg_resetxlog/pg_resetxlog.c
@@ -603,6 +603,8 @@ RewriteControlFile(void)
ControlFile.prevCheckPoint.xrecoff = 0;
ControlFile.minRecoveryPoint.xlogid = 0;
ControlFile.minRecoveryPoint.xrecoff = 0;
+ ControlFile.minSafeStartPoint.xlogid = 0;
+ ControlFile.minSafeStartPoint.xrecoff = 0;
/* Now we can force the recorded xlog seg size to the right thing. */
ControlFile.xlog_seg_size = XLogSegSize;
diff --git a/src/include/access/xlog.h b/src/include/access/xlog.h
index 6913f7c800..cf787c8df6 100644
--- a/src/include/access/xlog.h
+++ b/src/include/access/xlog.h
@@ -133,7 +133,16 @@ typedef struct XLogRecData
} XLogRecData;
extern TimeLineID ThisTimeLineID; /* current TLI */
-extern bool InRecovery;
+
+/*
+ * Prior to 8.4, all activity during recovery were carried out by Startup
+ * process. This local variable continues to be used in many parts of the
+ * code to indicate actions taken by RecoveryManagers. Other processes who
+ * potentially perform work during recovery should check
+ * IsRecoveryProcessingMode(), see XLogCtl notes in xlog.c
+ */
+extern bool InRecovery;
+
extern XLogRecPtr XactLastRecEnd;
/* these variables are GUC parameters related to XLOG */
@@ -166,6 +175,7 @@ extern bool XLOG_DEBUG;
/* These indicate the cause of a checkpoint request */
#define CHECKPOINT_CAUSE_XLOG 0x0010 /* XLOG consumption */
#define CHECKPOINT_CAUSE_TIME 0x0020 /* Elapsed time */
+#define CHECKPOINT_RESTARTPOINT 0x0040 /* Restartpoint during recovery */
/* Checkpoint statistics */
typedef struct CheckpointStatsData
@@ -199,6 +209,8 @@ extern void RestoreBkpBlocks(XLogRecPtr lsn, XLogRecord *record, bool cleanup);
extern void xlog_redo(XLogRecPtr lsn, XLogRecord *record);
extern void xlog_desc(StringInfo buf, uint8 xl_info, char *rec);
+extern bool IsRecoveryProcessingMode(void);
+
extern void UpdateControlFile(void);
extern Size XLOGShmemSize(void);
extern void XLOGShmemInit(void);
diff --git a/src/include/access/xlog_internal.h b/src/include/access/xlog_internal.h
index 5675bfbcbd..4830a5ce74 100644
--- a/src/include/access/xlog_internal.h
+++ b/src/include/access/xlog_internal.h
@@ -17,6 +17,7 @@
#define XLOG_INTERNAL_H
#include "access/xlog.h"
+#include "catalog/pg_control.h"
#include "fmgr.h"
#include "pgtime.h"
#include "storage/block.h"
@@ -245,6 +246,9 @@ extern const RmgrData RmgrTable[];
extern pg_time_t GetLastSegSwitchTime(void);
extern XLogRecPtr RequestXLogSwitch(void);
+extern void CreateRestartPoint(const XLogRecPtr ReadPtr,
+ const CheckPoint *restartPoint, int flags);
+
/*
* These aren't in xlog.h because I'd rather not include fmgr.h there.
*/
diff --git a/src/include/catalog/pg_control.h b/src/include/catalog/pg_control.h
index 400f32c749..e69c8ec553 100644
--- a/src/include/catalog/pg_control.h
+++ b/src/include/catalog/pg_control.h
@@ -21,7 +21,7 @@
/* Version identifier for this pg_control format */
-#define PG_CONTROL_VERSION 843
+#define PG_CONTROL_VERSION 847
/*
* Body of CheckPoint XLOG records. This is declared here because we keep
@@ -46,7 +46,7 @@ typedef struct CheckPoint
#define XLOG_NOOP 0x20
#define XLOG_NEXTOID 0x30
#define XLOG_SWITCH 0x40
-
+#define XLOG_RECOVERY_END 0x50
/* System status indicator */
typedef enum DBState
@@ -102,6 +102,7 @@ typedef struct ControlFileData
CheckPoint checkPointCopy; /* copy of last check point record */
XLogRecPtr minRecoveryPoint; /* must replay xlog to here */
+ XLogRecPtr minSafeStartPoint; /* safe point after recovery crashes */
/*
* This data is used to check for hardware-architecture compatibility of
diff --git a/src/include/postmaster/bgwriter.h b/src/include/postmaster/bgwriter.h
index c1c9d7f580..d4b389e927 100644
--- a/src/include/postmaster/bgwriter.h
+++ b/src/include/postmaster/bgwriter.h
@@ -12,6 +12,7 @@
#ifndef _BGWRITER_H
#define _BGWRITER_H
+#include "catalog/pg_control.h"
#include "storage/block.h"
#include "storage/relfilenode.h"
@@ -25,6 +26,11 @@ extern double CheckPointCompletionTarget;
extern void BackgroundWriterMain(void);
extern void RequestCheckpoint(int flags);
+extern void RequestRestartPoint(const XLogRecPtr ReadPtr, const CheckPoint *restartPoint, bool sendToBGWriter);
+extern void RequestRestartPointCompletion(void);
+extern XLogRecPtr GetRedoLocationForArchiveCheckpoint(void);
+extern void SetRedoLocationForArchiveCheckpoint(XLogRecPtr redo);
+
extern void CheckpointWriteDelay(int flags, double progress);
extern bool ForwardFsyncRequest(RelFileNode rnode, ForkNumber forknum,
diff --git a/src/include/storage/pmsignal.h b/src/include/storage/pmsignal.h
index 3101092cbd..1904187bfd 100644
--- a/src/include/storage/pmsignal.h
+++ b/src/include/storage/pmsignal.h
@@ -22,6 +22,7 @@
*/
typedef enum
{
+ PMSIGNAL_RECOVERY_START, /* move to PM_RECOVERY state */
PMSIGNAL_PASSWORD_CHANGE, /* pg_auth file has changed */
PMSIGNAL_WAKEN_ARCHIVER, /* send a NOTIFY signal to xlog archiver */
PMSIGNAL_ROTATE_LOGFILE, /* send SIGUSR1 to syslogger to rotate logfile */