From f4fd26775b8b6e7090774bb3d794b8529771fce5 Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki@enterprisedb.com>
Date: Fri, 23 Jan 2009 14:31:41 +0200
Subject: [PATCH] Import Simon's recovery infrastructure patch v9

---
 src/backend/access/transam/clog.c       |   3 +
 src/backend/access/transam/multixact.c  |  14 +-
 src/backend/access/transam/subtrans.c   |   3 +
 src/backend/access/transam/xact.c       |   3 +
 src/backend/access/transam/xlog.c       | 723 ++++++++++++++++++------
 src/backend/postmaster/bgwriter.c       | 420 ++++++++++----
 src/backend/postmaster/postmaster.c     |  62 +-
 src/backend/storage/buffer/README       |   9 +
 src/bin/pg_controldata/pg_controldata.c |   3 +
 src/bin/pg_resetxlog/pg_resetxlog.c     |   2 +
 src/include/access/xlog.h               |  14 +-
 src/include/access/xlog_internal.h      |   4 +
 src/include/catalog/pg_control.h        |   5 +-
 src/include/postmaster/bgwriter.h       |   6 +
 src/include/storage/pmsignal.h          |   1 +
 15 files changed, 959 insertions(+), 313 deletions(-)

diff --git a/src/backend/access/transam/clog.c b/src/backend/access/transam/clog.c
index 528a219db4..5bd72154c5 100644
--- a/src/backend/access/transam/clog.c
+++ b/src/backend/access/transam/clog.c
@@ -475,6 +475,9 @@ ZeroCLOGPage(int pageno, bool writeXlog)
 /*
  * This must be called ONCE during postmaster or standalone-backend startup,
  * after StartupXLOG has initialized ShmemVariableCache->nextXid.
+ *
+ * We access just a single clog page, so this action is atomic and safe
+ * for use if other processes are active during recovery.
  */
 void
 StartupCLOG(void)
diff --git a/src/backend/access/transam/multixact.c b/src/backend/access/transam/multixact.c
index 7314341101..881a588d69 100644
--- a/src/backend/access/transam/multixact.c
+++ b/src/backend/access/transam/multixact.c
@@ -1413,8 +1413,11 @@ ZeroMultiXactMemberPage(int pageno, bool writeXlog)
  * MultiXactSetNextMXact and/or MultiXactAdvanceNextMXact.	Note that we
  * may already have replayed WAL data into the SLRU files.
  *
- * We don't need any locks here, really; the SLRU locks are taken
- * only because slru.c expects to be called with locks held.
+ * We want this operation to be atomic to ensure that other processes can 
+ * use MultiXact while we complete recovery. We access one page only from the
+ * offset and members buffers, so once locks are acquired they will not be
+ * dropped and re-acquired by SLRU code. So we take both locks at start, then
+ * hold them all the way to the end.
  */
 void
 StartupMultiXact(void)
@@ -1426,6 +1429,7 @@ StartupMultiXact(void)
 
 	/* Clean up offsets state */
 	LWLockAcquire(MultiXactOffsetControlLock, LW_EXCLUSIVE);
+	LWLockAcquire(MultiXactMemberControlLock, LW_EXCLUSIVE);
 
 	/*
 	 * Initialize our idea of the latest page number.
@@ -1452,10 +1456,7 @@ StartupMultiXact(void)
 		MultiXactOffsetCtl->shared->page_dirty[slotno] = true;
 	}
 
-	LWLockRelease(MultiXactOffsetControlLock);
-
 	/* And the same for members */
-	LWLockAcquire(MultiXactMemberControlLock, LW_EXCLUSIVE);
 
 	/*
 	 * Initialize our idea of the latest page number.
@@ -1483,6 +1484,7 @@ StartupMultiXact(void)
 	}
 
 	LWLockRelease(MultiXactMemberControlLock);
+	LWLockRelease(MultiXactOffsetControlLock);
 
 	/*
 	 * Initialize lastTruncationPoint to invalid, ensuring that the first
@@ -1543,7 +1545,7 @@ CheckPointMultiXact(void)
 	 * SimpleLruTruncate would get confused.  It seems best not to risk
 	 * removing any data during recovery anyway, so don't truncate.
 	 */
-	if (!InRecovery)
+	if (!IsRecoveryProcessingMode())
 		TruncateMultiXact();
 
 	TRACE_POSTGRESQL_MULTIXACT_CHECKPOINT_DONE(true);
diff --git a/src/backend/access/transam/subtrans.c b/src/backend/access/transam/subtrans.c
index 0dbd2166be..eaad23182a 100644
--- a/src/backend/access/transam/subtrans.c
+++ b/src/backend/access/transam/subtrans.c
@@ -226,6 +226,9 @@ ZeroSUBTRANSPage(int pageno)
  *
  * oldestActiveXID is the oldest XID of any prepared transaction, or nextXid
  * if there are none.
+ *
+ * Note that this is not atomic and is not yet safe to perform while other
+ * processes might access subtrans.
  */
 void
 StartupSUBTRANS(TransactionId oldestActiveXID)
diff --git a/src/backend/access/transam/xact.c b/src/backend/access/transam/xact.c
index c94e2a2251..d0ed3c0318 100644
--- a/src/backend/access/transam/xact.c
+++ b/src/backend/access/transam/xact.c
@@ -394,6 +394,9 @@ AssignTransactionId(TransactionState s)
 	bool		isSubXact = (s->parent != NULL);
 	ResourceOwner currentOwner;
 
+	if (IsRecoveryProcessingMode())
+		elog(FATAL, "cannot assign TransactionIds during recovery");
+
 	/* Assert that caller didn't screw up */
 	Assert(!TransactionIdIsValid(s->transactionId));
 	Assert(s->state == TRANS_INPROGRESS);
diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c
index bd6035d4a6..7e480e2fb2 100644
--- a/src/backend/access/transam/xlog.c
+++ b/src/backend/access/transam/xlog.c
@@ -115,7 +115,8 @@ CheckpointStatsData CheckpointStats;
 
 /*
  * ThisTimeLineID will be same in all backends --- it identifies current
- * WAL timeline for the database system.
+ * WAL timeline for the database system. Zero is always a bug, so we 
+ * start with that to allow us to spot any errors.
  */
 TimeLineID	ThisTimeLineID = 0;
 
@@ -125,6 +126,10 @@ bool		InRecovery = false;
 /* Are we recovering using offline XLOG archives? */
 static bool InArchiveRecovery = false;
 
+/* Local copy of shared RecoveryProcessingMode state */
+static bool LocalRecoveryProcessingMode = true;
+static bool knownProcessingMode = false;
+
 /* Was the last xlog file restored from archive, or local? */
 static bool restoredFromArchive = false;
 
@@ -143,6 +148,9 @@ static TransactionId recoveryStopXid;
 static TimestampTz recoveryStopTime;
 static bool recoveryStopAfter;
 
+/* is the database proven consistent yet? */
+bool	reachedSafeStartPoint = false;
+
 /*
  * During normal operation, the only timeline we care about is ThisTimeLineID.
  * During recovery, however, things are more complicated.  To simplify life
@@ -242,10 +250,30 @@ static XLogRecPtr RedoRecPtr;
  * ControlFileLock: must be held to read/update control file or create
  * new log file.
  *
- * CheckpointLock: must be held to do a checkpoint (ensures only one
- * checkpointer at a time; currently, with all checkpoints done by the
- * bgwriter, this is just pro forma).
+ * CheckpointLock: must be held to do a checkpoint or restartpoint, ensuring
+ * we get just one of those at any time. In 8.4+ recovery, both startup and
+ * bgwriter processes may take restartpoints, so this locking must be strict 
+ * to ensure there are no mistakes.
+ *
+ * In 8.4 we progress through a number of states at startup. Initially, the
+ * postmaster is in PM_STARTUP state and spawns the Startup process. We then
+ * progress until the database is in a consistent state, then if we are in
+ * InArchiveRecovery we go into PM_RECOVERY state. The bgwriter then starts
+ * up and takes over responsibility for performing restartpoints. We then
+ * progress until the end of recovery when we enter PM_RUN state upon
+ * termination of the Startup process. In summary:
+ * 
+ * PM_STARTUP state:	Startup process performs restartpoints
+ * PM_RECOVERY state:	bgwriter process performs restartpoints
+ * PM_RUN state: 		bgwriter process performs checkpoints
  *
+ * These transitions are fairly delicate, with many things that need to
+ * happen at the same time in order to change state successfully throughout
+ * the system. Changing PM_STARTUP to PM_RECOVERY only occurs when we can
+ * prove the databases are in a consistent state. Changing from PM_RECOVERY
+ * to PM_RUN happens whenever recovery ends, which could be forced upon us
+ * externally or it can occur becasue of damage or termination of the WAL
+ * sequence.
  *----------
  */
 
@@ -287,11 +315,18 @@ typedef struct XLogCtlWrite
 
 /*
  * Total shared-memory state for XLOG.
+ *
+ * This small structure is accessed by many backends, so we take care to
+ * pad out the parts of the structure so they can be accessed by separate
+ * CPUs without causing false sharing cache flushes. Padding is generous
+ * to allow for a wide variety of CPU architectures.
  */
+#define	XLOGCTL_BUFFER_SPACING	128
 typedef struct XLogCtlData
 {
 	/* Protected by WALInsertLock: */
 	XLogCtlInsert Insert;
+	char	InsertPadding[XLOGCTL_BUFFER_SPACING - sizeof(XLogCtlInsert)];
 
 	/* Protected by info_lck: */
 	XLogwrtRqst LogwrtRqst;
@@ -299,9 +334,16 @@ typedef struct XLogCtlData
 	uint32		ckptXidEpoch;	/* nextXID & epoch of latest checkpoint */
 	TransactionId ckptXid;
 	XLogRecPtr	asyncCommitLSN; /* LSN of newest async commit */
+	/* add data structure padding for above info_lck declarations */
+	char	InfoPadding[XLOGCTL_BUFFER_SPACING - sizeof(XLogwrtRqst) 
+											- sizeof(XLogwrtResult)
+											- sizeof(uint32)
+											- sizeof(TransactionId)
+											- sizeof(XLogRecPtr)];
 
 	/* Protected by WALWriteLock: */
 	XLogCtlWrite Write;
+	char	WritePadding[XLOGCTL_BUFFER_SPACING - sizeof(XLogCtlWrite)];
 
 	/*
 	 * These values do not change after startup, although the pointed-to pages
@@ -313,6 +355,24 @@ typedef struct XLogCtlData
 	int			XLogCacheBlck;	/* highest allocated xlog buffer index */
 	TimeLineID	ThisTimeLineID;
 
+	/*
+	 * IsRecoveryProcessingMode shows whether the postmaster is in a
+	 * postmaster state earlier than PM_RUN, or not. This is a globally
+	 * accessible state to allow EXEC_BACKEND case.
+	 *
+	 * We also retain a local state variable InRecovery. InRecovery=true
+	 * means the code is being executed by Startup process and therefore
+	 * always during Recovery Processing Mode. This allows us to identify
+	 * code executed *during* Recovery Processing Mode but not necessarily
+	 * by Startup process itself.
+	 *
+	 * Protected by mode_lck
+	 */
+	bool		SharedRecoveryProcessingMode;
+	slock_t		mode_lck;
+
+	char		InfoLockPadding[XLOGCTL_BUFFER_SPACING];
+
 	slock_t		info_lck;		/* locks shared variables shown above */
 } XLogCtlData;
 
@@ -399,8 +459,10 @@ static void XLogArchiveCleanup(const char *xlog);
 static void readRecoveryCommandFile(void);
 static void exitArchiveRecovery(TimeLineID endTLI,
 					uint32 endLogId, uint32 endLogSeg);
+static void exitRecovery(void);
 static bool recoveryStopsHere(XLogRecord *record, bool *includeThis);
 static void CheckPointGuts(XLogRecPtr checkPointRedo, int flags);
+static XLogRecPtr GetRedoLocationForCheckpoint(void);
 
 static bool XLogCheckBuffer(XLogRecData *rdata, bool doPageWrites,
 				XLogRecPtr *lsn, BkpBlock *bkpb);
@@ -483,6 +545,11 @@ XLogInsert(RmgrId rmid, uint8 info, XLogRecData *rdata)
 	bool		updrqst;
 	bool		doPageWrites;
 	bool		isLogSwitch = (rmid == RM_XLOG_ID && info == XLOG_SWITCH);
+	bool		isRecoveryEnd = (rmid == RM_XLOG_ID && info == XLOG_RECOVERY_END);
+
+	/* cross-check on whether we should be here or not */
+	if (IsRecoveryProcessingMode() && !isRecoveryEnd)
+		elog(FATAL, "cannot make new WAL entries during recovery");
 
 	/* info's high bits are reserved for use by me */
 	if (info & XLR_INFO_MASK)
@@ -1729,8 +1796,7 @@ XLogFlush(XLogRecPtr record)
 	XLogRecPtr	WriteRqstPtr;
 	XLogwrtRqst WriteRqst;
 
-	/* Disabled during REDO */
-	if (InRedo)
+	if (IsRecoveryProcessingMode())
 		return;
 
 	/* Quick exit if already known flushed */
@@ -1818,9 +1884,9 @@ XLogFlush(XLogRecPtr record)
 	 * the bad page is encountered again during recovery then we would be
 	 * unable to restart the database at all!  (This scenario has actually
 	 * happened in the field several times with 7.1 releases. Note that we
-	 * cannot get here while InRedo is true, but if the bad page is brought in
-	 * and marked dirty during recovery then CreateCheckPoint will try to
-	 * flush it at the end of recovery.)
+	 * cannot get here while IsRecoveryProcessingMode(), but if the bad page is
+	 * brought in and marked dirty during recovery then if a checkpoint were
+	 * performed at the end of recovery it will try to flush it.
 	 *
 	 * The current approach is to ERROR under normal conditions, but only
 	 * WARNING during recovery, so that the system can be brought up even if
@@ -1830,7 +1896,7 @@ XLogFlush(XLogRecPtr record)
 	 * and so we will not force a restart for a bad LSN on a data page.
 	 */
 	if (XLByteLT(LogwrtResult.Flush, record))
-		elog(InRecovery ? WARNING : ERROR,
+		elog(ERROR,
 		"xlog flush request %X/%X is not satisfied --- flushed only to %X/%X",
 			 record.xlogid, record.xrecoff,
 			 LogwrtResult.Flush.xlogid, LogwrtResult.Flush.xrecoff);
@@ -2103,7 +2169,8 @@ XLogFileInit(uint32 log, uint32 seg,
 		unlink(tmppath);
 	}
 
-	elog(DEBUG2, "done creating and filling new WAL file");
+	XLogFileName(tmppath, ThisTimeLineID, log, seg);
+	elog(DEBUG2, "done creating and filling new WAL file %s", tmppath);
 
 	/* Set flag to tell caller there was no existent file */
 	*use_existent = false;
@@ -2409,6 +2476,28 @@ XLogFileRead(uint32 log, uint32 seg, int emode)
 					 xlogfname);
 			set_ps_display(activitymsg, false);
 
+			/* 
+			 * Calculate and write out a new safeStartPoint. This defines
+			 * the latest LSN that might appear on-disk while we apply
+			 * the WAL records in this file. If we crash during recovery
+			 * we must reach this point again before we can prove
+			 * database consistency. Not a restartpoint! Restart points
+			 * define where we should start recovery from, if we crash.
+			 */
+			if (InArchiveRecovery)
+			{
+				uint32 nextLog = log;
+				uint32 nextSeg = seg;
+
+				NextLogSeg(nextLog, nextSeg);
+
+				LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
+				ControlFile->minSafeStartPoint.xlogid = nextLog;
+				ControlFile->minSafeStartPoint.xrecoff = nextSeg * XLogSegSize;
+				UpdateControlFile();
+				LWLockRelease(ControlFileLock);
+			}
+
 			return fd;
 		}
 		if (errno != ENOENT)	/* unexpected failure? */
@@ -4283,6 +4372,7 @@ XLOGShmemInit(void)
 	XLogCtl->XLogCacheBlck = XLOGbuffers - 1;
 	XLogCtl->Insert.currpage = (XLogPageHeader) (XLogCtl->pages);
 	SpinLockInit(&XLogCtl->info_lck);
+	SpinLockInit(&XLogCtl->mode_lck);
 
 	/*
 	 * If we are not in bootstrap mode, pg_control should already exist. Read
@@ -4593,12 +4683,12 @@ readRecoveryCommandFile(void)
 			 * does nothing if a recovery_target is not also set
 			 */
 			if (!parse_bool(tok2, &recoveryLogRestartpoints))
-				  ereport(ERROR,
-							(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
-					  errmsg("parameter \"log_restartpoints\" requires a Boolean value")));
+				ereport(ERROR,
+						(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+							errmsg("parameter \"log_restartpoints\" requires a Boolean value")));
 			ereport(LOG,
-					(errmsg("log_restartpoints = %s", tok2)));
-		}
+				(errmsg("log_restartpoints = %s", tok2)));
+ 		}
 		else
 			ereport(FATAL,
 					(errmsg("unrecognized recovery parameter \"%s\"",
@@ -4733,15 +4823,13 @@ exitArchiveRecovery(TimeLineID endTLI, uint32 endLogId, uint32 endLogSeg)
 	unlink(recoveryPath);		/* ignore any error */
 
 	/*
-	 * Rename the config file out of the way, so that we don't accidentally
-	 * re-enter archive recovery mode in a subsequent crash.
+	 * As of 8.4 we no longer rename the recovery.conf file out of the
+	 * way until after we have performed a full checkpoint. This ensures
+	 * that any crash between now and the end of the checkpoint does not
+	 * attempt to restart from a WAL file that is no longer available to us.
+	 * As soon as we remove recovery.conf we lose our recovery_command and
+	 * cannot reaccess WAL files from the archive.
 	 */
-	unlink(RECOVERY_COMMAND_DONE);
-	if (rename(RECOVERY_COMMAND_FILE, RECOVERY_COMMAND_DONE) != 0)
-		ereport(FATAL,
-				(errcode_for_file_access(),
-				 errmsg("could not rename file \"%s\" to \"%s\": %m",
-						RECOVERY_COMMAND_FILE, RECOVERY_COMMAND_DONE)));
 
 	ereport(LOG,
 			(errmsg("archive recovery complete")));
@@ -4876,6 +4964,7 @@ StartupXLOG(void)
 	CheckPoint	checkPoint;
 	bool		wasShutdown;
 	bool		reachedStopPoint = false;
+	bool		performedRecovery = false;
 	bool		haveBackupLabel = false;
 	XLogRecPtr	RecPtr,
 				LastRec,
@@ -4888,6 +4977,8 @@ StartupXLOG(void)
 	uint32		freespace;
 	TransactionId oldestActiveXID;
 
+	XLogCtl->SharedRecoveryProcessingMode = true;
+
 	/*
 	 * Read control file and check XLOG status looks valid.
 	 *
@@ -5108,8 +5199,14 @@ StartupXLOG(void)
 		if (minRecoveryLoc.xlogid != 0 || minRecoveryLoc.xrecoff != 0)
 			ControlFile->minRecoveryPoint = minRecoveryLoc;
 		ControlFile->time = (pg_time_t) time(NULL);
+		/* No need to hold ControlFileLock yet, we aren't up far enough */
 		UpdateControlFile();
 
+		/*
+		 * Reset pgstat data, because it may be invalid after recovery.
+		 */
+		pgstat_reset_all();
+
 		/*
 		 * If there was a backup label file, it's done its job and the info
 		 * has now been propagated into pg_control.  We must get rid of the
@@ -5217,6 +5314,32 @@ StartupXLOG(void)
 
 				LastRec = ReadRecPtr;
 
+				/*
+				 * Have we reached our safe starting point? If so, we can
+				 * signal Postmaster to enter consistent recovery mode.
+				 *
+				 * There are two point in the log we must pass. The first is
+				 * the minRecoveryPoint, which is the LSN at the time the
+				 * base backup was taken that we are about to rollfoward from.
+				 * If recovery has ever crashed or was stopped there is 
+				 * another point also: minSafeStartPoint, which we know the
+				 * latest LSN that recovery could have reached prior to crash.
+				 */
+				if (!reachedSafeStartPoint && 
+					 XLByteLE(ControlFile->minSafeStartPoint, EndRecPtr) && 
+					 XLByteLE(ControlFile->minRecoveryPoint, EndRecPtr))
+				{
+					reachedSafeStartPoint = true;
+					if (InArchiveRecovery)
+					{
+						ereport(LOG,
+							(errmsg("consistent recovery state reached at %X/%X",
+								EndRecPtr.xlogid, EndRecPtr.xrecoff)));
+						if (IsUnderPostmaster)
+							SendPostmasterSignal(PMSIGNAL_RECOVERY_START);
+					}
+				}
+
 				record = ReadRecord(NULL, LOG);
 			} while (record != NULL && recoveryContinue);
 
@@ -5238,6 +5361,7 @@ StartupXLOG(void)
 			/* there are no WAL records following the checkpoint */
 			ereport(LOG,
 					(errmsg("redo is not required")));
+			reachedSafeStartPoint = true;
 		}
 	}
 
@@ -5251,9 +5375,9 @@ StartupXLOG(void)
 
 	/*
 	 * Complain if we did not roll forward far enough to render the backup
-	 * dump consistent.
+	 * dump consistent and start safely.
 	 */
-	if (XLByteLT(EndOfLog, ControlFile->minRecoveryPoint))
+	if (InRecovery && !reachedSafeStartPoint)
 	{
 		if (reachedStopPoint)	/* stopped because of stop request */
 			ereport(FATAL,
@@ -5375,39 +5499,14 @@ StartupXLOG(void)
 		XLogCheckInvalidPages();
 
 		/*
-		 * Reset pgstat data, because it may be invalid after recovery.
+		 * Finally exit recovery and mark that in WAL. Pre-8.4 we wrote
+		 * a shutdown checkpoint here, but we ask bgwriter to do that now.
 		 */
-		pgstat_reset_all();
+		exitRecovery();
 
-		/*
-		 * Perform a checkpoint to update all our recovery activity to disk.
-		 *
-		 * Note that we write a shutdown checkpoint rather than an on-line
-		 * one. This is not particularly critical, but since we may be
-		 * assigning a new TLI, using a shutdown checkpoint allows us to have
-		 * the rule that TLI only changes in shutdown checkpoints, which
-		 * allows some extra error checking in xlog_redo.
-		 */
-		CreateCheckPoint(CHECKPOINT_IS_SHUTDOWN | CHECKPOINT_IMMEDIATE);
+		performedRecovery = true;
 	}
 
-	/*
-	 * Preallocate additional log files, if wanted.
-	 */
-	PreallocXlogFiles(EndOfLog);
-
-	/*
-	 * Okay, we're officially UP.
-	 */
-	InRecovery = false;
-
-	ControlFile->state = DB_IN_PRODUCTION;
-	ControlFile->time = (pg_time_t) time(NULL);
-	UpdateControlFile();
-
-	/* start the archive_timeout timer running */
-	XLogCtl->Write.lastSegSwitchTime = ControlFile->time;
-
 	/* initialize shared-memory copy of latest checkpoint XID/epoch */
 	XLogCtl->ckptXidEpoch = ControlFile->checkPointCopy.nextXidEpoch;
 	XLogCtl->ckptXid = ControlFile->checkPointCopy.nextXid;
@@ -5441,6 +5540,89 @@ StartupXLOG(void)
 		readRecordBuf = NULL;
 		readRecordBufSize = 0;
 	}
+
+	/*
+	 * Prior to 8.4 we wrote a Shutdown Checkpoint at the end of recovery.
+	 * This could add minutes to the startup time, so we want bgwriter
+	 * to perform it. This then frees the Startup process to complete so we can
+	 * allow transactions and WAL inserts. We still write a checkpoint, but
+	 * it will be an online checkpoint. Online checkpoints have a redo
+	 * location that can be prior to the actual checkpoint record. So we want
+	 * to derive that redo location *before* we let anybody else write WAL,
+	 * otherwise we might miss some WAL records if we crash.
+	 */
+	if (performedRecovery)
+	{
+		XLogRecPtr	redo;
+
+		/* 
+		 * We must grab the pointer before anybody writes WAL 
+		 */
+		redo = GetRedoLocationForCheckpoint();
+
+		/* 
+		 * Tell the bgwriter
+		 */
+		SetRedoLocationForArchiveCheckpoint(redo);
+
+		/*
+		 * Okay, we can come up now. Allow others to write WAL.
+		 */
+		XLogCtl->SharedRecoveryProcessingMode = false;
+
+		/*
+		 * Now request checkpoint
+		 */
+		RequestCheckpoint(CHECKPOINT_FORCE | CHECKPOINT_IMMEDIATE);
+	}
+	else
+	{
+		/*
+		 * No recovery, so lets just get on with it. 
+		 */
+		LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
+		ControlFile->state = DB_IN_PRODUCTION;
+		ControlFile->time = (pg_time_t) time(NULL);
+		UpdateControlFile();
+		LWLockRelease(ControlFileLock);
+
+		/*
+		 * Okay, we're officially UP.
+		 */
+		XLogCtl->SharedRecoveryProcessingMode = false;
+	}
+
+	/* start the archive_timeout timer running */
+	XLogCtl->Write.lastSegSwitchTime = (pg_time_t) time(NULL);
+
+}
+
+/*
+ * IsRecoveryProcessingMode()
+ *
+ * Fast test for whether we're still in recovery or not. We test the shared
+ * state each time only until we leave recovery mode. After that we never
+ * look again, relying upon the settings of our local state variables. This
+ * is designed to avoid the need for a separate initialisation step.
+ */
+bool
+IsRecoveryProcessingMode(void)
+{
+	if (knownProcessingMode && !LocalRecoveryProcessingMode)
+		return false;
+
+	{
+		/* use volatile pointer to prevent code rearrangement */
+		volatile XLogCtlData *xlogctl = XLogCtl;
+
+		SpinLockAcquire(&xlogctl->mode_lck);
+		LocalRecoveryProcessingMode = XLogCtl->SharedRecoveryProcessingMode;
+		SpinLockRelease(&xlogctl->mode_lck);
+	}
+
+	knownProcessingMode = true;
+
+	return LocalRecoveryProcessingMode;
 }
 
 /*
@@ -5698,20 +5880,24 @@ ShutdownXLOG(int code, Datum arg)
 static void
 LogCheckpointStart(int flags)
 {
-	elog(LOG, "checkpoint starting:%s%s%s%s%s%s",
-		 (flags & CHECKPOINT_IS_SHUTDOWN) ? " shutdown" : "",
-		 (flags & CHECKPOINT_IMMEDIATE) ? " immediate" : "",
-		 (flags & CHECKPOINT_FORCE) ? " force" : "",
-		 (flags & CHECKPOINT_WAIT) ? " wait" : "",
-		 (flags & CHECKPOINT_CAUSE_XLOG) ? " xlog" : "",
-		 (flags & CHECKPOINT_CAUSE_TIME) ? " time" : "");
+	if (flags & CHECKPOINT_RESTARTPOINT)
+		elog(LOG, "restartpoint starting:%s",
+			 (flags & CHECKPOINT_IMMEDIATE) ? " immediate" : "");
+	else
+		elog(LOG, "checkpoint starting:%s%s%s%s%s%s",
+			 (flags & CHECKPOINT_IS_SHUTDOWN) ? " shutdown" : "",
+			 (flags & CHECKPOINT_IMMEDIATE) ? " immediate" : "",
+			 (flags & CHECKPOINT_FORCE) ? " force" : "",
+			 (flags & CHECKPOINT_WAIT) ? " wait" : "",
+			 (flags & CHECKPOINT_CAUSE_XLOG) ? " xlog" : "",
+			 (flags & CHECKPOINT_CAUSE_TIME) ? " time" : "");
 }
 
 /*
  * Log end of a checkpoint.
  */
 static void
-LogCheckpointEnd(void)
+LogCheckpointEnd(int flags)
 {
 	long		write_secs,
 				sync_secs,
@@ -5734,17 +5920,26 @@ LogCheckpointEnd(void)
 						CheckpointStats.ckpt_sync_end_t,
 						&sync_secs, &sync_usecs);
 
-	elog(LOG, "checkpoint complete: wrote %d buffers (%.1f%%); "
-		 "%d transaction log file(s) added, %d removed, %d recycled; "
-		 "write=%ld.%03d s, sync=%ld.%03d s, total=%ld.%03d s",
-		 CheckpointStats.ckpt_bufs_written,
-		 (double) CheckpointStats.ckpt_bufs_written * 100 / NBuffers,
-		 CheckpointStats.ckpt_segs_added,
-		 CheckpointStats.ckpt_segs_removed,
-		 CheckpointStats.ckpt_segs_recycled,
-		 write_secs, write_usecs / 1000,
-		 sync_secs, sync_usecs / 1000,
-		 total_secs, total_usecs / 1000);
+	if (flags & CHECKPOINT_RESTARTPOINT)
+		elog(LOG, "restartpoint complete: wrote %d buffers (%.1f%%); "
+			 "write=%ld.%03d s, sync=%ld.%03d s, total=%ld.%03d s",
+			 CheckpointStats.ckpt_bufs_written,
+			 (double) CheckpointStats.ckpt_bufs_written * 100 / NBuffers,
+			 write_secs, write_usecs / 1000,
+			 sync_secs, sync_usecs / 1000,
+			 total_secs, total_usecs / 1000);
+	else
+		elog(LOG, "checkpoint complete: wrote %d buffers (%.1f%%); "
+			 "%d transaction log file(s) added, %d removed, %d recycled; "
+			 "write=%ld.%03d s, sync=%ld.%03d s, total=%ld.%03d s",
+			 CheckpointStats.ckpt_bufs_written,
+			 (double) CheckpointStats.ckpt_bufs_written * 100 / NBuffers,
+			 CheckpointStats.ckpt_segs_added,
+			 CheckpointStats.ckpt_segs_removed,
+			 CheckpointStats.ckpt_segs_recycled,
+			 write_secs, write_usecs / 1000,
+			 sync_secs, sync_usecs / 1000,
+			 total_secs, total_usecs / 1000);
 }
 
 /*
@@ -5769,17 +5964,16 @@ CreateCheckPoint(int flags)
 	XLogRecPtr	recptr;
 	XLogCtlInsert *Insert = &XLogCtl->Insert;
 	XLogRecData rdata;
-	uint32		freespace;
 	uint32		_logId;
 	uint32		_logSeg;
 	TransactionId *inCommitXids;
 	int			nInCommit;
+	bool		leavingArchiveRecovery = false;
 
 	/*
 	 * Acquire CheckpointLock to ensure only one checkpoint happens at a time.
-	 * (This is just pro forma, since in the present system structure there is
-	 * only one process that is allowed to issue checkpoints at any given
-	 * time.)
+	 * That shouldn't be happening, but checkpoints are an important aspect
+	 * of our resilience, so we take no chances.
 	 */
 	LWLockAcquire(CheckpointLock, LW_EXCLUSIVE);
 
@@ -5793,6 +5987,13 @@ CreateCheckPoint(int flags)
 	MemSet(&CheckpointStats, 0, sizeof(CheckpointStats));
 	CheckpointStats.ckpt_start_t = GetCurrentTimestamp();
 
+	/*
+	 * Find out if this is the first checkpoint after archive recovery.
+	 */
+	LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
+	leavingArchiveRecovery = (ControlFile->state == DB_IN_ARCHIVE_RECOVERY);
+	LWLockRelease(ControlFileLock);
+
 	/*
 	 * Use a critical section to force system panic if we have trouble.
 	 */
@@ -5800,9 +6001,11 @@ CreateCheckPoint(int flags)
 
 	if (shutdown)
 	{
+		LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
 		ControlFile->state = DB_SHUTDOWNING;
 		ControlFile->time = (pg_time_t) time(NULL);
 		UpdateControlFile();
+		LWLockRelease(ControlFileLock);
 	}
 
 	/*
@@ -5817,90 +6020,68 @@ CreateCheckPoint(int flags)
 	checkPoint.ThisTimeLineID = ThisTimeLineID;
 	checkPoint.time = (pg_time_t) time(NULL);
 
-	/*
-	 * We must hold WALInsertLock while examining insert state to determine
-	 * the checkpoint REDO pointer.
-	 */
-	LWLockAcquire(WALInsertLock, LW_EXCLUSIVE);
+	if (leavingArchiveRecovery)
+		checkPoint.redo = GetRedoLocationForArchiveCheckpoint();
+	else
+	{
+		/*
+		 * We must hold WALInsertLock while examining insert state to determine
+		 * the checkpoint REDO pointer.
+		 */
+		LWLockAcquire(WALInsertLock, LW_EXCLUSIVE);
 
-	/*
-	 * If this isn't a shutdown or forced checkpoint, and we have not inserted
-	 * any XLOG records since the start of the last checkpoint, skip the
-	 * checkpoint.	The idea here is to avoid inserting duplicate checkpoints
-	 * when the system is idle. That wastes log space, and more importantly it
-	 * exposes us to possible loss of both current and previous checkpoint
-	 * records if the machine crashes just as we're writing the update.
-	 * (Perhaps it'd make even more sense to checkpoint only when the previous
-	 * checkpoint record is in a different xlog page?)
-	 *
-	 * We have to make two tests to determine that nothing has happened since
-	 * the start of the last checkpoint: current insertion point must match
-	 * the end of the last checkpoint record, and its redo pointer must point
-	 * to itself.
-	 */
-	if ((flags & (CHECKPOINT_IS_SHUTDOWN | CHECKPOINT_FORCE)) == 0)
-	{
-		XLogRecPtr	curInsert;
-
-		INSERT_RECPTR(curInsert, Insert, Insert->curridx);
-		if (curInsert.xlogid == ControlFile->checkPoint.xlogid &&
-			curInsert.xrecoff == ControlFile->checkPoint.xrecoff +
-			MAXALIGN(SizeOfXLogRecord + sizeof(CheckPoint)) &&
-			ControlFile->checkPoint.xlogid ==
-			ControlFile->checkPointCopy.redo.xlogid &&
-			ControlFile->checkPoint.xrecoff ==
-			ControlFile->checkPointCopy.redo.xrecoff)
+		/*
+		 * If this isn't a shutdown or forced checkpoint, and we have not inserted
+		 * any XLOG records since the start of the last checkpoint, skip the
+		 * checkpoint.	The idea here is to avoid inserting duplicate checkpoints
+		 * when the system is idle. That wastes log space, and more importantly it
+		 * exposes us to possible loss of both current and previous checkpoint
+		 * records if the machine crashes just as we're writing the update.
+		 * (Perhaps it'd make even more sense to checkpoint only when the previous
+		 * checkpoint record is in a different xlog page?)
+		 *
+		 * We have to make two tests to determine that nothing has happened since
+		 * the start of the last checkpoint: current insertion point must match
+		 * the end of the last checkpoint record, and its redo pointer must point
+		 * to itself.
+		 */
+		if ((flags & (CHECKPOINT_IS_SHUTDOWN | CHECKPOINT_FORCE)) == 0)
 		{
-			LWLockRelease(WALInsertLock);
-			LWLockRelease(CheckpointLock);
-			END_CRIT_SECTION();
-			return;
+			XLogRecPtr	curInsert;
+
+			INSERT_RECPTR(curInsert, Insert, Insert->curridx);
+			if (curInsert.xlogid == ControlFile->checkPoint.xlogid &&
+				curInsert.xrecoff == ControlFile->checkPoint.xrecoff +
+				MAXALIGN(SizeOfXLogRecord + sizeof(CheckPoint)) &&
+				ControlFile->checkPoint.xlogid ==
+				ControlFile->checkPointCopy.redo.xlogid &&
+				ControlFile->checkPoint.xrecoff ==
+				ControlFile->checkPointCopy.redo.xrecoff)
+			{
+				LWLockRelease(WALInsertLock);
+				LWLockRelease(CheckpointLock);
+				END_CRIT_SECTION();
+				return;
+			}
 		}
-	}
-
-	/*
-	 * Compute new REDO record ptr = location of next XLOG record.
-	 *
-	 * NB: this is NOT necessarily where the checkpoint record itself will be,
-	 * since other backends may insert more XLOG records while we're off doing
-	 * the buffer flush work.  Those XLOG records are logically after the
-	 * checkpoint, even though physically before it.  Got that?
-	 */
-	freespace = INSERT_FREESPACE(Insert);
-	if (freespace < SizeOfXLogRecord)
-	{
-		(void) AdvanceXLInsertBuffer(false);
-		/* OK to ignore update return flag, since we will do flush anyway */
-		freespace = INSERT_FREESPACE(Insert);
-	}
-	INSERT_RECPTR(checkPoint.redo, Insert, Insert->curridx);
 
-	/*
-	 * Here we update the shared RedoRecPtr for future XLogInsert calls; this
-	 * must be done while holding the insert lock AND the info_lck.
-	 *
-	 * Note: if we fail to complete the checkpoint, RedoRecPtr will be left
-	 * pointing past where it really needs to point.  This is okay; the only
-	 * consequence is that XLogInsert might back up whole buffers that it
-	 * didn't really need to.  We can't postpone advancing RedoRecPtr because
-	 * XLogInserts that happen while we are dumping buffers must assume that
-	 * their buffer changes are not included in the checkpoint.
-	 */
-	{
-		/* use volatile pointer to prevent code rearrangement */
-		volatile XLogCtlData *xlogctl = XLogCtl;
+		/*
+		 * Compute new REDO record ptr = location of next XLOG record.
+		 *
+		 * NB: this is NOT necessarily where the checkpoint record itself will be,
+		 * since other backends may insert more XLOG records while we're off doing
+		 * the buffer flush work.  Those XLOG records are logically after the
+		 * checkpoint, even though physically before it.  Got that?
+		 */
+		checkPoint.redo = GetRedoLocationForCheckpoint();
 
-		SpinLockAcquire(&xlogctl->info_lck);
-		RedoRecPtr = xlogctl->Insert.RedoRecPtr = checkPoint.redo;
-		SpinLockRelease(&xlogctl->info_lck);
+		/*
+		 * Now we can release WAL insert lock, allowing other xacts to proceed
+		 * while we are flushing disk buffers.
+		 */
+		LWLockRelease(WALInsertLock);
 	}
 
-	/*
-	 * Now we can release WAL insert lock, allowing other xacts to proceed
-	 * while we are flushing disk buffers.
-	 */
-	LWLockRelease(WALInsertLock);
-
 	/*
 	 * If enabled, log checkpoint start.  We postpone this until now so as not
 	 * to log anything if we decided to skip the checkpoint.
@@ -6010,18 +6191,43 @@ CreateCheckPoint(int flags)
 	XLByteToSeg(ControlFile->checkPointCopy.redo, _logId, _logSeg);
 
 	/*
-	 * Update the control file.
+	 * Update the control file. In 8.4, this routine becomes the primary
+	 * point for recording changes of state in the control file at the 
+	 * end of recovery. Postmaster state already shows us being in 
+	 * normal running mode, but it is only after this point that we
+	 * are completely free of reperforming a recovery if we crash.  Note
+	 * that this is executed by bgwriter after the death of Startup process.
 	 */
 	LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
+
 	if (shutdown)
 		ControlFile->state = DB_SHUTDOWNED;
+	else
+		ControlFile->state = DB_IN_PRODUCTION;
+
 	ControlFile->prevCheckPoint = ControlFile->checkPoint;
 	ControlFile->checkPoint = ProcLastRecPtr;
 	ControlFile->checkPointCopy = checkPoint;
 	ControlFile->time = (pg_time_t) time(NULL);
 	UpdateControlFile();
+
 	LWLockRelease(ControlFileLock);
 
+	if (leavingArchiveRecovery)
+	{
+		/*
+		 * Rename the config file out of the way, so that we don't accidentally
+		 * re-enter archive recovery mode in a subsequent crash. Prior to
+		 * 8.4 this step was performed at end of exitArchiveRecovery().
+		 */
+		unlink(RECOVERY_COMMAND_DONE);
+		if (rename(RECOVERY_COMMAND_FILE, RECOVERY_COMMAND_DONE) != 0)
+			ereport(ERROR,
+					(errcode_for_file_access(),
+					 errmsg("could not rename file \"%s\" to \"%s\": %m",
+							RECOVERY_COMMAND_FILE, RECOVERY_COMMAND_DONE)));
+	}
+
 	/* Update shared-memory copy of checkpoint XID/epoch */
 	{
 		/* use volatile pointer to prevent code rearrangement */
@@ -6068,12 +6274,11 @@ CreateCheckPoint(int flags)
 	 * in subtrans.c).	During recovery, though, we mustn't do this because
 	 * StartupSUBTRANS hasn't been called yet.
 	 */
-	if (!InRecovery)
-		TruncateSUBTRANS(GetOldestXmin(true, false));
+	TruncateSUBTRANS(GetOldestXmin(true, false));
 
 	/* All real work is done, but log before releasing lock. */
 	if (log_checkpoints)
-		LogCheckpointEnd();
+		LogCheckpointEnd(flags);
 
         TRACE_POSTGRESQL_CHECKPOINT_DONE(CheckpointStats.ckpt_bufs_written,
                                 NBuffers, CheckpointStats.ckpt_segs_added,
@@ -6083,6 +6288,51 @@ CreateCheckPoint(int flags)
 	LWLockRelease(CheckpointLock);
 }
 
+/* 
+ * GetRedoLocationForCheckpoint()
+ *
+ * When !IsRecoveryProcessingMode() this must be called while holding 
+ * WALInsertLock().
+ */
+static XLogRecPtr
+GetRedoLocationForCheckpoint()
+{
+	XLogCtlInsert  *Insert = &XLogCtl->Insert;
+	uint32			freespace;
+	XLogRecPtr		redo;
+
+	freespace = INSERT_FREESPACE(Insert);
+	if (freespace < SizeOfXLogRecord)
+	{
+		(void) AdvanceXLInsertBuffer(false);
+		/* OK to ignore update return flag, since we will do flush anyway */
+		freespace = INSERT_FREESPACE(Insert);
+	}
+	INSERT_RECPTR(redo, Insert, Insert->curridx);
+
+	/*
+	 * Here we update the shared RedoRecPtr for future XLogInsert calls; this
+	 * must be done while holding the insert lock AND the info_lck.
+	 *
+	 * Note: if we fail to complete the checkpoint, RedoRecPtr will be left
+	 * pointing past where it really needs to point.  This is okay; the only
+	 * consequence is that XLogInsert might back up whole buffers that it
+	 * didn't really need to.  We can't postpone advancing RedoRecPtr because
+	 * XLogInserts that happen while we are dumping buffers must assume that
+	 * their buffer changes are not included in the checkpoint.
+	 */
+	{
+		/* use volatile pointer to prevent code rearrangement */
+		volatile XLogCtlData *xlogctl = XLogCtl;
+
+		SpinLockAcquire(&xlogctl->info_lck);
+		RedoRecPtr = xlogctl->Insert.RedoRecPtr = redo;
+		SpinLockRelease(&xlogctl->info_lck);
+	}
+
+	return redo;
+}
+
 /*
  * Flush all data in shared memory to disk, and fsync
  *
@@ -6147,29 +6397,69 @@ RecoveryRestartPoint(const CheckPoint *checkPoint)
 			}
 	}
 
+	RequestRestartPoint(ReadRecPtr, checkPoint, reachedSafeStartPoint);
+}
+
+/*
+ * As of 8.4, RestartPoints are always created by the bgwriter
+ * once we have reachedSafeStartPoint. We use bgwriter's shared memory
+ * area wherever we call it from, to keep better code structure.
+ */
+void
+CreateRestartPoint(const XLogRecPtr ReadPtr, const CheckPoint *restartPoint, int flags)
+{
+	if (recoveryLogRestartpoints)
+	{
+		/*
+		 * Prepare to accumulate statistics.
+		 */
+
+		MemSet(&CheckpointStats, 0, sizeof(CheckpointStats));
+		CheckpointStats.ckpt_start_t = GetCurrentTimestamp();
+
+		LogCheckpointStart(CHECKPOINT_RESTARTPOINT | flags);
+	}
+
 	/*
-	 * OK, force data out to disk
+	 * Acquire CheckpointLock to ensure only one restartpoint happens at a time.
+	 * We rely on this lock to ensure that the startup process doesn't exit
+	 * Recovery while we are half way through a restartpoint.
 	 */
-	CheckPointGuts(checkPoint->redo, CHECKPOINT_IMMEDIATE);
+	LWLockAcquire(CheckpointLock, LW_EXCLUSIVE);
+
+	CheckPointGuts(restartPoint->redo, CHECKPOINT_RESTARTPOINT | flags);
 
 	/*
-	 * Update pg_control so that any subsequent crash will restart from this
-	 * checkpoint.	Note: ReadRecPtr gives the XLOG address of the checkpoint
-	 * record itself.
+	 * Update pg_control, using current time
 	 */
+	LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
 	ControlFile->prevCheckPoint = ControlFile->checkPoint;
-	ControlFile->checkPoint = ReadRecPtr;
-	ControlFile->checkPointCopy = *checkPoint;
+	ControlFile->checkPoint = ReadPtr;
+	ControlFile->checkPointCopy = *restartPoint;
 	ControlFile->time = (pg_time_t) time(NULL);
 	UpdateControlFile();
+	LWLockRelease(ControlFileLock);
+
+	/*
+	 * Currently, there is no need to truncate pg_subtrans during recovery.
+	 * If we did do that, we will need to have called StartupSUBTRANS()
+	 * already and then TruncateSUBTRANS() would go here.
+	 */
+
+	/* All real work is done, but log before releasing lock. */
+	if (recoveryLogRestartpoints)
+		LogCheckpointEnd(CHECKPOINT_RESTARTPOINT);
 
 	ereport((recoveryLogRestartpoints ? LOG : DEBUG2),
 			(errmsg("recovery restart point at %X/%X",
-					checkPoint->redo.xlogid, checkPoint->redo.xrecoff)));
+					restartPoint->redo.xlogid, restartPoint->redo.xrecoff)));
+
 	if (recoveryLastXTime)
 		ereport((recoveryLogRestartpoints ? LOG : DEBUG2),
-				(errmsg("last completed transaction was at log time %s",
-						timestamptz_to_str(recoveryLastXTime))));
+			(errmsg("last completed transaction was at log time %s",
+					timestamptz_to_str(recoveryLastXTime))));
+
+	LWLockRelease(CheckpointLock);
 }
 
 /*
@@ -6234,7 +6524,63 @@ RequestXLogSwitch(void)
 }
 
 /*
- * XLOG resource manager's routines
+ * exitRecovery()
+ *
+ * Exit recovery state and write a XLOG_RECOVERY_END record. This is the
+ * only record type that can record a change of timelineID. We assume
+ * caller has already set ThisTimeLineID, if appropriate.
+ */
+static void
+exitRecovery(void)
+{
+	XLogRecData rdata;
+
+	rdata.buffer = InvalidBuffer;
+	rdata.data = (char *) (&ThisTimeLineID);
+	rdata.len = sizeof(TimeLineID);
+	rdata.next = NULL;
+
+	/*
+	 * If a restartpoint is in progress, we will not be able to successfully
+	 * acquire CheckpointLock. If bgwriter is still in progress then send
+	 * a second signal to nudge bgwriter to go faster so we can avoid delay.
+	 * Then wait for lock, so we know the restartpoint has completed. We do
+	 * this because we don't want to interrupt the restartpoint half way
+	 * through, which might leave us in a mess and we want to be robust. We're
+	 * going to checkpoint soon anyway, so not it's not wasted effort.
+	 */
+	if (LWLockConditionalAcquire(CheckpointLock, LW_EXCLUSIVE))
+		LWLockRelease(CheckpointLock);
+	else
+	{
+		RequestRestartPointCompletion();
+		ereport(LOG,
+			(errmsg("startup process waiting for restartpoint to complete")));
+		LWLockAcquire(CheckpointLock, LW_EXCLUSIVE);
+		LWLockRelease(CheckpointLock);
+	}	
+
+	/*
+	 * This is the only type of WAL message that can be inserted during
+	 * recovery. This ensures that we don't allow others to get access
+	 * until after we have changed state.
+	 */
+	(void) XLogInsert(RM_XLOG_ID, XLOG_RECOVERY_END, &rdata);
+
+	/*
+	 * We don't XLogFlush() here otherwise we'll end up zeroing the WAL
+	 * file ourselves. So just let bgwriter's forthcoming checkpoint do
+	 * that for us.
+	 */
+
+	InRecovery = false;
+}
+
+/*
+ * XLOG resource manager's routines.
+ *
+ * Definitions of message info are in include/catalog/pg_control.h,
+ * though not all messages relate to control file processing.
  */
 void
 xlog_redo(XLogRecPtr lsn, XLogRecord *record)
@@ -6272,21 +6618,38 @@ xlog_redo(XLogRecPtr lsn, XLogRecord *record)
 		ControlFile->checkPointCopy.nextXid = checkPoint.nextXid;
 
 		/*
-		 * TLI may change in a shutdown checkpoint, but it shouldn't decrease
+		 * TLI no longer changes at shutdown checkpoint, since as of 8.4,
+		 * shutdown checkpoints only occur at shutdown. Much less confusing.
 		 */
-		if (checkPoint.ThisTimeLineID != ThisTimeLineID)
+
+		RecoveryRestartPoint(&checkPoint);
+	}
+	else if (info == XLOG_RECOVERY_END)
+	{
+		TimeLineID	tli;
+
+		memcpy(&tli, XLogRecGetData(record), sizeof(TimeLineID));
+
+		/*
+		 * TLI may change when recovery ends, but it shouldn't decrease.
+		 *
+		 * This is the only WAL record that can tell us to change timelineID
+		 * while we process WAL records. 
+		 *
+		 * We can *choose* to stop recovery at any point, generating a
+		 * new timelineID which is recorded using this record type.
+		 */
+		if (tli != ThisTimeLineID)
 		{
-			if (checkPoint.ThisTimeLineID < ThisTimeLineID ||
+			if (tli < ThisTimeLineID ||
 				!list_member_int(expectedTLIs,
-								 (int) checkPoint.ThisTimeLineID))
+								 (int) tli))
 				ereport(PANIC,
-						(errmsg("unexpected timeline ID %u (after %u) in checkpoint record",
-								checkPoint.ThisTimeLineID, ThisTimeLineID)));
+						(errmsg("unexpected timeline ID %u (after %u) at recovery end record",
+								tli, ThisTimeLineID)));
 			/* Following WAL records should be run with new TLI */
-			ThisTimeLineID = checkPoint.ThisTimeLineID;
+			ThisTimeLineID = tli;
 		}
-
-		RecoveryRestartPoint(&checkPoint);
 	}
 	else if (info == XLOG_CHECKPOINT_ONLINE)
 	{
@@ -6309,7 +6672,7 @@ xlog_redo(XLogRecPtr lsn, XLogRecord *record)
 		ControlFile->checkPointCopy.nextXidEpoch = checkPoint.nextXidEpoch;
 		ControlFile->checkPointCopy.nextXid = checkPoint.nextXid;
 
-		/* TLI should not change in an on-line checkpoint */
+		/* TLI must not change at a checkpoint */
 		if (checkPoint.ThisTimeLineID != ThisTimeLineID)
 			ereport(PANIC,
 					(errmsg("unexpected timeline ID %u (should be %u) in checkpoint record",
diff --git a/src/backend/postmaster/bgwriter.c b/src/backend/postmaster/bgwriter.c
index 6a0cd4eebf..3163fd3c1b 100644
--- a/src/backend/postmaster/bgwriter.c
+++ b/src/backend/postmaster/bgwriter.c
@@ -49,6 +49,7 @@
 #include <unistd.h>
 
 #include "access/xlog_internal.h"
+#include "catalog/pg_control.h"
 #include "libpq/pqsignal.h"
 #include "miscadmin.h"
 #include "pgstat.h"
@@ -129,6 +130,13 @@ typedef struct
 
 	int			ckpt_flags;		/* checkpoint flags, as defined in xlog.h */
 
+	/* 
+	 * When the Startup process wants bgwriter to perform a restartpoint, it 
+	 * sets these fields so that we can update the control file afterwards.
+	 */
+	XLogRecPtr	ReadPtr;		/* Requested log pointer */
+	CheckPoint  restartPoint;	/* restartPoint data for ControlFile */
+
 	uint32		num_backend_writes;		/* counts non-bgwriter buffer writes */
 
 	int			num_requests;	/* current # of requests */
@@ -165,7 +173,7 @@ static bool ckpt_active = false;
 
 /* these values are valid when ckpt_active is true: */
 static pg_time_t ckpt_start_time;
-static XLogRecPtr ckpt_start_recptr;
+static XLogRecPtr ckpt_start_recptr;	/* not used if IsRecoveryProcessingMode */
 static double ckpt_cached_elapsed;
 
 static pg_time_t last_checkpoint_time;
@@ -197,6 +205,7 @@ BackgroundWriterMain(void)
 {
 	sigjmp_buf	local_sigjmp_buf;
 	MemoryContext bgwriter_context;
+	bool		BgWriterRecoveryMode;
 
 	BgWriterShmem->bgwriter_pid = MyProcPid;
 	am_bg_writer = true;
@@ -355,16 +364,17 @@ BackgroundWriterMain(void)
 	 */
 	PG_SETMASK(&UnBlockSig);
 
+	BgWriterRecoveryMode = IsRecoveryProcessingMode();
+
+	if (BgWriterRecoveryMode)
+		elog(DEBUG1, "bgwriter starting during recovery, pid = %u", 
+			BgWriterShmem->bgwriter_pid);
+
 	/*
 	 * Loop forever
 	 */
 	for (;;)
 	{
-		bool		do_checkpoint = false;
-		int			flags = 0;
-		pg_time_t	now;
-		int			elapsed_secs;
-
 		/*
 		 * Emergency bailout if postmaster has died.  This is to avoid the
 		 * necessity for manual cleanup of all postmaster children.
@@ -382,118 +392,204 @@ BackgroundWriterMain(void)
 			got_SIGHUP = false;
 			ProcessConfigFile(PGC_SIGHUP);
 		}
-		if (checkpoint_requested)
-		{
-			checkpoint_requested = false;
-			do_checkpoint = true;
-			BgWriterStats.m_requested_checkpoints++;
-		}
-		if (shutdown_requested)
-		{
-			/*
-			 * From here on, elog(ERROR) should end with exit(1), not send
-			 * control back to the sigsetjmp block above
-			 */
-			ExitOnAnyError = true;
-			/* Close down the database */
-			ShutdownXLOG(0, 0);
-			/* Normal exit from the bgwriter is here */
-			proc_exit(0);		/* done */
-		}
 
-		/*
-		 * Force a checkpoint if too much time has elapsed since the last one.
-		 * Note that we count a timed checkpoint in stats only when this
-		 * occurs without an external request, but we set the CAUSE_TIME flag
-		 * bit even if there is also an external request.
-		 */
-		now = (pg_time_t) time(NULL);
-		elapsed_secs = now - last_checkpoint_time;
-		if (elapsed_secs >= CheckPointTimeout)
-		{
-			if (!do_checkpoint)
-				BgWriterStats.m_timed_checkpoints++;
-			do_checkpoint = true;
-			flags |= CHECKPOINT_CAUSE_TIME;
-		}
-
-		/*
-		 * Do a checkpoint if requested, otherwise do one cycle of
-		 * dirty-buffer writing.
-		 */
-		if (do_checkpoint)
-		{
-			/* use volatile pointer to prevent code rearrangement */
-			volatile BgWriterShmemStruct *bgs = BgWriterShmem;
-
-			/*
-			 * Atomically fetch the request flags to figure out what kind of a
-			 * checkpoint we should perform, and increase the started-counter
-			 * to acknowledge that we've started a new checkpoint.
-			 */
-			SpinLockAcquire(&bgs->ckpt_lck);
-			flags |= bgs->ckpt_flags;
-			bgs->ckpt_flags = 0;
-			bgs->ckpt_started++;
-			SpinLockRelease(&bgs->ckpt_lck);
-
-			/*
-			 * We will warn if (a) too soon since last checkpoint (whatever
-			 * caused it) and (b) somebody set the CHECKPOINT_CAUSE_XLOG flag
-			 * since the last checkpoint start.  Note in particular that this
-			 * implementation will not generate warnings caused by
-			 * CheckPointTimeout < CheckPointWarning.
-			 */
-			if ((flags & CHECKPOINT_CAUSE_XLOG) &&
-				elapsed_secs < CheckPointWarning)
-				ereport(LOG,
-						(errmsg("checkpoints are occurring too frequently (%d seconds apart)",
-								elapsed_secs),
-						 errhint("Consider increasing the configuration parameter \"checkpoint_segments\".")));
-
-			/*
-			 * Initialize bgwriter-private variables used during checkpoint.
-			 */
-			ckpt_active = true;
-			ckpt_start_recptr = GetInsertRecPtr();
-			ckpt_start_time = now;
-			ckpt_cached_elapsed = 0;
-
-			/*
-			 * Do the checkpoint.
-			 */
-			CreateCheckPoint(flags);
+ 		if (BgWriterRecoveryMode)
+  		{
+ 			if (shutdown_requested)
+ 			{
+ 				/*
+ 				 * From here on, elog(ERROR) should end with exit(1), not send
+ 				 * control back to the sigsetjmp block above
+ 				 */
+ 				ExitOnAnyError = true;
+ 				/* Normal exit from the bgwriter is here */
+ 				proc_exit(0);		/* done */
+ 			}
+ 
+ 			if (!IsRecoveryProcessingMode())
+ 			{
+ 				elog(DEBUG2, "bgwriter changing from recovery to normal mode");
+ 
+ 				InitXLOGAccess();
+ 				BgWriterRecoveryMode = false;
+ 
+ 				/*
+ 				 * Start time-driven events from now
+ 				 */
+ 				last_checkpoint_time = last_xlog_switch_time = (pg_time_t) time(NULL);
+ 
+ 				/* 
+ 				 * Notice that we do *not* act on a checkpoint_requested
+ 				 * state at this point. We have changed mode, so we wish to
+ 				 * perform a checkpoint not a restartpoint.
+ 				 */
+ 				continue;
+ 			}
+ 
+ 			if (checkpoint_requested)
+ 			{
+ 				XLogRecPtr		ReadPtr;
+ 				CheckPoint		restartPoint;
+ 
+ 				checkpoint_requested = false;
+ 
+ 				/*
+ 				 * Initialize bgwriter-private variables used during checkpoint.
+ 				 */
+ 				ckpt_active = true;
+ 				ckpt_start_time = (pg_time_t) time(NULL);
+ 				ckpt_cached_elapsed = 0;
+ 
+ 				/*
+ 				 * Get the requested values from shared memory that the 
+ 				 * Startup process has put there for us.
+ 				 */
+ 				SpinLockAcquire(&BgWriterShmem->ckpt_lck);
+ 				ReadPtr = BgWriterShmem->ReadPtr;
+ 				memcpy(&restartPoint, &BgWriterShmem->restartPoint, sizeof(CheckPoint));
+ 				SpinLockRelease(&BgWriterShmem->ckpt_lck);
+ 
+ 				/* Use smoothed writes, until interrupted if ever */
+ 				CreateRestartPoint(ReadPtr, &restartPoint, 0);
+ 
+ 				/*
+ 				 * After any checkpoint, close all smgr files.	This is so we
+ 				 * won't hang onto smgr references to deleted files indefinitely.
+ 				 */
+ 				smgrcloseall();
+ 
+ 				ckpt_active = false;
+ 				checkpoint_requested = false;
+ 			}
+ 			else
+ 			{
+ 				/* Clean buffers dirtied by recovery */
+ 				BgBufferSync();
+ 
+ 				/* Nap for the configured time. */
+ 				BgWriterNap();
+ 			}
+  		}
+		else	/* Normal processing */
+  		{
+			bool		do_checkpoint = false;
+			int			flags = 0;
+			pg_time_t	now;
+			int			elapsed_secs;
+
+			if (checkpoint_requested)
+			{
+				checkpoint_requested = false;
+				do_checkpoint = true;
+				BgWriterStats.m_requested_checkpoints++;
+			}
+			if (shutdown_requested)
+			{
+				/*
+				 * From here on, elog(ERROR) should end with exit(1), not send
+				 * control back to the sigsetjmp block above
+				 */
+				ExitOnAnyError = true;
+				/* Close down the database */
+				ShutdownXLOG(0, 0);
+				/* Normal exit from the bgwriter is here */
+				proc_exit(0);		/* done */
+			}
 
 			/*
-			 * After any checkpoint, close all smgr files.	This is so we
-			 * won't hang onto smgr references to deleted files indefinitely.
+			 * Force a checkpoint if too much time has elapsed since the last one.
+			 * Note that we count a timed checkpoint in stats only when this
+			 * occurs without an external request, but we set the CAUSE_TIME flag
+			 * bit even if there is also an external request.
 			 */
-			smgrcloseall();
+			now = (pg_time_t) time(NULL);
+			elapsed_secs = now - last_checkpoint_time;
+			if (elapsed_secs >= CheckPointTimeout)
+			{
+				if (!do_checkpoint)
+					BgWriterStats.m_timed_checkpoints++;
+				do_checkpoint = true;
+				flags |= CHECKPOINT_CAUSE_TIME;
+			}
 
 			/*
-			 * Indicate checkpoint completion to any waiting backends.
+			 * Do a checkpoint if requested, otherwise do one cycle of
+			 * dirty-buffer writing.
 			 */
-			SpinLockAcquire(&bgs->ckpt_lck);
-			bgs->ckpt_done = bgs->ckpt_started;
-			SpinLockRelease(&bgs->ckpt_lck);
+			if (do_checkpoint)
+			{
+				/* use volatile pointer to prevent code rearrangement */
+				volatile BgWriterShmemStruct *bgs = BgWriterShmem;
+
+				/*
+				 * Atomically fetch the request flags to figure out what kind of a
+				 * checkpoint we should perform, and increase the started-counter
+				 * to acknowledge that we've started a new checkpoint.
+				 */
+				SpinLockAcquire(&bgs->ckpt_lck);
+				flags |= bgs->ckpt_flags;
+				bgs->ckpt_flags = 0;
+				bgs->ckpt_started++;
+				SpinLockRelease(&bgs->ckpt_lck);
+
+				/*
+				 * We will warn if (a) too soon since last checkpoint (whatever
+				 * caused it) and (b) somebody set the CHECKPOINT_CAUSE_XLOG flag
+				 * since the last checkpoint start.  Note in particular that this
+				 * implementation will not generate warnings caused by
+				 * CheckPointTimeout < CheckPointWarning.
+				 */
+				if ((flags & CHECKPOINT_CAUSE_XLOG) &&
+					elapsed_secs < CheckPointWarning)
+					ereport(LOG,
+							(errmsg("checkpoints are occurring too frequently (%d seconds apart)",
+									elapsed_secs),
+							 errhint("Consider increasing the configuration parameter \"checkpoint_segments\".")));
+
+				/*
+				 * Initialize bgwriter-private variables used during checkpoint.
+				 */
+				ckpt_active = true;
+				ckpt_start_recptr = GetInsertRecPtr();
+				ckpt_start_time = now;
+				ckpt_cached_elapsed = 0;
+
+				/*
+				 * Do the checkpoint.
+				 */
+				CreateCheckPoint(flags);
+
+				/*
+				 * After any checkpoint, close all smgr files.	This is so we
+				 * won't hang onto smgr references to deleted files indefinitely.
+				 */
+				smgrcloseall();
+
+				/*
+				 * Indicate checkpoint completion to any waiting backends.
+				 */
+				SpinLockAcquire(&bgs->ckpt_lck);
+				bgs->ckpt_done = bgs->ckpt_started;
+				SpinLockRelease(&bgs->ckpt_lck);
+
+				ckpt_active = false;
+
+				/*
+				 * Note we record the checkpoint start time not end time as
+				 * last_checkpoint_time.  This is so that time-driven checkpoints
+				 * happen at a predictable spacing.
+				 */
+				last_checkpoint_time = now;
+			}
+			else
+				BgBufferSync();
 
-			ckpt_active = false;
+			/* Check for archive_timeout and switch xlog files if necessary. */
+			CheckArchiveTimeout();
 
-			/*
-			 * Note we record the checkpoint start time not end time as
-			 * last_checkpoint_time.  This is so that time-driven checkpoints
-			 * happen at a predictable spacing.
-			 */
-			last_checkpoint_time = now;
+			/* Nap for the configured time. */
+			BgWriterNap();
 		}
-		else
-			BgBufferSync();
-
-		/* Check for archive_timeout and switch xlog files if necessary. */
-		CheckArchiveTimeout();
-
-		/* Nap for the configured time. */
-		BgWriterNap();
 	}
 }
 
@@ -586,7 +682,8 @@ BgWriterNap(void)
 		(ckpt_active ? ImmediateCheckpointRequested() : checkpoint_requested))
 			break;
 		pg_usleep(1000000L);
-		AbsorbFsyncRequests();
+		if (!IsRecoveryProcessingMode())
+			AbsorbFsyncRequests();
 		udelay -= 1000000L;
 	}
 
@@ -640,6 +737,19 @@ CheckpointWriteDelay(int flags, double progress)
 	if (!am_bg_writer)
 		return;
 
+	/* Perform minimal duties during recovery and skip wait if requested */
+	if (IsRecoveryProcessingMode())
+	{
+		BgBufferSync();
+
+		if (!shutdown_requested &&
+			!checkpoint_requested &&
+			IsCheckpointOnSchedule(progress))
+			BgWriterNap();
+
+		return;
+	}
+
 	/*
 	 * Perform the usual bgwriter duties and take a nap, unless we're behind
 	 * schedule, in which case we just try to catch up as quickly as possible.
@@ -714,16 +824,19 @@ IsCheckpointOnSchedule(double progress)
 	 * However, it's good enough for our purposes, we're only calculating an
 	 * estimate anyway.
 	 */
-	recptr = GetInsertRecPtr();
-	elapsed_xlogs =
-		(((double) (int32) (recptr.xlogid - ckpt_start_recptr.xlogid)) * XLogSegsPerFile +
-		 ((double) recptr.xrecoff - (double) ckpt_start_recptr.xrecoff) / XLogSegSize) /
-		CheckPointSegments;
-
-	if (progress < elapsed_xlogs)
+	if (!IsRecoveryProcessingMode())
 	{
-		ckpt_cached_elapsed = elapsed_xlogs;
-		return false;
+		recptr = GetInsertRecPtr();
+		elapsed_xlogs =
+			(((double) (int32) (recptr.xlogid - ckpt_start_recptr.xlogid)) * XLogSegsPerFile +
+			 ((double) recptr.xrecoff - (double) ckpt_start_recptr.xrecoff) / XLogSegSize) /
+			CheckPointSegments;
+
+		if (progress < elapsed_xlogs)
+		{
+			ckpt_cached_elapsed = elapsed_xlogs;
+			return false;
+		}
 	}
 
 	/*
@@ -988,6 +1101,77 @@ RequestCheckpoint(int flags)
 	}
 }
 
+/*
+ * Always runs in Startup process (see xlog.c)
+ */
+void
+RequestRestartPoint(const XLogRecPtr ReadPtr, const CheckPoint *restartPoint, bool sendToBGWriter)
+{
+	/*
+	 * Should we just do it ourselves?
+	 */
+	if (!IsPostmasterEnvironment || !sendToBGWriter)
+	{
+		CreateRestartPoint(ReadPtr, restartPoint, CHECKPOINT_IMMEDIATE);
+		return;
+	}
+
+	/*
+	 * Push requested values into shared memory, then signal to request restartpoint.
+	 */
+	if (BgWriterShmem->bgwriter_pid == 0)
+		elog(LOG, "could not request restartpoint because bgwriter not running");
+
+#ifdef NOT_USED
+	elog(LOG, "tli = %u nextXidEpoch = %u nextXid = %u nextOid = %u",
+		restartPoint->ThisTimeLineID,
+		restartPoint->nextXidEpoch,
+		restartPoint->nextXid,
+		restartPoint->nextOid);
+#endif
+
+	SpinLockAcquire(&BgWriterShmem->ckpt_lck);
+	BgWriterShmem->ReadPtr = ReadPtr;
+	memcpy(&BgWriterShmem->restartPoint, restartPoint, sizeof(CheckPoint));
+	SpinLockRelease(&BgWriterShmem->ckpt_lck);
+
+	if (kill(BgWriterShmem->bgwriter_pid, SIGINT) != 0)
+		elog(LOG, "could not signal for restartpoint: %m");	
+}
+
+/* 
+ * Sends another checkpoint request signal to bgwriter, which causes it
+ * to avoid smoothed writes and continue processing as if it had been
+ * called with CHECKPOINT_IMMEDIATE. This is used at the end of recovery.
+ */
+void
+RequestRestartPointCompletion(void)
+{
+	if (BgWriterShmem->bgwriter_pid != 0 &&
+		kill(BgWriterShmem->bgwriter_pid, SIGINT) != 0)
+		elog(LOG, "could not signal for restartpoint immediate: %m");
+}
+
+XLogRecPtr
+GetRedoLocationForArchiveCheckpoint(void)
+{
+	XLogRecPtr	redo;
+
+	SpinLockAcquire(&BgWriterShmem->ckpt_lck);
+	redo = BgWriterShmem->ReadPtr;
+	SpinLockRelease(&BgWriterShmem->ckpt_lck);
+
+	return redo;
+}
+
+void
+SetRedoLocationForArchiveCheckpoint(XLogRecPtr redo)
+{
+	SpinLockAcquire(&BgWriterShmem->ckpt_lck);
+	BgWriterShmem->ReadPtr = redo;
+	SpinLockRelease(&BgWriterShmem->ckpt_lck);
+}
+
 /*
  * ForwardFsyncRequest
  *		Forward a file-fsync request from a backend to the bgwriter
diff --git a/src/backend/postmaster/postmaster.c b/src/backend/postmaster/postmaster.c
index 3380b806f6..5cb84be4b8 100644
--- a/src/backend/postmaster/postmaster.c
+++ b/src/backend/postmaster/postmaster.c
@@ -254,6 +254,11 @@ typedef enum
 {
 	PM_INIT,					/* postmaster starting */
 	PM_STARTUP,					/* waiting for startup subprocess */
+	PM_RECOVERY,				/* consistent recovery mode; state only
+								 * entered for archive and streaming recovery,
+								 * and only after the point where the 
+								 * all data is in consistent state.
+								 */
 	PM_RUN,						/* normal "database is alive" state */
 	PM_WAIT_BACKUP,				/* waiting for online backup mode to end */
 	PM_WAIT_BACKENDS,			/* waiting for live backends to exit */
@@ -1302,7 +1307,7 @@ ServerLoop(void)
 		 * state that prevents it, start one.  It doesn't matter if this
 		 * fails, we'll just try again later.
 		 */
-		if (BgWriterPID == 0 && pmState == PM_RUN)
+		if (BgWriterPID == 0 && (pmState == PM_RUN || pmState == PM_RECOVERY))
 			BgWriterPID = StartBackgroundWriter();
 
 		/*
@@ -2116,7 +2121,7 @@ reaper(SIGNAL_ARGS)
 		if (pid == StartupPID)
 		{
 			StartupPID = 0;
-			Assert(pmState == PM_STARTUP);
+			Assert(pmState == PM_STARTUP || pmState == PM_RECOVERY);
 
 			/* FATAL exit of startup is treated as catastrophic */
 			if (!EXIT_STATUS_0(exitstatus))
@@ -2157,11 +2162,11 @@ reaper(SIGNAL_ARGS)
 			load_role();
 
 			/*
-			 * Crank up the background writer.	It doesn't matter if this
-			 * fails, we'll just try again later.
+			 * Check whether we need to start background writer, if not
+			 * already running.
 			 */
-			Assert(BgWriterPID == 0);
-			BgWriterPID = StartBackgroundWriter();
+			if (BgWriterPID == 0)
+				BgWriterPID = StartBackgroundWriter();
 
 			/*
 			 * Likewise, start other special children as needed.  In a restart
@@ -3847,6 +3852,51 @@ sigusr1_handler(SIGNAL_ARGS)
 
 	PG_SETMASK(&BlockSig);
 
+	if (CheckPostmasterSignal(PMSIGNAL_RECOVERY_START))
+	{
+		Assert(pmState == PM_STARTUP);
+
+		/*
+		 * Go to shutdown mode if a shutdown request was pending.
+		 */
+		if (Shutdown > NoShutdown)
+		{
+			pmState = PM_WAIT_BACKENDS;
+			/* PostmasterStateMachine logic does the rest */
+		}
+		else
+		{
+			/*
+			 * Startup process has entered recovery
+			 */
+			pmState = PM_RECOVERY;
+
+			/*
+			 * Load the flat authorization file into postmaster's cache. The
+			 * startup process won't have recomputed this from the database yet,
+			 * so we it may change following recovery. 
+			 */
+			load_role();
+
+			/*
+			 * Crank up the background writer.	It doesn't matter if this
+			 * fails, we'll just try again later.
+			 */
+			Assert(BgWriterPID == 0);
+			BgWriterPID = StartBackgroundWriter();
+
+			/*
+			 * Likewise, start other special children as needed.
+			 */
+			Assert(PgStatPID == 0);
+			PgStatPID = pgstat_start();
+
+			/* XXX at this point we could accept read-only connections */
+			ereport(DEBUG1,
+				 (errmsg("database system is in consistent recovery mode")));
+		}
+	}
+
 	if (CheckPostmasterSignal(PMSIGNAL_PASSWORD_CHANGE))
 	{
 		/*
diff --git a/src/backend/storage/buffer/README b/src/backend/storage/buffer/README
index 62b22bd1db..a7b81e37a7 100644
--- a/src/backend/storage/buffer/README
+++ b/src/backend/storage/buffer/README
@@ -268,3 +268,12 @@ out (and anyone else who flushes buffer contents to disk must do so too).
 This ensures that the page image transferred to disk is reasonably consistent.
 We might miss a hint-bit update or two but that isn't a problem, for the same
 reasons mentioned under buffer access rules.
+
+As of 8.4, background writer starts during recovery mode when there is
+some form of potentially extended recovery to perform. It performs an
+identical service to normal processing, except that checkpoints it
+writes are technically restartpoints. Flushing outstanding WAL for dirty
+buffers is also skipped, though there shouldn't ever be new WAL entries
+at that time in any case. We could choose to start background writer
+immediately but we hold off until we can prove the database is in a 
+consistent state so that postmaster has a single, clean state change.
diff --git a/src/bin/pg_controldata/pg_controldata.c b/src/bin/pg_controldata/pg_controldata.c
index 4ea849d7f1..3bba50ab83 100644
--- a/src/bin/pg_controldata/pg_controldata.c
+++ b/src/bin/pg_controldata/pg_controldata.c
@@ -197,6 +197,9 @@ main(int argc, char *argv[])
 	printf(_("Minimum recovery ending location:     %X/%X\n"),
 		   ControlFile.minRecoveryPoint.xlogid,
 		   ControlFile.minRecoveryPoint.xrecoff);
+	printf(_("Minimum safe starting location:       %X/%X\n"),
+		   ControlFile.minSafeStartPoint.xlogid,
+		   ControlFile.minSafeStartPoint.xrecoff);
 	printf(_("Maximum data alignment:               %u\n"),
 		   ControlFile.maxAlign);
 	/* we don't print floatFormat since can't say much useful about it */
diff --git a/src/bin/pg_resetxlog/pg_resetxlog.c b/src/bin/pg_resetxlog/pg_resetxlog.c
index 51cdde1145..b20d4bd4dd 100644
--- a/src/bin/pg_resetxlog/pg_resetxlog.c
+++ b/src/bin/pg_resetxlog/pg_resetxlog.c
@@ -603,6 +603,8 @@ RewriteControlFile(void)
 	ControlFile.prevCheckPoint.xrecoff = 0;
 	ControlFile.minRecoveryPoint.xlogid = 0;
 	ControlFile.minRecoveryPoint.xrecoff = 0;
+	ControlFile.minSafeStartPoint.xlogid = 0;
+	ControlFile.minSafeStartPoint.xrecoff = 0;
 
 	/* Now we can force the recorded xlog seg size to the right thing. */
 	ControlFile.xlog_seg_size = XLogSegSize;
diff --git a/src/include/access/xlog.h b/src/include/access/xlog.h
index 6913f7c800..cf787c8df6 100644
--- a/src/include/access/xlog.h
+++ b/src/include/access/xlog.h
@@ -133,7 +133,16 @@ typedef struct XLogRecData
 } XLogRecData;
 
 extern TimeLineID ThisTimeLineID;		/* current TLI */
-extern bool InRecovery;
+
+/* 
+ * Prior to 8.4, all activity during recovery were carried out by Startup
+ * process. This local variable continues to be used in many parts of the
+ * code to indicate actions taken by RecoveryManagers. Other processes who
+ * potentially perform work during recovery should check
+ * IsRecoveryProcessingMode(), see XLogCtl notes in xlog.c
+ */
+extern bool InRecovery;	
+										
 extern XLogRecPtr XactLastRecEnd;
 
 /* these variables are GUC parameters related to XLOG */
@@ -166,6 +175,7 @@ extern bool XLOG_DEBUG;
 /* These indicate the cause of a checkpoint request */
 #define CHECKPOINT_CAUSE_XLOG	0x0010	/* XLOG consumption */
 #define CHECKPOINT_CAUSE_TIME	0x0020	/* Elapsed time */
+#define CHECKPOINT_RESTARTPOINT	0x0040	/* Restartpoint during recovery */
 
 /* Checkpoint statistics */
 typedef struct CheckpointStatsData
@@ -199,6 +209,8 @@ extern void RestoreBkpBlocks(XLogRecPtr lsn, XLogRecord *record, bool cleanup);
 extern void xlog_redo(XLogRecPtr lsn, XLogRecord *record);
 extern void xlog_desc(StringInfo buf, uint8 xl_info, char *rec);
 
+extern bool IsRecoveryProcessingMode(void);
+
 extern void UpdateControlFile(void);
 extern Size XLOGShmemSize(void);
 extern void XLOGShmemInit(void);
diff --git a/src/include/access/xlog_internal.h b/src/include/access/xlog_internal.h
index 5675bfbcbd..4830a5ce74 100644
--- a/src/include/access/xlog_internal.h
+++ b/src/include/access/xlog_internal.h
@@ -17,6 +17,7 @@
 #define XLOG_INTERNAL_H
 
 #include "access/xlog.h"
+#include "catalog/pg_control.h"
 #include "fmgr.h"
 #include "pgtime.h"
 #include "storage/block.h"
@@ -245,6 +246,9 @@ extern const RmgrData RmgrTable[];
 extern pg_time_t GetLastSegSwitchTime(void);
 extern XLogRecPtr RequestXLogSwitch(void);
 
+extern void CreateRestartPoint(const XLogRecPtr ReadPtr, 
+				const CheckPoint *restartPoint, int flags);
+
 /*
  * These aren't in xlog.h because I'd rather not include fmgr.h there.
  */
diff --git a/src/include/catalog/pg_control.h b/src/include/catalog/pg_control.h
index 400f32c749..e69c8ec553 100644
--- a/src/include/catalog/pg_control.h
+++ b/src/include/catalog/pg_control.h
@@ -21,7 +21,7 @@
 
 
 /* Version identifier for this pg_control format */
-#define PG_CONTROL_VERSION	843
+#define PG_CONTROL_VERSION	847
 
 /*
  * Body of CheckPoint XLOG records.  This is declared here because we keep
@@ -46,7 +46,7 @@ typedef struct CheckPoint
 #define XLOG_NOOP						0x20
 #define XLOG_NEXTOID					0x30
 #define XLOG_SWITCH						0x40
-
+#define XLOG_RECOVERY_END			0x50
 
 /* System status indicator */
 typedef enum DBState
@@ -102,6 +102,7 @@ typedef struct ControlFileData
 	CheckPoint	checkPointCopy; /* copy of last check point record */
 
 	XLogRecPtr	minRecoveryPoint;		/* must replay xlog to here */
+	XLogRecPtr	minSafeStartPoint;		/* safe point after recovery crashes */
 
 	/*
 	 * This data is used to check for hardware-architecture compatibility of
diff --git a/src/include/postmaster/bgwriter.h b/src/include/postmaster/bgwriter.h
index c1c9d7f580..d4b389e927 100644
--- a/src/include/postmaster/bgwriter.h
+++ b/src/include/postmaster/bgwriter.h
@@ -12,6 +12,7 @@
 #ifndef _BGWRITER_H
 #define _BGWRITER_H
 
+#include "catalog/pg_control.h"
 #include "storage/block.h"
 #include "storage/relfilenode.h"
 
@@ -25,6 +26,11 @@ extern double CheckPointCompletionTarget;
 extern void BackgroundWriterMain(void);
 
 extern void RequestCheckpoint(int flags);
+extern void RequestRestartPoint(const XLogRecPtr ReadPtr, const CheckPoint *restartPoint, bool sendToBGWriter);
+extern void RequestRestartPointCompletion(void);
+extern XLogRecPtr GetRedoLocationForArchiveCheckpoint(void);
+extern void SetRedoLocationForArchiveCheckpoint(XLogRecPtr redo);
+
 extern void CheckpointWriteDelay(int flags, double progress);
 
 extern bool ForwardFsyncRequest(RelFileNode rnode, ForkNumber forknum,
diff --git a/src/include/storage/pmsignal.h b/src/include/storage/pmsignal.h
index 3101092cbd..1904187bfd 100644
--- a/src/include/storage/pmsignal.h
+++ b/src/include/storage/pmsignal.h
@@ -22,6 +22,7 @@
  */
 typedef enum
 {
+	PMSIGNAL_RECOVERY_START,	/* move to PM_RECOVERY state */
 	PMSIGNAL_PASSWORD_CHANGE,	/* pg_auth file has changed */
 	PMSIGNAL_WAKEN_ARCHIVER,	/* send a NOTIFY signal to xlog archiver */
 	PMSIGNAL_ROTATE_LOGFILE,	/* send SIGUSR1 to syslogger to rotate logfile */
-- 
2.39.5