Import Simon's hot standby patch v9d.hotstandbyv9d

author: Heikki Linnakangas 2009-01-23 12:34:24 +0000
committer: Heikki Linnakangas 2009-01-23 12:34:24 +0000
commit: 25d33629c3019fe96988610211b45d195f72c70a (patch)
tree: 42821ba7b65ae97b389aba804164bd75cd274718
parent: f4fd26775b8b6e7090774bb3d794b8529771fce5 (diff)
78 files changed, 5369 insertions, 568 deletions
diff --git a/doc/src/sgml/config.sgml b/doc/src/sgml/config.sgml
index ec9a46d778..63d93022bb 100644
--- a/doc/src/sgml/config.sgml
+++ b/doc/src/sgml/config.sgml
@@ -370,6 +370,12 @@ SET ENABLE_SEQSCAN TO OFF;
         allows. See <xref linkend="sysvipc"> for information on how to
         adjust those parameters, if necessary.
        </para>
+
+       <para>
+	When running a standby server it is strongly recommended that you
+	set this parameter to be the same or higher than the master server,
+	otherwise queries on the standby server may fail.
+       </para>
       </listitem>
      </varlistentry>
 
@@ -5383,6 +5389,32 @@ plruby.use_strict = true        # generates error: unknown class name
       </listitem>
      </varlistentry>
 
+     <varlistentry id="guc-trace-recovery-messages" xreflabel="trace_recovery_messages">
+      <term><varname>trace_recovery_messages</varname> (<type>string</type>)</term>
+      <indexterm>
+       <primary><varname>trace_recovery_messages</> configuration parameter</primary>
+      </indexterm>
+      <listitem>
+       <para>
+        Controls which message levels are written to the server log
+		for system modules needed for recovery processing. This allows
+		the user to override the normal setting of log_min_messages,
+		but only for specific messages. This is intended for use in
+		debugging Hot Standby.
+        Valid values are <literal>DEBUG5</>, <literal>DEBUG4</>,
+        <literal>DEBUG3</>, <literal>DEBUG2</>, <literal>DEBUG1</>,
+        <literal>INFO</>, <literal>NOTICE</>, <literal>WARNING</>,
+        <literal>ERROR</>, <literal>LOG</>, <literal>FATAL</>, and
+        <literal>PANIC</>.  Each level includes all the levels that
+        follow it.  The later the level, the fewer messages are sent
+        to the log.  The default is <literal>WARNING</>.  Note that
+        <literal>LOG</> has a different rank here than in
+        <varname>client_min_messages</>.
+        Parameter should be set in the postgresql.conf only.
+       </para>
+      </listitem>
+     </varlistentry>
+
     <varlistentry id="guc-zero-damaged-pages" xreflabel="zero_damaged_pages">
       <term><varname>zero_damaged_pages</varname> (<type>boolean</type>)</term>
       <indexterm>
diff --git a/doc/src/sgml/func.sgml b/doc/src/sgml/func.sgml
index 1900d6a5fc..c835ddc069 100644
--- a/doc/src/sgml/func.sgml
+++ b/doc/src/sgml/func.sgml
@@ -12894,6 +12894,193 @@ postgres=# select * from pg_xlogfile_name_offset(pg_stop_backup());
     <xref linkend="continuous-archiving">.
    </para>
 
+   <indexterm>
+    <primary>pg_is_in_recovery</primary>
+   </indexterm>
+   <indexterm>
+    <primary>pg_last_recovered_xact_timestamp</primary>
+   </indexterm>
+   <indexterm>
+    <primary>pg_last_recovered_xid</primary>
+   </indexterm>
+   <indexterm>
+    <primary>pg_last_recovered_xlog_location</primary>
+   </indexterm>
+   <indexterm>
+    <primary>pg_recovery_pause</primary>
+   </indexterm>
+   <indexterm>
+    <primary>pg_recovery_continue</primary>
+   </indexterm>
+   <indexterm>
+    <primary>pg_recovery_pause_xid</primary>
+   </indexterm>
+   <indexterm>
+    <primary>pg_recovery_pause_time</primary>
+   </indexterm>
+   <indexterm>
+    <primary>pg_recovery_stop</primary>
+   </indexterm>
+
+   <para>
+    The functions shown in <xref
+    linkend="functions-admin-recovery-table"> assist in archive recovery.
+    Except for the first three functions, these are restricted to superusers.
+	All of these functions can only be executed during recovery.
+   </para>
+
+   <table id="functions-admin-recovery-table">
+    <title>Recovery Control Functions</title>
+    <tgroup cols="3">
+     <thead>
+      <row><entry>Name</entry> <entry>Return Type</entry> <entry>Description</entry>
+      </row>
+     </thead>
+
+     <tbody>
+      <row>
+       <entry>
+        <literal><function>pg_is_in_recovery</function>()</literal>
+        </entry>
+       <entry><type>bool</type></entry>
+       <entry>True if recovery is still in progress.</entry>
+      </row>
+      <row>
+       <entry>
+        <literal><function>pg_last_recovered_xact_timestamp</function>()</literal>
+        </entry>
+       <entry><type>timestamp with time zone</type></entry>
+       <entry>Returns the original completion timestamp with timezone of the 
+		last recovered transaction. If recovery is still in progress this
+		will increase monotonically while if recovery is complete then this 
+		value will remain static at the value of the last transaction applied
+		during that recovery. When the server has been started normally this 
+		will return a default value.
+	   </entry>
+      </row>
+      <row>
+       <entry>
+        <literal><function>pg_last_recovered_xid</function>()</literal>
+        </entry>
+       <entry><type>integer</type></entry>
+       <entry>Returns the transaction id (32-bit) of last completed transaction
+		in the current recovery. Later numbered transaction ids may already have
+		completed, so the value could in some cases be lower than the last time
+		this function executed. If recovery is complete then this value will 
+		remain static at the value of the last transaction applied during that
+		recovery. When the server has been started normally this will return 
+		InvalidXid (zero).
+	   </entry>
+      </row>
+      <row>
+       <entry>
+        <literal><function>pg_last_recovered_xlog_location</function>()</literal>
+        </entry>
+       <entry><type>text</type></entry>
+       <entry>Returns the transaction log location of the last WAL record
+		in the current recovery. If recovery is still in progress this
+		will increase monotonically. If recovery is complete then this value will 
+		remain static at the value of the last transaction applied during that
+		recovery. When the server has been started normally this will return 
+		InvalidXLogRecPtr (0/0).
+		(zero).
+	   </entry>
+      </row>
+
+      <row>
+       <entry>
+        <literal><function>pg_recovery_pause</function>()</literal>
+        </entry>
+       <entry><type>void</type></entry>
+       <entry>Pause recovery processing, unconditionally.</entry>
+      </row>
+      <row>
+       <entry>
+        <literal><function>pg_recovery_continue</function>()</literal>
+        </entry>
+       <entry><type>void</type></entry>
+       <entry>If recovery is paused, continue processing.</entry>
+      </row>
+      <row>
+       <entry>
+        <literal><function>pg_recovery_stop</function>()</literal>
+        </entry>
+       <entry><type>void</type></entry>
+       <entry>End recovery and begin normal processing.</entry>
+      </row>
+      <row>
+       <entry>
+        <literal><function>pg_recovery_pause_xid</function>()</literal>
+        </entry>
+       <entry><type>void</type></entry>
+       <entry>Continue recovery until specified xid completes, if it is ever 
+		seen, then pause recovery.
+	   </entry>
+      </row>
+      <row>
+       <entry>
+        <literal><function>pg_recovery_pause_time</function>()</literal>
+        </entry>
+       <entry><type>void</type></entry>
+       <entry>Continue recovery until a transaction with specified timestamp 
+		completes, if one is ever seen, then pause recovery.
+	   </entry>
+      </row>
+      <row>
+       <entry>
+        <literal><function>pg_recovery_advance</function>()</literal>
+        </entry>
+       <entry><type>void</type></entry>
+       <entry>Advance recovery specified number of records then pause.</entry>
+      </row>
+     </tbody>
+    </tgroup>
+   </table>
+
+   <para>
+    <function>pg_recovery_pause</> and <function>pg_recovery_continue</> allow
+	a superuser to control the progress of recovery on the database server.
+	While recovery is paused queries can then be executed to determine how far
+	forwards recovery should progress. Recovery can never go backwards
+	because previous values are overwritten.  If the superuser wishes recovery
+	to complete and normal processing mode to start, execute 
+	<function>pg_recovery_stop</>.
+   </para>
+
+   <para>
+	Variations of the pause function exist, mainly to allow PITR to dynamically
+	control where it should progress to. <function>pg_recovery_pause_xid</> and 
+	<function>pg_recovery_pause_time</> allow the specification of a trial
+	recovery target, similarly to <xref linkend="recovery-config-settings">.
+	Recovery will then progress to the specified point and then pause, rather
+	than stopping permanently, allowing assessment of whether this is the
+	desired stopping point for recovery.
+   </para>
+
+   <para>
+	<function>pg_recovery_advance</> allows recovery to progress record by
+	record, for very careful analysis or debugging. Step size can be 1 or
+	more records. If recovery is not yet paused then <function>pg_recovery_advance</>
+	will process the specified number of records then pause. If recovery
+	is already paused, recovery will continue for another N records before
+	pausing again.
+   </para>
+
+   <para>
+	If you pause recovery while the server is waiting for a WAL file when 
+	operating in standby mode it will have apparently no effect until the 
+	file arrives. Once the server begins processing WAL records again it
+	will notice the pause request and will act upon it. This is not a bug.
+	pause.
+   </para>
+
+   <para>
+	Pausing recovery will also prevent restartpoints from starting since they
+	are triggered by events in the WAL stream. In all other ways processing
+	will continue, for example the background writer will continue to clean
+	shared_buffers while paused.
+   </para>
+
    <para>
     The functions shown in <xref linkend="functions-admin-dbsize"> calculate
     the actual disk space usage of database objects.
diff --git a/src/backend/access/gin/ginxlog.c b/src/backend/access/gin/ginxlog.c
index 8382576d3c..7661c97fa3 100644
--- a/src/backend/access/gin/ginxlog.c
+++ b/src/backend/access/gin/ginxlog.c
@@ -14,6 +14,7 @@
 #include "postgres.h"
 
 #include "access/gin.h"
+#include "access/xact.h"
 #include "access/xlogutils.h"
 #include "storage/bufmgr.h"
 #include "utils/memutils.h"
@@ -438,6 +439,9 @@ gin_redo(XLogRecPtr lsn, XLogRecord *record)
 {
 	uint8		info = record->xl_info & ~XLR_INFO_MASK;
 
+	if (InArchiveRecovery)
+		(void) RecordKnownAssignedTransactionIds(lsn, record->xl_topxid, record->xl_xid);
+
 	RestoreBkpBlocks(lsn, record, false);
 
 	topCtx = MemoryContextSwitchTo(opCtx);
diff --git a/src/backend/access/gist/gistxlog.c b/src/backend/access/gist/gistxlog.c
index 4a20d905d4..3888bca945 100644
--- a/src/backend/access/gist/gistxlog.c
+++ b/src/backend/access/gist/gistxlog.c
@@ -14,6 +14,7 @@
 #include "postgres.h"
 
 #include "access/gist_private.h"
+#include "access/xact.h"
 #include "access/xlogutils.h"
 #include "miscadmin.h"
 #include "storage/bufmgr.h"
@@ -396,6 +397,9 @@ gist_redo(XLogRecPtr lsn, XLogRecord *record)
 	uint8		info = record->xl_info & ~XLR_INFO_MASK;
 	MemoryContext oldCxt;
 
+	if (InArchiveRecovery)
+		(void) RecordKnownAssignedTransactionIds(lsn, record->xl_topxid, record->xl_xid);
+
 	RestoreBkpBlocks(lsn, record, false);
 
 	oldCxt = MemoryContextSwitchTo(opCtx);
diff --git a/src/backend/access/heap/heapam.c b/src/backend/access/heap/heapam.c
index 52115cf64e..f2b45a2e63 100644
--- a/src/backend/access/heap/heapam.c
+++ b/src/backend/access/heap/heapam.c
@@ -3814,19 +3814,78 @@ heap_restrpos(HeapScanDesc scan)
 }
 
 /*
+ * Update the latestRemovedXid for the current VACUUM. This gets called
+ * only rarely, since we probably already removed rows earlier.
+ * see comments for vacuum_log_cleanup_info().
+ */
+void
+HeapTupleHeaderAdvanceLatestRemovedXid(HeapTupleHeader tuple, 
+										TransactionId *latestRemovedXid)
+{
+	TransactionId xmin = HeapTupleHeaderGetXmin(tuple);
+	TransactionId xmax = HeapTupleHeaderGetXmax(tuple);
+	TransactionId xvac = HeapTupleHeaderGetXvac(tuple);
+
+	if (tuple->t_infomask & HEAP_MOVED_OFF || 
+		tuple->t_infomask & HEAP_MOVED_IN)
+	{
+		if (TransactionIdPrecedes(*latestRemovedXid, xvac))
+			*latestRemovedXid = xvac;
+	}
+
+	if (TransactionIdPrecedes(*latestRemovedXid, xmax))
+		*latestRemovedXid = xmax;
+
+	if (TransactionIdPrecedes(*latestRemovedXid, xmin))
+		*latestRemovedXid = xmin;
+
+	Assert(TransactionIdIsValid(*latestRemovedXid));
+}
+
+/*
+ * Perform XLogInsert to register a heap cleanup info message. These
+ * messages are sent once per VACUUM and are required because
+ * of the phasing of removal operations during a lazy VACUUM.
+ * see comments for vacuum_log_cleanup_info().
+ */
+XLogRecPtr
+log_heap_cleanup_info(RelFileNode rnode, TransactionId latestRemovedXid)
+{
+	xl_heap_cleanup_info xlrec;
+	XLogRecPtr	recptr;
+	XLogRecData rdata;
+
+	xlrec.node = rnode;
+	xlrec.latestRemovedXid = latestRemovedXid;
+
+	rdata.data = (char *) &xlrec;
+	rdata.len = SizeOfHeapCleanupInfo;
+	rdata.buffer = InvalidBuffer;
+	rdata.next = NULL;
+
+	recptr = XLogInsert(RM_HEAP2_ID, XLOG_HEAP2_CLEANUP_INFO, &rdata);
+
+	return recptr;
+}
+
+/*
  * Perform XLogInsert for a heap-clean operation.  Caller must already
  * have modified the buffer and marked it dirty.
  *
  * Note: prior to Postgres 8.3, the entries in the nowunused[] array were
  * zero-based tuple indexes.  Now they are one-based like other uses
  * of OffsetNumber.
+ *
+ * For 8.4 we also include the latestRemovedXid which allows recovery
+ * processing to cancel long standby queries that would be have their 
+ * results changed if we applied these changes.
  */
 XLogRecPtr
 log_heap_clean(Relation reln, Buffer buffer,
 			   OffsetNumber *redirected, int nredirected,
 			   OffsetNumber *nowdead, int ndead,
 			   OffsetNumber *nowunused, int nunused,
-			   bool redirect_move)
+			   TransactionId latestRemovedXid, bool redirect_move)
 {
 	xl_heap_clean xlrec;
 	uint8		info;
@@ -3838,6 +3897,7 @@ log_heap_clean(Relation reln, Buffer buffer,
 
 	xlrec.node = reln->rd_node;
 	xlrec.block = BufferGetBlockNumber(buffer);
+	xlrec.latestRemovedXid = latestRemovedXid;
 	xlrec.nredirected = nredirected;
 	xlrec.ndead = ndead;
 
@@ -4109,6 +4169,46 @@ log_newpage(RelFileNode *rnode, ForkNumber forkNum, BlockNumber blkno,
 }
 
 /*
+ * Handles CLEANUP_INFO
+ */
+static void
+heap_xlog_cleanup_info(XLogRecPtr lsn, XLogRecord *record)
+{
+	xl_heap_cleanup_info *xlrec = (xl_heap_cleanup_info *) XLogRecGetData(record);
+
+	if (InArchiveRecovery && 
+		RecordKnownAssignedTransactionIds(lsn, record->xl_topxid, record->xl_xid) &&
+		LatestRemovedXidAdvances(xlrec->latestRemovedXid))
+	{
+		VirtualTransactionId *old_snapshots;
+
+		/*
+		 * Make sure the incoming transaction is emulated as running
+		 * prior to allowing any changes that could effect correctness
+		 * of MVCC for standby queries.
+		 *
+		 * Note that we will specifically exclude sessions with no
+		 * current snapshot, specifically idle in transaction sessions
+		 * that are neither serializable nor have active cursors.
+		 */
+	    old_snapshots = GetConflictingVirtualXIDs(xlrec->latestRemovedXid, 
+													xlrec->node.dbNode,
+	                                				InvalidTransactionId);
+
+		ResolveRecoveryConflictWithVirtualXIDs(old_snapshots,
+												"heap cleanup info",
+												ERROR,
+												lsn);
+	}
+
+	/* 
+	 * Actual operation is a no-op. Record type exists to provide a means
+	 * for conflict processing to occur before we begin index vacuum actions.
+	 * see vacuumlazy.c
+	 */
+}
+
+/*
  * Handles CLEAN and CLEAN_MOVE record types
  */
 static void
@@ -4126,12 +4226,34 @@ heap_xlog_clean(XLogRecPtr lsn, XLogRecord *record, bool clean_move)
 	int			nunused;
 	Size		freespace;
 
+	if (InArchiveRecovery && 
+		RecordKnownAssignedTransactionIds(lsn, record->xl_topxid, record->xl_xid) &&
+		LatestRemovedXidAdvances(xlrec->latestRemovedXid))
+	{
+		VirtualTransactionId *old_snapshots;
+
+		/*
+		 * see comments in heap_xlog_clean_info()
+		 */
+	    old_snapshots = GetConflictingVirtualXIDs(xlrec->latestRemovedXid, 
+													xlrec->node.dbNode,
+	                                				InvalidTransactionId);
+
+		ResolveRecoveryConflictWithVirtualXIDs(old_snapshots,
+												"heap cleanup",
+												ERROR,
+												lsn);
+	}
+
+	RestoreBkpBlocks(lsn, record, true);
+
 	if (record->xl_info & XLR_BKP_BLOCK_1)
 		return;
 
-	buffer = XLogReadBuffer(xlrec->node, xlrec->block, false);
+	buffer = XLogReadBufferExtended(xlrec->node, MAIN_FORKNUM, xlrec->block, RBM_NORMAL);
 	if (!BufferIsValid(buffer))
 		return;
+	LockBufferForCleanup(buffer);
 	page = (Page) BufferGetPage(buffer);
 
 	if (XLByteLE(lsn, PageGetLSN(page)))
@@ -4186,12 +4308,18 @@ heap_xlog_freeze(XLogRecPtr lsn, XLogRecord *record)
 	Buffer		buffer;
 	Page		page;
 
+	if (InArchiveRecovery)
+		(void) RecordKnownAssignedTransactionIds(lsn, record->xl_topxid, record->xl_xid);
+
+	RestoreBkpBlocks(lsn, record, false);
+
 	if (record->xl_info & XLR_BKP_BLOCK_1)
 		return;
 
-	buffer = XLogReadBuffer(xlrec->node, xlrec->block, false);
+	buffer = XLogReadBufferExtended(xlrec->node, MAIN_FORKNUM, xlrec->block, RBM_NORMAL);
 	if (!BufferIsValid(buffer))
 		return;
+	LockBufferForCleanup(buffer);
 	page = (Page) BufferGetPage(buffer);
 
 	if (XLByteLE(lsn, PageGetLSN(page)))
@@ -4777,6 +4905,9 @@ heap_redo(XLogRecPtr lsn, XLogRecord *record)
 {
 	uint8		info = record->xl_info & ~XLR_INFO_MASK;
 
+	if (InArchiveRecovery)
+		(void) RecordKnownAssignedTransactionIds(lsn, record->xl_topxid, record->xl_xid);
+
 	RestoreBkpBlocks(lsn, record, false);
 
 	switch (info & XLOG_HEAP_OPMASK)
@@ -4818,17 +4949,17 @@ heap2_redo(XLogRecPtr lsn, XLogRecord *record)
 	switch (info & XLOG_HEAP_OPMASK)
 	{
 		case XLOG_HEAP2_FREEZE:
-			RestoreBkpBlocks(lsn, record, false);
 			heap_xlog_freeze(lsn, record);
 			break;
 		case XLOG_HEAP2_CLEAN:
-			RestoreBkpBlocks(lsn, record, true);
 			heap_xlog_clean(lsn, record, false);
 			break;
 		case XLOG_HEAP2_CLEAN_MOVE:
-			RestoreBkpBlocks(lsn, record, true);
 			heap_xlog_clean(lsn, record, true);
 			break;
+		case XLOG_HEAP2_CLEANUP_INFO:
+			heap_xlog_cleanup_info(lsn, record);
+			break;
 		default:
 			elog(PANIC, "heap2_redo: unknown op code %u", info);
 	}
@@ -4958,17 +5089,26 @@ heap2_desc(StringInfo buf, uint8 xl_info, char *rec)
 	{
 		xl_heap_clean *xlrec = (xl_heap_clean *) rec;
 
-		appendStringInfo(buf, "clean: rel %u/%u/%u; blk %u",
+		appendStringInfo(buf, "clean: rel %u/%u/%u; blk %u remxid %u",
 						 xlrec->node.spcNode, xlrec->node.dbNode,
-						 xlrec->node.relNode, xlrec->block);
+						 xlrec->node.relNode, xlrec->block,
+						 xlrec->latestRemovedXid);
 	}
 	else if (info == XLOG_HEAP2_CLEAN_MOVE)
 	{
 		xl_heap_clean *xlrec = (xl_heap_clean *) rec;
 
-		appendStringInfo(buf, "clean_move: rel %u/%u/%u; blk %u",
+		appendStringInfo(buf, "clean_move: rel %u/%u/%u; blk %u remxid %u",
 						 xlrec->node.spcNode, xlrec->node.dbNode,
-						 xlrec->node.relNode, xlrec->block);
+						 xlrec->node.relNode, xlrec->block,
+						 xlrec->latestRemovedXid);
+	}
+	else if (info == XLOG_HEAP2_CLEANUP_INFO)
+	{
+		xl_heap_cleanup_info *xlrec = (xl_heap_cleanup_info *) rec;
+
+		appendStringInfo(buf, "cleanup info: remxid %u",
+						 xlrec->latestRemovedXid);
 	}
 	else
 		appendStringInfo(buf, "UNKNOWN");
diff --git a/src/backend/access/heap/pruneheap.c b/src/backend/access/heap/pruneheap.c
index 2691666e39..8c8bbd8355 100644
--- a/src/backend/access/heap/pruneheap.c
+++ b/src/backend/access/heap/pruneheap.c
@@ -30,6 +30,7 @@
 typedef struct
 {
 	TransactionId new_prune_xid;	/* new prune hint value for page */
+	TransactionId latestRemovedXid; /* latest xid to be removed by this prune */
 	int			nredirected;		/* numbers of entries in arrays below */
 	int			ndead;
 	int			nunused;
@@ -85,6 +86,14 @@ heap_page_prune_opt(Relation relation, Buffer buffer, TransactionId OldestXmin)
 		return;
 
 	/*
+	 * We can't write WAL in recovery mode, so there's no point trying to
+	 * clean the page. The master will likely issue a cleaning WAL record
+	 * soon anyway, so this is no particular loss.
+	 */
+	if (IsRecoveryProcessingMode())
+		return;
+
+	/*
 	 * We prune when a previous UPDATE failed to find enough space on the page
 	 * for a new tuple version, or when free space falls below the relation's
 	 * fill-factor target (but not less than 10%).
@@ -176,6 +185,7 @@ heap_page_prune(Relation relation, Buffer buffer, TransactionId OldestXmin,
 	 * Also initialize the rest of our working state.
 	 */
 	prstate.new_prune_xid = InvalidTransactionId;
+	prstate.latestRemovedXid = InvalidTransactionId;
 	prstate.nredirected = prstate.ndead = prstate.nunused = 0;
 	memset(prstate.marked, 0, sizeof(prstate.marked));
 
@@ -258,7 +268,7 @@ heap_page_prune(Relation relation, Buffer buffer, TransactionId OldestXmin,
 									prstate.redirected, prstate.nredirected,
 									prstate.nowdead, prstate.ndead,
 									prstate.nowunused, prstate.nunused,
-									redirect_move);
+									prstate.latestRemovedXid, redirect_move);
 
 			PageSetLSN(BufferGetPage(buffer), recptr);
 			PageSetTLI(BufferGetPage(buffer), ThisTimeLineID);
@@ -396,6 +406,8 @@ heap_prune_chain(Relation relation, Buffer buffer, OffsetNumber rootoffnum,
 				== HEAPTUPLE_DEAD && !HeapTupleHeaderIsHotUpdated(htup))
 			{
 				heap_prune_record_unused(prstate, rootoffnum);
+				HeapTupleHeaderAdvanceLatestRemovedXid(htup, 
+													   &prstate->latestRemovedXid);
 				ndeleted++;
 			}
 
@@ -521,7 +533,11 @@ heap_prune_chain(Relation relation, Buffer buffer, OffsetNumber rootoffnum,
 		 * find another DEAD tuple is a fairly unusual corner case.)
 		 */
 		if (tupdead)
+		{
 			latestdead = offnum;
+			HeapTupleHeaderAdvanceLatestRemovedXid(htup, 
+												   &prstate->latestRemovedXid);
+		}
 		else if (!recent_dead)
 			break;
 
diff --git a/src/backend/access/index/genam.c b/src/backend/access/index/genam.c
index 88baa7c904..fb2b06aa88 100644
--- a/src/backend/access/index/genam.c
+++ b/src/backend/access/index/genam.c
@@ -89,8 +89,19 @@ RelationGetIndexScan(Relation indexRelation,
 	else
 		scan->keyData = NULL;
 
+	/*
+	 * During recovery we ignore killed tuples and don't bother to kill them
+	 * either. We do this because the xmin on the primary node could easily
+	 * be later than the xmin on the standby node, so that what the primary
+	 * thinks is killed is supposed to be visible on standby. So for correct
+	 * MVCC for queries during recovery we must ignore these hints and check
+	 * all tuples. Do *not* set ignore_killed_tuples to true when running
+	 * in a transaction that was started during recovery. AMs can set it to
+	 * false at any time. xactStartedInRecovery should not be touched by AMs.
+	 */
 	scan->kill_prior_tuple = false;
-	scan->ignore_killed_tuples = true;	/* default setting */
+	scan->xactStartedInRecovery = TransactionStartedDuringRecovery();
+	scan->ignore_killed_tuples = !scan->xactStartedInRecovery;
 
 	scan->opaque = NULL;
 
diff --git a/src/backend/access/index/indexam.c b/src/backend/access/index/indexam.c
index 92eec92bab..09da208329 100644
--- a/src/backend/access/index/indexam.c
+++ b/src/backend/access/index/indexam.c
@@ -455,9 +455,12 @@ index_getnext(IndexScanDesc scan, ScanDirection direction)
 
 			/*
 			 * If we scanned a whole HOT chain and found only dead tuples,
-			 * tell index AM to kill its entry for that TID.
+			 * tell index AM to kill its entry for that TID. We do not do
+			 * this when in recovery because it may violate MVCC to do so. 
+			 * see comments in RelationGetIndexScan().
 			 */
-			scan->kill_prior_tuple = scan->xs_hot_dead;
+			if (!scan->xactStartedInRecovery)
+				scan->kill_prior_tuple = scan->xs_hot_dead;
 
 			/*
 			 * The AM's gettuple proc finds the next index entry matching the
diff --git a/src/backend/access/nbtree/README b/src/backend/access/nbtree/README
index 81d56b3a6b..aee8f8fe24 100644
--- a/src/backend/access/nbtree/README
+++ b/src/backend/access/nbtree/README
@@ -401,6 +401,27 @@ of the WAL entry.)  If the parent page becomes half-dead but is not
 immediately deleted due to a subsequent crash, there is no loss of
 consistency, and the empty page will be picked up by the next VACUUM.
 
+Scans during Recovery
+---------------------
+
+The btree index type can be safely used during recovery. During recovery
+we have at most one writer and potentially many readers. In that
+situation the locking requirements can be relaxed and we do not need
+double locking during block splits. Each WAL record makes changes to a 
+single level of the btree using the correct locking sequence and so
+is safe for concurrent readers. Some readers may observe a block split
+in progress as they descend the tree, but they will simple move right
+onto the correct page.
+
+During recovery all index scans start with ignore_killed_tuples = false
+and we never set kill_prior_tuple. We do this because the oldest xmin
+on the standby server can be older than the oldest xmin on the master 
+server, which means tuples can be marked as killed even when they are
+still visible on the standby. We don't WAL log tuple killed bits, but
+they can still appear in the standby because of full page writes. So
+we must always ignore them and that means it's not worth setting them
+either.
+
 Other Things That Are Handy to Know
 -----------------------------------
 
diff --git a/src/backend/access/nbtree/nbtinsert.c b/src/backend/access/nbtree/nbtinsert.c
index 69a2ed3ec2..7b4ce9efda 100644
--- a/src/backend/access/nbtree/nbtinsert.c
+++ b/src/backend/access/nbtree/nbtinsert.c
@@ -1924,7 +1924,7 @@ _bt_vacuum_one_page(Relation rel, Buffer buffer)
 	}
 
 	if (ndeletable > 0)
-		_bt_delitems(rel, buffer, deletable, ndeletable);
+		_bt_delitems(rel, buffer, deletable, ndeletable, false, 0);
 
 	/*
 	 * Note: if we didn't find any LP_DEAD items, then the page's
diff --git a/src/backend/access/nbtree/nbtpage.c b/src/backend/access/nbtree/nbtpage.c
index 23026c2905..4632524eb2 100644
--- a/src/backend/access/nbtree/nbtpage.c
+++ b/src/backend/access/nbtree/nbtpage.c
@@ -652,7 +652,8 @@ _bt_page_recyclable(Page page)
  */
 void
 _bt_delitems(Relation rel, Buffer buf,
-			 OffsetNumber *itemnos, int nitems)
+			 OffsetNumber *itemnos, int nitems, bool isVacuum,
+			 BlockNumber lastBlockVacuumed)
 {
 	Page		page = BufferGetPage(buf);
 	BTPageOpaque opaque;
@@ -684,15 +685,37 @@ _bt_delitems(Relation rel, Buffer buf,
 	/* XLOG stuff */
 	if (!rel->rd_istemp)
 	{
-		xl_btree_delete xlrec;
 		XLogRecPtr	recptr;
 		XLogRecData rdata[2];
 
-		xlrec.node = rel->rd_node;
-		xlrec.block = BufferGetBlockNumber(buf);
+		/* We don't need both, but it simplies the code to have both here */
+		xl_btree_delete xlrec_delete; 
+		xl_btree_vacuum xlrec_vacuum;
+
+		if (isVacuum)
+		{
+			xlrec_vacuum.node = rel->rd_node;
+			xlrec_vacuum.block = BufferGetBlockNumber(buf);
+
+			xlrec_vacuum.lastBlockVacuumed = lastBlockVacuumed;
+			rdata[0].data = (char *) &xlrec_vacuum;
+			rdata[0].len = SizeOfBtreeVacuum;
+		}
+		else
+		{
+			xlrec_delete.node = rel->rd_node;
+			xlrec_delete.block = BufferGetBlockNumber(buf);
+
+			/*
+			 * We would like to set an accurate latestRemovedXid, but there
+			 * is no easy way of obtaining a useful value. So we use the
+			 * probably far too conservative value of RecentGlobalXmin instead.
+			 */
+			xlrec_delete.latestRemovedXid = InvalidTransactionId;
+			rdata[0].data = (char *) &xlrec_delete;
+			rdata[0].len = SizeOfBtreeDelete;
+		}
 
-		rdata[0].data = (char *) &xlrec;
-		rdata[0].len = SizeOfBtreeDelete;
 		rdata[0].buffer = InvalidBuffer;
 		rdata[0].next = &(rdata[1]);
 
@@ -715,7 +738,10 @@ _bt_delitems(Relation rel, Buffer buf,
 		rdata[1].buffer_std = true;
 		rdata[1].next = NULL;
 
-		recptr = XLogInsert(RM_BTREE_ID, XLOG_BTREE_DELETE, rdata);
+		if (isVacuum)
+			recptr = XLogInsert(RM_BTREE_ID, XLOG_BTREE_VACUUM, rdata);
+		else
+			recptr = XLogInsert(RM_BTREE_ID, XLOG_BTREE_DELETE, rdata);
 
 		PageSetLSN(page, recptr);
 		PageSetTLI(page, ThisTimeLineID);
diff --git a/src/backend/access/nbtree/nbtree.c b/src/backend/access/nbtree/nbtree.c
index 59680cd056..b1a8a575de 100644
--- a/src/backend/access/nbtree/nbtree.c
+++ b/src/backend/access/nbtree/nbtree.c
@@ -58,7 +58,8 @@ typedef struct
 	IndexBulkDeleteCallback callback;
 	void	   *callback_state;
 	BTCycleId	cycleid;
-	BlockNumber lastUsedPage;
+	BlockNumber lastBlockVacuumed; 	/* last blkno reached by Vacuum scan */
+	BlockNumber lastUsedPage;		/* blkno of last page that is in use */
 	BlockNumber totFreePages;	/* true total # of free pages */
 	MemoryContext pagedelcontext;
 } BTVacState;
@@ -626,6 +627,7 @@ btvacuumscan(IndexVacuumInfo *info, IndexBulkDeleteResult *stats,
 	vstate.callback = callback;
 	vstate.callback_state = callback_state;
 	vstate.cycleid = cycleid;
+	vstate.lastBlockVacuumed = BTREE_METAPAGE; /* Initialise at first block */
 	vstate.lastUsedPage = BTREE_METAPAGE;
 	vstate.totFreePages = 0;
 
@@ -855,7 +857,19 @@ restart:
 		 */
 		if (ndeletable > 0)
 		{
-			_bt_delitems(rel, buf, deletable, ndeletable);
+			BlockNumber	lastBlockVacuumed = BufferGetBlockNumber(buf);
+
+			_bt_delitems(rel, buf, deletable, ndeletable, true, vstate->lastBlockVacuumed);
+
+			/*
+			 * Keep track of the block number of the lastBlockVacuumed, so
+			 * we can scan those blocks as well during WAL replay. This then
+			 * provides concurrency protection and allows btrees to be used
+			 * while in recovery.
+			 */
+			if (lastBlockVacuumed > vstate->lastBlockVacuumed)
+				vstate->lastBlockVacuumed = lastBlockVacuumed;
+
 			stats->tuples_removed += ndeletable;
 			/* must recompute maxoff */
 			maxoff = PageGetMaxOffsetNumber(page);
diff --git a/src/backend/access/nbtree/nbtxlog.c b/src/backend/access/nbtree/nbtxlog.c
index 517c4b90ce..02ff07c2ab 100644
--- a/src/backend/access/nbtree/nbtxlog.c
+++ b/src/backend/access/nbtree/nbtxlog.c
@@ -16,7 +16,10 @@
 
 #include "access/nbtree.h"
 #include "access/transam.h"
+#include "access/xact.h"
 #include "storage/bufmgr.h"
+#include "storage/procarray.h"
+#include "utils/inval.h"
 
 /*
  * We must keep track of expected insertions due to page splits, and apply
@@ -459,6 +462,86 @@ btree_xlog_split(bool onleft, bool isroot,
 }
 
 static void
+btree_xlog_vacuum(XLogRecPtr lsn, XLogRecord *record)
+{
+	xl_btree_vacuum *xlrec;
+	Buffer		buffer;
+	Page		page;
+	BTPageOpaque opaque;
+
+	if (record->xl_info & XLR_BKP_BLOCK_1)
+		return;
+
+	xlrec = (xl_btree_vacuum *) XLogRecGetData(record);
+
+	/*
+	 * We need to ensure every block is unpinned between the
+	 * lastBlockVacuumed and the current block, if there are any.
+	 * This ensures that every block in the index is touched during
+	 * VACUUM as required to ensure scans work correctly.
+	 */
+	if ((xlrec->lastBlockVacuumed + 1) != xlrec->block)
+	{
+		BlockNumber blkno = xlrec->lastBlockVacuumed + 1;
+
+		for (; blkno < xlrec->block; blkno++)
+		{
+			/*
+			 * XXXHS we don't actually need to read the block, we
+			 * just need to confirm it is unpinned. If we had a special call
+			 * into the buffer manager we could optimise this so that
+			 * if the block is not in shared_buffers we confirm it as unpinned.
+			 */
+			buffer = XLogReadBufferExtended(xlrec->node, MAIN_FORKNUM, blkno, RBM_NORMAL);
+			if (BufferIsValid(buffer))
+			{
+				LockBufferForCleanup(buffer);			
+	            UnlockReleaseBuffer(buffer);
+			}
+		}
+	}
+
+	/*
+	 * We need to take a cleanup lock to apply these changes.
+	 * See nbtree/README for details.
+	 */
+	buffer = XLogReadBufferExtended(xlrec->node, MAIN_FORKNUM, xlrec->block, RBM_NORMAL);
+	if (!BufferIsValid(buffer))
+		return;
+	LockBufferForCleanup(buffer);
+	page = (Page) BufferGetPage(buffer);
+
+	if (XLByteLE(lsn, PageGetLSN(page)))
+	{
+		UnlockReleaseBuffer(buffer);
+		return;
+	}
+
+	if (record->xl_len > SizeOfBtreeVacuum)
+	{
+		OffsetNumber *unused;
+		OffsetNumber *unend;
+
+		unused = (OffsetNumber *) ((char *) xlrec + SizeOfBtreeVacuum);
+		unend = (OffsetNumber *) ((char *) xlrec + record->xl_len);
+
+		PageIndexMultiDelete(page, unused, unend - unused);
+	}
+
+	/*
+	 * Mark the page as not containing any LP_DEAD items --- see comments in
+	 * _bt_delitems().
+	 */
+	opaque = (BTPageOpaque) PageGetSpecialPointer(page);
+	opaque->btpo_flags &= ~BTP_HAS_GARBAGE;
+
+	PageSetLSN(page, lsn);
+	PageSetTLI(page, ThisTimeLineID);
+	MarkBufferDirty(buffer);
+	UnlockReleaseBuffer(buffer);
+}
+
+static void
 btree_xlog_delete(XLogRecPtr lsn, XLogRecord *record)
 {
 	xl_btree_delete *xlrec;
@@ -470,6 +553,11 @@ btree_xlog_delete(XLogRecPtr lsn, XLogRecord *record)
 		return;
 
 	xlrec = (xl_btree_delete *) XLogRecGetData(record);
+
+	/*
+	 * We don't need to take a cleanup lock to apply these changes.
+	 * See nbtree/README for details.
+	 */
 	buffer = XLogReadBuffer(xlrec->node, xlrec->block, false);
 	if (!BufferIsValid(buffer))
 		return;
@@ -714,6 +802,46 @@ btree_redo(XLogRecPtr lsn, XLogRecord *record)
 {
 	uint8		info = record->xl_info & ~XLR_INFO_MASK;
 
+	/*
+	 * Btree delete records can conflict with standby queries. You might
+	 * think that Vacuum records would conflict as well, but they don't
+	 * because XLOG_HEAP2_CLEANUP_INFO exist specifically to ensure that
+	 * we perform all conflict for the whole index, rather than block by
+	 * block.
+	 */
+	if (InArchiveRecovery)
+	{
+		if (info == XLOG_BTREE_DELETE)
+		{
+			xl_btree_delete *xlrec = (xl_btree_delete *) XLogRecGetData(record);
+
+			if (RecordKnownAssignedTransactionIds(lsn, record->xl_topxid, record->xl_xid) &&
+				LatestRemovedXidAdvances(xlrec->latestRemovedXid))
+			{
+				VirtualTransactionId *old_snapshots;
+
+				/*
+				 * Make sure the incoming transaction is emulated as running
+				 * prior to allowing any changes made by it to touch data.
+				 *
+				 * XXXHS: Currently we put everybody on death row, because
+				 * currently _bt_delitems() supplies InvalidTransactionId. We
+				 * should be able to do better than that with some thought.
+				 */
+				old_snapshots = GetConflictingVirtualXIDs(xlrec->latestRemovedXid, 
+															xlrec->node.dbNode,
+															InvalidOid);
+
+				ResolveRecoveryConflictWithVirtualXIDs(old_snapshots,
+	            											"btree delete",
+															ERROR,
+															lsn);
+			}
+		}
+		else
+			(void) RecordKnownAssignedTransactionIds(lsn, record->xl_topxid, record->xl_xid);
+	}
+
 	RestoreBkpBlocks(lsn, record, false);
 
 	switch (info)
@@ -739,6 +867,9 @@ btree_redo(XLogRecPtr lsn, XLogRecord *record)
 		case XLOG_BTREE_SPLIT_R_ROOT:
 			btree_xlog_split(false, true, lsn, record);
 			break;
+		case XLOG_BTREE_VACUUM:
+			btree_xlog_vacuum(lsn, record);
+			break;
 		case XLOG_BTREE_DELETE:
 			btree_xlog_delete(lsn, record);
 			break;
@@ -843,13 +974,24 @@ btree_desc(StringInfo buf, uint8 xl_info, char *rec)
 								 xlrec->level, xlrec->firstright);
 				break;
 			}
+		case XLOG_BTREE_VACUUM:
+			{
+				xl_btree_vacuum *xlrec = (xl_btree_vacuum *) rec;
+
+				appendStringInfo(buf, "vacuum: rel %u/%u/%u; blk %u, lastBlockVacuumed %u",
+								 xlrec->node.spcNode, xlrec->node.dbNode,
+								 xlrec->node.relNode, xlrec->block,
+								 xlrec->lastBlockVacuumed);
+				break;
+			}
 		case XLOG_BTREE_DELETE:
 			{
 				xl_btree_delete *xlrec = (xl_btree_delete *) rec;
 
-				appendStringInfo(buf, "delete: rel %u/%u/%u; blk %u",
+				appendStringInfo(buf, "delete: rel %u/%u/%u; blk %u, latestRemovedXid %u",
 								 xlrec->node.spcNode, xlrec->node.dbNode,
-								 xlrec->node.relNode, xlrec->block);
+								 xlrec->node.relNode, xlrec->block,
+								 xlrec->latestRemovedXid);
 				break;
 			}
 		case XLOG_BTREE_DELETE_PAGE:
diff --git a/src/backend/access/transam/README b/src/backend/access/transam/README
index a88563e335..9e32a72d97 100644
--- a/src/backend/access/transam/README
+++ b/src/backend/access/transam/README
@@ -195,10 +195,11 @@ they first do something that requires one --- typically, insert/update/delete
 a tuple, though there are a few other places that need an XID assigned.
 If a subtransaction requires an XID, we always first assign one to its
 parent.  This maintains the invariant that child transactions have XIDs later
-than their parents, which is assumed in a number of places.
+than their parents, which is assumed in a number of places. In 8.4 onwards,
+some corner cases exist that require XID assignment to be WAL logged.
 
 The subsidiary actions of obtaining a lock on the XID and and entering it into
-pg_subtrans and PG_PROC are done at the time it is assigned.
+PG_PROC and, in some cases, pg_subtrans are done at the time it is assigned.
 
 A transaction that has no XID still needs to be identified for various
 purposes, notably holding locks.  For this purpose we assign a "virtual
@@ -376,7 +377,9 @@ but since we allow arbitrary nesting of subtransactions, we can't fit all Xids
 in shared memory, so we have to store them on disk.  Note, however, that for
 each transaction we keep a "cache" of Xids that are known to be part of the
 transaction tree, so we can skip looking at pg_subtrans unless we know the
-cache has been overflowed.  See storage/ipc/procarray.c for the gory details.
+cache has been overflowed.  In 8.4 we skip updating pg_subtrans unless the 
+cache has overflowed for that transaction, considerably reducing pg_subtrans
+activity. See storage/ipc/procarray.c for the gory details.
 
 slru.c is the supporting mechanism for both pg_clog and pg_subtrans.  It
 implements the LRU policy for in-memory buffer pages.  The high-level routines
@@ -649,3 +652,33 @@ fsync it down to disk without any sort of interlock, as soon as it finishes
 the bulk update.  However, all these paths are designed to write data that
 no other transaction can see until after T1 commits.  The situation is thus
 not different from ordinary WAL-logged updates.
+
+Transaction Emulation during Recovery
+-------------------------------------
+
+During Recovery we replay transaction changes in the order they occurred.
+As part of this replay we emulate some transactional behaviour, so that
+read only backends can take MVCC snapshots. We do this by maintaining
+Recovery Procs, so that each transaction that has recorded WAL records for 
+database writes will exist in the procarray until it commits. Further
+details are given in comments in procarray.c.
+
+Many actions write no WAL records at all, for example read only transactions.
+These have no effect on MVCC in recovery and we can pretend they never
+occurred at all. Subtransaction commit does not write a WAL record either
+and has very little effect, since lock waiters need to wait for the
+parent transaction to complete.
+
+Not all transactional behaviour is emulated, for example we do not insert
+a transaction entry into the lock table, nor do we maintain the transaction
+stack in memory. Clog entries are made normally. Multitrans is not maintained 
+because its purpose is to record tuple level locks that an application has 
+requested to prevent write locks. Since write locks cannot be obtained at all,
+there is never any conflict and so there is no reason to update multitrans.
+Subtrans is maintained during recovery but the details of the transaction
+tree are ignored and all subtransactions reference the top-level TransactionId
+directly. Since commit is atomic this provides correct lock wait behaviour
+yet simplifies emulation of subtransactions considerably.
+
+Further details on locking mechanics in recovery are given in comments
+with the Lock rmgr code.
diff --git a/src/backend/access/transam/clog.c b/src/backend/access/transam/clog.c
index 5bd72154c5..46e05596cd 100644
--- a/src/backend/access/transam/clog.c
+++ b/src/backend/access/transam/clog.c
@@ -35,6 +35,7 @@
 #include "access/clog.h"
 #include "access/slru.h"
 #include "access/transam.h"
+#include "access/xact.h"
 #include "pg_trace.h"
 #include "postmaster/bgwriter.h"
 
@@ -690,6 +691,9 @@ clog_redo(XLogRecPtr lsn, XLogRecord *record)
 	/* Backup blocks are not used in clog records */
 	Assert(!(record->xl_info & XLR_BKP_BLOCK_MASK));
 
+	if (InArchiveRecovery)
+		(void) RecordKnownAssignedTransactionIds(lsn, record->xl_topxid, record->xl_xid);
+
 	if (info == CLOG_ZEROPAGE)
 	{
 		int			pageno;
diff --git a/src/backend/access/transam/multixact.c b/src/backend/access/transam/multixact.c
index 881a588d69..f33c7fa91d 100644
--- a/src/backend/access/transam/multixact.c
+++ b/src/backend/access/transam/multixact.c
@@ -1544,6 +1544,7 @@ CheckPointMultiXact(void)
 	 * isn't valid (because StartupMultiXact hasn't been called yet) and so
 	 * SimpleLruTruncate would get confused.  It seems best not to risk
 	 * removing any data during recovery anyway, so don't truncate.
+	 * We are executing in the bgwriter, so we must access shared status.
 	 */
 	if (!IsRecoveryProcessingMode())
 		TruncateMultiXact();
@@ -1875,6 +1876,9 @@ multixact_redo(XLogRecPtr lsn, XLogRecord *record)
 	/* Backup blocks are not used in multixact records */
 	Assert(!(record->xl_info & XLR_BKP_BLOCK_MASK));
 
+	if (InArchiveRecovery)
+		(void) RecordKnownAssignedTransactionIds(lsn, record->xl_topxid, record->xl_xid);
+
 	if (info == XLOG_MULTIXACT_ZERO_OFF_PAGE)
 	{
 		int			pageno;
diff --git a/src/backend/access/transam/rmgr.c b/src/backend/access/transam/rmgr.c
index 0273b0e153..252f4ee3f8 100644
--- a/src/backend/access/transam/rmgr.c
+++ b/src/backend/access/transam/rmgr.c
@@ -20,6 +20,7 @@
 #include "commands/dbcommands.h"
 #include "commands/sequence.h"
 #include "commands/tablespace.h"
+#include "storage/sinval.h"
 #include "storage/freespace.h"
 
 
@@ -32,7 +33,7 @@ const RmgrData RmgrTable[RM_MAX_ID + 1] = {
 	{"Tablespace", tblspc_redo, tblspc_desc, NULL, NULL, NULL},
 	{"MultiXact", multixact_redo, multixact_desc, NULL, NULL, NULL},
 	{"Reserved 7", NULL, NULL, NULL, NULL, NULL},
-	{"Reserved 8", NULL, NULL, NULL, NULL, NULL},
+	{"Relation", relation_redo, relation_desc, NULL, NULL, NULL},
 	{"Heap2", heap2_redo, heap2_desc, NULL, NULL, NULL},
 	{"Heap", heap_redo, heap_desc, NULL, NULL, NULL},
 	{"Btree", btree_redo, btree_desc, btree_xlog_startup, btree_xlog_cleanup, btree_safe_restartpoint},
diff --git a/src/backend/access/transam/slru.c b/src/backend/access/transam/slru.c
index 68e38696fb..f337e18b0e 100644
--- a/src/backend/access/transam/slru.c
+++ b/src/backend/access/transam/slru.c
@@ -598,7 +598,8 @@ SlruPhysicalReadPage(SlruCtl ctl, int pageno, int slotno)
 	 * commands to set the commit status of transactions whose bits are in
 	 * already-truncated segments of the commit log (see notes in
 	 * SlruPhysicalWritePage).	Hence, if we are InRecovery, allow the case
-	 * where the file doesn't exist, and return zeroes instead.
+	 * where the file doesn't exist, and return zeroes instead. We also
+	 * return a zeroed page when seek and read fails. 
 	 */
 	fd = BasicOpenFile(path, O_RDWR | PG_BINARY, S_IRUSR | S_IWUSR);
 	if (fd < 0)
@@ -619,6 +620,14 @@ SlruPhysicalReadPage(SlruCtl ctl, int pageno, int slotno)
 
 	if (lseek(fd, (off_t) offset, SEEK_SET) < 0)
 	{
+		if (InRecovery)
+		{
+			ereport(LOG,
+					(errmsg("file \"%s\" doesn't exist, reading as zeroes",
+							path)));
+			MemSet(shared->page_buffer[slotno], 0, BLCKSZ);
+			return true;
+		}
 		slru_errcause = SLRU_SEEK_FAILED;
 		slru_errno = errno;
 		close(fd);
@@ -628,6 +637,14 @@ SlruPhysicalReadPage(SlruCtl ctl, int pageno, int slotno)
 	errno = 0;
 	if (read(fd, shared->page_buffer[slotno], BLCKSZ) != BLCKSZ)
 	{
+		if (InRecovery)
+		{
+			ereport(LOG,
+					(errmsg("file \"%s\" doesn't exist, reading as zeroes",
+							path)));
+			MemSet(shared->page_buffer[slotno], 0, BLCKSZ);
+			return true;
+		}
 		slru_errcause = SLRU_READ_FAILED;
 		slru_errno = errno;
 		close(fd);
diff --git a/src/backend/access/transam/subtrans.c b/src/backend/access/transam/subtrans.c
index eaad23182a..fe57e61024 100644
--- a/src/backend/access/transam/subtrans.c
+++ b/src/backend/access/transam/subtrans.c
@@ -31,6 +31,7 @@
 #include "access/slru.h"
 #include "access/subtrans.h"
 #include "access/transam.h"
+#include "miscadmin.h"
 #include "pg_trace.h"
 #include "utils/snapmgr.h"
 
@@ -223,36 +224,19 @@ ZeroSUBTRANSPage(int pageno)
 /*
  * This must be called ONCE during postmaster or standalone-backend startup,
  * after StartupXLOG has initialized ShmemVariableCache->nextXid.
- *
- * oldestActiveXID is the oldest XID of any prepared transaction, or nextXid
- * if there are none.
- *
- * Note that this is not atomic and is not yet safe to perform while other
- * processes might access subtrans.
  */
 void
 StartupSUBTRANS(TransactionId oldestActiveXID)
 {
-	int			startPage;
-	int			endPage;
+	TransactionId xid = ShmemVariableCache->nextXid;
+	int			pageno = TransactionIdToPage(xid);
 
-	/*
-	 * Since we don't expect pg_subtrans to be valid across crashes, we
-	 * initialize the currently-active page(s) to zeroes during startup.
-	 * Whenever we advance into a new page, ExtendSUBTRANS will likewise zero
-	 * the new page without regard to whatever was previously on disk.
-	 */
 	LWLockAcquire(SubtransControlLock, LW_EXCLUSIVE);
 
-	startPage = TransactionIdToPage(oldestActiveXID);
-	endPage = TransactionIdToPage(ShmemVariableCache->nextXid);
-
-	while (startPage != endPage)
-	{
-		(void) ZeroSUBTRANSPage(startPage);
-		startPage++;
-	}
-	(void) ZeroSUBTRANSPage(startPage);
+	/*
+	 * Initialize our idea of the latest page number.
+	 */
+	SubTransCtl->shared->latest_page_number = pageno;
 
 	LWLockRelease(SubtransControlLock);
 }
@@ -305,16 +289,55 @@ void
 ExtendSUBTRANS(TransactionId newestXact)
 {
 	int			pageno;
+	static int last_pageno = 0;
 
-	/*
-	 * No work except at first XID of a page.  But beware: just after
-	 * wraparound, the first XID of page zero is FirstNormalTransactionId.
-	 */
-	if (TransactionIdToEntry(newestXact) != 0 &&
-		!TransactionIdEquals(newestXact, FirstNormalTransactionId))
-		return;
+	Assert(TransactionIdIsNormal(newestXact));
 
-	pageno = TransactionIdToPage(newestXact);
+	if (!InRecovery)
+	{
+		/*
+		 * No work except at first XID of a page.  But beware: just after
+		 * wraparound, the first XID of page zero is FirstNormalTransactionId.
+		 */
+		if (TransactionIdToEntry(newestXact) != 0 &&
+			!TransactionIdEquals(newestXact, FirstNormalTransactionId))
+			return;
+
+		pageno = TransactionIdToPage(newestXact);
+	}
+	else
+	{
+		int32		diff;
+
+		/*
+		 * InRecovery we keep track of the last page we extended, so
+		 * we can compare that against incoming XIDs. This will only
+		 * ever be run by startup process, so keep it as a static variable
+		 * rather than hiding behind the SubtransControlLock.
+		 */
+		pageno = TransactionIdToPage(newestXact);
+
+		/*
+		 * Fast path return for common case
+		 */
+		if (pageno == last_pageno)
+			return;		
+
+		/*
+		 * If pageno logically precedes last_pageno then we do nothing.
+		 * We need to be careful at wraparound here too, so we do a 
+		 * modulo-2^31 comparison, exactly as we do in TransactionIdPrecedes()
+	 	 */
+		diff = (int32) (pageno - last_pageno);
+		if (diff < 0)
+			return;
+
+		elog(trace_recovery(DEBUG1), 
+						"extend subtrans  xid %u page %d last_page %d",
+						newestXact, pageno, last_pageno);
+
+		last_pageno = pageno;
+	}
 
 	LWLockAcquire(SubtransControlLock, LW_EXCLUSIVE);
 
diff --git a/src/backend/access/transam/transam.c b/src/backend/access/transam/transam.c
index 2a1eab4d16..6fb2d3f729 100644
--- a/src/backend/access/transam/transam.c
+++ b/src/backend/access/transam/transam.c
@@ -35,9 +35,6 @@ static TransactionId cachedFetchXid = InvalidTransactionId;
 static XidStatus cachedFetchXidStatus;
 static XLogRecPtr cachedCommitLSN;
 
-/* Handy constant for an invalid xlog recptr */
-static const XLogRecPtr InvalidXLogRecPtr = {0, 0};
-
 /* Local functions */
 static XidStatus TransactionLogFetch(TransactionId transactionId);
 
diff --git a/src/backend/access/transam/twophase.c b/src/backend/access/transam/twophase.c
index eb3f34183f..e5d6a4265a 100644
--- a/src/backend/access/transam/twophase.c
+++ b/src/backend/access/transam/twophase.c
@@ -359,7 +359,7 @@ MarkAsPrepared(GlobalTransaction gxact)
 	 * Put it into the global ProcArray so TransactionIdIsInProgress considers
 	 * the XID as still running.
 	 */
-	ProcArrayAdd(&gxact->proc);
+	ProcArrayAdd(&gxact->proc, true);
 }
 
 /*
@@ -1198,7 +1198,7 @@ FinishPreparedTransaction(const char *gid, bool isCommit)
 									   hdr->nsubxacts, children,
 									   hdr->nabortrels, abortrels);
 
-	ProcArrayRemove(&gxact->proc, latestXid);
+	ProcArrayRemove(&gxact->proc, latestXid, 0, NULL);
 
 	/*
 	 * In case we fail while running the callbacks, mark the gxact invalid so
@@ -1719,6 +1719,7 @@ RecordTransactionCommitPrepared(TransactionId xid,
 	/* Emit the XLOG commit record */
 	xlrec.xid = xid;
 	xlrec.crec.xact_time = GetCurrentTimestamp();
+	xlrec.crec.xinfo = 0;
 	xlrec.crec.nrels = nrels;
 	xlrec.crec.nsubxacts = nchildren;
 	rdata[0].data = (char *) (&xlrec);
diff --git a/src/backend/access/transam/varsup.c b/src/backend/access/transam/varsup.c
index 16a75346e8..4c1550508c 100644
--- a/src/backend/access/transam/varsup.c
+++ b/src/backend/access/transam/varsup.c
@@ -277,6 +277,16 @@ SetTransactionIdLimit(TransactionId oldest_datfrozenxid,
 	curXid = ShmemVariableCache->nextXid;
 	LWLockRelease(XidGenLock);
 
+	/*
+	 * If we are in recovery then we are just replaying what has happened on
+	 * the master. If we do need to trigger an autovacuum then it will happen
+	 * on the master and changes will be fed through to the standby.
+	 * So we have nothing to do here but be patient. We may be called during
+	 * recovery by Startup process when updating db flat files.
+	 */
+	if (InRecovery)
+		return;
+
 	/* Log the info */
 	ereport(DEBUG1,
 	   (errmsg("transaction ID wrap limit is %u, limited by database \"%s\"",
diff --git a/src/backend/access/transam/xact.c b/src/backend/access/transam/xact.c
index d0ed3c0318..ceb150c85d 100644
--- a/src/backend/access/transam/xact.c
+++ b/src/backend/access/transam/xact.c
@@ -40,6 +40,7 @@
 #include "storage/fd.h"
 #include "storage/lmgr.h"
 #include "storage/procarray.h"
+#include "storage/sinval.h"
 #include "storage/sinvaladt.h"
 #include "storage/smgr.h"
 #include "utils/combocid.h"
@@ -141,6 +142,7 @@ typedef struct TransactionStateData
 	Oid			prevUser;		/* previous CurrentUserId setting */
 	bool		prevSecDefCxt;	/* previous SecurityDefinerContext setting */
 	bool		prevXactReadOnly;		/* entry-time xact r/o state */
+	bool		startedInRecovery;	/* did we start in recovery? */
 	struct TransactionStateData *parent;		/* back link to parent */
 } TransactionStateData;
 
@@ -169,6 +171,7 @@ static TransactionStateData TopTransactionStateData = {
 	InvalidOid,					/* previous CurrentUserId setting */
 	false,						/* previous SecurityDefinerContext setting */
 	false,						/* entry-time xact r/o state */
+	false,						/* startedInRecovery */
 	NULL						/* link to parent state block */
 };
 
@@ -212,6 +215,17 @@ static bool forceSyncCommit = false;
 static MemoryContext TransactionAbortContext = NULL;
 
 /*
+ * Bookkeeping for tracking emulated transactions in Recovery Procs.
+ */
+static TransactionId	latestObservedXid = InvalidTransactionId;
+static bool				RunningXactIsValid;
+
+/*
+ * Local state to optimise recovery conflict resolution
+ */
+static	TransactionId	latestRemovedXid = InvalidTransactionId;
+
+/*
  * List of add-on start- and end-of-xact callbacks
  */
 typedef struct XactCallbackItem
@@ -237,7 +251,7 @@ static SubXactCallbackItem *SubXact_callbacks = NULL;
 
 
 /* local function prototypes */
-static void AssignTransactionId(TransactionState s);
+static void AssignTransactionId(TransactionState s, int recursion_level);
 static void AbortTransaction(void);
 static void AtAbort_Memory(void);
 static void AtCleanup_Memory(void);
@@ -331,7 +345,7 @@ TransactionId
 GetTopTransactionId(void)
 {
 	if (!TransactionIdIsValid(TopTransactionStateData.transactionId))
-		AssignTransactionId(&TopTransactionStateData);
+		AssignTransactionId(&TopTransactionStateData, 0);
 	return TopTransactionStateData.transactionId;
 }
 
@@ -361,7 +375,7 @@ GetCurrentTransactionId(void)
 	TransactionState s = CurrentTransactionState;
 
 	if (!TransactionIdIsValid(s->transactionId))
-		AssignTransactionId(s);
+		AssignTransactionId(s, 0);
 	return s->transactionId;
 }
 
@@ -389,7 +403,7 @@ GetCurrentTransactionIdIfAny(void)
  * following its parent's.
  */
 static void
-AssignTransactionId(TransactionState s)
+AssignTransactionId(TransactionState s, int recursion_level)
 {
 	bool		isSubXact = (s->parent != NULL);
 	ResourceOwner currentOwner;
@@ -406,7 +420,7 @@ AssignTransactionId(TransactionState s)
 	 * than its parent.
 	 */
 	if (isSubXact && !TransactionIdIsValid(s->parent->transactionId))
-		AssignTransactionId(s->parent);
+		AssignTransactionId(s->parent, recursion_level + 1);
 
 	/*
 	 * Generate a new Xid and record it in PG_PROC and pg_subtrans.
@@ -418,7 +432,14 @@ AssignTransactionId(TransactionState s)
 	 */
 	s->transactionId = GetNewTransactionId(isSubXact);
 
-	if (isSubXact)
+	/*
+	 * If we have overflowed the subxid cache then we must mark subtrans
+	 * with the parent xid. Prior to 8.4 we marked subtrans for each
+	 * subtransaction, though that is no longer necessary because the 
+	 * way snapshots are searched in XidInMVCCSnapshot() has changed to
+	 * allow searching of both subxid cache and subtrans, not either/or.
+	 */
+	if (isSubXact && MyProc->subxids.overflowed)
 		SubTransSetParent(s->transactionId, s->parent->transactionId);
 
 	/*
@@ -440,8 +461,61 @@ AssignTransactionId(TransactionState s)
 	}
 	PG_END_TRY();
 	CurrentResourceOwner = currentOwner;
-}
 
+	/*
+	 * Recovery environment needs to know when a transaction first starts
+	 * making changes to the database. We could issue an assignment WAL
+	 * record for every transaction and subtransaction but that would be
+	 * a large performance hit. However, each WAL record is marked with 
+	 * both it's xid and its top-level xid. So we only need to issue an
+	 * assignment xid when we are assigning multiple xids recursively,
+	 * except for when we are on the very first subtransaction in any
+	 * transaction - since that already has xid and topxid on it.
+	 */
+	if (recursion_level > 1 || (recursion_level == 1 && isSubXact))
+	{
+		XLogRecData rdata;
+		xl_xact_assignment	xlrec;
+
+		xlrec.xassign = s->transactionId;
+
+		if (isSubXact)
+			xlrec.xtop = s->parent->transactionId;
+		else
+			xlrec.xtop = InvalidTransactionId;
+
+		elog(trace_recovery(DEBUG2), 
+				"AssignTransactionId xid %u xtop %u nest %d recursion %d hasParent %s",
+				xlrec.xassign,
+				xlrec.xtop,
+				GetCurrentTransactionNestLevel(),
+				recursion_level,
+				isSubXact ? "t" : "f");
+
+		START_CRIT_SECTION();
+
+		rdata.data = (char *) (&xlrec);
+		rdata.len = sizeof(xl_xact_assignment);
+		rdata.buffer = InvalidBuffer;
+		rdata.next = NULL;
+
+		/* 
+		 * These WAL records look like no other. We are assigning a 
+		 * TransactionId to upper levels of the transaction stack. The
+		 * transaction level we are looking is *not* be the *current*
+		 * transaction - we haven't even assigned the xid for the current
+		 * transaction yet, so the xl_xid of this WAL record will be 
+		 * InvalidTransactionId, even though we are in a transaction.
+		 * Got that?
+		 * 
+		 * So we stuff the newly assigned xid into the body of the WAL 
+		 * record and let RecordKnownAssignedTransactionIds() work it out.
+		 */
+		(void) XLogInsert(RM_XACT_ID, XLOG_XACT_ASSIGNMENT, &rdata);
+
+		END_CRIT_SECTION();
+	}
+}
 
 /*
  *	GetCurrentSubTransactionId
@@ -600,6 +674,16 @@ TransactionIdIsCurrentTransactionId(TransactionId xid)
 	return false;
 }
 
+/*
+ *	TransactionStartedDuringRecovery, used during index scans
+ */
+bool
+TransactionStartedDuringRecovery(void)
+{
+	TransactionState s = CurrentTransactionState;
+
+	return s->startedInRecovery;
+}
 
 /*
  *	CommandCounterIncrement
@@ -827,11 +911,15 @@ RecordTransactionCommit(void)
 	bool		haveNonTemp;
 	int			nchildren;
 	TransactionId *children;
+	int			nmsgs;
+	SharedInvalidationMessage *invalidationMessages = NULL;
+	bool		RelcacheInitFileInval;
 
 	/* Get data needed for commit record */
 	nrels = smgrGetPendingDeletes(true, &rels, &haveNonTemp);
 	nchildren = xactGetCommittedChildren(&children);
-
+	nmsgs = xactGetCommittedInvalidationMessages(&invalidationMessages, 
+												 &RelcacheInitFileInval);
 	/*
 	 * If we haven't been assigned an XID yet, we neither can, nor do we want
 	 * to write a COMMIT record.
@@ -865,7 +953,7 @@ RecordTransactionCommit(void)
 		/*
 		 * Begin commit critical section and insert the commit XLOG record.
 		 */
-		XLogRecData rdata[3];
+		XLogRecData rdata[4];
 		int			lastrdata = 0;
 		xl_xact_commit xlrec;
 
@@ -873,6 +961,19 @@ RecordTransactionCommit(void)
 		BufmgrCommit();
 
 		/*
+		 * Set flags required for recovery processing of commits.
+		 * Nothing too critical here that we would want to include this
+		 * within the critical section following.
+		 */
+		xlrec.xinfo = 0;
+		if (AtEOXact_Database_FlatFile_Update_Needed())
+			xlrec.xinfo |= XACT_COMPLETION_UPDATE_DB_FILE;
+		if (AtEOXact_Auth_FlatFile_Update_Needed())
+			xlrec.xinfo |= XACT_COMPLETION_UPDATE_AUTH_FILE;
+		if (RelcacheInitFileInval)
+			xlrec.xinfo |= XACT_COMPLETION_UPDATE_RELCACHE_FILE;
+
+		/*
 		 * Mark ourselves as within our "commit critical section".	This
 		 * forces any concurrent checkpoint to wait until we've updated
 		 * pg_clog.  Without this, it is possible for the checkpoint to set
@@ -896,6 +997,8 @@ RecordTransactionCommit(void)
 		xlrec.xact_time = xactStopTimestamp;
 		xlrec.nrels = nrels;
 		xlrec.nsubxacts = nchildren;
+		xlrec.nmsgs = nmsgs;
+
 		rdata[0].data = (char *) (&xlrec);
 		rdata[0].len = MinSizeOfXactCommit;
 		rdata[0].buffer = InvalidBuffer;
@@ -917,6 +1020,15 @@ RecordTransactionCommit(void)
 			rdata[2].buffer = InvalidBuffer;
 			lastrdata = 2;
 		}
+		/* dump shared cache invalidation messages */
+		if (nmsgs > 0)
+		{
+			rdata[lastrdata].next = &(rdata[3]);
+			rdata[3].data = (char *) invalidationMessages;
+			rdata[3].len = nmsgs * sizeof(SharedInvalidationMessage);
+			rdata[3].buffer = InvalidBuffer;
+			lastrdata = 3;
+		}
 		rdata[lastrdata].next = NULL;
 
 		(void) XLogInsert(RM_XACT_ID, XLOG_XACT_COMMIT, rdata);
@@ -1528,6 +1640,7 @@ StartTransaction(void)
 	s->childXids = NULL;
 	s->nChildXids = 0;
 	s->maxChildXids = 0;
+	s->startedInRecovery = IsRecoveryProcessingMode();
 	GetUserIdAndContext(&s->prevUser, &s->prevSecDefCxt);
 	/* SecurityDefinerContext should never be set outside a transaction */
 	Assert(!s->prevSecDefCxt);
@@ -4217,31 +4330,438 @@ xactGetCommittedChildren(TransactionId **ptr)
 }
 
 /*
+ * Record an enhanced snapshot of running transactions into WAL.
+ */
+void
+LogCurrentRunningXacts(void)
+{
+	RunningTransactions		CurrRunningXacts = GetRunningTransactionData();
+	xl_xact_running_xacts	xlrec;
+	XLogRecData 			rdata[3];
+	int						lastrdata = 0;
+	XLogRecPtr				recptr;
+
+	xlrec.xcnt = CurrRunningXacts->xcnt;
+	xlrec.subxcnt = CurrRunningXacts->subxcnt;
+	xlrec.latestRunningXid = CurrRunningXacts->latestRunningXid;
+	xlrec.oldestRunningXid = CurrRunningXacts->oldestRunningXid;
+	xlrec.latestCompletedXid = CurrRunningXacts->latestCompletedXid;
+
+	/* Header */
+	rdata[0].data = (char *) (&xlrec);
+	rdata[0].len = MinSizeOfXactRunningXacts;
+	rdata[0].buffer = InvalidBuffer;
+
+	/* array of RunningXact */
+	if (xlrec.xcnt > 0)
+	{
+		rdata[0].next = &(rdata[1]);
+		rdata[1].data = (char *) CurrRunningXacts->xrun;
+		rdata[1].len = xlrec.xcnt * sizeof(RunningXact);
+		rdata[1].buffer = InvalidBuffer;
+		lastrdata = 1;
+	}
+
+	/* array of RunningXact */
+	if (xlrec.subxcnt > 0)
+	{
+		rdata[lastrdata].next = &(rdata[2]);
+		rdata[2].data = (char *) CurrRunningXacts->subxip;
+		rdata[2].len = xlrec.subxcnt * sizeof(TransactionId);
+		rdata[2].buffer = InvalidBuffer;
+		lastrdata = 2;
+	}
+
+	rdata[lastrdata].next = NULL;
+
+	START_CRIT_SECTION();
+
+	recptr = XLogInsert(RM_XACT_ID, XLOG_XACT_RUNNING_XACTS, rdata);
+
+	END_CRIT_SECTION();
+
+	elog(trace_recovery(DEBUG2), "captured snapshot of running xacts %X/%X", recptr.xlogid, recptr.xrecoff);
+}
+
+/*
+ * Is the data available to allow valid snapshots?
+ */
+bool 
+IsRunningXactDataValid(void)
+{
+	return RunningXactIsValid;
+}
+
+void
+SetRunningXactData(bool mode)
+{
+	RunningXactIsValid = mode;
+}
+
+/*
+ * We need to issue shared invalidations and hold locks. Holding locks
+ * means others may want to wait on us, so we need to make lock table
+ * inserts to appear like a transaction. We could create and delete
+ * lock table entries for each transaction but its simpler just to create
+ * one permanent entry and leave it there all the time. Locks are then
+ * acquired and released as needed. Yes, this means you can see the
+ * Startup process in pg_locks once we have run this.
+ */
+void
+InitRecoveryTransactionEnvironment(void)
+{
+	VirtualTransactionId vxid;
+
+	/*
+	 * Initialise shared invalidation management for Startup process,
+	 * being careful to register ourselves as a sendOnly process so
+	 * we don't need to read messages, nor will we get signalled
+	 * when the queue starts filling up.
+	 */
+	SharedInvalBackendInit(true);
+
+	/*
+	 * Additional initialisation tasks. Most of this was performed
+	 * during initial stages of startup.
+	 */
+	ProcArrayInitRecoveryEnvironment();
+
+	/*
+	 * Lock a virtual transaction id for Startup process.
+	 *
+	 * We need to do GetNextLocalTransactionId() because 
+	 * SharedInvalBackendInit() leaves localTransactionid invalid and
+	 * the lock manager doesn't like that at all.
+	 *
+	 * Note that we don't need to run XactLockTableInsert() because nobody
+	 * needs to wait on xids. That sounds a little strange, but table locks
+	 * are held by vxids and row level locks are held by xids. All queries 
+	 * hold AccessShareLocks so never block while we write or lock new rows.
+	 */
+	vxid.backendId = MyBackendId;
+	vxid.localTransactionId = GetNextLocalTransactionId();
+	VirtualXactLockTableInsert(vxid);
+
+	/*
+	 * Now that the database is consistent we can create a valid copy of
+	 * the flat files required for connection and authentication. This
+	 * may already have been executed at appropriate commit points, but
+	 * we cannot trust that those executions were correct, so force it
+	 * again now just to be safe.
+	 */
+	BuildFlatFiles(false);
+}
+
+/*
+ * During recovery we maintain ProcArray with incoming xids when we first 
+ * observe them in use. Uses local variables, so should only be called 
+ * by Startup process.
+ *
+ * We record all xids that we know have been assigned. That includes
+ * all the xids on the WAL record, plus all unobserved xids that
+ * we can deduce have been assigned. We can deduce the existence of
+ * unobserved xids because we know xids are in sequence, with no gaps.
+ */
+bool
+RecordKnownAssignedTransactionIds(XLogRecPtr lsn, TransactionId top_xid, TransactionId child_xid)
+{
+	TransactionId	xid;
+	PGPROC 			*proc;
+	bool			unobserved = false;
+	bool			mark_subtrans = false;
+
+	/*
+	 * Skip processing if the current snapshot is invalid. If you're
+	 * thinking of removing this, think again. We must have a valid
+	 * initial state before we try to modify it.
+	 */
+	if (!IsRunningXactDataValid())
+		return false;
+
+	xid = child_xid;
+	if (child_xid == top_xid)
+		child_xid = InvalidTransactionId;
+
+	/*
+	 * VACUUM records are always sent with InvalidTransactionId, so
+	 * invoke conflict processing if we see a record like this.
+	 */
+	if (!TransactionIdIsValid(top_xid))
+		return true;
+
+	/*
+	 * Identify the recovery proc that holds replay info for this xid.
+	 *
+	 * XXXHS This gets called for every WAL record (with XID). I think we'll
+	 * need a faster version of BackendiXidGetProc, using a hash table or
+	 * something. FWIW, the hash table wouldn't need to be in shared memory,
+	 * because the startup process is the only one doing this.
+	 */
+	proc = BackendXidGetProc(top_xid);
+
+	LWLockAcquire(ProcArrayLock, LW_EXCLUSIVE);
+
+	if (proc == NULL)
+	{
+		proc = CreateRecoveryProcessForTransactionId(top_xid);
+
+		if (proc == NULL)
+		{
+			LWLockRelease(ProcArrayLock);
+			SetRunningXactData(false);
+			return false;
+		}
+
+		unobserved = true;
+	}
+
+	/*
+	 * Use volatile pointer to prevent code rearrangement; other backends
+	 * could be examining the subxid info concurrently, and we don't want
+	 * them to see an invalid intermediate state, such as incrementing
+	 * nxids before filling the array entry.  Note we are assuming that
+	 * TransactionId and int fetch/store are atomic, but that's OK since
+	 * we're holding ProcArrayLock exclusively.
+	 */
+	{
+		volatile PGPROC *myproc = proc;
+
+		myproc->lsn = lsn;
+
+		if (TransactionIdIsValid(child_xid))
+		{
+			int			nxids = myproc->subxids.nxids;
+
+			if (nxids < PGPROC_MAX_CACHED_SUBXIDS)
+			{
+				/* 
+				 * Just remember when reading this logic that by definition
+				 * we have Assert(TransactionIdPrecedes(top_xid, xid)) 
+				 */
+				if (nxids > 0 && TransactionIdPrecedes(myproc->subxids.xids[nxids - 1], child_xid))
+				{
+					myproc->subxids.xids[nxids] = child_xid;
+					myproc->subxids.nxids = nxids + 1;
+				}
+			}
+			else
+			{
+				myproc->subxids.overflowed = true;
+				mark_subtrans = true;
+			}
+		}
+	}
+
+	/*
+	 * When a newly observed xid arrives, it is frequently the case
+	 * that it is *not* the next xid in sequence. When this occurs, we
+	 * must treat the intervening xids as running also. So we maintain
+	 * a special list of these UnobservedXids, so that snapshots can
+	 * see the missing xids as in-progress.
+	 *
+	 * We maintain both recovery Procs *and* UnobservedXids because we
+	 * need them both. Recovery procs allow us to store top-level xids
+	 * and subtransactions separately, otherwise we wouldn't know
+	 * when to overflow the subxid cache. UnobservedXids allow us to
+	 * make sense of the out-of-order arrival of xids.
+	 *
+	 * Some examples:
+	 * 1)	latestObservedXid = 647
+	 *		next xid observed in WAL = 651 (a top-level transaction)
+	 *		so we add 648, 649, 650 to UnobservedXids
+	 *		and add 651 as a recovery proc
+	 *
+	 * 2)	latestObservedXid = 769
+	 *		next xid observed in WAL = 771 (a subtransaction)
+	 *		so we add 770 to UnobservedXids
+	 *		and add 771 into the subxid cache of its top-level xid
+	 *
+	 * 3)	latestObservedXid = 769
+	 *		next xid observed in WAL = 810 (a subtransaction)
+	 *		810's parent had not yet recorded WAL = 807
+	 *		so we add 770 thru 809 inclusive to UnobservedXids
+	 *		then remove 807
+	 *
+	 * 4)	latestObservedXid = 769
+	 *		next xid observed in WAL = 771 (a subtransaction)
+	 *		771's parent had not yet recorded WAL = 770
+	 *		so do nothing
+	 *
+	 * 5)	latestObservedXid = 7747
+	 *		next xid observed in WAL = 7748 (a subtransaction)
+	 *		7748's parent had not yet recorded WAL = 7742
+	 *		so we add 7748 and removed 7742
+	 */
+	for (xid = top_xid; TransactionIdIsValid(xid); xid = child_xid)
+	{
+		TransactionId	next_expected_xid = latestObservedXid;
+		TransactionIdAdvance(next_expected_xid);
+
+		if (next_expected_xid == xid)
+		{
+			Assert(!XidInUnobservedTransactions(xid));
+			latestObservedXid = xid;
+		}
+		else if (TransactionIdPrecedes(next_expected_xid, xid))
+		{
+			UnobservedTransactionsAddXids(next_expected_xid, xid);
+			latestObservedXid = xid;
+		}
+		else if (unobserved)
+			UnobservedTransactionsRemoveXid(xid, true);
+
+		if (xid == child_xid)
+			break;
+	}
+
+	LWLockRelease(ProcArrayLock);
+
+	elog(trace_recovery(DEBUG4), 
+					"record known xact top_xid %u child_xid %u %slatestObservedXid %u",
+					top_xid, child_xid,
+					(unobserved ? "unobserved " : " "),
+					latestObservedXid);
+
+	/* 
+	 * Now we've upated the proc we can update subtrans, if appropriate.  
+	 * We must do this step last to avoid race conditions.  See comments
+	 * and code for AssignTransactionId().
+	 *
+	 * Notice that we update pg_subtrans with the top-level xid, rather
+	 * than the parent xid. This is a difference between normal 
+	 * processing and recovery, yet is still correct in all cases. The
+	 * reason is that subtransaction commit is not marked in clog until
+	 * commit processing, so all aborted subtransactions have already been
+	 * clearly marked in clog. As a result we are able to refer directly
+	 * to the top-level transaction's state rather than skipping through
+	 * all the intermediate states in the subtransaction tree.
+	 */
+	if (mark_subtrans)
+	{
+		elog(trace_recovery(DEBUG2), 
+				"subtrans setting topxid %d for xid %d", top_xid, child_xid);
+		ExtendSUBTRANS(child_xid);
+		SubTransSetParent(child_xid, top_xid);
+	}
+
+	return true;
+}
+
+/*
+ * LatestRemovedXidAdvances - returns true if latestRemovedXid is moved
+ * 								forwards by the latest provided value
+ */
+bool
+LatestRemovedXidAdvances(TransactionId latestXid)
+{
+	/*
+	 * Don't bother checking for conflicts for cleanup records earlier than
+	 * we have already tested for. 
+	 */
+	if (TransactionIdIsValid(latestRemovedXid) &&
+		TransactionIdPrecedes(latestRemovedXid, latestXid))
+		return false;
+
+	/*
+	 * Remember how far we've cleaned to avoid checks in the future.
+	 */
+	latestRemovedXid = latestXid;
+
+	return true;
+}
+
+/*
  *	XLOG support routines
  */
 
+/*
+ * Before 8.4 this was a fairly short function, but now it performs many
+ * actions for which the order of execution is critical.
+ */
 static void
-xact_redo_commit(xl_xact_commit *xlrec, TransactionId xid)
+xact_redo_commit(xl_xact_commit *xlrec, TransactionId xid, bool preparedXact)
 {
 	TransactionId *sub_xids;
 	TransactionId max_xid;
+	PGPROC	   *proc;
 	int			i;
 
-	/* Mark the transaction committed in pg_clog */
-	sub_xids = (TransactionId *) &(xlrec->xnodes[xlrec->nrels]);
-	TransactionIdCommitTree(xid, xlrec->nsubxacts, sub_xids);
-
 	/* Make sure nextXid is beyond any XID mentioned in the record */
 	max_xid = xid;
+	sub_xids = (TransactionId *) &(xlrec->xnodes[xlrec->nrels]);
+
+	/*
+	 * Find the highest xid and remove unobserved xids if required.
+	 */
 	for (i = 0; i < xlrec->nsubxacts; i++)
 	{
 		if (TransactionIdPrecedes(max_xid, sub_xids[i]))
 			max_xid = sub_xids[i];
 	}
+
+	/* Mark the transaction committed in pg_clog */
+	TransactionIdCommitTree(xid, xlrec->nsubxacts, sub_xids);
+
+	if (InArchiveRecovery && (proc = BackendXidGetProc(xid)) != NULL)
+	{
+		/*
+		 * We must mark clog before we update the ProcArray. Only update
+		 * if we have already initialised the state and we have previously
+		 * added an xid to the proc. We need no lock to check xid since it 
+		 * is controlled by Startup process. It's possible for xids to
+		 * appear that haven't been seen before. We don't need to check
+		 * UnobservedXids because in the normal case this will already have
+		 * happened, but there are cases where they might sneak through.
+		 * Leave these for the periodic cleanup by XACT_RUNNING_XACT records.
+		 */
+		if (IsRunningXactDataValid() && !preparedXact)
+		{
+			ProcArrayRemove(proc, InvalidTransactionId, xlrec->nsubxacts, sub_xids);
+			FreeRecoveryProcess(proc);
+		}
+
+		/*
+		 * If requested, update the flat files for DB and Auth Files by
+		 * reading the catalog tables. Needs to be the first action taken
+		 * after marking transaction complete to minimise race conditions.
+		 * This is the opposite way round to the original actions, which
+		 * update the files and then mark committed, so there is a race
+		 * condition in both places.
+		 */
+		if (XactCompletionUpdateDBFile(xlrec) || XactCompletionUpdateAuthFile(xlrec))
+		{
+			if (XactCompletionUpdateAuthFile(xlrec))
+				BuildFlatFiles(false);
+			else
+				BuildFlatFiles(true);
+		}
+
+		/*
+		 * Send any cache invalidations attached to the commit. We must
+		 * maintain the same order of invalidation then release locks
+		 * as occurs in RecordTransactionCommit.
+		 */
+		if (xlrec->nmsgs > 0)
+		{
+			int	offset = OffsetSharedInvalInXactCommit();
+			SharedInvalidationMessage *msgs = (SharedInvalidationMessage *)
+							(((char *) xlrec) + offset);
+
+			SendSharedInvalidMessages(msgs, xlrec->nmsgs);
+		}
+
+		/*
+		 * Release locks, if any.
+		 */
+		RelationReleaseRecoveryLockTree(xid, xlrec->nsubxacts, sub_xids);
+	}
+
+	/* Make sure nextXid is beyond any XID mentioned in the record */
 	if (TransactionIdFollowsOrEquals(max_xid,
 									 ShmemVariableCache->nextXid))
 	{
 		ShmemVariableCache->nextXid = max_xid;
+		ShmemVariableCache->latestCompletedXid = ShmemVariableCache->nextXid;
 		TransactionIdAdvance(ShmemVariableCache->nextXid);
 	}
 
@@ -4263,28 +4783,65 @@ xact_redo_commit(xl_xact_commit *xlrec, TransactionId xid)
 	}
 }
 
+/*
+ * Be careful with the order of execution, as with xact_redo_commit().
+ * The two functions are similar but differ in key places.
+ */
 static void
-xact_redo_abort(xl_xact_abort *xlrec, TransactionId xid)
+xact_redo_abort(xl_xact_abort *xlrec, TransactionId xid, bool preparedXact)
 {
+	PGPROC		*proc = NULL;
 	TransactionId *sub_xids;
 	TransactionId max_xid;
 	int			i;
 
-	/* Mark the transaction aborted in pg_clog */
-	sub_xids = (TransactionId *) &(xlrec->xnodes[xlrec->nrels]);
-	TransactionIdAbortTree(xid, xlrec->nsubxacts, sub_xids);
-
 	/* Make sure nextXid is beyond any XID mentioned in the record */
 	max_xid = xid;
+	sub_xids = (TransactionId *) &(xlrec->xnodes[xlrec->nrels]);
+
+	/*
+	 * Find the highest xid and remove unobserved xids if required.
+	 */
 	for (i = 0; i < xlrec->nsubxacts; i++)
 	{
 		if (TransactionIdPrecedes(max_xid, sub_xids[i]))
 			max_xid = sub_xids[i];
 	}
+
+	/* Mark the transaction aborted in pg_clog */
+	TransactionIdAbortTree(xid, xlrec->nsubxacts, sub_xids);
+
+	if (InArchiveRecovery && (proc = BackendXidGetProc(xid)) != NULL)
+	{
+		/*
+		 * We must mark clog before we update the ProcArray. Only update
+		 * if we have already initialised the state and we have previously
+		 * added an xid to the proc. We need no lock to check xid since it 
+		 * is controlled by Startup process. It's possible for xids to
+		 * appear that haven't been seen before. We don't need to check
+		 * UnobservedXids because in the normal case this will already have
+		 * happened, but there are cases where they might sneak through.
+		 * Leave these for the periodic cleanup by XACT_RUNNING_XACT records.
+		 */
+		if (IsRunningXactDataValid() && 
+			TransactionIdIsValid(proc->xid) && !preparedXact)
+		{
+			ProcArrayRemove(proc, InvalidTransactionId, xlrec->nsubxacts, sub_xids);
+			FreeRecoveryProcess(proc);
+		}
+
+		/*
+		 * Release locks, if any. There are no invalidations to send.
+		 */
+		RelationReleaseRecoveryLockTree(xid, xlrec->nsubxacts, sub_xids);
+	}
+
+	/* Make sure nextXid is beyond any XID mentioned in the record */
 	if (TransactionIdFollowsOrEquals(max_xid,
 									 ShmemVariableCache->nextXid))
 	{
 		ShmemVariableCache->nextXid = max_xid;
+		ShmemVariableCache->latestCompletedXid = ShmemVariableCache->nextXid;
 		TransactionIdAdvance(ShmemVariableCache->nextXid);
 	}
 
@@ -4314,17 +4871,63 @@ xact_redo(XLogRecPtr lsn, XLogRecord *record)
 	/* Backup blocks are not used in xact records */
 	Assert(!(record->xl_info & XLR_BKP_BLOCK_MASK));
 
+	if (info == XLOG_XACT_ASSIGNMENT)
+	{
+		xl_xact_assignment	*xlrec = (xl_xact_assignment *) XLogRecGetData(record);
+
+		if (InArchiveRecovery)
+		{
+			/*
+			 * Its an assignment record, so we need to need extract data from
+			 * the body of the record, rather than take header values. This
+			 * is because an assignment record can be issued when
+			 * GetCurrentTransactionIdIfAny() returns InvalidTransactionId.
+			 */
+			(void) RecordKnownAssignedTransactionIds(lsn, xlrec->xtop, 
+														xlrec->xassign);
+		}
+
+		return;
+	}
+	else if (info == XLOG_XACT_RUNNING_XACTS)
+	{
+		xl_xact_running_xacts *xlrec = (xl_xact_running_xacts *) XLogRecGetData(record);
+
+		/*
+		 * If RunningXact data is complete then apply it
+		 */
+		if (InArchiveRecovery && TransactionIdIsValid(xlrec->latestRunningXid))
+		{
+			if (TransactionIdPrecedes(latestObservedXid, xlrec->latestRunningXid))
+			{
+				latestObservedXid = xlrec->latestRunningXid;
+				ShmemVariableCache->latestCompletedXid = xlrec->latestCompletedXid;
+			}
+			ProcArrayUpdateRecoveryTransactions(lsn, xlrec);
+		}
+
+		return;
+	}
+
+	if (InArchiveRecovery)
+	{
+		/*
+		 * No conflict resolution is required for transaction completion records
+		 */
+		(void) RecordKnownAssignedTransactionIds(lsn, record->xl_topxid, record->xl_xid);
+	}
+
 	if (info == XLOG_XACT_COMMIT)
 	{
 		xl_xact_commit *xlrec = (xl_xact_commit *) XLogRecGetData(record);
 
-		xact_redo_commit(xlrec, record->xl_xid);
+		xact_redo_commit(xlrec, record->xl_xid, false);
 	}
 	else if (info == XLOG_XACT_ABORT)
 	{
 		xl_xact_abort *xlrec = (xl_xact_abort *) XLogRecGetData(record);
 
-		xact_redo_abort(xlrec, record->xl_xid);
+		xact_redo_abort(xlrec, record->xl_xid, false);
 	}
 	else if (info == XLOG_XACT_PREPARE)
 	{
@@ -4336,14 +4939,14 @@ xact_redo(XLogRecPtr lsn, XLogRecord *record)
 	{
 		xl_xact_commit_prepared *xlrec = (xl_xact_commit_prepared *) XLogRecGetData(record);
 
-		xact_redo_commit(&xlrec->crec, xlrec->xid);
+		xact_redo_commit(&xlrec->crec, xlrec->xid, true);
 		RemoveTwoPhaseFile(xlrec->xid, false);
 	}
 	else if (info == XLOG_XACT_ABORT_PREPARED)
 	{
 		xl_xact_abort_prepared *xlrec = (xl_xact_abort_prepared *) XLogRecGetData(record);
 
-		xact_redo_abort(&xlrec->arec, xlrec->xid);
+		xact_redo_abort(&xlrec->arec, xlrec->xid, true);
 		RemoveTwoPhaseFile(xlrec->xid, false);
 	}
 	else
@@ -4355,10 +4958,19 @@ xact_desc_commit(StringInfo buf, xl_xact_commit *xlrec)
 {
 	int			i;
 
+	if (XactCompletionUpdateDBFile(xlrec))
+		appendStringInfo(buf, "; update db file");
+
+	if (XactCompletionUpdateDBFile(xlrec))
+		appendStringInfo(buf, "; update auth file");
+
+	if (XactCompletionRelcacheInitFileInval(xlrec))
+		appendStringInfo(buf, "; relcache init file inval");
+
 	appendStringInfoString(buf, timestamptz_to_str(xlrec->xact_time));
 	if (xlrec->nrels > 0)
 	{
-		appendStringInfo(buf, "; rels:");
+		appendStringInfo(buf, "; %d rels:", xlrec->nrels);
 		for (i = 0; i < xlrec->nrels; i++)
 		{
 			char *path = relpath(xlrec->xnodes[i], MAIN_FORKNUM);
@@ -4369,12 +4981,34 @@ xact_desc_commit(StringInfo buf, xl_xact_commit *xlrec)
 	if (xlrec->nsubxacts > 0)
 	{
 		TransactionId *xacts = (TransactionId *)
-		&xlrec->xnodes[xlrec->nrels];
-
-		appendStringInfo(buf, "; subxacts:");
+									&xlrec->xnodes[xlrec->nrels];
+		appendStringInfo(buf, "; %d subxacts:", xlrec->nsubxacts);
 		for (i = 0; i < xlrec->nsubxacts; i++)
 			appendStringInfo(buf, " %u", xacts[i]);
 	}
+	if (xlrec->nmsgs > 0)
+	{
+		/* 
+		 * The invalidation messages are the third variable length array
+		 * from the start of the record. The record header has everything
+		 * we need to calculate where that starts.
+		 */
+		int	offset = OffsetSharedInvalInXactCommit();
+		SharedInvalidationMessage *msgs = (SharedInvalidationMessage *)
+						(((char *) xlrec) + offset);
+		appendStringInfo(buf, "; %d inval msgs:", xlrec->nmsgs);
+		for (i = 0; i < xlrec->nmsgs; i++)
+		{
+			SharedInvalidationMessage *msg = msgs + i;
+
+			if (msg->id >= 0)
+				appendStringInfo(buf,  "catcache id%d ", msg->id);
+			else if (msg->id == SHAREDINVALRELCACHE_ID)
+				appendStringInfo(buf,  "relcache ");
+			else if (msg->id == SHAREDINVALSMGR_ID)
+				appendStringInfo(buf,  "smgr ");
+		}
+	}
 }
 
 static void
@@ -4404,6 +5038,43 @@ xact_desc_abort(StringInfo buf, xl_xact_abort *xlrec)
 	}
 }
 
+static void
+xact_desc_running_xacts(StringInfo buf, xl_xact_running_xacts *xlrec)
+{
+	int				xid_index,
+					subxid_index;
+	TransactionId 	*subxip = (TransactionId *) &(xlrec->xrun[xlrec->xcnt]);
+
+	appendStringInfo(buf, "nxids %u nsubxids %u latestRunningXid %d",
+								xlrec->xcnt, 
+								xlrec->subxcnt,
+								xlrec->latestRunningXid);
+
+	appendStringInfo(buf, " oldestRunningXid %d latestCompletedXid %d", 
+								xlrec->oldestRunningXid,
+								xlrec->latestCompletedXid);
+
+	for (xid_index = 0; xid_index < xlrec->xcnt; xid_index++)
+	{
+		RunningXact		*rxact = (RunningXact *) xlrec->xrun;
+
+		appendStringInfo(buf, "; xid %d", rxact[xid_index].xid);
+
+		if (rxact[xid_index].nsubxids > 0)
+		{
+			appendStringInfo(buf, " nsubxids %u offset %d ovflow? %s",
+									rxact[xid_index].nsubxids,
+									rxact[xid_index].subx_offset,
+									(rxact[xid_index].overflowed ? "t" : "f"));
+
+			appendStringInfo(buf, "; subxacts: ");
+			for (subxid_index = 0; subxid_index < rxact[xid_index].nsubxids; subxid_index++)
+				appendStringInfo(buf, " %u", 
+						subxip[subxid_index + rxact[xid_index].subx_offset]);
+		}
+	}
+}
+
 void
 xact_desc(StringInfo buf, uint8 xl_info, char *rec)
 {
@@ -4441,6 +5112,21 @@ xact_desc(StringInfo buf, uint8 xl_info, char *rec)
 		appendStringInfo(buf, "abort %u: ", xlrec->xid);
 		xact_desc_abort(buf, &xlrec->arec);
 	}
+	else if (info == XLOG_XACT_ASSIGNMENT)
+	{
+		xl_xact_assignment *xlrec = (xl_xact_assignment *) rec;
+
+		/* ignore the main xid, it may be Invalid and misleading */
+		appendStringInfo(buf, "assignment: xassign %u xtop %u", 
+							xlrec->xassign, xlrec->xtop);
+	}
+	else if (info == XLOG_XACT_RUNNING_XACTS)
+	{
+		xl_xact_running_xacts *xlrec = (xl_xact_running_xacts *) rec;
+
+		appendStringInfo(buf, "running xacts: ");
+		xact_desc_running_xacts(buf, xlrec);
+	}
 	else
 		appendStringInfo(buf, "UNKNOWN");
 }
diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c
index 7e480e2fb2..fcf5657a23 100644
--- a/src/backend/access/transam/xlog.c
+++ b/src/backend/access/transam/xlog.c
@@ -25,6 +25,7 @@
 
 #include "access/clog.h"
 #include "access/multixact.h"
+#include "access/nbtree.h"
 #include "access/subtrans.h"
 #include "access/transam.h"
 #include "access/tuptoaster.h"
@@ -44,6 +45,7 @@
 #include "storage/ipc.h"
 #include "storage/pmsignal.h"
 #include "storage/procarray.h"
+#include "storage/sinval.h"
 #include "storage/smgr.h"
 #include "storage/spin.h"
 #include "utils/builtins.h"
@@ -51,6 +53,7 @@
 #include "utils/ps_status.h"
 #include "pg_trace.h"
 
+#define WAL_DEBUG
 
 /* File path names (all relative to $PGDATA) */
 #define BACKUP_LABEL_FILE		"backup_label"
@@ -58,6 +61,8 @@
 #define RECOVERY_COMMAND_FILE	"recovery.conf"
 #define RECOVERY_COMMAND_DONE	"recovery.done"
 
+/* copied from tcopprot.h rather than include whole file */
+extern	int	PostAuthDelay;
 
 /* User-settable parameters */
 int			CheckPointSegments = 3;
@@ -70,7 +75,9 @@ bool		log_checkpoints = false;
 int 		sync_method = DEFAULT_SYNC_METHOD;
 
 #ifdef WAL_DEBUG
-bool		XLOG_DEBUG = false;
+bool		XLOG_DEBUG_FLUSH = false;
+bool		XLOG_DEBUG_BGFLUSH = false;
+bool		XLOG_DEBUG_REDO = true;
 #endif
 
 /*
@@ -124,33 +131,51 @@ TimeLineID	ThisTimeLineID = 0;
 bool		InRecovery = false;
 
 /* Are we recovering using offline XLOG archives? */
-static bool InArchiveRecovery = false;
+bool 		InArchiveRecovery = false;
+
+static 	XLogRecPtr	LastRec;
 
 /* Local copy of shared RecoveryProcessingMode state */
 static bool LocalRecoveryProcessingMode = true;
 static bool knownProcessingMode = false;
 
+/* is the database proven consistent yet? */
+bool	reachedSafeStartPoint = false;
+
 /* Was the last xlog file restored from archive, or local? */
 static bool restoredFromArchive = false;
 
 /* options taken from recovery.conf */
 static char *recoveryRestoreCommand = NULL;
-static bool recoveryTarget = false;
 static bool recoveryTargetExact = false;
 static bool recoveryTargetInclusive = true;
 static bool recoveryLogRestartpoints = false;
 static TransactionId recoveryTargetXid;
 static TimestampTz recoveryTargetTime;
+static int recoveryTargetAdvance = 0;
+
+/* recovery target modes */
+#define	RECOVERY_TARGET_NONE				0
+#define RECOVERY_TARGET_PAUSE_ALL			1
+#define RECOVERY_TARGET_PAUSE_XID			2
+#define RECOVERY_TARGET_PAUSE_TIME			3
+#define RECOVERY_TARGET_ADVANCE				4
+#define RECOVERY_TARGET_STOP_IMMEDIATE		5
+#define RECOVERY_TARGET_STOP_XID			6
+#define RECOVERY_TARGET_STOP_TIME			7
+static int recoveryTargetMode = RECOVERY_TARGET_NONE; 
+
+#define DEFAULT_MAX_STANDBY_DELAY 	0
+int maxStandbyDelay = DEFAULT_MAX_STANDBY_DELAY;
+
 static TimestampTz recoveryLastXTime = 0;
+static TransactionId recoveryLastXid = InvalidTransactionId;
 
 /* if recoveryStopsHere returns true, it saves actual stop xid/time here */
 static TransactionId recoveryStopXid;
 static TimestampTz recoveryStopTime;
 static bool recoveryStopAfter;
 
-/* is the database proven consistent yet? */
-bool	reachedSafeStartPoint = false;
-
 /*
  * During normal operation, the only timeline we care about is ThisTimeLineID.
  * During recovery, however, things are more complicated.  To simplify life
@@ -272,7 +297,7 @@ static XLogRecPtr RedoRecPtr;
  * the system. Changing PM_STARTUP to PM_RECOVERY only occurs when we can
  * prove the databases are in a consistent state. Changing from PM_RECOVERY
  * to PM_RUN happens whenever recovery ends, which could be forced upon us
- * externally or it can occur becasue of damage or termination of the WAL
+ * externally or it can occur because of damage or termination of the WAL
  * sequence.
  *----------
  */
@@ -371,6 +396,20 @@ typedef struct XLogCtlData
 	bool		SharedRecoveryProcessingMode;
 	slock_t		mode_lck;
 
+	/*
+	 * recovery target control information
+	 *
+	 * Protected by info_lck
+	 */
+	int				recoveryTargetMode;
+	TransactionId	recoveryTargetXid;
+	TimestampTz		recoveryTargetTime;
+	int				recoveryTargetAdvance;
+
+	TimestampTz 	recoveryLastXTime;
+	TransactionId 	recoveryLastXid;
+	XLogRecPtr		recoveryLastRecPtr;
+
 	char		InfoLockPadding[XLOGCTL_BUFFER_SPACING];
 
 	slock_t		info_lck;		/* locks shared variables shown above */
@@ -545,11 +584,14 @@ XLogInsert(RmgrId rmid, uint8 info, XLogRecData *rdata)
 	bool		updrqst;
 	bool		doPageWrites;
 	bool		isLogSwitch = (rmid == RM_XLOG_ID && info == XLOG_SWITCH);
-	bool		isRecoveryEnd = (rmid == RM_XLOG_ID && info == XLOG_RECOVERY_END);
+	bool		isRecoveryEnd = (rmid == RM_XLOG_ID && 
+									(info == XLOG_RECOVERY_END ||
+									 info == XLOG_CHECKPOINT_ONLINE));
 
 	/* cross-check on whether we should be here or not */
 	if (IsRecoveryProcessingMode() && !isRecoveryEnd)
-		elog(FATAL, "cannot make new WAL entries during recovery");
+		elog(FATAL, "cannot make new WAL entries during recovery "
+					"(RMgrId = %d info = %d)", rmid, info);
 
 	/* info's high bits are reserved for use by me */
 	if (info & XLR_INFO_MASK)
@@ -888,6 +930,7 @@ begin:;
 	record->xl_len = len;		/* doesn't include backup blocks */
 	record->xl_info = info;
 	record->xl_rmid = rmid;
+	record->xl_topxid = GetTopTransactionIdIfAny();
 
 	/* Now we can finish computing the record's CRC */
 	COMP_CRC32(rdata_crc, (char *) record + sizeof(pg_crc32),
@@ -895,25 +938,6 @@ begin:;
 	FIN_CRC32(rdata_crc);
 	record->xl_crc = rdata_crc;
 
-#ifdef WAL_DEBUG
-	if (XLOG_DEBUG)
-	{
-		StringInfoData buf;
-
-		initStringInfo(&buf);
-		appendStringInfo(&buf, "INSERT @ %X/%X: ",
-						 RecPtr.xlogid, RecPtr.xrecoff);
-		xlog_outrec(&buf, record);
-		if (rdata->data != NULL)
-		{
-			appendStringInfo(&buf, " - ");
-			RmgrTable[record->xl_rmid].rm_desc(&buf, record->xl_info, rdata->data);
-		}
-		elog(LOG, "%s", buf.data);
-		pfree(buf.data);
-	}
-#endif
-
 	/* Record begin of record in appropriate places */
 	ProcLastRecPtr = RecPtr;
 	Insert->PrevRecord = RecPtr;
@@ -1804,7 +1828,7 @@ XLogFlush(XLogRecPtr record)
 		return;
 
 #ifdef WAL_DEBUG
-	if (XLOG_DEBUG)
+	if (XLOG_DEBUG_FLUSH)
 		elog(LOG, "xlog flush request %X/%X; write %X/%X; flush %X/%X",
 			 record.xlogid, record.xrecoff,
 			 LogwrtResult.Write.xlogid, LogwrtResult.Write.xrecoff,
@@ -1954,7 +1978,7 @@ XLogBackgroundFlush(void)
 		return;
 
 #ifdef WAL_DEBUG
-	if (XLOG_DEBUG)
+	if (XLOG_DEBUG_BGFLUSH)
 		elog(LOG, "xlog bg flush request %X/%X; write %X/%X; flush %X/%X",
 			 WriteRqstPtr.xlogid, WriteRqstPtr.xrecoff,
 			 LogwrtResult.Write.xlogid, LogwrtResult.Write.xrecoff,
@@ -3027,6 +3051,9 @@ RestoreBkpBlocks(XLogRecPtr lsn, XLogRecord *record, bool cleanup)
 	char	   *blk;
 	int			i;
 
+	if (!(record->xl_info & XLR_BKP_BLOCK_MASK))
+		return;
+
 	blk = (char *) XLogRecGetData(record) + record->xl_len;
 	for (i = 0; i < XLR_MAX_BKP_BLOCKS; i++)
 	{
@@ -4456,6 +4483,7 @@ BootStrapXLOG(void)
 	record->xl_prev.xlogid = 0;
 	record->xl_prev.xrecoff = 0;
 	record->xl_xid = InvalidTransactionId;
+	record->xl_topxid = InvalidTransactionId;
 	record->xl_tot_len = SizeOfXLogRecord + sizeof(checkPoint);
 	record->xl_len = sizeof(checkPoint);
 	record->xl_info = XLOG_CHECKPOINT_SHUTDOWN;
@@ -4639,7 +4667,7 @@ readRecoveryCommandFile(void)
 			ereport(LOG,
 					(errmsg("recovery_target_xid = %u",
 							recoveryTargetXid)));
-			recoveryTarget = true;
+			recoveryTargetMode = RECOVERY_TARGET_STOP_XID;
 			recoveryTargetExact = true;
 		}
 		else if (strcmp(tok1, "recovery_target_time") == 0)
@@ -4650,7 +4678,7 @@ readRecoveryCommandFile(void)
 			 */
 			if (recoveryTargetExact)
 				continue;
-			recoveryTarget = true;
+			recoveryTargetMode = RECOVERY_TARGET_STOP_TIME;
 			recoveryTargetExact = false;
 
 			/*
@@ -4683,12 +4711,32 @@ readRecoveryCommandFile(void)
 			 * does nothing if a recovery_target is not also set
 			 */
 			if (!parse_bool(tok2, &recoveryLogRestartpoints))
-				ereport(ERROR,
-						(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
-							errmsg("parameter \"log_restartpoints\" requires a Boolean value")));
+				  ereport(ERROR,
+							(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+					  errmsg("parameter \"log_restartpoints\" requires a Boolean value")));
+			ereport(LOG,
+					(errmsg("log_restartpoints = %s", tok2)));
+		}
+		else if (strcmp(tok1, "max_standby_delay") == 0)
+		{
+			errno = 0;
+			maxStandbyDelay = (TransactionId) strtoul(tok2, NULL, 0);
+			if (errno == EINVAL || errno == ERANGE)
+				ereport(FATAL,
+				 (errmsg("max_standby_delay is not a valid number: \"%s\"",
+						 tok2)));
+			/*
+			 * 2E6 seconds is about 23 days. Allows us to measure delay in
+			 * milliseconds.
+			 */
+			if (maxStandbyDelay > INT_MAX || maxStandbyDelay < -1)
+				ereport(FATAL,
+				 (errmsg("max_standby_delay must be between -1 (wait forever) and 2 000 000 secs")));
+
 			ereport(LOG,
-				(errmsg("log_restartpoints = %s", tok2)));
- 		}
+					(errmsg("max_standby_delay = %u",
+							maxStandbyDelay)));
+		}
 		else
 			ereport(FATAL,
 					(errmsg("unrecognized recovery parameter \"%s\"",
@@ -4836,8 +4884,8 @@ exitArchiveRecovery(TimeLineID endTLI, uint32 endLogId, uint32 endLogSeg)
 }
 
 /*
- * For point-in-time recovery, this function decides whether we want to
- * stop applying the XLOG at or after the current record.
+ * For archive recovery, this function decides whether we want to
+ * pause or stop applying the XLOG at or after the current record.
  *
  * Returns TRUE if we are stopping, FALSE otherwise.  On TRUE return,
  * *includeThis is set TRUE if we should apply this record before stopping.
@@ -4850,72 +4898,275 @@ exitArchiveRecovery(TimeLineID endTLI, uint32 endLogId, uint32 endLogSeg)
 static bool
 recoveryStopsHere(XLogRecord *record, bool *includeThis)
 {
-	bool		stopsHere;
-	uint8		record_info;
-	TimestampTz recordXtime;
-
+	bool		stopsHere = false;
+	bool		pauseHere = false;
+	static bool    	paused = false;
+	uint8		record_info = 0;        /* valid iff (is_xact_completion_record) */
+	TimestampTz recordXtime = 0;
+	bool        is_xact_completion_record = false;
+  
 	/* We only consider stopping at COMMIT or ABORT records */
-	if (record->xl_rmid != RM_XACT_ID)
-		return false;
-	record_info = record->xl_info & ~XLR_INFO_MASK;
-	if (record_info == XLOG_XACT_COMMIT)
+	if (record->xl_rmid == RM_XACT_ID)
 	{
-		xl_xact_commit *recordXactCommitData;
+		record_info = record->xl_info & ~XLR_INFO_MASK;
+		if (record_info == XLOG_XACT_COMMIT)
+		{
+			xl_xact_commit *recordXactCommitData;
 
-		recordXactCommitData = (xl_xact_commit *) XLogRecGetData(record);
-		recordXtime = recordXactCommitData->xact_time;
-	}
-	else if (record_info == XLOG_XACT_ABORT)
-	{
-		xl_xact_abort *recordXactAbortData;
+			recordXactCommitData = (xl_xact_commit *) XLogRecGetData(record);
+			recordXtime = recordXactCommitData->xact_time;
+			is_xact_completion_record = true;
+		}
+		else if (record_info == XLOG_XACT_ABORT)
+		{
+			xl_xact_abort *recordXactAbortData;
 
-		recordXactAbortData = (xl_xact_abort *) XLogRecGetData(record);
-		recordXtime = recordXactAbortData->xact_time;
-	}
-	else
-		return false;
+			recordXactAbortData = (xl_xact_abort *) XLogRecGetData(record);
+			recordXtime = recordXactAbortData->xact_time;
+			is_xact_completion_record = true;
+		}
 
-	/* Do we have a PITR target at all? */
-	if (!recoveryTarget)
-	{
-		recoveryLastXTime = recordXtime;
-		return false;
+		/* Remember the most recent COMMIT/ABORT time for logging purposes */
+		if (is_xact_completion_record)
+		{
+			recoveryLastXTime = recordXtime;
+			recoveryLastXid = record->xl_xid;
+		}
 	}
 
-	if (recoveryTargetExact)
+	do
 	{
+		int	prevRecoveryTargetMode = recoveryTargetMode;	
+
+		CHECK_FOR_INTERRUPTS();
+
 		/*
-		 * there can be only one transaction end record with this exact
-		 * transactionid
-		 *
-		 * when testing for an xid, we MUST test for equality only, since
-		 * transactions are numbered in the order they start, not the order
-		 * they complete. A higher numbered xid will complete before you about
-		 * 50% of the time...
+		 * Let's see if user has updated our recoveryTargetMode.
 		 */
-		stopsHere = (record->xl_xid == recoveryTargetXid);
-		if (stopsHere)
-			*includeThis = recoveryTargetInclusive;
-	}
-	else
-	{
+		{
+			/* use volatile pointer to prevent code rearrangement */
+			volatile XLogCtlData *xlogctl = XLogCtl;
+
+			SpinLockAcquire(&xlogctl->info_lck);
+			recoveryTargetMode = xlogctl->recoveryTargetMode;
+			if (recoveryTargetMode != RECOVERY_TARGET_NONE)
+			{
+				recoveryTargetXid = xlogctl->recoveryTargetXid;
+				recoveryTargetTime = xlogctl->recoveryTargetTime;
+
+				/* Don't reset counter while we're advancing */
+				if (recoveryTargetAdvance <= 0)
+				{
+					recoveryTargetAdvance = xlogctl->recoveryTargetAdvance;
+					xlogctl->recoveryTargetAdvance = 0;
+				}
+			}
+			if (is_xact_completion_record)
+			{
+				xlogctl->recoveryLastXTime = recordXtime;
+				xlogctl->recoveryLastXid = record->xl_xid;
+			}
+			xlogctl->recoveryLastRecPtr = LastRec;
+			SpinLockRelease(&xlogctl->info_lck);
+		}
+
+		/* Decide how to act on any pause target */
+		switch (recoveryTargetMode) 
+		{
+			case RECOVERY_TARGET_NONE:
+					/* 
+					 * If we aren't paused and we're not looking to stop,
+					 * just exit out quickly and get on with recovery.
+					 */
+					if (paused)
+					{
+						ereport(LOG, 
+								(errmsg("recovery restarting after pause")));
+						set_ps_display("recovery continues", false);
+						paused = false;
+					}
+					return false;
+
+			case RECOVERY_TARGET_PAUSE_ALL:
+					pauseHere = true;
+					break;
+
+			case RECOVERY_TARGET_ADVANCE:
+					if (paused)
+					{
+						if (recoveryTargetAdvance-- > 0)
+						{
+							elog(LOG, "recovery advancing 1 record");
+							return false;
+						}
+						else
+							break;
+					}
+
+					if (recoveryTargetAdvance-- <= 0)
+						pauseHere = true;
+					break;
+
+			case RECOVERY_TARGET_STOP_IMMEDIATE:
+			case RECOVERY_TARGET_STOP_XID:
+			case RECOVERY_TARGET_STOP_TIME:
+					paused = false;
+					break;
+
+			/*
+			 * If we're paused, and mode has changed reset to allow new settings
+			 * to apply and maybe allow us to continue.
+			 */
+			if (paused && prevRecoveryTargetMode != recoveryTargetMode)
+				paused = false;
+
+			case RECOVERY_TARGET_PAUSE_XID:
+					/*
+					 * there can be only one transaction end record with this exact
+					 * transactionid
+					 *
+					 * when testing for an xid, we MUST test for equality only, since
+					 * transactions are numbered in the order they start, not the order
+					 * they complete. A higher numbered xid will complete before you about
+					 * 50% of the time...
+					 */
+					if (is_xact_completion_record)
+						pauseHere = (record->xl_xid == recoveryTargetXid);
+					break;
+
+			case RECOVERY_TARGET_PAUSE_TIME:
+					/*
+					 * there can be many transactions that share the same commit time, so
+					 * we pause after the last one, if we are inclusive, or pause at the
+					 * first one if we are exclusive
+					 */
+					if (is_xact_completion_record)
+					{
+						if (recoveryTargetInclusive)
+							pauseHere = (recoveryLastXTime > recoveryTargetTime);
+						else
+							pauseHere = (recoveryLastXTime >= recoveryTargetTime);
+					}
+					break;
+
+			default:
+					ereport(WARNING,
+							(errmsg("unknown recovery mode %d, continuing recovery", 
+											recoveryTargetMode)));
+					return false;
+		}
+
 		/*
-		 * there can be many transactions that share the same commit time, so
-		 * we stop after the last one, if we are inclusive, or stop at the
-		 * first one if we are exclusive
+		 * If we just entered pause, issue log messages
 		 */
-		if (recoveryTargetInclusive)
-			stopsHere = (recordXtime > recoveryTargetTime);
-		else
-			stopsHere = (recordXtime >= recoveryTargetTime);
-		if (stopsHere)
-			*includeThis = false;
+		if (pauseHere && !paused)
+		{
+			if (is_xact_completion_record)
+			{
+				if (record_info == XLOG_XACT_COMMIT)
+					ereport(LOG,
+						(errmsg("recovery pausing before commit of transaction %u, log time %s",
+									record->xl_xid,
+									timestamptz_to_str(recoveryLastXTime))));
+				else
+					ereport(LOG,
+						(errmsg("recovery pausing before abort of transaction %u, log time %s",
+									record->xl_xid,
+									timestamptz_to_str(recoveryLastXTime))));
+			}
+			else
+				ereport(LOG,
+						(errmsg("recovery pausing; last recovered transaction %u, "
+								"last recovered xact timestamp %s",
+									recoveryLastXid,
+									timestamptz_to_str(recoveryLastXTime))));
+
+			set_ps_display("recovery paused", false);
+
+			paused = true;
+		}
+
+		/*
+		 * Pause for a while before rechecking mode at top of loop.
+		 */
+		if (paused)
+		{
+			recoveryTargetAdvance = 0;
+
+			/*
+			 * Update the recoveryTargetMode
+			 */
+			{
+				/* use volatile pointer to prevent code rearrangement */
+				volatile XLogCtlData *xlogctl = XLogCtl;
+
+				SpinLockAcquire(&xlogctl->info_lck);
+				xlogctl->recoveryTargetMode = RECOVERY_TARGET_PAUSE_ALL;
+				xlogctl->recoveryTargetAdvance = 0;
+				SpinLockRelease(&xlogctl->info_lck);
+			}
+
+			pg_usleep(200000L);
+		}
+		
+		/*
+		 * We leave the loop at the bottom only if our recovery mode is
+		 * set (or has been recently reset) to one of the stop options.
+		 */
+	} while (paused);
+
+	/* 
+	 * Decide how to act if stop target mode set. We run this separately from 
+	 * pause to allow user to reset their stop target while paused.
+	 */
+	switch (recoveryTargetMode) 
+	{
+		case RECOVERY_TARGET_STOP_IMMEDIATE:
+				ereport(LOG,
+						(errmsg("recovery stopping immediately due to user request")));
+				return true;
+
+		case RECOVERY_TARGET_STOP_XID:
+				/*
+				 * there can be only one transaction end record with this exact
+				 * transactionid
+				 *
+				 * when testing for an xid, we MUST test for equality only, since
+				 * transactions are numbered in the order they start, not the order
+				 * they complete. A higher numbered xid will complete before you about
+				 * 50% of the time...
+				 */
+				if (is_xact_completion_record)
+				{
+					stopsHere = (record->xl_xid == recoveryTargetXid);
+					if (stopsHere)
+						*includeThis = recoveryTargetInclusive;
+				}
+				break;
+
+		case RECOVERY_TARGET_STOP_TIME:
+				/*
+				 * there can be many transactions that share the same commit time, so
+				 * we stop after the last one, if we are inclusive, or stop at the
+				 * first one if we are exclusive
+				 */
+				if (is_xact_completion_record)
+				{
+					if (recoveryTargetInclusive)
+						stopsHere = (recoveryLastXTime > recoveryTargetTime);
+					else
+						stopsHere = (recoveryLastXTime >= recoveryTargetTime);
+					if (stopsHere)
+						*includeThis = false;
+				}
+				break;
 	}
 
 	if (stopsHere)
 	{
+		Assert(is_xact_completion_record);
 		recoveryStopXid = record->xl_xid;
-		recoveryStopTime = recordXtime;
+		recoveryStopTime = recoveryLastXTime;
 		recoveryStopAfter = *includeThis;
 
 		if (record_info == XLOG_XACT_COMMIT)
@@ -4944,14 +5195,289 @@ recoveryStopsHere(XLogRecord *record, bool *includeThis)
 								recoveryStopXid,
 								timestamptz_to_str(recoveryStopTime))));
 		}
+	}
 
-		if (recoveryStopAfter)
-			recoveryLastXTime = recordXtime;
+	return stopsHere;
+}
+
+/*
+ * Utility function used by various user functions to set the recovery
+ * target mode. This allows user control over the progress of recovery.
+ */
+static void
+SetRecoveryTargetMode(int mode, TransactionId xid, TimestampTz ts, int advance)
+{
+	if (!superuser())
+		ereport(ERROR,
+				(errcode(ERRCODE_INSUFFICIENT_PRIVILEGE),
+				 errmsg("must be superuser to control recovery")));
+
+	if (!IsRecoveryProcessingMode())
+		ereport(ERROR,
+				(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
+				 errmsg("recovery is not in progress"),
+				 errhint("WAL control functions can only be executed during recovery.")));
+
+	{
+		/* use volatile pointer to prevent code rearrangement */
+		volatile XLogCtlData *xlogctl = XLogCtl;
+
+		SpinLockAcquire(&xlogctl->info_lck);
+		xlogctl->recoveryTargetMode = mode;
+
+		if (mode == RECOVERY_TARGET_STOP_XID || 
+			mode == RECOVERY_TARGET_PAUSE_XID)
+			xlogctl->recoveryTargetXid = xid;
+		else if (mode == RECOVERY_TARGET_STOP_TIME || 
+				  mode == RECOVERY_TARGET_PAUSE_TIME)
+			xlogctl->recoveryTargetTime = ts;
+		else if (mode == RECOVERY_TARGET_ADVANCE)
+			xlogctl->recoveryTargetAdvance = advance;
+
+		SpinLockRelease(&xlogctl->info_lck);
+	}
+
+	return;
+}
+
+/*
+ * Forces recovery mode to reset to unfrozen.
+ * Returns void.
+ */
+Datum
+pg_recovery_continue(PG_FUNCTION_ARGS)
+{
+	SetRecoveryTargetMode(RECOVERY_TARGET_NONE, InvalidTransactionId, 0, 0);
+
+	PG_RETURN_VOID();
+}
+
+/*
+ * Pause recovery immediately. Stays paused until asked to play again.
+ * Returns void.
+ */
+Datum
+pg_recovery_pause(PG_FUNCTION_ARGS)
+{
+	SetRecoveryTargetMode(RECOVERY_TARGET_PAUSE_ALL, InvalidTransactionId, 0, 0);
+
+	PG_RETURN_VOID();
+}
+
+/*
+ * Pause recovery at stated xid, if ever seen. Once paused, stays paused
+ * until asked to play again.
+ */
+Datum
+pg_recovery_pause_xid(PG_FUNCTION_ARGS)
+{
+	int			  xidi = PG_GETARG_INT32(0);
+	TransactionId xid = (TransactionId) xidi;
+
+	if (xid < 3)
+		elog(ERROR, "cannot specify special values for transaction id");
+
+	SetRecoveryTargetMode(RECOVERY_TARGET_PAUSE_XID, xid, 0, 0);
+
+	PG_RETURN_VOID();
+}
+
+/*
+ * Pause recovery at stated timestamp, if ever reached. Once paused, stays paused
+ * until asked to play again.
+ */
+Datum
+pg_recovery_pause_time(PG_FUNCTION_ARGS)
+{
+	TimestampTz ts = PG_GETARG_TIMESTAMPTZ(0);
+
+	SetRecoveryTargetMode(RECOVERY_TARGET_PAUSE_TIME, InvalidTransactionId, ts, 0);
+
+	PG_RETURN_VOID();
+}
+
+/*
+ * If paused, advance N records.
+ */
+Datum
+pg_recovery_advance(PG_FUNCTION_ARGS)
+{
+	int adv = PG_GETARG_INT32(0);
+
+	if (adv < 1)
+		elog(ERROR, "recovery advance must be greater than or equal to 1");
+
+	SetRecoveryTargetMode(RECOVERY_TARGET_ADVANCE, InvalidTransactionId, 0, adv);
+
+	PG_RETURN_VOID();
+}
+
+/*
+ * Forces recovery to stop now if paused, or at end of next record if playing.
+ */
+Datum
+pg_recovery_stop(PG_FUNCTION_ARGS)
+{
+	SetRecoveryTargetMode(RECOVERY_TARGET_STOP_IMMEDIATE, InvalidTransactionId, 0, 0);
+
+	PG_RETURN_VOID();
+}
+
+Datum
+pg_current_recovery_target(PG_FUNCTION_ARGS)
+{
+	StringInfoData buf;
+
+	initStringInfo(&buf);
+
+	{
+		/* use volatile pointer to prevent code rearrangement */
+		volatile XLogCtlData *xlogctl = XLogCtl;
+
+		SpinLockAcquire(&xlogctl->info_lck);
+
+		recoveryTargetMode = xlogctl->recoveryTargetMode;
+		if (recoveryTargetMode != RECOVERY_TARGET_NONE)
+		{
+			recoveryTargetXid = xlogctl->recoveryTargetXid;
+			recoveryTargetTime = xlogctl->recoveryTargetTime;
+			recoveryTargetAdvance = xlogctl->recoveryTargetAdvance;
+		}
+
+		SpinLockRelease(&xlogctl->info_lck);
+	}
+
+	switch (recoveryTargetMode)
+	{
+		case RECOVERY_TARGET_NONE:
+				appendStringInfo(&buf, "No recovery target has been set");
+				break;
+		case RECOVERY_TARGET_PAUSE_ALL:
+				appendStringInfo(&buf, "Recovery paused");
+				break;
+		case RECOVERY_TARGET_PAUSE_XID:
+				appendStringInfo(&buf, "Recovery will pause after commit of transaction %u", recoveryTargetXid);
+				break;
+		case RECOVERY_TARGET_PAUSE_TIME:
+				appendStringInfo(&buf, "Recovery will pause after transaction completion timestamp %s", 
+										timestamptz_to_str(recoveryTargetTime));
+				break;
+		case RECOVERY_TARGET_ADVANCE:
+				appendStringInfo(&buf, "Recovery will advance");
+				break;
+		case RECOVERY_TARGET_STOP_IMMEDIATE:
+				appendStringInfo(&buf, "No recovery target has been set");
+				break;
+		case RECOVERY_TARGET_STOP_XID:
+				appendStringInfo(&buf, "Recovery will stop after commit of transaction %u", recoveryTargetXid);
+				break;
+		case RECOVERY_TARGET_STOP_TIME:
+				appendStringInfo(&buf, "Recovery will stop after transaction completion timestamp %s",
+										timestamptz_to_str(recoveryTargetTime));
+				break;
+	}
+
+	PG_RETURN_TEXT_P(cstring_to_text(buf.data));
+}
+
+/*
+ * Returns bool with current recovery mode, a global state.
+ */
+Datum
+pg_is_in_recovery(PG_FUNCTION_ARGS)
+{
+	PG_RETURN_BOOL(IsRecoveryProcessingMode());
+}
+
+/*
+ * Returns timestamp of last completed transaction
+ */
+Datum
+pg_last_recovered_xact_timestamp(PG_FUNCTION_ARGS)
+{
+	{
+		/* use volatile pointer to prevent code rearrangement */
+		volatile XLogCtlData *xlogctl = XLogCtl;
+
+		SpinLockAcquire(&xlogctl->info_lck);
+
+		recoveryLastXTime = xlogctl->recoveryLastXTime;
+
+		SpinLockRelease(&xlogctl->info_lck);
 	}
+
+	PG_RETURN_TIMESTAMPTZ(recoveryLastXTime);
+}
+
+/*
+ * Returns xid of last completed transaction
+ */
+Datum
+pg_last_recovered_xid(PG_FUNCTION_ARGS)
+{
+	{
+		/* use volatile pointer to prevent code rearrangement */
+		volatile XLogCtlData *xlogctl = XLogCtl;
+
+		SpinLockAcquire(&xlogctl->info_lck);
+
+		recoveryLastXid = xlogctl->recoveryLastXid;
+
+		SpinLockRelease(&xlogctl->info_lck);
+	}
+
+	PG_RETURN_INT32(recoveryLastXid);
+}
+
+/*
+ * Returns xlog location of last recovered WAL record.
+ */
+Datum
+pg_last_recovered_xlog_location(PG_FUNCTION_ARGS)
+{
+	char		location[MAXFNAMELEN];
+
+	{
+		/* use volatile pointer to prevent code rearrangement */
+		volatile XLogCtlData *xlogctl = XLogCtl;
+
+		SpinLockAcquire(&xlogctl->info_lck);
+
+		LastRec = xlogctl->recoveryLastRecPtr;
+
+		SpinLockRelease(&xlogctl->info_lck);
+	}
+
+	snprintf(location, sizeof(location), "%X/%X",
+			 LastRec.xlogid, LastRec.xrecoff);
+	PG_RETURN_TEXT_P(cstring_to_text(location));
+}
+
+/*
+ * Returns delay in milliseconds, or -1 if delay too large
+ */
+int
+GetLatestReplicationDelay(void)
+{
+	long		delay_secs;
+	int			delay_usecs;
+	int			delay;
+	TimestampTz currTz = GetCurrentTimestamp();
+
+	TimestampDifference(recoveryLastXTime, currTz,
+						&delay_secs, &delay_usecs);
+
+	/*
+	 * If delay is very large we probably aren't looking at
+	 * a replication situation at all, just a recover from backup.
+	 * So return a special value instead.
+	 */
+	if (delay_secs > (long)(INT_MAX / 1000))
+		delay = -1;
 	else
-		recoveryLastXTime = recordXtime;
+		delay = (int)(delay_secs * 1000) + (delay_usecs / 1000);
 
-	return stopsHere;
+	return delay;
 }
 
 /*
@@ -4967,7 +5493,6 @@ StartupXLOG(void)
 	bool		performedRecovery = false;
 	bool		haveBackupLabel = false;
 	XLogRecPtr	RecPtr,
-				LastRec,
 				checkPointLoc,
 				minRecoveryLoc,
 				EndOfLog;
@@ -5043,6 +5568,16 @@ StartupXLOG(void)
 	 */
 	readRecoveryCommandFile();
 
+	/*
+	 * PostAuthDelay is a debugging aid for investigating problems in startup
+	 * and/or recovery: it can be set in postgresql.conf to allow time to
+	 * attach to the newly-forked backend with a debugger. It can also be set
+	 * using the postmaster -W switch, which can be specified using the -o
+	 * option of pg_ctl, e.g. pg_ctl -D data -o "-W 30"
+	 */
+	if (PostAuthDelay > 0)
+		pg_usleep(PostAuthDelay * 1000000L);
+
 	/* Now we can determine the list of expected TLIs */
 	expectedTLIs = readTimeLineHistory(recoveryTargetTLI);
 
@@ -5264,21 +5799,29 @@ StartupXLOG(void)
 			do
 			{
 #ifdef WAL_DEBUG
-				if (XLOG_DEBUG)
+				if (XLOG_DEBUG_REDO)
 				{
-					StringInfoData buf;
-
-					initStringInfo(&buf);
-					appendStringInfo(&buf, "REDO @ %X/%X; LSN %X/%X: ",
-									 ReadRecPtr.xlogid, ReadRecPtr.xrecoff,
-									 EndRecPtr.xlogid, EndRecPtr.xrecoff);
-					xlog_outrec(&buf, record);
-					appendStringInfo(&buf, " - ");
-					RmgrTable[record->xl_rmid].rm_desc(&buf,
-													   record->xl_info,
-													 XLogRecGetData(record));
-					elog(LOG, "%s", buf.data);
-					pfree(buf.data);
+					int		loglevel = DEBUG3;
+
+					if (rmid == RM_XACT_ID)
+						loglevel = DEBUG2;
+
+					if (loglevel >= trace_recovery_messages)
+					{
+						StringInfoData buf;
+
+						initStringInfo(&buf);
+						appendStringInfo(&buf, "REDO @ %X/%X; LSN %X/%X: ",
+										 ReadRecPtr.xlogid, ReadRecPtr.xrecoff,
+										 EndRecPtr.xlogid, EndRecPtr.xrecoff);
+						xlog_outrec(&buf, record);
+						appendStringInfo(&buf, " - ");
+						RmgrTable[record->xl_rmid].rm_desc(&buf,
+														   record->xl_info,
+														 XLogRecGetData(record));
+						elog(LOG, "%s", buf.data);
+						pfree(buf.data);
+					}
 				}
 #endif
 
@@ -5309,32 +5852,41 @@ StartupXLOG(void)
 
 				RmgrTable[record->xl_rmid].rm_redo(EndRecPtr, record);
 
+				Assert(NumLWLocksHeldByMe() == 0);
+
 				/* Pop the error context stack */
 				error_context_stack = errcontext.previous;
 
 				LastRec = ReadRecPtr;
 
 				/*
-				 * Have we reached our safe starting point? If so, we can
-				 * signal Postmaster to enter consistent recovery mode.
-				 *
-				 * There are two point in the log we must pass. The first is
-				 * the minRecoveryPoint, which is the LSN at the time the
-				 * base backup was taken that we are about to rollfoward from.
-				 * If recovery has ever crashed or was stopped there is 
-				 * another point also: minSafeStartPoint, which we know the
-				 * latest LSN that recovery could have reached prior to crash.
-				 */
-				if (!reachedSafeStartPoint && 
-					 XLByteLE(ControlFile->minSafeStartPoint, EndRecPtr) && 
-					 XLByteLE(ControlFile->minRecoveryPoint, EndRecPtr))
+				* Can we signal Postmaster to enter consistent recovery mode?
+				*
+				* There are two points in the log that we must pass. The first
+				* is minRecoveryPoint, which is the LSN at the time the
+				* base backup was taken that we are about to rollforward from.
+				* If recovery has ever crashed or was stopped there is also
+				* another point also: minSafeStartPoint, which we know the
+				* latest LSN that recovery could have reached prior to crash.
+				*
+				* We must also have assembled sufficient information about
+				* transaction state to allow valid snapshots to be taken.
+				* In some circumstances that may change, but we only call
+				* this once, not each time we re-enable snapshots.
+				*/
+				if (!reachedSafeStartPoint &&
+					IsRunningXactDataValid() &&
+					XLByteLE(ControlFile->minSafeStartPoint, EndRecPtr) && 
+					XLByteLE(ControlFile->minRecoveryPoint, EndRecPtr))
 				{
-					reachedSafeStartPoint = true;
+					reachedSafeStartPoint = true;  /* so we only do this once */
 					if (InArchiveRecovery)
 					{
 						ereport(LOG,
-							(errmsg("consistent recovery state reached at %X/%X",
-								EndRecPtr.xlogid, EndRecPtr.xrecoff)));
+								(errmsg("database has now reached consistent state at %X/%X",
+										EndRecPtr.xlogid, EndRecPtr.xrecoff)));
+						InitRecoveryTransactionEnvironment();
+						StartCleanupDelayStats();
 						if (IsUnderPostmaster)
 							SendPostmasterSignal(PMSIGNAL_RECOVERY_START);
 					}
@@ -5377,14 +5929,14 @@ StartupXLOG(void)
 	 * Complain if we did not roll forward far enough to render the backup
 	 * dump consistent and start safely.
 	 */
-	if (InRecovery && !reachedSafeStartPoint)
+	if (InArchiveRecovery && !reachedSafeStartPoint)
 	{
 		if (reachedStopPoint)	/* stopped because of stop request */
 			ereport(FATAL,
 					(errmsg("requested recovery stop point is before end time of backup dump")));
 		else	/* ran off end of WAL */
 			ereport(FATAL,
-					(errmsg("WAL ends before end time of backup dump")));
+					(errmsg("end of WAL reached before end time of backup dump")));
 	}
 
 	/*
@@ -5515,6 +6067,10 @@ StartupXLOG(void)
 	ShmemVariableCache->latestCompletedXid = ShmemVariableCache->nextXid;
 	TransactionIdRetreat(ShmemVariableCache->latestCompletedXid);
 
+	/* Shutdown the recovery environment. Must be in this order */
+	ProcArrayClearRecoveryTransactions();
+	RelationClearRecoveryLocks();
+
 	/* Start up the commit log and related stuff, too */
 	StartupCLOG();
 	StartupSUBTRANS(oldestActiveXID);
@@ -5561,19 +6117,29 @@ StartupXLOG(void)
 		redo = GetRedoLocationForCheckpoint();
 
 		/* 
-		 * Tell the bgwriter
-		 */
-		SetRedoLocationForArchiveCheckpoint(redo);
-
-		/*
-		 * Okay, we can come up now. Allow others to write WAL.
+		 * Set up information for the bgwriter, but if it is not active
+		 * for whatever reason, perform the checkpoint ourselves.
 		 */
-		XLogCtl->SharedRecoveryProcessingMode = false;
+		if (SetRedoLocationForArchiveCheckpoint(redo))
+		{
+			/*
+			 * Okay, we can come up now. Allow others to write WAL.
+			 */
+			XLogCtl->SharedRecoveryProcessingMode = false;
 
-		/*
-		 * Now request checkpoint
-		 */
-		RequestCheckpoint(CHECKPOINT_FORCE | CHECKPOINT_IMMEDIATE);
+			/*
+			 * Now request checkpoint from bgwriter.
+			 */
+			RequestCheckpoint(CHECKPOINT_FORCE | CHECKPOINT_IMMEDIATE);
+		}
+		else
+		{
+			/*
+			 * Startup process performs the checkpoint, but defers
+			 * the change in processing mode until afterwards.
+			 */
+			CreateCheckPoint(CHECKPOINT_FORCE | CHECKPOINT_IMMEDIATE);
+		}
 	}
 	else
 	{
@@ -5585,16 +6151,15 @@ StartupXLOG(void)
 		ControlFile->time = (pg_time_t) time(NULL);
 		UpdateControlFile();
 		LWLockRelease(ControlFileLock);
-
-		/*
-		 * Okay, we're officially UP.
-		 */
-		XLogCtl->SharedRecoveryProcessingMode = false;
 	}
 
+	/*
+	 * Okay, we can come up now. Allow others to write WAL.
+	 */
+	XLogCtl->SharedRecoveryProcessingMode = false;
+
 	/* start the archive_timeout timer running */
 	XLogCtl->Write.lastSegSwitchTime = (pg_time_t) time(NULL);
-
 }
 
 /*
@@ -5615,6 +6180,9 @@ IsRecoveryProcessingMode(void)
 		/* use volatile pointer to prevent code rearrangement */
 		volatile XLogCtlData *xlogctl = XLogCtl;
 
+		if (xlogctl == NULL)
+			return false;
+
 		SpinLockAcquire(&xlogctl->mode_lck);
 		LocalRecoveryProcessingMode = XLogCtl->SharedRecoveryProcessingMode;
 		SpinLockRelease(&xlogctl->mode_lck);
@@ -5882,7 +6450,7 @@ LogCheckpointStart(int flags)
 {
 	if (flags & CHECKPOINT_RESTARTPOINT)
 		elog(LOG, "restartpoint starting:%s",
-			 (flags & CHECKPOINT_IMMEDIATE) ? " immediate" : "");
+			(flags & CHECKPOINT_IMMEDIATE) ? " immediate" : "");
 	else
 		elog(LOG, "checkpoint starting:%s%s%s%s%s%s",
 			 (flags & CHECKPOINT_IS_SHUTDOWN) ? " shutdown" : "",
@@ -6020,51 +6588,51 @@ CreateCheckPoint(int flags)
 	checkPoint.ThisTimeLineID = ThisTimeLineID;
 	checkPoint.time = (pg_time_t) time(NULL);
 
-	if (leavingArchiveRecovery)
-		checkPoint.redo = GetRedoLocationForArchiveCheckpoint();
-	else
-	{
-		/*
-		 * We must hold WALInsertLock while examining insert state to determine
-		 * the checkpoint REDO pointer.
-		 */
-		LWLockAcquire(WALInsertLock, LW_EXCLUSIVE);
+	/*
+	 * We must hold WALInsertLock while examining insert state to determine
+	 * the checkpoint REDO pointer.
+	 */
+	LWLockAcquire(WALInsertLock, LW_EXCLUSIVE);
 
-		/*
-		 * If this isn't a shutdown or forced checkpoint, and we have not inserted
-		 * any XLOG records since the start of the last checkpoint, skip the
-		 * checkpoint.	The idea here is to avoid inserting duplicate checkpoints
-		 * when the system is idle. That wastes log space, and more importantly it
-		 * exposes us to possible loss of both current and previous checkpoint
-		 * records if the machine crashes just as we're writing the update.
-		 * (Perhaps it'd make even more sense to checkpoint only when the previous
-		 * checkpoint record is in a different xlog page?)
-		 *
-		 * We have to make two tests to determine that nothing has happened since
-		 * the start of the last checkpoint: current insertion point must match
-		 * the end of the last checkpoint record, and its redo pointer must point
-		 * to itself.
-		 */
-		if ((flags & (CHECKPOINT_IS_SHUTDOWN | CHECKPOINT_FORCE)) == 0)
+	/*
+	 * If this isn't a shutdown or forced checkpoint, and we have not inserted
+	 * any XLOG records since the start of the last checkpoint, skip the
+	 * checkpoint.	The idea here is to avoid inserting duplicate checkpoints
+	 * when the system is idle. That wastes log space, and more importantly it
+	 * exposes us to possible loss of both current and previous checkpoint
+	 * records if the machine crashes just as we're writing the update.
+	 * (Perhaps it'd make even more sense to checkpoint only when the previous
+	 * checkpoint record is in a different xlog page?)
+	 *
+	 * We have to make two tests to determine that nothing has happened since
+	 * the start of the last checkpoint: current insertion point must match
+	 * the end of the last checkpoint record, and its redo pointer must point
+	 * to itself.
+	 */
+	if ((flags & (CHECKPOINT_IS_SHUTDOWN | CHECKPOINT_FORCE)) == 0)
+	{
+		XLogRecPtr	curInsert;
+
+		INSERT_RECPTR(curInsert, Insert, Insert->curridx);
+		if (curInsert.xlogid == ControlFile->checkPoint.xlogid &&
+			curInsert.xrecoff == ControlFile->checkPoint.xrecoff +
+			MAXALIGN(SizeOfXLogRecord + sizeof(CheckPoint)) &&
+			ControlFile->checkPoint.xlogid ==
+			ControlFile->checkPointCopy.redo.xlogid &&
+			ControlFile->checkPoint.xrecoff ==
+			ControlFile->checkPointCopy.redo.xrecoff)
 		{
-			XLogRecPtr	curInsert;
-
-			INSERT_RECPTR(curInsert, Insert, Insert->curridx);
-			if (curInsert.xlogid == ControlFile->checkPoint.xlogid &&
-				curInsert.xrecoff == ControlFile->checkPoint.xrecoff +
-				MAXALIGN(SizeOfXLogRecord + sizeof(CheckPoint)) &&
-				ControlFile->checkPoint.xlogid ==
-				ControlFile->checkPointCopy.redo.xlogid &&
-				ControlFile->checkPoint.xrecoff ==
-				ControlFile->checkPointCopy.redo.xrecoff)
-			{
-				LWLockRelease(WALInsertLock);
-				LWLockRelease(CheckpointLock);
-				END_CRIT_SECTION();
-				return;
-			}
+			LWLockRelease(WALInsertLock);
+			LWLockRelease(CheckpointLock);
+			END_CRIT_SECTION();
+			return;
 		}
+	}
 
+	if (leavingArchiveRecovery)
+		checkPoint.redo = GetRedoLocationForArchiveCheckpoint();
+	else
+	{
 		/*
 		 * Compute new REDO record ptr = location of next XLOG record.
 		 *
@@ -6074,15 +6642,15 @@ CreateCheckPoint(int flags)
 		 * checkpoint, even though physically before it.  Got that?
 		 */
 		checkPoint.redo = GetRedoLocationForCheckpoint();
-
-		/*
-		 * Now we can release WAL insert lock, allowing other xacts to proceed
-		 * while we are flushing disk buffers.
-		 */
-		LWLockRelease(WALInsertLock);
 	}
 
 	/*
+	 * Now we can release WAL insert lock, allowing other xacts to proceed
+	 * while we are flushing disk buffers.
+	 */
+	LWLockRelease(WALInsertLock);
+
+	/*
 	 * If enabled, log checkpoint start.  We postpone this until now so as not
 	 * to log anything if we decided to skip the checkpoint.
 	 */
@@ -6199,18 +6767,15 @@ CreateCheckPoint(int flags)
 	 * that this is executed by bgwriter after the death of Startup process.
 	 */
 	LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
-
 	if (shutdown)
 		ControlFile->state = DB_SHUTDOWNED;
 	else
 		ControlFile->state = DB_IN_PRODUCTION;
-
 	ControlFile->prevCheckPoint = ControlFile->checkPoint;
 	ControlFile->checkPoint = ProcLastRecPtr;
 	ControlFile->checkPointCopy = checkPoint;
 	ControlFile->time = (pg_time_t) time(NULL);
 	UpdateControlFile();
-
 	LWLockRelease(ControlFileLock);
 
 	if (leavingArchiveRecovery)
@@ -6223,9 +6788,9 @@ CreateCheckPoint(int flags)
 		unlink(RECOVERY_COMMAND_DONE);
 		if (rename(RECOVERY_COMMAND_FILE, RECOVERY_COMMAND_DONE) != 0)
 			ereport(ERROR,
-					(errcode_for_file_access(),
+				    (errcode_for_file_access(),
 					 errmsg("could not rename file \"%s\" to \"%s\": %m",
-							RECOVERY_COMMAND_FILE, RECOVERY_COMMAND_DONE)));
+								RECOVERY_COMMAND_FILE, RECOVERY_COMMAND_DONE)));
 	}
 
 	/* Update shared-memory copy of checkpoint XID/epoch */
@@ -6271,10 +6836,10 @@ CreateCheckPoint(int flags)
 	 * Truncate pg_subtrans if possible.  We can throw away all data before
 	 * the oldest XMIN of any running transaction.	No future transaction will
 	 * attempt to reference any pg_subtrans entry older than that (see Asserts
-	 * in subtrans.c).	During recovery, though, we mustn't do this because
-	 * StartupSUBTRANS hasn't been called yet.
+	 * in subtrans.c).	
 	 */
-	TruncateSUBTRANS(GetOldestXmin(true, false));
+	if (!shutdown)
+		TruncateSUBTRANS(GetOldestXmin(true, false));
 
 	/* All real work is done, but log before releasing lock. */
 	if (log_checkpoints)
@@ -6286,8 +6851,19 @@ CreateCheckPoint(int flags)
                                 CheckpointStats.ckpt_segs_recycled);
 
 	LWLockRelease(CheckpointLock);
-}
 
+	/*
+	 * Take a snapshot of running transactions and write this to WAL.
+	 * This allows us to reconstruct the state of running transactions 
+	 * during archive recovery, if required.
+	 * 
+	 * If we are shutting down, or Startup process is completing crash
+	 * recovery we don't need to write running xact data.
+	 */
+	if (!shutdown && !IsRecoveryProcessingMode())
+		LogCurrentRunningXacts();
+}
+ 
 /* 
  * GetRedoLocationForCheckpoint()
  *
@@ -6298,15 +6874,15 @@ static XLogRecPtr
 GetRedoLocationForCheckpoint()
 {
 	XLogCtlInsert  *Insert = &XLogCtl->Insert;
-	uint32			freespace;
-	XLogRecPtr		redo;
+	uint32                  freespace;
+	XLogRecPtr              redo;
 
 	freespace = INSERT_FREESPACE(Insert);
 	if (freespace < SizeOfXLogRecord)
 	{
-		(void) AdvanceXLInsertBuffer(false);
-		/* OK to ignore update return flag, since we will do flush anyway */
-		freespace = INSERT_FREESPACE(Insert);
+	        (void) AdvanceXLInsertBuffer(false);
+	        /* OK to ignore update return flag, since we will do flush anyway */
+	        freespace = INSERT_FREESPACE(Insert);
 	}
 	INSERT_RECPTR(redo, Insert, Insert->curridx);
 
@@ -6322,12 +6898,12 @@ GetRedoLocationForCheckpoint()
 	 * their buffer changes are not included in the checkpoint.
 	 */
 	{
-		/* use volatile pointer to prevent code rearrangement */
-		volatile XLogCtlData *xlogctl = XLogCtl;
+	        /* use volatile pointer to prevent code rearrangement */
+	        volatile XLogCtlData *xlogctl = XLogCtl;
 
-		SpinLockAcquire(&xlogctl->info_lck);
-		RedoRecPtr = xlogctl->Insert.RedoRecPtr = redo;
-		SpinLockRelease(&xlogctl->info_lck);
+        SpinLockAcquire(&xlogctl->info_lck);
+        RedoRecPtr = xlogctl->Insert.RedoRecPtr = redo;
+        SpinLockRelease(&xlogctl->info_lck);
 	}
 
 	return redo;
@@ -6389,7 +6965,7 @@ RecoveryRestartPoint(const CheckPoint *checkPoint)
 		if (RmgrTable[rmid].rm_safe_restartpoint != NULL)
 			if (!(RmgrTable[rmid].rm_safe_restartpoint()))
 			{
-				elog(DEBUG2, "RM %d not safe to record restart point at %X/%X",
+				elog(trace_recovery(DEBUG2), "RM %d not safe to record restart point at %X/%X",
 					 rmid,
 					 checkPoint->redo.xlogid,
 					 checkPoint->redo.xrecoff);
@@ -6401,30 +6977,30 @@ RecoveryRestartPoint(const CheckPoint *checkPoint)
 }
 
 /*
- * As of 8.4, RestartPoints are always created by the bgwriter
- * once we have reachedSafeStartPoint. We use bgwriter's shared memory
- * area wherever we call it from, to keep better code structure.
- */
+* As of 8.4, RestartPoints are always created by the bgwriter
+* once we have reachedSafeStartPoint. We use bgwriter's shared memory
+* area wherever we call it from, to keep better code structure.
+*/
 void
 CreateRestartPoint(const XLogRecPtr ReadPtr, const CheckPoint *restartPoint, int flags)
 {
-	if (recoveryLogRestartpoints)
+	if (recoveryLogRestartpoints || log_checkpoints)
 	{
-		/*
+  		/*
 		 * Prepare to accumulate statistics.
-		 */
+  		 */
 
 		MemSet(&CheckpointStats, 0, sizeof(CheckpointStats));
 		CheckpointStats.ckpt_start_t = GetCurrentTimestamp();
 
 		LogCheckpointStart(CHECKPOINT_RESTARTPOINT | flags);
 	}
-
-	/*
+  
+  	/*
 	 * Acquire CheckpointLock to ensure only one restartpoint happens at a time.
 	 * We rely on this lock to ensure that the startup process doesn't exit
 	 * Recovery while we are half way through a restartpoint.
-	 */
+  	 */
 	LWLockAcquire(CheckpointLock, LW_EXCLUSIVE);
 
 	CheckPointGuts(restartPoint->redo, CHECKPOINT_RESTARTPOINT | flags);
@@ -6433,11 +7009,11 @@ CreateRestartPoint(const XLogRecPtr ReadPtr, const CheckPoint *restartPoint, int
 	 * Update pg_control, using current time
 	 */
 	LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
-	ControlFile->prevCheckPoint = ControlFile->checkPoint;
+  	ControlFile->prevCheckPoint = ControlFile->checkPoint;
 	ControlFile->checkPoint = ReadPtr;
 	ControlFile->checkPointCopy = *restartPoint;
-	ControlFile->time = (pg_time_t) time(NULL);
-	UpdateControlFile();
+  	ControlFile->time = (pg_time_t) time(NULL);
+  	UpdateControlFile();
 	LWLockRelease(ControlFileLock);
 
 	/*
@@ -6447,21 +7023,23 @@ CreateRestartPoint(const XLogRecPtr ReadPtr, const CheckPoint *restartPoint, int
 	 */
 
 	/* All real work is done, but log before releasing lock. */
-	if (recoveryLogRestartpoints)
+	if (recoveryLogRestartpoints || log_checkpoints)
 		LogCheckpointEnd(CHECKPOINT_RESTARTPOINT);
-
+  
 	ereport((recoveryLogRestartpoints ? LOG : DEBUG2),
-			(errmsg("recovery restart point at %X/%X",
+  			(errmsg("recovery restart point at %X/%X",
 					restartPoint->redo.xlogid, restartPoint->redo.xrecoff)));
 
-	if (recoveryLastXTime)
-		ereport((recoveryLogRestartpoints ? LOG : DEBUG2),
+	ReportCleanupDelayStats();
+
+  	if (recoveryLastXTime)
+  		ereport((recoveryLogRestartpoints ? LOG : DEBUG2),
 			(errmsg("last completed transaction was at log time %s",
 					timestamptz_to_str(recoveryLastXTime))));
 
 	LWLockRelease(CheckpointLock);
 }
-
+  
 /*
  * Write a NEXTOID log record
  */
@@ -6554,7 +7132,7 @@ exitRecovery(void)
 	else
 	{
 		RequestRestartPointCompletion();
-		ereport(LOG,
+		ereport(trace_recovery(DEBUG1),
 			(errmsg("startup process waiting for restartpoint to complete")));
 		LWLockAcquire(CheckpointLock, LW_EXCLUSIVE);
 		LWLockRelease(CheckpointLock);
@@ -6594,6 +7172,9 @@ xlog_redo(XLogRecPtr lsn, XLogRecord *record)
 	{
 		Oid			nextOid;
 
+		if (InArchiveRecovery)
+			(void) RecordKnownAssignedTransactionIds(lsn, record->xl_topxid, record->xl_xid);
+
 		memcpy(&nextOid, XLogRecGetData(record), sizeof(Oid));
 		if (ShmemVariableCache->nextOid < nextOid)
 		{
@@ -6613,11 +7194,15 @@ xlog_redo(XLogRecPtr lsn, XLogRecord *record)
 		MultiXactSetNextMXact(checkPoint.nextMulti,
 							  checkPoint.nextMultiOffset);
 
+		/* We know nothing was running on the master at this point */
+		ProcArrayClearRecoveryTransactions();
+		RelationClearRecoveryLocks();
+
 		/* ControlFile->checkPointCopy always tracks the latest ckpt XID */
 		ControlFile->checkPointCopy.nextXidEpoch = checkPoint.nextXidEpoch;
 		ControlFile->checkPointCopy.nextXid = checkPoint.nextXid;
 
-		/*
+  		/*
 		 * TLI no longer changes at shutdown checkpoint, since as of 8.4,
 		 * shutdown checkpoints only occur at shutdown. Much less confusing.
 		 */
@@ -6630,6 +7215,10 @@ xlog_redo(XLogRecPtr lsn, XLogRecord *record)
 
 		memcpy(&tli, XLogRecGetData(record), sizeof(TimeLineID));
 
+		/* We know nothing was running on the master at this point */
+		ProcArrayClearRecoveryTransactions();
+		RelationClearRecoveryLocks();
+
 		/*
 		 * TLI may change when recovery ends, but it shouldn't decrease.
 		 *
@@ -6640,17 +7229,17 @@ xlog_redo(XLogRecPtr lsn, XLogRecord *record)
 		 * new timelineID which is recorded using this record type.
 		 */
 		if (tli != ThisTimeLineID)
-		{
+  		{
 			if (tli < ThisTimeLineID ||
-				!list_member_int(expectedTLIs,
+  				!list_member_int(expectedTLIs,
 								 (int) tli))
-				ereport(PANIC,
+  				ereport(PANIC,
 						(errmsg("unexpected timeline ID %u (after %u) at recovery end record",
 								tli, ThisTimeLineID)));
-			/* Following WAL records should be run with new TLI */
+  			/* Following WAL records should be run with new TLI */
 			ThisTimeLineID = tli;
-		}
-	}
+  		}
+  	}
 	else if (info == XLOG_CHECKPOINT_ONLINE)
 	{
 		CheckPoint	checkPoint;
@@ -6740,6 +7329,10 @@ xlog_outrec(StringInfo buf, XLogRecord *record)
 					 record->xl_prev.xlogid, record->xl_prev.xrecoff,
 					 record->xl_xid);
 
+	appendStringInfo(buf, "; pxid %u len %u",
+					 record->xl_topxid, 
+					 record->xl_len);
+
 	for (i = 0; i < XLR_MAX_BKP_BLOCKS; i++)
 	{
 		if (record->xl_info & XLR_SET_BKP_BLOCK(i))
@@ -6895,6 +7488,12 @@ pg_start_backup(PG_FUNCTION_ARGS)
 				(errcode(ERRCODE_INSUFFICIENT_PRIVILEGE),
 				 errmsg("must be superuser to run a backup")));
 
+	if (IsRecoveryProcessingMode())
+		ereport(ERROR,
+				(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
+				 errmsg("recovery is in progress"),
+				 errhint("WAL control functions cannot be executed during recovery.")));
+
 	if (!XLogArchivingActive())
 		ereport(ERROR,
 				(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
@@ -7067,6 +7666,12 @@ pg_stop_backup(PG_FUNCTION_ARGS)
 				(errcode(ERRCODE_INSUFFICIENT_PRIVILEGE),
 				 (errmsg("must be superuser to run a backup"))));
 
+	if (IsRecoveryProcessingMode())
+		ereport(ERROR,
+				(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
+				 errmsg("recovery is in progress"),
+				 errhint("WAL control functions cannot be executed during recovery.")));
+
 	if (!XLogArchivingActive())
 		ereport(ERROR,
 				(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
@@ -7228,6 +7833,12 @@ pg_switch_xlog(PG_FUNCTION_ARGS)
 				(errcode(ERRCODE_INSUFFICIENT_PRIVILEGE),
 			 (errmsg("must be superuser to switch transaction log files"))));
 
+	if (IsRecoveryProcessingMode())
+		ereport(ERROR,
+				(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
+				 errmsg("recovery is in progress"),
+				 errhint("WAL control functions cannot be executed during recovery.")));
+
 	switchpoint = RequestXLogSwitch();
 
 	/*
@@ -7250,6 +7861,12 @@ pg_current_xlog_location(PG_FUNCTION_ARGS)
 {
 	char		location[MAXFNAMELEN];
 
+	if (IsRecoveryProcessingMode())
+		ereport(ERROR,
+				(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
+				 errmsg("recovery is in progress"),
+				 errhint("WAL control functions cannot be executed during recovery.")));
+
 	/* Make sure we have an up-to-date local LogwrtResult */
 	{
 		/* use volatile pointer to prevent code rearrangement */
@@ -7277,6 +7894,12 @@ pg_current_xlog_insert_location(PG_FUNCTION_ARGS)
 	XLogRecPtr	current_recptr;
 	char		location[MAXFNAMELEN];
 
+	if (IsRecoveryProcessingMode())
+		ereport(ERROR,
+				(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
+				 errmsg("recovery is in progress"),
+				 errhint("WAL control functions cannot be executed during recovery.")));
+
 	/*
 	 * Get the current end-of-WAL position ... shared lock is sufficient
 	 */
diff --git a/src/backend/catalog/storage.c b/src/backend/catalog/storage.c
index 309fa469ad..cfbd9d3c46 100644
--- a/src/backend/catalog/storage.c
+++ b/src/backend/catalog/storage.c
@@ -404,6 +404,9 @@ smgr_redo(XLogRecPtr lsn, XLogRecord *record)
 	/* Backup blocks are not used in smgr records */
 	Assert(!(record->xl_info & XLR_BKP_BLOCK_MASK));
 
+	if (InArchiveRecovery)
+		(void) RecordKnownAssignedTransactionIds(lsn, record->xl_topxid, record->xl_xid);
+
 	if (info == XLOG_SMGR_CREATE)
 	{
 		xl_smgr_create *xlrec = (xl_smgr_create *) XLogRecGetData(record);
diff --git a/src/backend/commands/dbcommands.c b/src/backend/commands/dbcommands.c
index 7e065762a8..54786e8a71 100644
--- a/src/backend/commands/dbcommands.c
+++ b/src/backend/commands/dbcommands.c
@@ -26,6 +26,7 @@
 
 #include "access/genam.h"
 #include "access/heapam.h"
+#include "access/transam.h"
 #include "access/xact.h"
 #include "access/xlogutils.h"
 #include "catalog/catalog.h"
@@ -52,6 +53,7 @@
 #include "utils/flatfiles.h"
 #include "utils/fmgroids.h"
 #include "utils/guc.h"
+#include "utils/inval.h"
 #include "utils/lsyscache.h"
 #include "utils/pg_locale.h"
 #include "utils/snapmgr.h"
@@ -1954,6 +1956,14 @@ dbase_redo(XLogRecPtr lsn, XLogRecord *record)
 		src_path = GetDatabasePath(xlrec->src_db_id, xlrec->src_tablespace_id);
 		dst_path = GetDatabasePath(xlrec->db_id, xlrec->tablespace_id);
 
+		if (InArchiveRecovery)
+		{
+			/*
+			 * No conflict resolution is required for a create database record
+			 */
+			(void) RecordKnownAssignedTransactionIds(lsn, record->xl_topxid, record->xl_xid);
+		}
+
 		/*
 		 * Our theory for replaying a CREATE is to forcibly drop the target
 		 * subdirectory if present, then re-copy the source data. This may be
@@ -1987,6 +1997,28 @@ dbase_redo(XLogRecPtr lsn, XLogRecord *record)
 
 		dst_path = GetDatabasePath(xlrec->db_id, xlrec->tablespace_id);
 
+		if (InArchiveRecovery && 
+			RecordKnownAssignedTransactionIds(lsn, record->xl_topxid, record->xl_xid))
+		{
+			VirtualTransactionId *database_users;
+
+			/*
+			 * Find all users connected to this database and ask them
+			 * politely to kill themselves before processing the 
+			 * drop database record, after the usual grace period.
+			 * We don't wait for commit because drop database is
+			 * non-transactional.
+			 */
+		    database_users = GetConflictingVirtualXIDs(InvalidTransactionId, 
+														xlrec->db_id,
+		                                				InvalidTransactionId);
+
+			ResolveRecoveryConflictWithVirtualXIDs(database_users,
+													"drop database",
+													FATAL,
+													InvalidXLogRecPtr);
+		}
+
 		/* Drop pages for this database that are in the shared buffer cache */
 		DropDatabaseBuffers(xlrec->db_id);
 
diff --git a/src/backend/commands/discard.c b/src/backend/commands/discard.c
index 348e6e033f..9623a6bd77 100644
--- a/src/backend/commands/discard.c
+++ b/src/backend/commands/discard.c
@@ -65,7 +65,8 @@ DiscardAll(bool isTopLevel)
 	ResetAllOptions();
 	DropAllPreparedStatements();
 	PortalHashTableDeleteAll();
-	Async_UnlistenAll();
+	if (!IsRecoveryProcessingMode())
+		Async_UnlistenAll();
 	LockReleaseAll(USER_LOCKMETHOD, true);
 	ResetPlanCache();
 	ResetTempTableNamespace();
diff --git a/src/backend/commands/indexcmds.c b/src/backend/commands/indexcmds.c
index 9f7cbc8dbd..2ac9806a0e 100644
--- a/src/backend/commands/indexcmds.c
+++ b/src/backend/commands/indexcmds.c
@@ -648,7 +648,7 @@ DefineIndex(RangeVar *heapRelation,
 	 * Also, GetCurrentVirtualXIDs never reports our own vxid, so we need not
 	 * check for that.
 	 */
-	old_snapshots = GetCurrentVirtualXIDs(snapshot->xmax, false,
+	old_snapshots = GetCurrentVirtualXIDs(snapshot->xmax, MyDatabaseId,
 										  PROC_IS_AUTOVACUUM | PROC_IN_VACUUM);
 
 	while (VirtualTransactionIdIsValid(*old_snapshots))
diff --git a/src/backend/commands/lockcmds.c b/src/backend/commands/lockcmds.c
index e32b184852..fe1e518694 100644
--- a/src/backend/commands/lockcmds.c
+++ b/src/backend/commands/lockcmds.c
@@ -48,6 +48,16 @@ LockTableCommand(LockStmt *lockstmt)
 
 		reloid = RangeVarGetRelid(relation, false);
 
+		/*
+		 * During recovery we only accept these variations:
+		 *
+		 * LOCK TABLE foo       -- parser translates as AccessEclusiveLock request
+		 * LOCK TABLE foo IN AccessShareLock MODE
+		 * LOCK TABLE foo IN AccessExclusiveLock MODE
+		 */
+		if (!(lockstmt->mode == AccessShareLock || lockstmt->mode == AccessExclusiveLock))
+			PreventCommandDuringRecovery();
+  
 		if (recurse)
 			children_and_self = find_all_inheritors(reloid);
 		else
diff --git a/src/backend/commands/sequence.c b/src/backend/commands/sequence.c
index 46d7683377..134b7fb139 100644
--- a/src/backend/commands/sequence.c
+++ b/src/backend/commands/sequence.c
@@ -457,6 +457,8 @@ nextval_internal(Oid relid)
 				rescnt = 0;
 	bool		logit = false;
 
+	PreventCommandDuringRecovery();
+
 	/* open and AccessShareLock sequence */
 	init_sequence(relid, &elm, &seqrel);
 
@@ -1342,6 +1344,11 @@ seq_redo(XLogRecPtr lsn, XLogRecord *record)
 	/* Backup blocks are not used in seq records */
 	Assert(!(record->xl_info & XLR_BKP_BLOCK_MASK));
 
+	if (InArchiveRecovery)
+		(void) RecordKnownAssignedTransactionIds(lsn, record->xl_topxid, record->xl_xid);
+
+	RestoreBkpBlocks(lsn, record, false);
+
 	if (info != XLOG_SEQ_LOG)
 		elog(PANIC, "seq_redo: unknown op code %u", info);
 
diff --git a/src/backend/commands/tablespace.c b/src/backend/commands/tablespace.c
index 75f772f0e4..ad5581ab06 100644
--- a/src/backend/commands/tablespace.c
+++ b/src/backend/commands/tablespace.c
@@ -51,6 +51,7 @@
 #include "access/heapam.h"
 #include "access/sysattr.h"
 #include "access/xact.h"
+#include "access/transam.h"
 #include "catalog/catalog.h"
 #include "catalog/dependency.h"
 #include "catalog/indexing.h"
@@ -60,10 +61,12 @@
 #include "miscadmin.h"
 #include "postmaster/bgwriter.h"
 #include "storage/fd.h"
+#include "storage/procarray.h"
 #include "utils/acl.h"
 #include "utils/builtins.h"
 #include "utils/fmgroids.h"
 #include "utils/guc.h"
+#include "utils/inval.h"
 #include "utils/lsyscache.h"
 #include "utils/memutils.h"
 #include "utils/rel.h"
@@ -1285,6 +1288,15 @@ tblspc_redo(XLogRecPtr lsn, XLogRecord *record)
 		char	   *location = xlrec->ts_path;
 		char	   *linkloc;
 
+		if (InArchiveRecovery)
+		{
+			/*
+			 * No conflict resolution is required for a create database record
+			 */
+			(void) RecordKnownAssignedTransactionIds(lsn, record->xl_topxid, 
+															record->xl_xid);
+		}
+
 		/*
 		 * Attempt to coerce target directory to safe permissions.	If this
 		 * fails, it doesn't exist or has the wrong owner.
@@ -1316,12 +1328,71 @@ tblspc_redo(XLogRecPtr lsn, XLogRecord *record)
 	else if (info == XLOG_TBLSPC_DROP)
 	{
 		xl_tblspc_drop_rec *xlrec = (xl_tblspc_drop_rec *) XLogRecGetData(record);
+		bool				process_conflicts = false;
 
+		/*
+		 * Process recovery transaction information
+		 */
+		if (InArchiveRecovery)
+			process_conflicts = RecordKnownAssignedTransactionIds(lsn, 
+													record->xl_topxid, 
+													record->xl_xid);
+		/*
+		 * If we issued a WAL record for a drop tablespace it is
+		 * because there were no files in it at all. That means that
+		 * no permanent objects can exist in it at this point.
+		 *
+		 * It is possible for standby users to be using this tablespace
+		 * as a location for their temporary files, so if we fail to
+		 * remove all files then do conflict processing and try again,
+		 * if currently enabled.
+		 */
 		if (!remove_tablespace_directories(xlrec->ts_id, true))
-			ereport(ERROR,
+		{
+			if (process_conflicts)
+			{
+				VirtualTransactionId *temp_file_users;
+
+				/*
+				 * Standby users may be currently using this tablespace for
+				 * for their temporary files. We only care about current
+				 * users because temp_tablespace parameter will just ignore
+				 * tablespaces that no longer exist.
+				 * 
+				 * We can work out the pids of currently active backends using
+				 * this tablespace by examining the temp filenames in the 
+				 * directory. We then convert the pids into VirtualXIDs before 
+				 * attempting to cancel them.
+				 *
+				 * We don't wait for commit because drop database is
+				 * non-transactional.
+				 *
+				 * XXXHS: that's the theory, but right now we choose to nuke the
+				 * entire site from orbit, cos its the only way to be sure,
+				 * after the usual grace period.
+				 */
+				temp_file_users = GetConflictingVirtualXIDs(InvalidTransactionId,
+															InvalidOid, 
+															InvalidOid);
+
+				ResolveRecoveryConflictWithVirtualXIDs(temp_file_users,
+														"drop tablespace",
+														ERROR,
+														InvalidXLogRecPtr);
+			}
+
+			/*
+			 * If we did recovery processing then hopefully the
+			 * backends who wrote temp files should have cleaned up and
+			 * exited by now. So lets recheck before we throw an error.
+			 * If !process_conflicts then this will just fail again.
+			 */		
+			if (!remove_tablespace_directories(xlrec->ts_id, true))
+				ereport(ERROR,
 					(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
 					 errmsg("tablespace %u is not empty",
-							xlrec->ts_id)));
+									xlrec->ts_id)));
+		}
 	}
 	else
 		elog(PANIC, "tblspc_redo: unknown op code %u", info);
diff --git a/src/backend/commands/user.c b/src/backend/commands/user.c
index 783de0246a..664a135390 100644
--- a/src/backend/commands/user.c
+++ b/src/backend/commands/user.c
@@ -1491,3 +1491,4 @@ DelRoleMems(const char *rolename, Oid roleid,
 	 */
 	heap_close(pg_authmem_rel, NoLock);
 }
+
diff --git a/src/backend/commands/vacuum.c b/src/backend/commands/vacuum.c
index 9b46c858f0..1599506375 100644
--- a/src/backend/commands/vacuum.c
+++ b/src/backend/commands/vacuum.c
@@ -141,6 +141,7 @@ typedef struct VRelStats
 	/* vtlinks array for tuple chain following - sorted by new_tid */
 	int			num_vtlinks;
 	VTupleLink	vtlinks;
+	TransactionId	latestRemovedXid;
 } VRelStats;
 
 /*----------------------------------------------------------------------
@@ -224,7 +225,7 @@ static void scan_heap(VRelStats *vacrelstats, Relation onerel,
 static void repair_frag(VRelStats *vacrelstats, Relation onerel,
 			VacPageList vacuum_pages, VacPageList fraged_pages,
 			int nindexes, Relation *Irel);
-static void move_chain_tuple(Relation rel,
+static void move_chain_tuple(VRelStats *vacrelstats, Relation rel,
 				 Buffer old_buf, Page old_page, HeapTuple old_tup,
 				 Buffer dst_buf, Page dst_page, VacPage dst_vacpage,
 				 ExecContext ec, ItemPointer ctid, bool cleanVpd);
@@ -237,7 +238,7 @@ static void update_hint_bits(Relation rel, VacPageList fraged_pages,
 				 int num_moved);
 static void vacuum_heap(VRelStats *vacrelstats, Relation onerel,
 			VacPageList vacpagelist);
-static void vacuum_page(Relation onerel, Buffer buffer, VacPage vacpage);
+static void vacuum_page(VRelStats *vacrelstats, Relation onerel, Buffer buffer, VacPage vacpage);
 static void vacuum_index(VacPageList vacpagelist, Relation indrel,
 			 double num_tuples, int keep_tuples);
 static void scan_index(Relation indrel, double num_tuples);
@@ -1271,6 +1272,7 @@ full_vacuum_rel(Relation onerel, VacuumStmt *vacstmt)
 	vacrelstats->rel_tuples = 0;
 	vacrelstats->rel_indexed_tuples = 0;
 	vacrelstats->hasindex = false;
+	vacrelstats->latestRemovedXid = InvalidTransactionId;
 
 	/* scan the heap */
 	vacuum_pages.num_pages = fraged_pages.num_pages = 0;
@@ -1674,6 +1676,9 @@ scan_heap(VRelStats *vacrelstats, Relation onerel,
 			{
 				ItemId		lpp;
 
+				HeapTupleHeaderAdvanceLatestRemovedXid(tuple.t_data, 
+											&vacrelstats->latestRemovedXid);
+
 				/*
 				 * Here we are building a temporary copy of the page with dead
 				 * tuples removed.	Below we will apply
@@ -1987,7 +1992,7 @@ repair_frag(VRelStats *vacrelstats, Relation onerel,
 				/* there are dead tuples on this page - clean them */
 				Assert(!isempty);
 				LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE);
-				vacuum_page(onerel, buf, last_vacuum_page);
+				vacuum_page(vacrelstats, onerel, buf, last_vacuum_page);
 				LockBuffer(buf, BUFFER_LOCK_UNLOCK);
 			}
 			else
@@ -2476,7 +2481,7 @@ repair_frag(VRelStats *vacrelstats, Relation onerel,
 					tuple.t_data = (HeapTupleHeader) PageGetItem(Cpage, Citemid);
 					tuple_len = tuple.t_len = ItemIdGetLength(Citemid);
 
-					move_chain_tuple(onerel, Cbuf, Cpage, &tuple,
+					move_chain_tuple(vacrelstats, onerel, Cbuf, Cpage, &tuple,
 									 dst_buffer, dst_page, destvacpage,
 									 &ec, &Ctid, vtmove[ti].cleanVpd);
 
@@ -2562,7 +2567,7 @@ repair_frag(VRelStats *vacrelstats, Relation onerel,
 				dst_page = BufferGetPage(dst_buffer);
 				/* if this page was not used before - clean it */
 				if (!PageIsEmpty(dst_page) && dst_vacpage->offsets_used == 0)
-					vacuum_page(onerel, dst_buffer, dst_vacpage);
+					vacuum_page(vacrelstats, onerel, dst_buffer, dst_vacpage);
 			}
 			else
 				LockBuffer(dst_buffer, BUFFER_LOCK_EXCLUSIVE);
@@ -2739,7 +2744,7 @@ repair_frag(VRelStats *vacrelstats, Relation onerel,
 			LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE);
 			page = BufferGetPage(buf);
 			if (!PageIsEmpty(page))
-				vacuum_page(onerel, buf, *curpage);
+				vacuum_page(vacrelstats, onerel, buf, *curpage);
 			UnlockReleaseBuffer(buf);
 		}
 	}
@@ -2875,7 +2880,7 @@ repair_frag(VRelStats *vacrelstats, Relation onerel,
 				recptr = log_heap_clean(onerel, buf,
 										NULL, 0, NULL, 0,
 										unused, uncnt,
-										false);
+										vacrelstats->latestRemovedXid, false);
 				PageSetLSN(page, recptr);
 				PageSetTLI(page, ThisTimeLineID);
 			}
@@ -2925,7 +2930,7 @@ repair_frag(VRelStats *vacrelstats, Relation onerel,
  *		already too long and almost unreadable.
  */
 static void
-move_chain_tuple(Relation rel,
+move_chain_tuple(VRelStats *vacrelstats, Relation rel,
 				 Buffer old_buf, Page old_page, HeapTuple old_tup,
 				 Buffer dst_buf, Page dst_page, VacPage dst_vacpage,
 				 ExecContext ec, ItemPointer ctid, bool cleanVpd)
@@ -2981,7 +2986,7 @@ move_chain_tuple(Relation rel,
 		int			sv_offsets_used = dst_vacpage->offsets_used;
 
 		dst_vacpage->offsets_used = 0;
-		vacuum_page(rel, dst_buf, dst_vacpage);
+		vacuum_page(vacrelstats, rel, dst_buf, dst_vacpage);
 		dst_vacpage->offsets_used = sv_offsets_used;
 	}
 
@@ -3305,7 +3310,7 @@ vacuum_heap(VRelStats *vacrelstats, Relation onerel, VacPageList vacuum_pages)
 			buf = ReadBufferExtended(onerel, MAIN_FORKNUM, (*vacpage)->blkno,
 									 RBM_NORMAL, vac_strategy);
 			LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE);
-			vacuum_page(onerel, buf, *vacpage);
+			vacuum_page(vacrelstats, onerel, buf, *vacpage);
 			UnlockReleaseBuffer(buf);
 		}
 	}
@@ -3335,7 +3340,7 @@ vacuum_heap(VRelStats *vacrelstats, Relation onerel, VacPageList vacuum_pages)
  * Caller must hold pin and lock on buffer.
  */
 static void
-vacuum_page(Relation onerel, Buffer buffer, VacPage vacpage)
+vacuum_page(VRelStats *vacrelstats, Relation onerel, Buffer buffer, VacPage vacpage)
 {
 	Page		page = BufferGetPage(buffer);
 	int			i;
@@ -3364,7 +3369,7 @@ vacuum_page(Relation onerel, Buffer buffer, VacPage vacpage)
 		recptr = log_heap_clean(onerel, buffer,
 								NULL, 0, NULL, 0,
 								vacpage->offsets, vacpage->offsets_free,
-								false);
+								vacrelstats->latestRemovedXid, false);
 		PageSetLSN(page, recptr);
 		PageSetTLI(page, ThisTimeLineID);
 	}
diff --git a/src/backend/commands/vacuumlazy.c b/src/backend/commands/vacuumlazy.c
index 59c02e2083..e2bedf3bd1 100644
--- a/src/backend/commands/vacuumlazy.c
+++ b/src/backend/commands/vacuumlazy.c
@@ -97,6 +97,7 @@ typedef struct LVRelStats
 	ItemPointer dead_tuples;	/* array of ItemPointerData */
 	int			num_index_scans;
 	bool		scanned_all;	/* have we scanned all pages (this far)? */
+	TransactionId latestRemovedXid;
 } LVRelStats;
 
 
@@ -246,6 +247,36 @@ lazy_vacuum_rel(Relation onerel, VacuumStmt *vacstmt,
 		*scanned_all = vacrelstats->scanned_all;
 }
 
+/*
+ * For Hot Standby we need to know the highest transaction id that will
+ * be removed by any change. VACUUM proceeds in a number of passes so 
+ * we need to consider how each pass operates. The first pass runs
+ * heap_page_prune(), which can issue XLOG_HEAP2_CLEAN records as it
+ * progresses - these will have a latestRemovedXid on each record.
+ * In many cases this removes all of the tuples to be removed.
+ * Then we look at tuples to be removed, but do not actually remove them
+ * until phase three. However, index records for those rows are removed
+ * in phase two and index blocks do not have MVCC information attached.
+ * So before we can allow removal of *any* index tuples we need to issue
+ * a WAL record indicating what the latestRemovedXid will be at the end
+ * of phase three. This then allows Hot Standby queries to block at the
+ * correct place, i.e. before phase two, rather than during phase three
+ * as we issue more XLOG_HEAP2_CLEAN records. If we need to run multiple
+ * phase two/three because of memory constraints we need to issue multiple
+ * log records also.
+ */
+static void
+vacuum_log_cleanup_info(Relation rel, LVRelStats *vacrelstats)
+{
+	/* 
+	 * No need to log changes for temp tables, they do not contain
+	 * data visible on the standby server.
+	 */
+	if (rel->rd_istemp)
+		return;
+
+	(void) log_heap_cleanup_info(rel->rd_node, vacrelstats->latestRemovedXid);
+}
 
 /*
  *	lazy_scan_heap() -- scan an open heap relation
@@ -296,6 +327,7 @@ lazy_scan_heap(Relation onerel, LVRelStats *vacrelstats,
 	nblocks = RelationGetNumberOfBlocks(onerel);
 	vacrelstats->rel_pages = nblocks;
 	vacrelstats->nonempty_pages = 0;
+	vacrelstats->latestRemovedXid = InvalidTransactionId;
 
 	lazy_space_alloc(vacrelstats, nblocks);
 
@@ -354,6 +386,9 @@ lazy_scan_heap(Relation onerel, LVRelStats *vacrelstats,
 		if ((vacrelstats->max_dead_tuples - vacrelstats->num_dead_tuples) < MaxHeapTuplesPerPage &&
 			vacrelstats->num_dead_tuples > 0)
 		{
+			/* Log cleanup info before we touch indexes */
+			vacuum_log_cleanup_info(onerel, vacrelstats);
+
 			/* Remove index entries */
 			for (i = 0; i < nindexes; i++)
 				lazy_vacuum_index(Irel[i],
@@ -593,6 +628,8 @@ lazy_scan_heap(Relation onerel, LVRelStats *vacrelstats,
 			if (tupgone)
 			{
 				lazy_record_dead_tuple(vacrelstats, &(tuple.t_self));
+				HeapTupleHeaderAdvanceLatestRemovedXid(tuple.t_data, 
+												&vacrelstats->latestRemovedXid);
 				tups_vacuumed += 1;
 			}
 			else
@@ -703,6 +740,9 @@ lazy_scan_heap(Relation onerel, LVRelStats *vacrelstats,
 	/* XXX put a threshold on min number of tuples here? */
 	if (vacrelstats->num_dead_tuples > 0)
 	{
+		/* Log cleanup info before we touch indexes */
+		vacuum_log_cleanup_info(onerel, vacrelstats);
+
 		/* Remove index entries */
 		for (i = 0; i < nindexes; i++)
 			lazy_vacuum_index(Irel[i],
@@ -847,7 +887,7 @@ lazy_vacuum_page(Relation onerel, BlockNumber blkno, Buffer buffer,
 		recptr = log_heap_clean(onerel, buffer,
 								NULL, 0, NULL, 0,
 								unused, uncnt,
-								false);
+								vacrelstats->latestRemovedXid, false);
 		PageSetLSN(page, recptr);
 		PageSetTLI(page, ThisTimeLineID);
 	}
diff --git a/src/backend/postmaster/bgwriter.c b/src/backend/postmaster/bgwriter.c
index 3163fd3c1b..fb479ffe18 100644
--- a/src/backend/postmaster/bgwriter.c
+++ b/src/backend/postmaster/bgwriter.c
@@ -210,6 +210,12 @@ BackgroundWriterMain(void)
 	BgWriterShmem->bgwriter_pid = MyProcPid;
 	am_bg_writer = true;
 
+	BgWriterRecoveryMode = IsRecoveryProcessingMode();
+
+	if (BgWriterRecoveryMode)
+		elog(DEBUG1, "bgwriter starting during recovery, pid = %u", 
+			BgWriterShmem->bgwriter_pid);
+
 	/*
 	 * If possible, make this process a group leader, so that the postmaster
 	 * can signal any child processes too.	(bgwriter probably never has any
@@ -364,12 +370,6 @@ BackgroundWriterMain(void)
 	 */
 	PG_SETMASK(&UnBlockSig);
 
-	BgWriterRecoveryMode = IsRecoveryProcessingMode();
-
-	if (BgWriterRecoveryMode)
-		elog(DEBUG1, "bgwriter starting during recovery, pid = %u", 
-			BgWriterShmem->bgwriter_pid);
-
 	/*
 	 * Loop forever
 	 */
@@ -382,101 +382,101 @@ BackgroundWriterMain(void)
 		if (!PostmasterIsAlive(true))
 			exit(1);
 
-		/*
-		 * Process any requests or signals received recently.
-		 */
-		AbsorbFsyncRequests();
-
 		if (got_SIGHUP)
 		{
 			got_SIGHUP = false;
 			ProcessConfigFile(PGC_SIGHUP);
 		}
 
- 		if (BgWriterRecoveryMode)
-  		{
- 			if (shutdown_requested)
- 			{
- 				/*
- 				 * From here on, elog(ERROR) should end with exit(1), not send
- 				 * control back to the sigsetjmp block above
- 				 */
- 				ExitOnAnyError = true;
- 				/* Normal exit from the bgwriter is here */
- 				proc_exit(0);		/* done */
- 			}
- 
- 			if (!IsRecoveryProcessingMode())
- 			{
- 				elog(DEBUG2, "bgwriter changing from recovery to normal mode");
- 
- 				InitXLOGAccess();
- 				BgWriterRecoveryMode = false;
- 
- 				/*
- 				 * Start time-driven events from now
- 				 */
- 				last_checkpoint_time = last_xlog_switch_time = (pg_time_t) time(NULL);
- 
- 				/* 
- 				 * Notice that we do *not* act on a checkpoint_requested
- 				 * state at this point. We have changed mode, so we wish to
- 				 * perform a checkpoint not a restartpoint.
- 				 */
- 				continue;
- 			}
- 
- 			if (checkpoint_requested)
- 			{
- 				XLogRecPtr		ReadPtr;
- 				CheckPoint		restartPoint;
- 
- 				checkpoint_requested = false;
- 
- 				/*
- 				 * Initialize bgwriter-private variables used during checkpoint.
- 				 */
- 				ckpt_active = true;
- 				ckpt_start_time = (pg_time_t) time(NULL);
- 				ckpt_cached_elapsed = 0;
- 
- 				/*
- 				 * Get the requested values from shared memory that the 
- 				 * Startup process has put there for us.
- 				 */
- 				SpinLockAcquire(&BgWriterShmem->ckpt_lck);
- 				ReadPtr = BgWriterShmem->ReadPtr;
- 				memcpy(&restartPoint, &BgWriterShmem->restartPoint, sizeof(CheckPoint));
- 				SpinLockRelease(&BgWriterShmem->ckpt_lck);
- 
- 				/* Use smoothed writes, until interrupted if ever */
- 				CreateRestartPoint(ReadPtr, &restartPoint, 0);
- 
- 				/*
- 				 * After any checkpoint, close all smgr files.	This is so we
- 				 * won't hang onto smgr references to deleted files indefinitely.
- 				 */
- 				smgrcloseall();
- 
- 				ckpt_active = false;
- 				checkpoint_requested = false;
- 			}
- 			else
- 			{
- 				/* Clean buffers dirtied by recovery */
- 				BgBufferSync();
- 
- 				/* Nap for the configured time. */
- 				BgWriterNap();
- 			}
-  		}
+		if (BgWriterRecoveryMode)
+		{
+			if (shutdown_requested)
+			{
+				/*
+				 * From here on, elog(ERROR) should end with exit(1), not send
+				 * control back to the sigsetjmp block above
+				 */
+				ExitOnAnyError = true;
+				/* Normal exit from the bgwriter is here */
+				proc_exit(0);		/* done */
+			}
+
+			if (!IsRecoveryProcessingMode())
+			{
+				elog(DEBUG2, "bgwriter changing from recovery to normal mode");
+	  
+				InitXLOGAccess();
+				BgWriterRecoveryMode = false;
+
+				/*
+				 * Start time-driven events from now
+				 */
+				last_checkpoint_time = last_xlog_switch_time = (pg_time_t) time(NULL);
+
+				/* 
+				 * Notice that we do *not* act on a checkpoint_requested
+				 * state at this point. We have changed mode, so we wish to
+				 * perform a checkpoint not a restartpoint.
+				 */
+				continue;
+			}
+
+			if (checkpoint_requested)
+			{
+				XLogRecPtr		ReadPtr;
+				CheckPoint		restartPoint;
+
+				checkpoint_requested = false;
+
+				/*
+				 * Initialize bgwriter-private variables used during checkpoint.
+				 */
+				ckpt_active = true;
+				ckpt_start_time = (pg_time_t) time(NULL);
+				ckpt_cached_elapsed = 0;
+
+				/*
+				 * Get the requested values from shared memory that the 
+				 * Startup process has put there for us.
+				 */
+				SpinLockAcquire(&BgWriterShmem->ckpt_lck);
+				ReadPtr = BgWriterShmem->ReadPtr;
+				memcpy(&restartPoint, &BgWriterShmem->restartPoint, sizeof(CheckPoint));
+				SpinLockRelease(&BgWriterShmem->ckpt_lck);
+
+				/* Use smoothed writes, until interrupted if ever */
+				CreateRestartPoint(ReadPtr, &restartPoint, 0);
+
+				/*
+				 * After any checkpoint, close all smgr files.	This is so we
+				 * won't hang onto smgr references to deleted files indefinitely.
+				 */
+				smgrcloseall();
+
+				ckpt_active = false;
+				checkpoint_requested = false;
+			}
+			else
+			{
+				/* Clean buffers dirtied by recovery */
+				BgBufferSync();
+
+				/* Nap for the configured time. */
+				BgWriterNap();
+			}
+		}
 		else	/* Normal processing */
-  		{
+		{
 			bool		do_checkpoint = false;
 			int			flags = 0;
 			pg_time_t	now;
 			int			elapsed_secs;
 
+			/*
+			 * Process any requests or signals received recently.
+			 */
+			AbsorbFsyncRequests();
+
 			if (checkpoint_requested)
 			{
 				checkpoint_requested = false;
@@ -1122,14 +1122,6 @@ RequestRestartPoint(const XLogRecPtr ReadPtr, const CheckPoint *restartPoint, bo
 	if (BgWriterShmem->bgwriter_pid == 0)
 		elog(LOG, "could not request restartpoint because bgwriter not running");
 
-#ifdef NOT_USED
-	elog(LOG, "tli = %u nextXidEpoch = %u nextXid = %u nextOid = %u",
-		restartPoint->ThisTimeLineID,
-		restartPoint->nextXidEpoch,
-		restartPoint->nextXid,
-		restartPoint->nextOid);
-#endif
-
 	SpinLockAcquire(&BgWriterShmem->ckpt_lck);
 	BgWriterShmem->ReadPtr = ReadPtr;
 	memcpy(&BgWriterShmem->restartPoint, restartPoint, sizeof(CheckPoint));
@@ -1164,12 +1156,22 @@ GetRedoLocationForArchiveCheckpoint(void)
 	return redo;
 }
 
-void
+/* 
+ * Store the information needed for a checkpoint at the end of recovery.
+ * Returns true if bgwriter can perform checkpoint, or false if bgwriter
+ * not active or otherwise unable to comply.
+ */
+bool
 SetRedoLocationForArchiveCheckpoint(XLogRecPtr redo)
 {
 	SpinLockAcquire(&BgWriterShmem->ckpt_lck);
 	BgWriterShmem->ReadPtr = redo;
 	SpinLockRelease(&BgWriterShmem->ckpt_lck);
+
+	if (BgWriterShmem->bgwriter_pid == 0 || !IsPostmasterEnvironment)
+		return false;
+
+	return true;
 }
 
 /*
diff --git a/src/backend/postmaster/postmaster.c b/src/backend/postmaster/postmaster.c
index 5cb84be4b8..9c026313c3 100644
--- a/src/backend/postmaster/postmaster.c
+++ b/src/backend/postmaster/postmaster.c
@@ -230,8 +230,10 @@ static bool FatalError = false; /* T if recovering from backend crash */
  * We use a simple state machine to control startup, shutdown, and
  * crash recovery (which is rather like shutdown followed by startup).
  *
- * Normal child backends can only be launched when we are in PM_RUN state.
- * (We also allow it in PM_WAIT_BACKUP state, but only for superusers.)
+ * Normal child backends can only be launched when we are in PM_RUN or
+ * PM_RECOVERY state. Any transaction started in PM_RECOVERY state will
+ * be read-only for the whole of its life.  (We also allow launch of normal
+ * child backends in PM_WAIT_BACKUP state, but only for superusers.)
  * In other states we handle connection requests by launching "dead_end"
  * child processes, which will simply send the client an error message and
  * quit.  (We track these in the BackendList so that we can know when they
@@ -1656,11 +1658,6 @@ retry1:
 					(errcode(ERRCODE_CANNOT_CONNECT_NOW),
 					 errmsg("the database system is shutting down")));
 			break;
-		case CAC_RECOVERY:
-			ereport(FATAL,
-					(errcode(ERRCODE_CANNOT_CONNECT_NOW),
-					 errmsg("the database system is in recovery mode")));
-			break;
 		case CAC_TOOMANY:
 			ereport(FATAL,
 					(errcode(ERRCODE_TOO_MANY_CONNECTIONS),
@@ -1669,6 +1666,7 @@ retry1:
 		case CAC_WAITBACKUP:
 			/* OK for now, will check in InitPostgres */
 			break;
+		case CAC_RECOVERY:
 		case CAC_OK:
 			break;
 	}
@@ -1987,10 +1985,11 @@ pmdie(SIGNAL_ARGS)
 			ereport(LOG,
 					(errmsg("received smart shutdown request")));
 
-			if (pmState == PM_RUN)
+			if (pmState == PM_RUN || pmState == PM_RECOVERY)
 			{
 				/* autovacuum workers are told to shut down immediately */
-				SignalAutovacWorkers(SIGTERM);
+				if (pmState == PM_RUN)
+					SignalAutovacWorkers(SIGTERM);
 				/* and the autovac launcher too */
 				if (AutoVacPID != 0)
 					signal_child(AutoVacPID, SIGTERM);
@@ -2024,7 +2023,7 @@ pmdie(SIGNAL_ARGS)
 
 			if (StartupPID != 0)
 				signal_child(StartupPID, SIGTERM);
-			if (pmState == PM_RUN || pmState == PM_WAIT_BACKUP)
+			if (pmState == PM_RUN || pmState == PM_RECOVERY || pmState == PM_WAIT_BACKUP)
 			{
 				ereport(LOG,
 						(errmsg("aborting any active transactions")));
@@ -2120,8 +2119,11 @@ reaper(SIGNAL_ARGS)
 		 */
 		if (pid == StartupPID)
 		{
+			bool	leavingRecovery = (pmState == PM_RECOVERY);
+
 			StartupPID = 0;
-			Assert(pmState == PM_STARTUP || pmState == PM_RECOVERY);
+			Assert(pmState == PM_STARTUP || pmState == PM_RECOVERY ||
+				   pmState == PM_WAIT_BACKUP || pmState == PM_WAIT_BACKENDS);
 
 			/* FATAL exit of startup is treated as catastrophic */
 			if (!EXIT_STATUS_0(exitstatus))
@@ -2129,7 +2131,7 @@ reaper(SIGNAL_ARGS)
 				LogChildExit(LOG, _("startup process"),
 							 pid, exitstatus);
 				ereport(LOG,
-				(errmsg("aborting startup due to startup process failure")));
+						(errmsg("aborting startup due to startup process failure")));
 				ExitPostmaster(1);
 			}
 
@@ -2166,7 +2168,7 @@ reaper(SIGNAL_ARGS)
 			 * already running.
 			 */
 			if (BgWriterPID == 0)
-				BgWriterPID = StartBackgroundWriter();
+			BgWriterPID = StartBackgroundWriter();
 
 			/*
 			 * Likewise, start other special children as needed.  In a restart
@@ -2182,8 +2184,12 @@ reaper(SIGNAL_ARGS)
 				PgStatPID = pgstat_start();
 
 			/* at this point we are really open for business */
-			ereport(LOG,
-				 (errmsg("database system is ready to accept connections")));
+			if (leavingRecovery)
+				ereport(LOG,
+					 (errmsg("database can now be accessed with read and write transactions")));
+			else
+				ereport(LOG,
+					 (errmsg("database system is ready to accept connections")));
 
 			continue;
 		}
@@ -2903,7 +2909,8 @@ BackendStartup(Port *port)
 	bn->pid = pid;
 	bn->cancel_key = MyCancelKey;
 	bn->is_autovacuum = false;
-	bn->dead_end = (port->canAcceptConnections != CAC_OK &&
+	bn->dead_end = (!(port->canAcceptConnections == CAC_RECOVERY || 
+					  port->canAcceptConnections == CAC_OK) &&
 					port->canAcceptConnections != CAC_WAITBACKUP);
 	DLAddHead(BackendList, DLNewElem(bn));
 #ifdef EXEC_BACKEND
@@ -3854,8 +3861,6 @@ sigusr1_handler(SIGNAL_ARGS)
 
 	if (CheckPostmasterSignal(PMSIGNAL_RECOVERY_START))
 	{
-		Assert(pmState == PM_STARTUP);
-
 		/*
 		 * Go to shutdown mode if a shutdown request was pending.
 		 */
@@ -3864,11 +3869,12 @@ sigusr1_handler(SIGNAL_ARGS)
 			pmState = PM_WAIT_BACKENDS;
 			/* PostmasterStateMachine logic does the rest */
 		}
-		else
+		else if (pmState == PM_STARTUP)
 		{
 			/*
 			 * Startup process has entered recovery
 			 */
+			
 			pmState = PM_RECOVERY;
 
 			/*
@@ -3891,9 +3897,11 @@ sigusr1_handler(SIGNAL_ARGS)
 			Assert(PgStatPID == 0);
 			PgStatPID = pgstat_start();
 
-			/* XXX at this point we could accept read-only connections */
-			ereport(DEBUG1,
-				 (errmsg("database system is in consistent recovery mode")));
+			/* We can now accept read-only connections */
+			ereport(LOG,
+				 (errmsg("database system is ready to accept connections")));
+			ereport(LOG,
+				 (errmsg("database can now be accessed with read only transactions")));
 		}
 	}
 
diff --git a/src/backend/storage/buffer/README b/src/backend/storage/buffer/README
index a7b81e37a7..7fb1621c88 100644
--- a/src/backend/storage/buffer/README
+++ b/src/backend/storage/buffer/README
@@ -270,10 +270,11 @@ We might miss a hint-bit update or two but that isn't a problem, for the same
 reasons mentioned under buffer access rules.
 
 As of 8.4, background writer starts during recovery mode when there is
-some form of potentially extended recovery to perform. It performs an
-identical service to normal processing, except that checkpoints it
-writes are technically restartpoints. Flushing outstanding WAL for dirty
-buffers is also skipped, though there shouldn't ever be new WAL entries
-at that time in any case. We could choose to start background writer
-immediately but we hold off until we can prove the database is in a 
-consistent state so that postmaster has a single, clean state change.
+some form of potentially extended recovery to perform. We perform cleaning
+of dirty blocks and enacting restartpoints when requested by the startup
+process. Most other bgwriter functions are skipped, such as flushing 
+outstanding WAL for dirty buffers since no new WAL has been written. 
+We could choose to start background writer immediately but we wait until we 
+can prove the database is in a consistent state. This allows the postmaster 
+to have a single, clean state change between the initial stages of recovery 
+and the main recovery mode. 
diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c
index bd053d503d..4108e2578b 100644
--- a/src/backend/storage/buffer/bufmgr.c
+++ b/src/backend/storage/buffer/bufmgr.c
@@ -33,6 +33,7 @@
 #include <sys/file.h>
 #include <unistd.h>
 
+#include "access/xlogdefs.h"
 #include "catalog/catalog.h"
 #include "miscadmin.h"
 #include "pg_trace.h"
@@ -78,7 +79,13 @@ static bool IsForInput;
 
 /* local state for LockBufferForCleanup */
 static volatile BufferDesc *PinCountWaitBuf = NULL;
+static long		CleanupWaitSecs = 0;
+static int		CleanupWaitUSecs = 0;
+static bool		CleanupWaitStats = false;
 
+/* local state for recovery conflict processing */
+static bool			BufferRecoveryConflictPending = false;
+static XLogRecPtr	BufferRecoveryConflictLSN;
 
 static Buffer ReadBuffer_common(SMgrRelation reln, bool isLocalBuf,
 					ForkNumber forkNum, BlockNumber blockNum,
@@ -100,7 +107,8 @@ static volatile BufferDesc *BufferAlloc(SMgrRelation smgr, ForkNumber forkNum,
 			bool *foundPtr);
 static void FlushBuffer(volatile BufferDesc *buf, SMgrRelation reln);
 static void AtProcExit_Buffers(int code, Datum arg);
-
+static void BufferProcessRecoveryConflictsIfAny(volatile  BufferDesc *bufHdr);
+	
 
 /*
  * PrefetchBuffer -- initiate asynchronous read of a block of a relation
@@ -306,6 +314,8 @@ ReadBuffer_common(SMgrRelation smgr, bool isLocalBuf, ForkNumber forkNum,
 			/* Just need to update stats before we exit */
 			*hit = true;
 
+			BufferProcessRecoveryConflictsIfAny(bufHdr);
+
 			if (VacuumCostActive)
 				VacuumCostBalance += VacuumCostPageHit;
 
@@ -419,6 +429,8 @@ ReadBuffer_common(SMgrRelation smgr, bool isLocalBuf, ForkNumber forkNum,
 									blockNum,
 									relpath(smgr->smgr_rnode, forkNum))));
 			}
+
+			BufferProcessRecoveryConflictsIfAny(bufHdr);
 		}
 	}
 
@@ -1580,6 +1592,38 @@ SyncOneBuffer(int buf_id, bool skip_recently_used)
 	return result | BUF_WRITTEN;
 }
 
+static void
+BufferProcessRecoveryConflictsIfAny(volatile BufferDesc *bufHdr)
+{
+	if (!BufferRecoveryConflictPending)
+		return;
+	else
+	{
+		XLogRecPtr	bufLSN = BufferGetLSN(bufHdr);
+
+		if (XLByteLE(bufLSN, BufferRecoveryConflictLSN))
+			ereport(ERROR,
+				(errcode(ERRCODE_QUERY_CANCELED),
+				 errmsg("canceling statement due to recent buffer changes during recovery")));
+	}
+}
+
+bool
+SetBufferRecoveryConflictLSN(XLogRecPtr conflict_LSN)
+{
+	if (XLogRecPtrIsValid(conflict_LSN))
+	{
+		BufferRecoveryConflictPending = true;
+		BufferRecoveryConflictLSN = conflict_LSN;
+	}
+	else
+	{
+		BufferRecoveryConflictPending = false;
+		BufferRecoveryConflictLSN = InvalidXLogRecPtr;
+	}
+
+	return BufferRecoveryConflictPending;
+}
 
 /*
  * Return a palloc'd string containing buffer usage statistics.
@@ -2365,6 +2409,53 @@ ConditionalLockBuffer(Buffer buffer)
 }
 
 /*
+ * On standby servers only the Startup process applies Cleanup. As a result
+ * a single buffer pin can be enough to effectively halt recovery for short
+ * periods. We need special instrumentation to monitor this so we can judge
+ * whether additional measures are required to control the negative effects.
+ */
+void
+StartCleanupDelayStats(void)
+{
+	CleanupWaitSecs = 0;
+	CleanupWaitUSecs = 0;
+	CleanupWaitStats = true;
+}
+
+void
+EndCleanupDelayStats(void)
+{
+	CleanupWaitStats = false;
+}
+
+/* 
+ * Called by Startup process whenever we request restartpoint
+ */
+void
+ReportCleanupDelayStats(void)
+{
+	elog(trace_recovery(DEBUG2), "cleanup wait total=%ld.%03d s",
+			 	CleanupWaitSecs, CleanupWaitUSecs / 1000);
+}
+
+static void
+CleanupDelayStats(TimestampTz start_ts, TimestampTz end_ts)
+{
+	long			wait_secs;
+	int				wait_usecs;
+
+	TimestampDifference(start_ts, end_ts, &wait_secs, &wait_usecs);
+
+	CleanupWaitSecs +=wait_secs;
+	CleanupWaitUSecs +=wait_usecs;
+	if (CleanupWaitUSecs > 999999)
+	{
+		CleanupWaitSecs += 1;
+		CleanupWaitUSecs -= 1000000;
+	}
+}
+
+/*
  * LockBufferForCleanup - lock a buffer in preparation for deleting items
  *
  * Items may be deleted from a disk page only when the caller (a) holds an
@@ -2407,6 +2498,8 @@ LockBufferForCleanup(Buffer buffer)
 
 	for (;;)
 	{
+		TimestampTz 	start_ts = 0;
+
 		/* Try to acquire lock */
 		LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
 		LockBufHdr(bufHdr);
@@ -2429,9 +2522,14 @@ LockBufferForCleanup(Buffer buffer)
 		PinCountWaitBuf = bufHdr;
 		UnlockBufHdr(bufHdr);
 		LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
+		if (CleanupWaitStats)
+			start_ts = GetCurrentTimestamp();
 		/* Wait to be signaled by UnpinBuffer() */
 		ProcWaitForSignal();
 		PinCountWaitBuf = NULL;
+		if (CleanupWaitStats)
+			CleanupDelayStats(start_ts, GetCurrentTimestamp());
+		
 		/* Loop back and try again */
 	}
 }
diff --git a/src/backend/storage/ipc/procarray.c b/src/backend/storage/ipc/procarray.c
index 06f8ad8f4a..92150c10b1 100644
--- a/src/backend/storage/ipc/procarray.c
+++ b/src/backend/storage/ipc/procarray.c
@@ -17,6 +17,32 @@
  * as are the myProcLocks lists.  They can be distinguished from regular
  * backend PGPROCs at need by checking for pid == 0.
  *
+ * The process array now also includes PGPROC structures representing
+ * transactions being recovered. The xid and subxids fields of these are valid,
+ * though few other fields are.  They can be distinguished from regular backend
+ * PGPROCs by checking for pid == 0.  The proc array also has an
+ * secondary array of UnobservedXids representing transactions that are
+ * known to be running on the master but for which we do not yet have
+ * a recovery proc. We infer the existence of UnobservedXids by watching 
+ * the sequence of arriving xids. This is very important because if we leave 
+ * those xids out of the snapshot then they will appear to be already complete. 
+ * Later, when they have actually completed this could lead to confusion as to 
+ * whether those xids are visible or not, blowing a huge hole in MVCC. 
+ * We need 'em.
+ * 
+ * Although we have max_connections procs during recovery, they will only
+ * be used when the master is running a write transaction. Read only
+ * transactions never show up in WAL at all and it is valid to ignore them.
+ * So we would only ever use all max_connections procs is we were running
+ * a write transaction on every session at once. As a result, we may be
+ * able to continue running normally even if max_connections is set lower
+ * on the standby than on the master.
+ *
+ * It is theoretically possible for a FATAL error to explode before writing
+ * an abort record. This would then tie up a recovery proc until the next
+ * WAL record containing a valid list of running xids arrives. This is
+ * relatively unlikely, so considered both a minor and an acceptable flaw
+ * in the emulation of transactions during recovery.
  *
  * Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
@@ -33,24 +59,36 @@
 
 #include "access/subtrans.h"
 #include "access/transam.h"
-#include "access/xact.h"
+#include "access/xlog.h"
 #include "access/twophase.h"
 #include "miscadmin.h"
+#include "storage/proc.h"
 #include "storage/procarray.h"
 #include "utils/snapmgr.h"
 
+static RunningXactsData	CurrentRunningXactsData;
+
+void ProcArrayDisplay(int trace_level);
+
 
 /* Our shared memory area */
 typedef struct ProcArrayStruct
 {
 	int			numProcs;		/* number of valid procs entries */
-	int			maxProcs;		/* allocated size of procs array */
+	int			maxProcs;			/* allocated size of total procs array */
+
+	int			numUnobservedXids;	/* number of valid unobserved xids */
+	int			maxUnobservedXids;	/* allocated size of unobserved array */
+
+	bool		allowStandbySnapshots;	/* can queries take snapshots? */
 
 	/*
 	 * We declare procs[] as 1 entry because C wants a fixed-size array, but
 	 * actually it is maxProcs entries long.
 	 */
 	PGPROC	   *procs[1];		/* VARIABLE LENGTH ARRAY */
+
+	/* ARRAY OF UNOBSERVED TRANSACTION XIDs FOLLOWS */
 } ProcArrayStruct;
 
 static ProcArrayStruct *procArray;
@@ -100,8 +138,18 @@ ProcArrayShmemSize(void)
 	Size		size;
 
 	size = offsetof(ProcArrayStruct, procs);
-	size = add_size(size, mul_size(sizeof(PGPROC *),
-								 add_size(MaxBackends, max_prepared_xacts)));
+
+	/* Normal processing */
+	/* MyProc slots */
+	size = add_size(size, mul_size(sizeof(PGPROC *), MaxBackends));
+	size = add_size(size, mul_size(sizeof(PGPROC *), max_prepared_xacts));
+
+	/* Recovery processing */
+
+	/* Recovery Procs */
+	size = add_size(size, mul_size(sizeof(PGPROC *), MaxBackends));
+	/* UnobservedXids */
+	size = add_size(size, mul_size(sizeof(TransactionId), 2 * MaxBackends));
 
 	return size;
 }
@@ -123,8 +171,27 @@ CreateSharedProcArray(void)
 		/*
 		 * We're the first - initialize.
 		 */
+		/* Normal processing */
 		procArray->numProcs = 0;
 		procArray->maxProcs = MaxBackends + max_prepared_xacts;
+
+		/* Recovery processing */
+		procArray->maxProcs += MaxBackends;
+
+		procArray->allowStandbySnapshots = false;
+
+		/*
+		 * The max number of UnobservedXids is theoretically unbounded
+		 * because of a very slim chance of FATAL errors that fail to
+		 * write abort records. However, in normal running each
+		 * session will have at most 2 xids assigned without having 
+		 * written a WAL record, so we set a reasonable limit accordingly.
+		 * UnobservedXids typically has length 0 or 1, though can be
+		 * longer if there is high contention for data blocks.
+		 * If you change this, also change ProcArrayShmemSize()
+		 */
+		procArray->maxUnobservedXids = 2 * MaxBackends;
+		procArray->numUnobservedXids = 0;
 	}
 }
 
@@ -132,11 +199,12 @@ CreateSharedProcArray(void)
  * Add the specified PGPROC to the shared array.
  */
 void
-ProcArrayAdd(PGPROC *proc)
+ProcArrayAdd(PGPROC *proc, bool need_lock)
 {
 	ProcArrayStruct *arrayP = procArray;
 
-	LWLockAcquire(ProcArrayLock, LW_EXCLUSIVE);
+	if (need_lock)
+		LWLockAcquire(ProcArrayLock, LW_EXCLUSIVE);
 
 	if (arrayP->numProcs >= arrayP->maxProcs)
 	{
@@ -154,13 +222,15 @@ ProcArrayAdd(PGPROC *proc)
 	arrayP->procs[arrayP->numProcs] = proc;
 	arrayP->numProcs++;
 
-	LWLockRelease(ProcArrayLock);
+	if (need_lock)
+		LWLockRelease(ProcArrayLock);
 }
 
 /*
  * Remove the specified PGPROC from the shared array.
  *
- * When latestXid is a valid XID, we are removing a live 2PC gxact from the
+ * When latestXid is a valid XID, it is either an emulated transaction during
+ * recovery or removing a live 2PC gxact that we wish to remove from the
  * array, and thus causing it to appear as "not running" anymore.  In this
  * case we must advance latestCompletedXid.  (This is essentially the same
  * as ProcArrayEndTransaction followed by removal of the PGPROC, but we take
@@ -168,7 +238,8 @@ ProcArrayAdd(PGPROC *proc)
  * twophase.c depends on the latter.)
  */
 void
-ProcArrayRemove(PGPROC *proc, TransactionId latestXid)
+ProcArrayRemove(PGPROC *proc, TransactionId latestXid,
+				int nsubxids, TransactionId *subxids)
 {
 	ProcArrayStruct *arrayP = procArray;
 	int			index;
@@ -181,6 +252,15 @@ ProcArrayRemove(PGPROC *proc, TransactionId latestXid)
 
 	LWLockAcquire(ProcArrayLock, LW_EXCLUSIVE);
 
+	/*
+	 * Remove any UnobservedXids remaining
+	 */
+	if (IsRecoveryProcessingMode())
+	{
+		for (index = 0; index < nsubxids; index++)
+			UnobservedTransactionsRemoveXid(subxids[index], false);
+	}
+
 	if (TransactionIdIsValid(latestXid))
 	{
 		Assert(TransactionIdIsValid(proc->xid));
@@ -193,7 +273,7 @@ ProcArrayRemove(PGPROC *proc, TransactionId latestXid)
 	else
 	{
 		/* Shouldn't be trying to remove a live transaction here */
-		Assert(!TransactionIdIsValid(proc->xid));
+		Assert(IsRecoveryProcessingMode() || !TransactionIdIsValid(proc->xid));
 	}
 
 	for (index = 0; index < arrayP->numProcs; index++)
@@ -213,6 +293,15 @@ ProcArrayRemove(PGPROC *proc, TransactionId latestXid)
 	elog(LOG, "failed to find proc %p in ProcArray", proc);
 }
 
+/*
+ * Initialisation when we switch into PM_RECOVERY mode.
+ * Expected caller is InitRecoveryTransactionEnvironment()
+ */
+void
+ProcArrayInitRecoveryEnvironment(void)
+{
+	PublishStartupProcessInformation();
+}
 
 /*
  * ProcArrayEndTransaction -- mark a transaction as no longer running
@@ -301,6 +390,7 @@ ProcArrayClearTransaction(PGPROC *proc)
 	proc->xid = InvalidTransactionId;
 	proc->lxid = InvalidLocalTransactionId;
 	proc->xmin = InvalidTransactionId;
+	proc->lsn = InvalidXLogRecPtr;
 
 	/* redundant, but just in case */
 	proc->vacuumFlags &= ~PROC_VACUUM_STATE_MASK;
@@ -311,6 +401,309 @@ ProcArrayClearTransaction(PGPROC *proc)
 	proc->subxids.overflowed = false;
 }
 
+/*
+ * ProcArrayClearRecoveryTransactions
+ *
+ * Called during recovery when we see a Shutdown checkpoint or EndRecovery
+ * record, or at the end of recovery processing.
+ */
+void
+ProcArrayClearRecoveryTransactions(void)
+{
+	ProcArrayStruct *arrayP = procArray;
+	int			index;
+
+	LWLockAcquire(ProcArrayLock, LW_EXCLUSIVE);
+
+	/*
+	 * Reset recovery procs, which is any proc that has a valid xid.
+	 */
+	for (index = 0; index < arrayP->numProcs; index++)
+	{
+		volatile PGPROC *proc = arrayP->procs[index];
+
+		if (TransactionIdIsValid(proc->xid) && proc->pid == 0)
+		{
+			arrayP->procs[index] = arrayP->procs[arrayP->numProcs - 1];
+			arrayP->numProcs--;
+		}
+	}
+
+	/*
+	 * Clear the UnobservedXids also
+	 */
+	UnobservedTransactionsClearXids();
+
+	LWLockRelease(ProcArrayLock);
+}
+
+/* debug support functions for recovery processing */
+bool
+XidInRecoveryProcs(TransactionId xid)
+{
+	ProcArrayStruct *arrayP = procArray;
+	int				index;
+
+	if (!TransactionIdIsValid(xid))
+		return false;
+
+	for (index = 0; index < arrayP->numProcs; index++)
+	{
+		PGPROC	*RecoveryProc = arrayP->procs[index];
+
+		if (RecoveryProc->xid == xid)
+			return true;
+	}
+	return false;
+}
+
+void
+ProcArrayDisplay(int trace_level)
+{
+	ProcArrayStruct *arrayP = procArray;
+	int			index;
+
+	LWLockAcquire(ProcArrayLock, LW_SHARED);
+
+	for (index = 0; index < arrayP->numProcs; index++)
+	{
+		PGPROC	*RecoveryProc = arrayP->procs[index];
+
+		if (TransactionIdIsValid(RecoveryProc->xid))
+			elog(trace_level,
+					"proc %d proc->xid %d proc->lsn %X/%X", index, RecoveryProc->xid, 
+								RecoveryProc->lsn.xlogid, RecoveryProc->lsn.xrecoff);
+	}
+
+	UnobservedTransactionsDisplay(trace_level);
+
+	LWLockRelease(ProcArrayLock);
+}
+
+/*
+ * ProcArrayUpdateRecoveryTransactions -- initialise the proc array in recovery
+ *
+ * Use the data about running transactions on master to either create the
+ * initial state of the recovery procs, or maintain correctness of their
+ * state. In a sense this is almost the opposite of GetSnapshotData(), 
+ * since we are updating the proc array based upon the snapshot. We do this
+ * as a cross-check that the proc array is correctly maintained, because
+ * we know it is possible that some transactions with FATAL errors do not
+ * write abort records and also to create the initial state of the procarray.
+ *
+ * Only used during recovery. Notice the signature is very similar to a
+ * _redo function.
+ */
+void
+ProcArrayUpdateRecoveryTransactions(XLogRecPtr lsn, xl_xact_running_xacts *xlrec)
+{
+	ProcArrayStruct *arrayP = procArray;
+	int				xid_index;	/* main loop */
+	int 			index;
+
+	LWLockAcquire(ProcArrayLock, LW_EXCLUSIVE);
+
+	/*
+	 * Scan the proc array for stale recovery PGPROC entries, and
+	 * remove them. This shouldn't happen, except when FATAL error
+	 * caused us to skip the abort record, but we don't want to stop
+	 * recovery because of this. Be careful not to confuse super-fresh
+	 * with stale, because of race conditions as noted above.
+	 * We remove stale entries first to free up their proc entries.
+	 */
+	for (index = 0; index < arrayP->numProcs; index++)
+	{
+		PGPROC	*p = arrayP->procs[index];
+
+		if (TransactionIdPrecedes(p->xid, xlrec->oldestRunningXid) && p->pid == 0)
+		{
+			elog(LOG, "removing stale proc array entry for transaction %d", p->xid);
+
+			arrayP->procs[index] = arrayP->procs[arrayP->numProcs - 1];
+			arrayP->numProcs--;
+			FreeRecoveryProcess(p);
+		}
+	}
+
+	/*
+	 * Left prune the UnobservedXids array up to latestRunningXid.
+	 * This is correct because at the time we take this snapshot, all
+	 * completed transactions prior to latestRunningXid will be marked in
+	 * WAL or they are explicitly present here.
+	 *
+	 * We can't clear the array completely because race conditions allow
+	 * things to slip through sometimes.
+	 */
+	UnobservedTransactionsPruneXids(xlrec->latestRunningXid);
+
+	/*
+	 * Scan through the incoming array of RunningXacts and update the
+	 * proc array entries so that they match as much as possible.
+	 */	
+	for (xid_index = 0; xid_index < xlrec->xcnt; xid_index++)
+	{
+		RunningXact		*rxact = (RunningXact *) xlrec->xrun;
+		PGPROC 			*proc = NULL;
+		TransactionId	xid = rxact[xid_index].xid;
+		bool	unobserved = false;
+
+		/*
+		 * Look up the incoming xids in the existing proc array.
+		 *
+		 * XXXHS: This gives O(N^2) behaviour. We could sort the list of
+		 * procs first to improve performance if both lists are long.
+		 */
+		for (index = 0; index < arrayP->numProcs; index++)
+		{
+			PGPROC	*p = arrayP->procs[index];
+
+			if (p->xid == xid)
+			{
+				proc = p;
+				break;
+			}
+		}
+
+		/*
+		 * Create procs for any missing xids, with warning if appropriate.
+		 * We do this differently from RecordKnownAssignedTransactionIds()
+		 * because here we have a better and possibly full knowledge of 
+		 * subtransactions.
+		 */
+		if (proc == NULL)
+		{
+			unobserved = XidInUnobservedTransactions(xid);
+
+			if (!procArray->allowStandbySnapshots || unobserved ||
+				(!TransactionIdDidCommit(xid) && !TransactionIdDidAbort(xid)))
+			{
+				proc = CreateRecoveryProcessForTransactionId(xid);
+
+				if (proc == NULL)
+				{
+					/* 
+					 * If we've run out of recovery procs then don't bother
+					 * to process any further. No more snapshots for a while.
+					 */
+					ProcArrayClearRecoveryTransactions();
+					LWLockRelease(ProcArrayLock);
+					return;
+				}
+
+				if (unobserved)
+					UnobservedTransactionsRemoveXid(xid, true);
+			}
+			else
+			{
+				/* 
+				 * It's possible for a commit or abort to have arrived in WAL
+				 * between us doing GetRunningTransactionData() and grabbing
+ 				 * the WALInsertLock. Issue a debug message, but thats all.
+				 */
+				elog(DEBUG2, "proc array entry was missing for transaction %d", xid);
+				continue;
+			}
+		}
+
+		/*
+		 * If our state information is later for this proc, then 
+		 * overwrite it. It's possible for a commit and possibly
+		 * a new transaction record to have arrived in WAL in between
+		 * us doing GetRunningTransactionData() and grabbing the
+		 * WALInsertLock, so we musn't assume we always know best.
+		 */
+		if (XLByteLT(proc->lsn, lsn))
+		{
+			TransactionId 	*subxip = (TransactionId *) &(xlrec->xrun[xlrec->xcnt]);
+
+			proc->lsn = lsn;
+			/* proc-> pid stays 0 for Recovery Procs */
+
+			proc->subxids.nxids = rxact[xid_index].nsubxids;
+			proc->subxids.overflowed = rxact[xid_index].overflowed;
+
+			memcpy(proc->subxids.xids, subxip, 
+						rxact[xid_index].nsubxids * sizeof(TransactionId));
+
+			/* Remove subtransactions from UnobservedXids also */
+			if (unobserved)
+			{
+				for (index = 0; index < rxact[xid_index].nsubxids; index++)
+					UnobservedTransactionsRemoveXid(subxip[index + rxact[xid_index].subx_offset], false);
+			}
+		}
+
+		elog(trace_recovery(DEBUG5), 
+			"running xact proc->lsn %X/%X lsn %X/%X proc->xid %d xid %d",
+				proc->lsn.xlogid, proc->lsn.xrecoff,
+				lsn.xlogid, lsn.xrecoff, proc->xid, rxact[xid_index].xid);
+	}
+
+	/* Advance global latestCompletedXid while holding the lock */
+	if (TransactionIdPrecedes(ShmemVariableCache->latestCompletedXid,
+							  xlrec->latestCompletedXid))
+		ShmemVariableCache->latestCompletedXid = xlrec->latestCompletedXid;
+
+	/*
+	 * If we fully applied the RunningXact data then we can (re)open 
+	 * for business.
+	 */
+	procArray->allowStandbySnapshots = true;
+	SetRunningXactData(true);
+
+	LWLockRelease(ProcArrayLock);
+
+	ProcArrayDisplay(trace_recovery(DEBUG5));
+}
+
+/*
+ * CreateRecoveryProcessForTransactionId 
+ *
+ * Create recovery process and add it to proc array, or throw a warning.
+ *
+ * Must be called with ProcArrayLock held, stays held at exit
+ */
+PGPROC *
+CreateRecoveryProcessForTransactionId(TransactionId xid)
+{
+	PGPROC 			*proc = NULL;
+
+	proc = InitRecoveryProcess(xid);
+
+	/*
+	 * Was there a recovery proc free? If not, punt. It might be possible 
+	 * to wedge stuff into UnobservedXids, but the code to do this would 
+	 * be complex and difficult to test.
+	 */
+	if (proc == NULL)
+	{
+		ereport(WARNING,
+			(errcode(ERRCODE_INSUFFICIENT_RESOURCES),
+			 errmsg("insufficient recovery procs - standby snapshots disabled"),
+			 errdetail("Recovery will continue but standby queries will"
+				" consistently fail until either more resources are"
+				" allocated or the transaction load reduces"
+				" on the master server - not this standby server."),
+			 errhint("Increase the \"max_connections\" parameter"
+	                    " and restart the server.")));
+
+		/*
+		 * We have now set allowStandbySnapshots = false and we will refuse 
+		 * further snapshots until at least the next RunningXact WAL record 
+		 * arrives, though we wait until the data all fits in our recovery 
+		 * procs. This may be a very long time: minutes/hours/days+, but 
+		 * the important thing is that recovery continues.
+		 */
+		procArray->allowStandbySnapshots = false;
+		SetRunningXactData(false);
+
+		return NULL;
+	}
+
+	ProcArrayAdd(proc, false);
+
+	return proc;
+}
 
 /*
  * TransactionIdIsInProgress -- is given transaction running in some backend
@@ -589,6 +982,9 @@ GetOldestXmin(bool allDbs, bool ignoreVacuum)
 	TransactionId result;
 	int			index;
 
+	/* Cannot look for individual databases during recovery */
+	Assert(allDbs || !IsRecoveryProcessingMode());
+
 	LWLockAcquire(ProcArrayLock, LW_SHARED);
 
 	/*
@@ -655,7 +1051,7 @@ GetOldestXmin(bool allDbs, bool ignoreVacuum)
  * but since PGPROC has only a limited cache area for subxact XIDs, full
  * information may not be available.  If we find any overflowed subxid arrays,
  * we have to mark the snapshot's subxid data as overflowed, and extra work
- * will need to be done to determine what's running (see XidInMVCCSnapshot()
+ * *may* need to be done to determine what's running (see XidInMVCCSnapshot()
  * in tqual.c).
  *
  * We also update the following backend-global variables:
@@ -680,6 +1076,7 @@ GetSnapshotData(Snapshot snapshot)
 	int			index;
 	int			count = 0;
 	int			subcount = 0;
+	bool		suboverflowed = false;
 
 	Assert(snapshot != NULL);
 
@@ -707,7 +1104,7 @@ GetSnapshotData(Snapshot snapshot)
 					 errmsg("out of memory")));
 		Assert(snapshot->subxip == NULL);
 		snapshot->subxip = (TransactionId *)
-			malloc(arrayP->maxProcs * PGPROC_MAX_CACHED_SUBXIDS * sizeof(TransactionId));
+			malloc((arrayP->maxProcs * PGPROC_MAX_CACHED_SUBXIDS) * sizeof(TransactionId));
 		if (snapshot->subxip == NULL)
 			ereport(ERROR,
 					(errcode(ERRCODE_OUT_OF_MEMORY),
@@ -720,6 +1117,16 @@ GetSnapshotData(Snapshot snapshot)
 	 */
 	LWLockAcquire(ProcArrayLock, LW_SHARED);
 
+	if (IsRecoveryProcessingMode() && !arrayP->allowStandbySnapshots)
+	{
+		LWLockRelease(ProcArrayLock);
+		ereport(ERROR,
+			(errcode(ERRCODE_QUERY_CANCELED),
+			 errmsg("canceling statement because standby snapshots are currently disabled"),
+			 errdetail("Valid MVCC snapshot cannot be taken at this time."),
+			 errhint("Contact your administrator if this error recurs frequently")));
+	}
+
 	/* xmax is always latestCompletedXid + 1 */
 	xmax = ShmemVariableCache->latestCompletedXid;
 	Assert(TransactionIdIsNormal(xmax));
@@ -771,11 +1178,11 @@ GetSnapshotData(Snapshot snapshot)
 		}
 
 		/*
-		 * Save subtransaction XIDs if possible (if we've already overflowed,
-		 * there's no point).  Note that the subxact XIDs must be later than
-		 * their parent, so no need to check them against xmin.  We could
-		 * filter against xmax, but it seems better not to do that much work
-		 * while holding the ProcArrayLock.
+		 * Save subtransaction XIDs, whether or not we have overflowed. 
+		 * Note that the subxact XIDs must be later than their parent, so no
+		 * need to check them against xmin.  We could filter against xmax, 
+		 * but it seems better not to do that much work while holding the 
+		 * ProcArrayLock.
 		 *
 		 * The other backend can add more subxids concurrently, but cannot
 		 * remove any.	Hence it's important to fetch nxids just once. Should
@@ -784,23 +1191,69 @@ GetSnapshotData(Snapshot snapshot)
 		 *
 		 * Again, our own XIDs are not included in the snapshot.
 		 */
-		if (subcount >= 0 && proc != MyProc)
-		{
-			if (proc->subxids.overflowed)
-				subcount = -1;	/* overflowed */
-			else
+		if (proc != MyProc)
 			{
 				int			nxids = proc->subxids.nxids;
 
 				if (nxids > 0)
 				{
+					if (proc->subxids.overflowed)
+						suboverflowed = true;
+
 					memcpy(snapshot->subxip + subcount,
 						   (void *) proc->subxids.xids,
 						   nxids * sizeof(TransactionId));
 					subcount += nxids;
 				}
+
 			}
 		}
+
+	/*
+	 * Also check for unobserved xids. There is no need for us to specify
+	 * only if IsRecoveryProcessingMode(), since the list will always be
+	 * empty when normal processing begins and the test will be optimised
+	 * to nearly nothing very quickly.
+	 */
+	for (index = 0; index < arrayP->numUnobservedXids; index++)
+	{
+		volatile TransactionId	*UnobservedXids;
+		TransactionId 	xid;
+
+		UnobservedXids = (TransactionId *) &(arrayP->procs[arrayP->maxProcs]);
+
+		/* Fetch xid just once - see GetNewTransactionId */
+		xid = UnobservedXids[index];
+
+		/*
+		 * If there are no more visible xids, we're done. This works
+		 * because UnobservedXids is maintained in strict ascending order.
+		 */
+		if (!TransactionIdIsNormal(xid) || TransactionIdPrecedes(xid, xmax))
+			break;
+
+		/*
+		 * Typically, there will be space in the snapshot. We know that the
+		 * unobserved xids are being run by one of the procs marked with
+		 * an xid of InvalidTransactionId, so we will have ignored that above,
+		 * and the xidcache for that proc will have been empty also.
+		 *
+		 * We put the unobserved xids into the subxid cache. The xid might
+		 * be a top-level or it might be a subtransaction, but it won't
+		 * change the answer to XidInMVCCSnapshot() whichever it is. That's
+		 * just as well, since we don't know which it is, by definition.
+		 * The subxid cache gets searched first, so put it there.
+		 */
+		snapshot->subxip[subcount++] = xid;
+
+		/*
+		 * We don't really need xmin during recovery, but lets derive
+		 * it anyway for consistency. It is possible that an unobserved
+		 * xid could be xmin if there is contention between long-lived 
+		 * transactions.
+		 */
+		if (TransactionIdPrecedes(xid, xmin))
+			xmin = xid;
 	}
 
 	if (!TransactionIdIsValid(MyProc->xmin))
@@ -824,6 +1277,7 @@ GetSnapshotData(Snapshot snapshot)
 	snapshot->xmax = xmax;
 	snapshot->xcnt = count;
 	snapshot->subxcnt = subcount;
+	snapshot->suboverflowed = suboverflowed;
 
 	snapshot->curcid = GetCurrentCommandId(false);
 
@@ -839,6 +1293,197 @@ GetSnapshotData(Snapshot snapshot)
 }
 
 /*
+ * GetRunningTransactionData -- returns information about running transactions.
+ *
+ * Similar to GetSnapshotData but returning more information. We include
+ * all PGPROCs with an assigned TransactionId, even VACUUM processes. We
+ * also keep track of which subtransactions go with each PGPROC. All of this
+ * looks very similar to GetSnapshotData, but we have more procs and more info
+ * about each proc.
+ *
+ * This is never executed during recovery so there is no need to look at
+ * UnobservedXids.
+ *
+ * We don't worry about updating other counters, we want to keep this as
+ * simple as possible and leave GetSnapshotData() as the primary code for
+ * that bookkeeping.
+ */
+RunningTransactions
+GetRunningTransactionData(void)
+{
+	ProcArrayStruct *arrayP = procArray;
+	static RunningTransactions CurrentRunningXacts = (RunningTransactions) &CurrentRunningXactsData;
+	RunningXact	*rxact;
+	TransactionId *subxip;
+	TransactionId latestRunningXid = InvalidTransactionId;
+	TransactionId latestCompletedXid;
+	TransactionId oldestRunningXid = InvalidTransactionId;
+	int			index;
+	int			count = 0;
+	int			subcount = 0;
+	bool		suboverflowed = false;
+
+	/*
+	 * Allocating space for maxProcs xids is usually overkill; numProcs would
+	 * be sufficient.  But it seems better to do the malloc while not holding
+	 * the lock, so we can't look at numProcs.  Likewise, we allocate much
+	 * more subxip storage than is probably needed.
+	 *
+	 * Should only be allocated for bgwriter, since only ever executed
+	 * during checkpoints.
+	 */
+	if (CurrentRunningXacts->xrun == NULL)
+	{
+		/*
+		 * First call
+		 */
+		CurrentRunningXacts->xrun = (RunningXact *)
+			malloc(arrayP->maxProcs * sizeof(RunningXact));
+		if (CurrentRunningXacts->xrun == NULL)
+			ereport(ERROR,
+					(errcode(ERRCODE_OUT_OF_MEMORY),
+					 errmsg("out of memory")));
+		Assert(CurrentRunningXacts->subxip == NULL);
+		CurrentRunningXacts->subxip = (TransactionId *)
+			malloc((arrayP->maxProcs * PGPROC_MAX_CACHED_SUBXIDS) * sizeof(TransactionId));
+		if (CurrentRunningXacts->subxip == NULL)
+			ereport(ERROR,
+					(errcode(ERRCODE_OUT_OF_MEMORY),
+					 errmsg("out of memory")));
+	}
+
+	rxact = CurrentRunningXacts->xrun;
+	subxip = CurrentRunningXacts->subxip;
+
+	count = 0;
+	subcount = 0;
+	suboverflowed = false;
+
+	LWLockAcquire(ProcArrayLock, LW_SHARED);
+
+	latestCompletedXid = ShmemVariableCache->latestCompletedXid;
+
+	/*
+	 * Spin over procArray checking xid, and subxids. Shared lock is enough
+	 * because new transactions don't use locks at all, so LW_EXCLUSIVE
+	 * wouldn't be enough to prevent them, so don't bother.
+	 */
+	for (index = 0; index < arrayP->numProcs; index++)
+	{
+		volatile PGPROC *proc = arrayP->procs[index];
+		TransactionId xid;
+		int			nxids;
+
+		/* Fetch xid just once - see GetNewTransactionId */
+		xid = proc->xid;
+
+		/*
+		 * We store all xids, even XIDs >= xmax and our own XID, if any.
+		 * But we don't store transactions that don't have a TransactionId
+		 * yet because they will not show as running on a standby server.
+		 */
+		if (!TransactionIdIsValid(xid))
+			continue;
+
+		rxact[count].xid = xid;
+
+		if (TransactionIdPrecedes(latestRunningXid, xid))
+			latestRunningXid = xid;
+
+		if (!TransactionIdIsValid(oldestRunningXid) || 
+			TransactionIdPrecedes(xid, oldestRunningXid))
+			oldestRunningXid = xid;
+
+		/*
+		 * Save subtransaction XIDs. 
+		 *
+		 * The other backend can add more subxids concurrently, but cannot
+		 * remove any.	Hence it's important to fetch nxids just once. Should
+		 * be safe to use memcpy, though.  (We needn't worry about missing any
+		 * xids added concurrently, because they must postdate xmax.)
+		 *
+		 * Again, our own XIDs *are* included in the snapshot.
+		 */
+		nxids = proc->subxids.nxids;
+
+		if (nxids > 0)
+		{
+			TransactionId *subxids = (TransactionId *) proc->subxids.xids;
+
+			rxact[count].subx_offset = subcount;
+
+			memcpy(subxip + subcount,
+				   (void *) proc->subxids.xids,
+				   nxids * sizeof(TransactionId));
+			subcount += nxids;
+
+			if (proc->subxids.overflowed)
+			{
+				rxact[count].overflowed = true;
+				suboverflowed = true;
+			}
+			
+			if (TransactionIdPrecedes(latestRunningXid, subxids[nxids - 1]))
+				latestRunningXid = subxids[nxids - 1];
+		}
+		else
+		{
+			rxact[count].subx_offset = 0;
+			rxact[count].overflowed = false;
+		}
+
+		rxact[count].nsubxids = nxids;
+		count++;
+	}
+
+	LWLockRelease(ProcArrayLock);
+
+	/*
+	 * When there are no transactions running, just use the value
+	 * of the last completed transaction. No need to check
+	 * ReadNewTransactionId().
+	 */
+	if (count == 0)
+		latestRunningXid = latestCompletedXid;
+
+	CurrentRunningXacts->xcnt = count;
+	CurrentRunningXacts->subxcnt = subcount;
+	CurrentRunningXacts->latestCompletedXid = latestCompletedXid;
+	CurrentRunningXacts->oldestRunningXid = oldestRunningXid;
+	if (suboverflowed)
+		CurrentRunningXacts->latestRunningXid = InvalidTransactionId;
+	else
+		CurrentRunningXacts->latestRunningXid = latestRunningXid;
+
+#ifdef RUNNING_XACT_DEBUG
+	elog(trace_recovery(DEBUG3), 
+					"logging running xacts xcnt %d subxcnt %d latestCompletedXid %d latestRunningXid %d",
+					CurrentRunningXacts->xcnt,
+					CurrentRunningXacts->subxcnt,
+					CurrentRunningXacts->latestCompletedXid,
+					CurrentRunningXacts->latestRunningXid);
+
+	for (index = 0; index < CurrentRunningXacts->xcnt; index++)
+	{
+		int j;
+		elog(trace_recovery(DEBUG3), 
+					"xid %d nsubxids %d offset %d, ovflow %s",
+					CurrentRunningXacts->xrun[index].xid,
+					CurrentRunningXacts->xrun[index].nsubxids,
+					CurrentRunningXacts->xrun[index].subx_offset,
+					CurrentRunningXacts->xrun[index].overflowed ? "t" : "f");
+		for (j = 0; j < CurrentRunningXacts->xrun[index].nsubxids; j++)
+			elog(trace_recovery(DEBUG3), 
+					"subxid offset %d j %d xid %d", 
+					CurrentRunningXacts->xrun[index].subx_offset, j,
+					CurrentRunningXacts->subxip[j + CurrentRunningXacts->xrun[index].subx_offset]);
+	}
+#endif
+
+	return CurrentRunningXacts;
+}
+
+/*
  * GetTransactionsInCommit -- Get the XIDs of transactions that are committing
  *
  * Constructs an array of XIDs of transactions that are currently in commit
@@ -968,6 +1613,41 @@ BackendPidGetProc(int pid)
 }
 
 /*
+ * BackendXidGetProc -- get a backend's PGPROC given its XID
+ *
+ * Returns NULL if not found.  Note that it is up to the caller to be
+ * sure that the question remains meaningful for long enough for the
+ * answer to be used ...
+ */
+PGPROC *
+BackendXidGetProc(TransactionId xid)
+{
+	PGPROC	   *result = NULL;
+	ProcArrayStruct *arrayP = procArray;
+	int			index;
+
+	if (xid == InvalidTransactionId)	/* never match invalid xid */
+		return 0;
+
+	LWLockAcquire(ProcArrayLock, LW_SHARED);
+
+	for (index = 0; index < arrayP->numProcs; index++)
+	{
+		PGPROC	   *proc = arrayP->procs[index];
+
+		if (proc->xid == xid)
+		{
+			result = proc;
+			break;
+		}
+	}
+
+	LWLockRelease(ProcArrayLock);
+
+	return result;
+}
+
+/*
  * BackendXidGetPid -- get a backend's pid given its XID
  *
  * Returns 0 if not found or it's a prepared transaction.  Note that
@@ -1024,13 +1704,14 @@ IsBackendPid(int pid)
  * The array is palloc'd and is terminated with an invalid VXID.
  *
  * If limitXmin is not InvalidTransactionId, we skip any backends
- * with xmin >= limitXmin.	If allDbs is false, we skip backends attached
+ * with xmin >= limitXmin.	If dbOid is valid we skip backends attached
  * to other databases.  If excludeVacuum isn't zero, we skip processes for
  * which (excludeVacuum & vacuumFlags) is not zero.  Also, our own process
  * is always skipped.
+ * 
  */
 VirtualTransactionId *
-GetCurrentVirtualXIDs(TransactionId limitXmin, bool allDbs, int excludeVacuum)
+GetCurrentVirtualXIDs(TransactionId limitXmin, Oid dbOid, int excludeVacuum)
 {
 	VirtualTransactionId *vxids;
 	ProcArrayStruct *arrayP = procArray;
@@ -1047,13 +1728,13 @@ GetCurrentVirtualXIDs(TransactionId limitXmin, bool allDbs, int excludeVacuum)
 	{
 		volatile PGPROC *proc = arrayP->procs[index];
 
-		if (proc == MyProc)
+		if (proc == MyProc || proc->pid == 0)
 			continue;
 
 		if (excludeVacuum & proc->vacuumFlags)
 			continue;
 
-		if (allDbs || proc->databaseId == MyDatabaseId)
+		if (!OidIsValid(dbOid) || proc->databaseId == dbOid)
 		{
 			/* Fetch xmin just once - might change on us? */
 			TransactionId pxmin = proc->xmin;
@@ -1083,6 +1764,117 @@ GetCurrentVirtualXIDs(TransactionId limitXmin, bool allDbs, int excludeVacuum)
 	return vxids;
 }
 
+/*
+ * GetConflictingVirtualXIDs -- returns an array of currently active VXIDs.
+ *
+ * The array is palloc'd and is terminated with an invalid VXID.
+ *
+ * If limitXmin is not InvalidTransactionId, we skip any backends
+ * with xmin >= limitXmin.	If dbOid is valid we skip backends attached
+ * to other databases.  If roleId is valid we skip backends attached
+ * as other roles.
+ *
+ * Be careful to *not* pfree the result from this function. We reuse
+ * this array sufficiently often that we use malloc for the result.
+ * We only ever call
+ */
+VirtualTransactionId *
+GetConflictingVirtualXIDs(TransactionId limitXmin, Oid dbOid, Oid roleId)
+{
+	static VirtualTransactionId *vxids;
+	ProcArrayStruct *arrayP = procArray;
+	int			count = 0;
+	int			index;
+
+	/*
+	 * If not first time through, get workspace to remember main XIDs in. We
+	 * malloc it permanently to avoid repeated palloc/pfree overhead.
+	 * Allow result space, remembering room for a terminator.
+	 */
+	if (vxids == NULL)
+	{
+		vxids = (VirtualTransactionId *)
+			malloc(sizeof(VirtualTransactionId) * (arrayP->maxProcs + 1));
+		if (vxids == NULL)
+			ereport(ERROR,
+					(errcode(ERRCODE_OUT_OF_MEMORY),
+					 errmsg("out of memory")));
+	}
+
+	LWLockAcquire(ProcArrayLock, LW_EXCLUSIVE);
+
+	for (index = 0; index < arrayP->numProcs; index++)
+	{
+		volatile PGPROC *proc = arrayP->procs[index];
+
+		/* Exclude recovery procs and prepared transactions */
+		if (proc->pid == 0)
+			continue;
+
+		if ((!OidIsValid(dbOid) && !OidIsValid(roleId)) || 
+			(proc->databaseId == dbOid && !OidIsValid(roleId)) ||
+			(OidIsValid(dbOid) && proc->roleId == roleId))
+		{
+			/* Fetch xmin just once - can't change on us, but good coding */
+			TransactionId pxmin = proc->xmin;
+
+			/*
+			 * If limitXmin is set we explicitly choose to ignore an invalid
+			 * pxmin because this means that backend has no snapshot and
+			 * cannot get another one while we hold exclusive lock.
+			 */
+			if (!TransactionIdIsValid(limitXmin) ||
+				(TransactionIdPrecedes(pxmin, limitXmin) && TransactionIdIsValid(pxmin)))
+			{
+				VirtualTransactionId vxid;
+
+				GET_VXID_FROM_PGPROC(vxid, *proc);
+				if (VirtualTransactionIdIsValid(vxid))
+					vxids[count++] = vxid;
+			}
+		}
+	}
+
+	LWLockRelease(ProcArrayLock);
+
+	/* add the terminator */
+	vxids[count].backendId = InvalidBackendId;
+	vxids[count].localTransactionId = InvalidLocalTransactionId;
+
+	return vxids;
+}
+
+PGPROC *
+VirtualTransactionIdGetProc(VirtualTransactionId vxid)
+{
+	ProcArrayStruct *arrayP = procArray;
+	PGPROC 		*result = NULL;
+	int			index;
+
+	if (!VirtualTransactionIdIsValid(vxid))
+		return NULL;
+
+	LWLockAcquire(ProcArrayLock, LW_SHARED);
+
+	for (index = 0; index < arrayP->numProcs; index++)
+	{
+		VirtualTransactionId procvxid;
+		PGPROC	   *proc = arrayP->procs[index];
+
+		GET_VXID_FROM_PGPROC(procvxid, *proc);
+
+		if (procvxid.backendId == vxid.backendId &&
+			procvxid.localTransactionId == vxid.localTransactionId)
+		{
+			result = proc;
+			break;
+		}
+	}
+
+	LWLockRelease(ProcArrayLock);
+
+	return result;
+}
 
 /*
  * CountActiveBackends --- count backends (other than myself) that are in
@@ -1111,7 +1903,7 @@ CountActiveBackends(void)
 		if (proc == MyProc)
 			continue;			/* do not count myself */
 		if (proc->pid == 0)
-			continue;			/* do not count prepared xacts */
+			continue;			/* do not count prepared xacts or recovery procs */
 		if (proc->xid == InvalidTransactionId)
 			continue;			/* do not count if no XID assigned */
 		if (proc->waitLock != NULL)
@@ -1139,7 +1931,7 @@ CountDBBackends(Oid databaseid)
 		volatile PGPROC *proc = arrayP->procs[index];
 
 		if (proc->pid == 0)
-			continue;			/* do not count prepared xacts */
+			continue;			/* do not count prepared xacts or recovery procs */
 		if (proc->databaseId == databaseid)
 			count++;
 	}
@@ -1166,7 +1958,7 @@ CountUserBackends(Oid roleid)
 		volatile PGPROC *proc = arrayP->procs[index];
 
 		if (proc->pid == 0)
-			continue;			/* do not count prepared xacts */
+			continue;			/* do not count prepared xacts or recovery procs */
 		if (proc->roleId == roleid)
 			count++;
 	}
@@ -1207,6 +1999,9 @@ CountOtherDBBackends(Oid databaseId, int *nbackends, int *nprepared)
 	int			autovac_pids[MAXAUTOVACPIDS];
 	int			tries;
 
+	/* Gives wrong answer in recovery, so make sure we don't use it */
+	Assert(!IsRecoveryProcessingMode());
+
 	/* 50 tries with 100ms sleep between tries makes 5 sec total wait */
 	for (tries = 0; tries < 50; tries++)
 	{
@@ -1367,3 +2162,243 @@ DisplayXidCache(void)
 }
 
 #endif   /* XIDCACHE_DEBUG */
+
+/* ----------------------------------------------
+ * 		UnobservedTransactions sub-module
+ * ----------------------------------------------
+ *
+ * All functions must be called holding ProcArrayLock.
+ */
+
+/*
+ * Add unobserved xids to end of UnobservedXids array
+ */
+void
+UnobservedTransactionsAddXids(TransactionId firstXid, TransactionId lastXid)
+{
+	TransactionId 	ixid = firstXid;
+	int 			index = procArray->numUnobservedXids;
+	TransactionId *UnobservedXids;
+
+	UnobservedXids = (TransactionId *) &(procArray->procs[procArray->maxProcs]);
+
+	Assert(TransactionIdIsNormal(firstXid));
+	Assert(TransactionIdIsNormal(lastXid));
+	Assert(TransactionIdPrecedes(firstXid, lastXid));
+
+	/*
+	 * UnobservedXids is maintained as a ascending list of xids, with no gaps.
+	 * Incoming xids are always higher than previous entries, so we just add
+	 * them directly to the end of the array.
+	 */
+	while (ixid != lastXid)
+	{
+		/*
+		 * check to see if we have space to store more UnobservedXids
+		 */
+		if (index >= procArray->maxUnobservedXids)
+		{
+			UnobservedTransactionsDisplay(WARNING);
+			elog(FATAL, "no more room in UnobservedXids array");
+		}
+
+		/*
+		 * append ixid to UnobservedXids
+		 */
+#ifdef USE_ASSERT_CHECKING
+		if (TransactionIdIsValid(UnobservedXids[index]))
+		{
+			UnobservedTransactionsDisplay(LOG);
+			elog(FATAL, "unobservedxids leak: adding xid %u onto existing entry %d", 
+									ixid, UnobservedXids[index]);  
+		}
+
+		if ((index > 0 && TransactionIdPrecedes(ixid, UnobservedXids[index - 1])))
+		{
+			UnobservedTransactionsDisplay(LOG);
+			elog(FATAL, "UnobservedXids leak: adding xid %u out of order at index %d", 
+									ixid, index);  
+		}
+#endif
+
+		elog(trace_recovery(DEBUG4), "adding unobservedxid %u (numxids %d min %u max %u)", 
+										ixid, procArray->numUnobservedXids,
+										UnobservedXids[0],
+										UnobservedXids[procArray->numUnobservedXids]);
+		UnobservedXids[index] = ixid;
+		index++;
+
+		TransactionIdAdvance(ixid);
+	}
+
+	procArray->numUnobservedXids = index;
+}
+
+/*
+ * Remove one unobserved xid from anywhere on UnobservedXids array.
+ * If xid has already been pruned away, no need to report as missing.
+ */
+void
+UnobservedTransactionsRemoveXid(TransactionId xid, bool missing_is_error)
+{
+	int 			index;
+	bool			found = false;
+	TransactionId	*UnobservedXids;
+
+	UnobservedXids = (TransactionId *) &(procArray->procs[procArray->maxProcs]);
+
+	/* 
+	 * If we haven't initialised array yet, or if we've already cleared it
+	 * ignore this and get on with it. If it's missing after this it is an
+	 * ERROR if removal is requested and the value isn't present.
+	 */
+	if (procArray->numUnobservedXids == 0 ||
+		(procArray->numUnobservedXids > 0 && 
+		TransactionIdPrecedes(xid, UnobservedXids[0])))
+		return;
+
+	elog(trace_recovery(DEBUG4), "remove unobservedxid %u (numxids %d min %u max %u)", 
+										xid, procArray->numUnobservedXids,
+										UnobservedXids[0],
+										UnobservedXids[procArray->numUnobservedXids]);
+
+	/*
+	 * Locate our xid, and if found shunt others sideways to close the gap.
+	 */
+	for (index = 0; index < procArray->numUnobservedXids; index++)
+	{
+		if (!found)
+		{
+			if (UnobservedXids[index] == xid)
+				found = true;
+		}
+		else
+		{
+			UnobservedXids[index - 1] = UnobservedXids[index];
+		}
+	}
+
+	if (found)
+	{
+		UnobservedXids[--procArray->numUnobservedXids] = InvalidTransactionId;
+	}
+
+	if (!found && missing_is_error)
+	{
+		UnobservedTransactionsDisplay(LOG);
+		elog(ERROR, "could not remove unobserved xid = %d", xid);
+	}
+}
+
+/*
+ * Prune array up to a particular limit. This frequently means clearing the
+ * whole array, so check for that first.
+ */
+void
+UnobservedTransactionsPruneXids(TransactionId limitXid)
+{
+	int 			index;
+	int				pruneUpToThisIndex = 0;
+	TransactionId	*UnobservedXids;
+
+	UnobservedXids = (TransactionId *) &(procArray->procs[procArray->maxProcs]);
+
+	if (TransactionIdFollowsOrEquals(limitXid, UnobservedXids[procArray->numUnobservedXids]))
+	{
+		UnobservedTransactionsClearXids();
+		return;
+	}
+
+	elog(trace_recovery(DEBUG4), "prune unobservedxids up to %u (numxids %d min %u max %u)", 
+										limitXid, 
+										procArray->numUnobservedXids,
+										UnobservedXids[0],
+										UnobservedXids[procArray->numUnobservedXids]);
+
+	for (index = 0; index < procArray->numUnobservedXids; index++)
+	{
+		if (TransactionIdFollowsOrEquals(limitXid, UnobservedXids[index]))
+			pruneUpToThisIndex = index + 1;
+		else 
+		{
+			/*
+			 * Anything to delete?
+			 */
+			if (pruneUpToThisIndex == 0)
+				return;
+
+			/*
+			 * Move unpruned values to start of array
+			 */
+			UnobservedXids[index - pruneUpToThisIndex] = UnobservedXids[index];
+			UnobservedXids[index] = 0;
+		}
+	}
+
+	procArray->numUnobservedXids -= pruneUpToThisIndex;
+}
+
+/*
+ * Clear the whole array.
+ */
+void
+UnobservedTransactionsClearXids(void)
+{
+	int 			index;
+	TransactionId	*UnobservedXids;
+
+	elog(trace_recovery(DEBUG4), "clear UnobservedXids");
+	UnobservedTransactionsDisplay(DEBUG4);
+
+	UnobservedXids = (TransactionId *) &(procArray->procs[procArray->maxProcs]);
+
+	/* 
+	 * UnobservedTransactionsAddXids() asserts that array will be empty
+	 * when we add new values. so it must be zeroes here each time.
+	 * That needs to be fast and accurate, this can be slowish.
+	 */
+	for (index = 0; index < procArray->numUnobservedXids; index++)
+	{
+		UnobservedXids[index] = 0;
+	}
+
+	procArray->numUnobservedXids = 0;
+}
+
+void
+UnobservedTransactionsDisplay(int trace_level)
+{
+	int				index;
+	TransactionId	*UnobservedXids;
+	StringInfoData buf;
+
+	initStringInfo(&buf);
+
+	UnobservedXids = (TransactionId *) &(procArray->procs[procArray->maxProcs]);
+
+	for (index = 0; index < procArray->numUnobservedXids; index++)
+	{
+		if (TransactionIdIsValid(UnobservedXids[index]))
+			appendStringInfo(&buf, "%u ", UnobservedXids[index]);
+	}
+
+	elog(trace_level, "%d unobserved xids %s", procArray->numUnobservedXids, buf.data);
+
+	pfree(buf.data);
+}
+
+bool
+XidInUnobservedTransactions(TransactionId xid)
+{
+	int				index;
+	TransactionId	*UnobservedXids;
+
+	UnobservedXids = (TransactionId *) &(procArray->procs[procArray->maxProcs]);
+
+	for (index = 0; index < procArray->numUnobservedXids; index++)
+	{
+		if (UnobservedXids[index] == xid)
+			return true;
+	}
+	return false;
+}
diff --git a/src/backend/storage/ipc/sinvaladt.c b/src/backend/storage/ipc/sinvaladt.c
index cb4e0a942b..8e0a60f120 100644
--- a/src/backend/storage/ipc/sinvaladt.c
+++ b/src/backend/storage/ipc/sinvaladt.c
@@ -142,6 +142,7 @@ typedef struct ProcState
 	int			nextMsgNum;		/* next message number to read */
 	bool		resetState;		/* backend needs to reset its state */
 	bool		signaled;		/* backend has been sent catchup signal */
+	bool		sendOnly;		/* backend only sends, never receives */
 
 	/*
 	 * Next LocalTransactionId to use for each idle backend slot.  We keep
@@ -248,7 +249,7 @@ CreateSharedInvalidationState(void)
  *		Initialize a new backend to operate on the sinval buffer
  */
 void
-SharedInvalBackendInit(void)
+SharedInvalBackendInit(bool sendOnly)
 {
 	int			index;
 	ProcState  *stateP = NULL;
@@ -307,6 +308,7 @@ SharedInvalBackendInit(void)
 	stateP->nextMsgNum = segP->maxMsgNum;
 	stateP->resetState = false;
 	stateP->signaled = false;
+	stateP->sendOnly = sendOnly;
 
 	LWLockRelease(SInvalWriteLock);
 
@@ -578,7 +580,9 @@ SICleanupQueue(bool callerHasWriteLock, int minFree)
 	/*
 	 * Recompute minMsgNum = minimum of all backends' nextMsgNum, identify
 	 * the furthest-back backend that needs signaling (if any), and reset
-	 * any backends that are too far back.
+	 * any backends that are too far back. Note that because we ignore
+	 * sendOnly backends here it is possible for them to keep sending
+	 * messages without a problem even when they are the only active backend.
 	 */
 	min = segP->maxMsgNum;
 	minsig = min - SIG_THRESHOLD;
@@ -590,7 +594,7 @@ SICleanupQueue(bool callerHasWriteLock, int minFree)
 		int		n = stateP->nextMsgNum;
 
 		/* Ignore if inactive or already in reset state */
-		if (stateP->procPid == 0 || stateP->resetState)
+		if (stateP->procPid == 0 || stateP->resetState || stateP->sendOnly)
 			continue;
 
 		/*
diff --git a/src/backend/storage/lmgr/lock.c b/src/backend/storage/lmgr/lock.c
index 7c8b1f5aac..b9cd501f6c 100644
--- a/src/backend/storage/lmgr/lock.c
+++ b/src/backend/storage/lmgr/lock.c
@@ -35,9 +35,11 @@
 #include "access/transam.h"
 #include "access/twophase.h"
 #include "access/twophase_rmgr.h"
+#include "access/xact.h"
 #include "miscadmin.h"
 #include "pg_trace.h"
 #include "pgstat.h"
+#include "storage/sinval.h"
 #include "utils/memutils.h"
 #include "utils/ps_status.h"
 #include "utils/resowner.h"
@@ -490,6 +492,15 @@ LockAcquire(const LOCKTAG *locktag,
 	if (lockmode <= 0 || lockmode > lockMethodTable->numLockModes)
 		elog(ERROR, "unrecognized lock mode: %d", lockmode);
 
+	if (IsRecoveryProcessingMode() && 
+		locktag->locktag_type == LOCKTAG_OBJECT &&
+		lockmode > AccessShareLock)
+		ereport(ERROR,
+				(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
+				 errmsg("cannot acquire lockmode %s on database objects while recovery is in progress", 
+									lockMethodTable->lockModeNames[lockmode]),
+				 errhint("Only AccessShareLock can be acquired on database objects during recovery.")));
+
 #ifdef LOCK_DEBUG
 	if (LOCK_DEBUG_ENABLED(locktag))
 		elog(LOG, "LockAcquire: lock [%u,%u] %s",
@@ -817,6 +828,54 @@ LockAcquire(const LOCKTAG *locktag,
 
 	LWLockRelease(partitionLock);
 
+	/*
+	 * We made it all the way here. We've got the lock and we've got
+	 * it for the first time in this transaction. So now it's time
+	 * to send a WAL message so that standby servers can see this event,
+	 * if its an AccessExclusiveLock on a relation. 
+	 */
+	if (!IsRecoveryProcessingMode() && lockmode >= AccessExclusiveLock && 
+		locktag->locktag_type == LOCKTAG_RELATION)
+	{
+		XLogRecData		rdata;
+		xl_rel_lock		xlrec;
+		TransactionId	xid;
+
+		/*
+		 * First thing we do is ensure that a TransactionId has been
+		 * assigned to this transaction. We don't actually need the xid
+		 * but if we don't do this then RecordTransactionCommit() and
+		 * RecordTransactionAbort() will optimise away the transaction
+		 * completion record which recovery relies upon to release locks.
+		 * It's a hack, but for a corner case not worth adding code for 
+		 * into the main commit path.
+		 */
+		xid = GetTopTransactionId();
+		Assert(TransactionIdIsValid(xid));
+
+		Assert(OidIsValid(locktag->locktag_field2));
+
+		START_CRIT_SECTION();
+
+		/* 
+		 * Decode the locktag back to the original values, to avoid
+		 * sending lots of empty bytes with every message.  See
+		 * lock.h to check how a locktag is defined  for LOCKTAG_RELATION
+		 */
+		xlrec.xid = xid;
+		xlrec.dbOid = locktag->locktag_field1;
+		xlrec.relOid = locktag->locktag_field2;
+
+		rdata.data = (char *) (&xlrec);
+		rdata.len = sizeof(xl_rel_lock);
+		rdata.buffer = InvalidBuffer;
+		rdata.next = NULL;
+
+		(void) XLogInsert(RM_RELATION_ID, XLOG_RELATION_LOCK, &rdata);
+
+		END_CRIT_SECTION();
+	}
+
 	return LOCKACQUIRE_OK;
 }
 
diff --git a/src/backend/storage/lmgr/lwlock.c b/src/backend/storage/lmgr/lwlock.c
index f2ccbe14e7..ea55be4f48 100644
--- a/src/backend/storage/lmgr/lwlock.c
+++ b/src/backend/storage/lmgr/lwlock.c
@@ -693,3 +693,18 @@ LWLockHeldByMe(LWLockId lockid)
 	}
 	return false;
 }
+
+void
+PrintLWLocksHeldByMe(void)
+{
+	int			i;
+
+	for (i = 0; i < num_held_lwlocks; i++)
+		elog(LOG, "leak held_lwlocks[%d] = %d", i, held_lwlocks[i]);
+}
+
+int
+NumLWLocksHeldByMe(void)
+{
+	return num_held_lwlocks;
+}
diff --git a/src/backend/storage/lmgr/proc.c b/src/backend/storage/lmgr/proc.c
index 9e871eff92..489c9a07d5 100644
--- a/src/backend/storage/lmgr/proc.c
+++ b/src/backend/storage/lmgr/proc.c
@@ -28,6 +28,10 @@
  *
  * ProcKill -- destroys the shared memory state (and locks)
  * associated with the process.
+ *
+ * In 8.4 we introduce the idea of recovery procs which hold state
+ * information for transactions currently being replayed. Many of the
+ * functions here apply only real procs representing connected users.
  */
 #include "postgres.h"
 
@@ -103,6 +107,8 @@ ProcGlobalShmemSize(void)
 	size = add_size(size, mul_size(NUM_AUXILIARY_PROCS, sizeof(PGPROC)));
 	/* MyProcs, including autovacuum */
 	size = add_size(size, mul_size(MaxBackends, sizeof(PGPROC)));
+	/* RecoveryProcs, including recovery actions by autovacuum */
+	size = add_size(size, mul_size(MaxBackends, sizeof(PGPROC)));
 	/* ProcStructLock */
 	size = add_size(size, sizeof(slock_t));
 
@@ -172,6 +178,7 @@ InitProcGlobal(void)
 	 */
 	ProcGlobal->freeProcs = NULL;
 	ProcGlobal->autovacFreeProcs = NULL;
+	ProcGlobal->freeRecoveryProcs = NULL;
 
 	ProcGlobal->spins_per_delay = DEFAULT_SPINS_PER_DELAY;
 
@@ -204,6 +211,35 @@ InitProcGlobal(void)
 		ProcGlobal->autovacFreeProcs = &procs[i];
 	}
 
+	/* 
+	 * Create enough recovery procs so there is a shadow proc for every
+	 * proc on the master, including both normal procs, autovac procs
+	 * and anything else that might run transactions and write WAL.
+	 * Bgwriter writes WAL but does not have a TransactionId, so ignore.
+	 * We use the same procs for prepared transactions whether we are
+	 * in recovery or not, so no space required for them either.
+	 * 
+	 * Recovery procs are just ghosts which store just enough information 
+	 * to make them look real to anyone requesting a snapshot from the 
+	 * procarray. So recovery procs don't need semaphores because they 
+	 * aren't actually performing any work.
+	 *
+	 * Although the recovery procs tie up some shared memory they will
+	 * not be part of the ProcArray once the database has fully started
+	 * up, so there is little performance effect during normal running.
+	 */
+	procs = (PGPROC *) ShmemAlloc((MaxBackends) * sizeof(PGPROC));
+	if (!procs)
+		ereport(FATAL,
+				(errcode(ERRCODE_OUT_OF_MEMORY),
+				 errmsg("out of shared memory")));
+	MemSet(procs, 0, MaxBackends * sizeof(PGPROC));
+	for (i = 0; i < MaxBackends; i++)
+	{
+		procs[i].links.next = (SHM_QUEUE *) ProcGlobal->freeRecoveryProcs;
+		ProcGlobal->freeRecoveryProcs = &procs[i];
+	}
+
 	MemSet(AuxiliaryProcs, 0, NUM_AUXILIARY_PROCS * sizeof(PGPROC));
 	for (i = 0; i < NUM_AUXILIARY_PROCS; i++)
 	{
@@ -342,7 +378,7 @@ InitProcessPhase2(void)
 	/*
 	 * Add our PGPROC to the PGPROC array in shared memory.
 	 */
-	ProcArrayAdd(MyProc);
+	ProcArrayAdd(MyProc, true);
 
 	/*
 	 * Arrange to clean that up at backend exit.
@@ -363,6 +399,11 @@ InitProcessPhase2(void)
  * to the ProcArray or the sinval messaging mechanism, either.	They also
  * don't get a VXID assigned, since this is only useful when we actually
  * hold lockmgr locks.
+ *
+ * Startup process however uses locks but never waits for them in the
+ * normal backend sense. Startup process also takes part in sinval messaging
+ * as a sendOnly process, so never reads messages from sinval queue. So
+ * Startup process does have a VXID and does show up in pg_locks.
  */
 void
 InitAuxiliaryProcess(void)
@@ -452,6 +493,153 @@ InitAuxiliaryProcess(void)
 }
 
 /*
+ * InitRecoveryProcess -- initialize a per-master process data structure
+ *							for use when emulating transactions in recovery
+ *
+ * Note: returns NULL if no proc was available - this is not an error, it
+ * will just force a change of state in the proc array.
+ */
+PGPROC *
+InitRecoveryProcess(TransactionId xid)
+{
+	/* use volatile pointer to prevent code rearrangement */
+	volatile PROC_HDR *procglobal = ProcGlobal;
+	PGPROC		*ThisProc = NULL;
+
+	/*
+	 * ProcGlobal should be set up already (if we are a backend, we inherit
+	 * this by fork() or EXEC_BACKEND mechanism from the postmaster).
+	 */
+	if (procglobal == NULL)
+		elog(PANIC, "proc header uninitialized");
+
+	/*
+	 * Try to get a proc struct from the free list.  If this fails, we must be
+	 * out of PGPROC structures.
+	 */
+	SpinLockAcquire(ProcStructLock);
+
+	ThisProc = procglobal->freeRecoveryProcs;
+
+	if (ThisProc != NULL)
+	{
+		procglobal->freeRecoveryProcs = (PGPROC *) ThisProc->links.next;
+		SpinLockRelease(ProcStructLock);
+	}
+	else
+	{
+		SpinLockRelease(ProcStructLock);
+
+		/*
+		 * If we did throw an ERROR, it would be here
+		 */
+		return NULL;
+	}
+
+	/*
+	 * We haven't added it to proc array yet, so no locking required here.
+	 */
+	ThisProc->xid = xid;
+
+	/* 
+	 * The following are not used for recovery procs
+	 */
+	ThisProc->backendId = InvalidBackendId;
+	ThisProc->pid = 0;
+	ThisProc->waitStatus = STATUS_OK;
+	ThisProc->lxid = InvalidLocalTransactionId;
+	ThisProc->xmin = InvalidTransactionId;
+	ThisProc->databaseId = InvalidOid;
+	ThisProc->roleId = InvalidOid;
+	ThisProc->inCommit = false;
+	ThisProc->vacuumFlags = 0;
+	ThisProc->lwWaiting = false;
+	ThisProc->lwExclusive = false;
+	ThisProc->lwWaitLink = NULL;
+	ThisProc->waitLock = NULL;
+	ThisProc->waitProcLock = NULL;
+
+	/*
+	 * There is little else to do. The recovery proc is never used to
+	 * acquire buffers, nor will we ever acquire LWlocks using the proc.
+	 * Deadlock checker is not active during recovery.
+	 */
+	return ThisProc;
+}
+
+void
+FreeRecoveryProcess(PGPROC *proc)
+{
+	volatile PROC_HDR *procglobal = ProcGlobal;
+
+	SpinLockAcquire(ProcStructLock);
+
+	/* Return struct to freelist */
+	proc->links.next = (SHM_QUEUE *) procglobal->freeRecoveryProcs;
+	procglobal->freeRecoveryProcs = proc;
+
+	SpinLockRelease(ProcStructLock);
+}
+
+/*
+ * Additional initialisation for Startup process
+ */
+void
+PublishStartupProcessInformation(void)
+{
+	/* use volatile pointer to prevent code rearrangement */
+	volatile PROC_HDR *procglobal = ProcGlobal;
+
+	SpinLockAcquire(ProcStructLock);
+
+	/*
+	 * Record Startup process information, for use in ProcSendSignal().
+	 * See comments there for further explanation.
+	 */ 
+	procglobal->startupProc = MyProc;
+	procglobal->startupProcPid = MyProcPid;
+
+	SpinLockRelease(ProcStructLock);
+}
+
+/*
+ * Set recovery conflict information for a single proc. 
+ */
+void
+ProcSetRecoveryConflict(PGPROC *proc, XLogRecPtr conflict_LSN, int cancel_mode)
+{
+	/* use volatile pointer to prevent code rearrangement */
+	volatile PGPROC *vproc = proc;
+
+	SpinLockAcquire(ProcStructLock);
+
+	vproc->recoveryConflictLSN = conflict_LSN;
+	vproc->recoveryConflictCancelMode = cancel_mode;
+
+	SpinLockRelease(ProcStructLock);
+}
+
+/*
+ * Get recovery conflict information for a single proc. 
+ */
+XLogRecPtr
+ProcGetRecoveryConflict(int *cancel_mode)
+{
+	XLogRecPtr conflict_LSN;
+
+	volatile PGPROC *vproc = MyProc;
+
+	SpinLockAcquire(ProcStructLock);
+
+	conflict_LSN = vproc->recoveryConflictLSN;
+	*cancel_mode = vproc->recoveryConflictCancelMode;
+
+	SpinLockRelease(ProcStructLock);
+
+	return conflict_LSN;
+}
+
+/*
  * Check whether there are at least N free PGPROC objects.
  *
  * Note: this is designed on the assumption that N will generally be small.
@@ -565,17 +753,21 @@ ProcReleaseLocks(bool isCommit)
 
 /*
  * RemoveProcFromArray() -- Remove this process from the shared ProcArray.
+ *
+ * Only intended for use with real procs, not recovery procs.
  */
 static void
 RemoveProcFromArray(int code, Datum arg)
 {
 	Assert(MyProc != NULL);
-	ProcArrayRemove(MyProc, InvalidTransactionId);
+	ProcArrayRemove(MyProc, InvalidTransactionId, 0, NULL);
 }
 
 /*
  * ProcKill() -- Destroy the per-proc data structure for
  *		this process. Release any of its held LW locks.
+ *
+ * Only intended for use with real procs, not recovery procs.
  */
 static void
 ProcKill(int code, Datum arg)
@@ -1271,7 +1463,31 @@ ProcWaitForSignal(void)
 void
 ProcSendSignal(int pid)
 {
-	PGPROC	   *proc = BackendPidGetProc(pid);
+	PGPROC	   *proc = NULL;
+
+	if (IsRecoveryProcessingMode())
+	{
+		/* use volatile pointer to prevent code rearrangement */
+		volatile PROC_HDR *procglobal = ProcGlobal;
+
+		SpinLockAcquire(ProcStructLock);
+
+		/*
+		 * Check to see whether it is the Startup process we wish to signal.
+		 * This call is made by the buffer manager when it wishes to wake
+		 * up a process that has been waiting for a pin in so it can obtain a
+		 * cleanup lock using LockBufferForCleanup(). Startup is not a normal 
+		 * backend, so BackendPidGetProc() will not return any pid at all. 
+		 * So we remember the information for this special case.
+		 */
+		if (pid == procglobal->startupProcPid)
+			proc = procglobal->startupProc;
+
+		SpinLockRelease(ProcStructLock);
+	}
+
+	if (proc == NULL) 
+		proc = BackendPidGetProc(pid);
 
 	if (proc != NULL)
 		PGSemaphoreUnlock(&proc->sem);
diff --git a/src/backend/tcop/postgres.c b/src/backend/tcop/postgres.c
index 3781b55be8..de666acedf 100644
--- a/src/backend/tcop/postgres.c
+++ b/src/backend/tcop/postgres.c
@@ -2579,8 +2579,8 @@ StatementCancelHandler(SIGNAL_ARGS)
 		 * the interrupt immediately.  No point in interrupting if we're
 		 * waiting for input, however.
 		 */
-		if (ImmediateInterruptOK && InterruptHoldoffCount == 0 &&
-			CritSectionCount == 0 && !DoingCommandRead)
+		if (InterruptHoldoffCount == 0 && CritSectionCount == 0 && 
+			(DoingCommandRead || ImmediateInterruptOK))
 		{
 			/* bump holdoff count to make ProcessInterrupts() a no-op */
 			/* until we are done getting ready for it */
@@ -2660,10 +2660,37 @@ ProcessInterrupts(void)
 			ereport(ERROR,
 					(errcode(ERRCODE_QUERY_CANCELED),
 					 errmsg("canceling autovacuum task")));
-		else
+		else 
+		{
+			if (IsRecoveryProcessingMode())
+			{
+				int 		cancel_mode = 0;
+				XLogRecPtr	conflict_LSN = ProcGetRecoveryConflict(&cancel_mode);
+
+				switch (cancel_mode)
+				{
+					case FATAL:
+							ereport(FATAL,
+								(errcode(ERRCODE_QUERY_CANCELED),
+								 errmsg("canceling session due to conflict with recovery")));
+					case ERROR:
+							if (XLogRecPtrIsValid(conflict_LSN))
+								SetBufferRecoveryConflictLSN(conflict_LSN);
+							else
+								ereport(ERROR,
+									(errcode(ERRCODE_QUERY_CANCELED),
+									 errmsg("canceling statement due to conflict with recovery")));
+							return;	
+					default:
+							/* No conflict pending, so fall through */
+							break;
+				}
+			}
+
 			ereport(ERROR,
 					(errcode(ERRCODE_QUERY_CANCELED),
 					 errmsg("canceling statement due to user request")));
+		}
 	}
 	/* If we get here, do nothing (probably, QueryCancelPending was reset) */
 }
diff --git a/src/backend/tcop/utility.c b/src/backend/tcop/utility.c
index 751d7deaa5..e7ad3faaca 100644
--- a/src/backend/tcop/utility.c
+++ b/src/backend/tcop/utility.c
@@ -287,10 +287,22 @@ ProcessUtility(Node *parsetree,
 									SetPGVariable("transaction_isolation",
 												  list_make1(item->arg),
 												  true);
+								
 								else if (strcmp(item->defname, "transaction_read_only") == 0)
+								{
+									A_Const	   *con;
+
+									Assert(IsA(item->arg, A_Const));
+									con = (A_Const *) item->arg;
+									Assert(nodeTag(&con->val) == T_Integer);
+
+									if (!intVal(&con->val))
+										PreventCommandDuringRecovery();
+
 									SetPGVariable("transaction_read_only",
 												  list_make1(item->arg),
 												  true);
+								}
 							}
 						}
 						break;
@@ -305,6 +317,7 @@ ProcessUtility(Node *parsetree,
 						break;
 
 					case TRANS_STMT_PREPARE:
+						PreventCommandDuringRecovery();
 						if (!PrepareTransactionBlock(stmt->gid))
 						{
 							/* report unsuccessful commit in completionTag */
@@ -314,11 +327,13 @@ ProcessUtility(Node *parsetree,
 						break;
 
 					case TRANS_STMT_COMMIT_PREPARED:
+						PreventCommandDuringRecovery();
 						PreventTransactionChain(isTopLevel, "COMMIT PREPARED");
 						FinishPreparedTransaction(stmt->gid, true);
 						break;
 
 					case TRANS_STMT_ROLLBACK_PREPARED:
+						PreventCommandDuringRecovery();
 						PreventTransactionChain(isTopLevel, "ROLLBACK PREPARED");
 						FinishPreparedTransaction(stmt->gid, false);
 						break;
@@ -676,6 +691,7 @@ ProcessUtility(Node *parsetree,
 			break;
 
 		case T_GrantStmt:
+			PreventCommandDuringRecovery();
 			ExecuteGrantStmt((GrantStmt *) parsetree);
 			break;
 
@@ -846,6 +862,7 @@ ProcessUtility(Node *parsetree,
 		case T_NotifyStmt:
 			{
 				NotifyStmt *stmt = (NotifyStmt *) parsetree;
+				PreventCommandDuringRecovery();
 
 				Async_Notify(stmt->conditionname);
 			}
@@ -854,6 +871,7 @@ ProcessUtility(Node *parsetree,
 		case T_ListenStmt:
 			{
 				ListenStmt *stmt = (ListenStmt *) parsetree;
+				PreventCommandDuringRecovery();
 
 				Async_Listen(stmt->conditionname);
 			}
@@ -862,6 +880,7 @@ ProcessUtility(Node *parsetree,
 		case T_UnlistenStmt:
 			{
 				UnlistenStmt *stmt = (UnlistenStmt *) parsetree;
+				PreventCommandDuringRecovery();
 
 				if (stmt->conditionname)
 					Async_Unlisten(stmt->conditionname);
@@ -881,10 +900,12 @@ ProcessUtility(Node *parsetree,
 			break;
 
 		case T_ClusterStmt:
+			PreventCommandDuringRecovery();
 			cluster((ClusterStmt *) parsetree, isTopLevel);
 			break;
 
 		case T_VacuumStmt:
+			PreventCommandDuringRecovery();
 			vacuum((VacuumStmt *) parsetree, InvalidOid, true, NULL, false,
 				   isTopLevel);
 			break;
@@ -1000,12 +1021,14 @@ ProcessUtility(Node *parsetree,
 				ereport(ERROR,
 						(errcode(ERRCODE_INSUFFICIENT_PRIVILEGE),
 						 errmsg("must be superuser to do CHECKPOINT")));
+			PreventCommandDuringRecovery();
 			RequestCheckpoint(CHECKPOINT_IMMEDIATE | CHECKPOINT_FORCE | CHECKPOINT_WAIT);
 			break;
 
 		case T_ReindexStmt:
 			{
 				ReindexStmt *stmt = (ReindexStmt *) parsetree;
+				PreventCommandDuringRecovery();
 
 				switch (stmt->kind)
 				{
@@ -2490,3 +2513,12 @@ GetCommandLogLevel(Node *parsetree)
 
 	return lev;
 }
+
+void
+PreventCommandDuringRecovery(void)
+{
+	if (IsRecoveryProcessingMode())
+		ereport(ERROR,
+			(errcode(ERRCODE_READ_ONLY_SQL_TRANSACTION),
+			 errmsg("cannot be run until recovery completes")));
+}
diff --git a/src/backend/utils/adt/txid.c b/src/backend/utils/adt/txid.c
index 7e51f9e2ad..81814e6e2b 100644
--- a/src/backend/utils/adt/txid.c
+++ b/src/backend/utils/adt/txid.c
@@ -338,6 +338,12 @@ txid_current(PG_FUNCTION_ARGS)
 	txid		val;
 	TxidEpoch	state;
 
+	if (IsRecoveryProcessingMode())
+		ereport(ERROR,
+				(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
+				 errmsg("cannot assign txid while recovery is in progress"),
+				 errhint("only read only queries can execute during recovery")));
+
 	load_xid_epoch(&state);
 
 	val = convert_xid(GetTopTransactionId(), &state);
diff --git a/src/backend/utils/cache/inval.c b/src/backend/utils/cache/inval.c
index 9738fa1c31..03a8ba372a 100644
--- a/src/backend/utils/cache/inval.c
+++ b/src/backend/utils/cache/inval.c
@@ -86,10 +86,16 @@
  */
 #include "postgres.h"
 
+#include <signal.h>
+
+#include "access/transam.h"
 #include "access/twophase_rmgr.h"
 #include "access/xact.h"
 #include "catalog/catalog.h"
 #include "miscadmin.h"
+#include "storage/lmgr.h"
+#include "storage/procarray.h"
+#include "storage/proc.h"
 #include "storage/sinval.h"
 #include "storage/smgr.h"
 #include "utils/inval.h"
@@ -155,6 +161,14 @@ typedef struct TransInvalidationInfo
 
 static TransInvalidationInfo *transInvalInfo = NULL;
 
+static SharedInvalidationMessage *SharedInvalidMessagesArray;
+static int 					numSharedInvalidMessagesArray;
+static int 					maxSharedInvalidMessagesArray;
+
+static List *RecoveryLockList;
+static MemoryContext	RelationLockContext;
+
+
 /*
  * Dynamically-registered callback functions.  Current implementation
  * assumes there won't be very many of these at once; could improve if needed.
@@ -741,6 +755,8 @@ AtStart_Inval(void)
 		MemoryContextAllocZero(TopTransactionContext,
 							   sizeof(TransInvalidationInfo));
 	transInvalInfo->my_level = GetCurrentTransactionNestLevel();
+	SharedInvalidMessagesArray = NULL;
+	numSharedInvalidMessagesArray = 0;
 }
 
 /*
@@ -851,6 +867,126 @@ inval_twophase_postcommit(TransactionId xid, uint16 info,
 	}
 }
 
+static void
+MakeSharedInvalidMessagesArray(const SharedInvalidationMessage *msgs, int n)
+{
+	/*
+	 * Initialise array first time through in each commit
+	 */
+	if (SharedInvalidMessagesArray == NULL)
+	{
+		maxSharedInvalidMessagesArray = FIRSTCHUNKSIZE;
+		numSharedInvalidMessagesArray = 0;
+
+		/*
+		 * Although this is being palloc'd we don't actually free it directly.
+		 * We're so close to EOXact that we now we're going to lose it anyhow.
+		 */
+		SharedInvalidMessagesArray = palloc(maxSharedInvalidMessagesArray 
+											* sizeof(SharedInvalidationMessage));
+	}
+	
+	if ((numSharedInvalidMessagesArray + n) > maxSharedInvalidMessagesArray)
+	{
+		while ((numSharedInvalidMessagesArray + n) > maxSharedInvalidMessagesArray)
+			maxSharedInvalidMessagesArray *= 2;
+
+		SharedInvalidMessagesArray = repalloc(SharedInvalidMessagesArray,
+											maxSharedInvalidMessagesArray 
+											* sizeof(SharedInvalidationMessage));
+	}
+
+	/*
+	 * Append the next chunk onto the array
+	 */
+	memcpy(SharedInvalidMessagesArray + numSharedInvalidMessagesArray,
+			msgs, n * sizeof(SharedInvalidationMessage));
+	numSharedInvalidMessagesArray += n;
+}
+
+/*
+ * xactGetCommittedInvalidationMessages() is executed by 
+ * RecordTransactionCommit() to add invalidation messages onto the
+ * commit record. This applies only to commit message types, never to
+ * abort records. Must always run before AtEOXact_Inval(), since that
+ * removes the data we need to see.
+ *
+ * Remember that this runs before we have officially committed, so we
+ * must not do anything here to change what might occur *if* we should
+ * fail between here and the actual commit.
+ *
+ * Note that transactional validation does *not* write a invalidation
+ * WAL message using XLOG_RELATION_INVAL messages. Those are only used
+ * by non-transactional invalidation. see comments in
+ * EndNonTransactionalInvalidation().
+ *
+ * see also xact_redo_commit() and xact_desc_commit()
+ */
+int
+xactGetCommittedInvalidationMessages(SharedInvalidationMessage **msgs, 
+										bool *RelcacheInitFileInval)
+{
+	MemoryContext oldcontext;
+
+	/* Must be at top of stack */
+	Assert(transInvalInfo != NULL && transInvalInfo->parent == NULL);
+
+	/*
+	 * Relcache init file invalidation requires processing both before and
+	 * after we send the SI messages.  However, we need not do anything
+	 * unless we committed.
+	 */
+	if (transInvalInfo->RelcacheInitFileInval)
+		*RelcacheInitFileInval = true;
+	else
+		*RelcacheInitFileInval = false;
+
+	/*
+	 * Walk through TransInvalidationInfo to collect all the messages
+	 * into a single contiguous array of invalidation messages. It must
+	 * be contiguous so we can copy directly into WAL message. Maintain the
+	 * order that they would be processed in by AtEOXact_Inval(), to ensure
+	 * emulated behaviour in redo is as similar as possible to original.
+	 * We want the same bugs, if any, not new ones. 
+	 */
+	oldcontext = MemoryContextSwitchTo(CurTransactionContext);
+
+	ProcessInvalidationMessagesMulti(&transInvalInfo->CurrentCmdInvalidMsgs,
+									 MakeSharedInvalidMessagesArray);
+	ProcessInvalidationMessagesMulti(&transInvalInfo->PriorCmdInvalidMsgs,
+									 MakeSharedInvalidMessagesArray);
+	MemoryContextSwitchTo(oldcontext);
+
+#ifdef STANDBY_INVAL_DEBUG
+	if (numSharedInvalidMessagesArray > 0)
+	{
+		int i;
+	
+		elog(LOG, "numSharedInvalidMessagesArray = %d", numSharedInvalidMessagesArray);
+
+		Assert(SharedInvalidMessagesArray != NULL);
+
+		for (i = 0; i < numSharedInvalidMessagesArray; i++)
+		{
+			SharedInvalidationMessage *msg = SharedInvalidMessagesArray + i;
+
+			if (msg->id >= 0)
+				elog(LOG, "catcache id %d", msg->id);
+			else if (msg->id == SHAREDINVALRELCACHE_ID)
+				elog(LOG, "relcache id %d", msg->id);
+			else if (msg->id == SHAREDINVALSMGR_ID)
+				elog(LOG, "smgr cache id %d", msg->id);
+		}
+	}
+#endif
+
+	if (numSharedInvalidMessagesArray > 0)
+		Assert(SharedInvalidMessagesArray != NULL);
+
+	*msgs = SharedInvalidMessagesArray;
+
+	return numSharedInvalidMessagesArray;
+}
 
 /*
  * AtEOXact_Inval
@@ -1041,6 +1177,42 @@ BeginNonTransactionalInvalidation(void)
 	Assert(transInvalInfo->CurrentCmdInvalidMsgs.cclist == NULL);
 	Assert(transInvalInfo->CurrentCmdInvalidMsgs.rclist == NULL);
 	Assert(transInvalInfo->RelcacheInitFileInval == false);
+
+	SharedInvalidMessagesArray = NULL;
+	numSharedInvalidMessagesArray = 0;
+}
+
+/*
+ * General function to log the SharedInvalidMessagesArray. Only current 
+ * caller is EndNonTransactionalInvalidation(), but that may change.
+ */
+static void
+LogSharedInvalidMessagesArray(void)
+{
+	XLogRecData		rdata[2];
+	xl_rel_inval	xlrec;
+
+	if (numSharedInvalidMessagesArray == 0)
+		return;
+
+	START_CRIT_SECTION();
+
+	xlrec.nmsgs = numSharedInvalidMessagesArray;
+
+	rdata[0].data = (char *) (&xlrec);
+	rdata[0].len = MinSizeOfRelationInval;
+	rdata[0].buffer = InvalidBuffer;
+
+	rdata[0].next = &(rdata[1]);
+	rdata[1].data = (char *) SharedInvalidMessagesArray;
+	rdata[1].len = numSharedInvalidMessagesArray * 
+								sizeof(SharedInvalidationMessage);
+	rdata[1].buffer = InvalidBuffer;
+	rdata[1].next = NULL;
+
+	(void) XLogInsert(RM_RELATION_ID, XLOG_RELATION_INVAL, rdata);
+
+	END_CRIT_SECTION();
 }
 
 /*
@@ -1081,7 +1253,25 @@ EndNonTransactionalInvalidation(void)
 	ProcessInvalidationMessagesMulti(&transInvalInfo->CurrentCmdInvalidMsgs,
 									 SendSharedInvalidMessages);
 
+	/* 
+	 * Write invalidation messages to WAL. This is not required for
+	 * recovery, it is only required for standby servers. It's fairly
+	 * low overhead so don't worry. This allows us to trigger inval
+	 * messages on the standby as soon as we see these records.
+	 * see relation_redo_inval()
+	 * 
+	 * Note that transactional validation uses an array attached to
+	 * a WAL commit record, so these messages are rare.
+	 */
+	ProcessInvalidationMessagesMulti(&transInvalInfo->CurrentCmdInvalidMsgs,
+									 MakeSharedInvalidMessagesArray);
+	LogSharedInvalidMessagesArray();
+
 	/* Clean up and release memory */
+
+	/* XXXHS: Think some more on memory allocation and freeing.
+	 */
+
 	for (chunk = transInvalInfo->CurrentCmdInvalidMsgs.cclist;
 		 chunk != NULL;
 		 chunk = next)
@@ -1235,3 +1425,455 @@ CacheRegisterRelcacheCallback(RelcacheCallbackFunction func,
 
 	++relcache_callback_count;
 }
+
+/*
+ * -----------------------------------------------------
+ * 		Standby wait timers and backend cancel logic
+ * -----------------------------------------------------
+ */
+
+static void
+InitStandbyDelayTimers(int *currentDelay_ms, int *standbyWait_ms)
+{
+	*currentDelay_ms = GetLatestReplicationDelay();
+
+	/*
+	 * If replication delay is enormously huge, just treat that as
+	 * zero and work up from there. This prevents us from acting
+	 * foolishly when replaying old log files.
+	 */
+	if (*currentDelay_ms < 0)
+		*currentDelay_ms = 0;
+
+#define STANDBY_INITIAL_WAIT_MS  1
+	*standbyWait_ms = STANDBY_INITIAL_WAIT_MS;
+}
+
+/*
+ * Standby wait logic for ResolveRecoveryConflictWithVirtualXIDs.
+ * We wait here for a while then return. If we decide we can't wait any
+ * more then we return true, if we can wait some more return false.
+ */
+static bool
+WaitExceedsMaxStandbyDelay(int *currentDelay_ms, int *standbyWait_ms)
+{
+	int		maxStandbyDelay_ms = maxStandbyDelay * 1000;
+
+	/*
+	 * If the server is already further behind than we would
+	 * like then no need to wait or do more complex logic.
+	 * max_standby_delay = -1 means wait for ever, if necessary
+	 */
+	if (maxStandbyDelay >= 0 &&
+		*currentDelay_ms >= maxStandbyDelay_ms)
+		return true;
+
+	/*
+	 * Sleep, then do bookkeeping.
+	 */
+	pg_usleep(*standbyWait_ms * 1000L);
+	*currentDelay_ms += *standbyWait_ms;
+
+	/*
+	 * Progressively increase the sleep times.
+	 */
+	*standbyWait_ms *= 2;
+	if (*standbyWait_ms > 1000)
+		*standbyWait_ms = 1000;
+
+	/*
+	 * Re-test our exit criteria
+	 */
+	if (maxStandbyDelay >= 0 &&
+		*currentDelay_ms >= maxStandbyDelay_ms)
+		return true;
+
+	return false;
+}
+
+/*
+ * This is the main executioner for any query backend that conflicts with
+ * recovery processing. Judgement has already been passed on it within
+ * a specific rmgr. Here we just issue the orders to the procs. The procs
+ * then throw the required error as instructed.
+ *
+ * We may ask for a specific cancel_mode, typically ERROR or FATAL.
+ *
+ * If we want an ERROR, we may defer that until the buffer manager
+ * sees a recently changed block. If we want this we must specify a 
+ * valid conflict_LSN.
+ */
+void
+ResolveRecoveryConflictWithVirtualXIDs(VirtualTransactionId *waitlist,
+										char *reason, int cancel_mode, 
+										XLogRecPtr conflict_LSN)
+{
+	int				standbyWait_ms;
+	int 			currentDelay_ms;
+	bool			logged;
+	int				wontDieWait = 1;
+
+	InitStandbyDelayTimers(&currentDelay_ms, &standbyWait_ms);
+	logged = false;
+
+    while (VirtualTransactionIdIsValid(*waitlist))
+    {
+		/*
+		 * log that we have been waiting for a while now...
+		 */
+		if (!logged && standbyWait_ms > 500)
+		{
+			elog(trace_recovery(DEBUG5),
+					"virtual transaction %u/%u is blocking %s",
+						waitlist->backendId,
+						waitlist->localTransactionId, 
+						reason);
+			logged = true;
+		}
+
+		if (ConditionalVirtualXactLockTableWait(*waitlist))
+		{
+			waitlist++;
+			InitStandbyDelayTimers(&currentDelay_ms, &standbyWait_ms);
+			logged = false;
+		}
+		else if (WaitExceedsMaxStandbyDelay(&currentDelay_ms,
+											 &standbyWait_ms))
+		{
+			/*
+			 * Now find out who to throw out of the balloon.
+			 */
+			PGPROC *proc;
+
+			Assert(VirtualTransactionIdIsValid(*waitlist));
+			proc = VirtualTransactionIdGetProc(*waitlist);
+
+			/*
+			 * Kill the pid if it's still here. If not, that's what we wanted
+			 * so ignore any errors.
+			 */
+			if (proc)
+			{
+				/*
+				 * Startup process debug messages
+				 */
+				switch (cancel_mode)
+				{
+					case FATAL:
+						elog(trace_recovery(DEBUG2), 
+							"recovery disconnects session with pid %d "
+							"because of conflict with %s (current delay %d secs)",
+								proc->pid, 
+								reason,
+								currentDelay_ms / 1000);
+							break;
+					case ERROR:
+							if (XLogRecPtrIsValid(conflict_LSN))
+								elog(trace_recovery(DEBUG2), 
+									"recovery signals virtual transaction %u/%u pid %d "
+									"for deferred cancelation with LSN %X/%X "
+									"because of conflict with %s (current delay %d secs)",
+										waitlist->backendId,
+										waitlist->localTransactionId,
+										proc->pid,
+										conflict_LSN.xlogid,
+										conflict_LSN.xrecoff,
+										reason,
+										currentDelay_ms / 1000);
+							else
+								elog(trace_recovery(DEBUG2), 
+									"recovery cancels virtual transaction %u/%u pid %d "
+									"because of conflict with %s (current delay %d secs)",
+										waitlist->backendId,
+										waitlist->localTransactionId, 
+										proc->pid,
+										reason,
+										currentDelay_ms / 1000);
+							break;
+					default:
+							/* No conflict pending, so fall through */
+							break;
+				}
+
+				Assert(proc->pid != 0);
+
+				/*
+				 * Issue orders for the proc to read next time it receives SIGINT
+				 */
+				ProcSetRecoveryConflict(proc, conflict_LSN, cancel_mode);
+
+				/*
+				 * Do we expect it to talk? No, Mr. Bond, we expect it to die.
+				 */
+				kill(proc->pid, SIGINT);
+
+				/*
+				 * Wait, if the instruction is expected to complete quickly
+				 */
+				if (!XLogRecPtrIsValid(conflict_LSN))
+				{
+					/* wait awhile for it to die */
+					pg_usleep(wontDieWait * 5000L);
+					wontDieWait *= 2;
+				}
+			}
+		}
+    }
+}
+
+/*
+ * -----------------------------------------------------
+ * Locking in Recovery Mode
+ * -----------------------------------------------------
+ *
+ * All locks are held by the Startup process using a single virtual
+ * transaction. This implementation is both simpler and in some senses, 
+ * more correct. The locks held mean "some original transaction held 
+ * this lock, so query access is not allowed at this time". So the Startup
+ * process is the proxy by which the original locks are implemented.
+ *
+ * We only keep track of AccessExclusiveLocks, which are only ever held by
+ * one transaction on one relation, and don't worry about lock queuing.
+ * 
+ * We keep a single dynamically expandible locks list in local memory.
+ * List elements use type xl_rel_lock, since the WAL record type exactly
+ * matches the information that we need to keep track of.
+ *
+ * We use session locks rather than normal locks so we don't need 
+ * ResourceOwners.
+ */
+
+/* called by relation_redo_lock() */
+static void
+RelationAddRecoveryLock(xl_rel_lock *lockRequest)
+{
+	xl_rel_lock 	*newlock;
+	LOCKTAG			locktag;
+	MemoryContext 	old_context;
+
+	elog(trace_recovery(DEBUG4), 
+			"adding recovery lock: db %d rel %d",
+				lockRequest->dbOid, lockRequest->relOid);
+
+	/*
+	 * dbOid is InvalidOid when we are locking a shared relation.
+	 */
+	Assert(OidIsValid(lockRequest->relOid));
+
+	if (RelationLockContext == NULL)
+        RelationLockContext = AllocSetContextCreate(TopMemoryContext,
+														"RelationLocks",
+														ALLOCSET_DEFAULT_MINSIZE,
+														ALLOCSET_DEFAULT_INITSIZE,
+														ALLOCSET_DEFAULT_MAXSIZE);
+
+	old_context = MemoryContextSwitchTo(RelationLockContext);
+	newlock = palloc(sizeof(xl_rel_lock));
+	MemoryContextSwitchTo(old_context);
+
+	newlock->xid = lockRequest->xid;
+	newlock->dbOid = lockRequest->dbOid;
+	newlock->relOid = lockRequest->relOid;
+	RecoveryLockList = lappend(RecoveryLockList, newlock);
+
+	/*
+	 * Attempt to acquire the lock as requested.
+	 */
+	SET_LOCKTAG_RELATION(locktag, newlock->dbOid, newlock->relOid);
+
+	/*
+	 * Waiting for lock to clear or kill anyone in our way. Not a
+	 * completely foolproof way of getting the lock, but we cannot
+	 * afford to sit and wait for the lock indefinitely. This is
+	 * one reason to reduce strengths of various locks in 8.4.
+	 */
+	while (LockAcquire(&locktag, AccessExclusiveLock, true, true) 
+											== LOCKACQUIRE_NOT_AVAIL)
+	{
+		VirtualTransactionId *old_lockholders;
+
+		old_lockholders = GetLockConflicts(&locktag, AccessExclusiveLock);
+		ResolveRecoveryConflictWithVirtualXIDs(old_lockholders,
+												"exclusive lock",
+												ERROR,
+												InvalidXLogRecPtr);
+	}
+}
+
+static void
+RelationRemoveRecoveryLocks(TransactionId xid)
+{
+	ListCell   *l;
+	LOCKTAG		locktag;
+	List		*deletionList = NIL;
+
+	/*
+	 * Release all matching locks and identify list elements to remove
+	 */
+	foreach(l, RecoveryLockList)
+	{
+		xl_rel_lock *lock = (xl_rel_lock *) lfirst(l);
+
+		elog(trace_recovery(DEBUG4), 
+				"releasing recovery lock: xid %u db %d rel %d",
+						lock->xid, lock->dbOid, lock->relOid);
+
+		if (!TransactionIdIsValid(xid) || lock->xid == xid)
+		{
+			SET_LOCKTAG_RELATION(locktag, lock->dbOid, lock->relOid);
+			if (!LockRelease(&locktag, AccessExclusiveLock, true))
+				elog(trace_recovery(LOG),
+					"RecoveryLockList contains entry for lock "
+					"no longer recorded by lock manager "
+					"xid %u database %d relation %d",
+						lock->xid, lock->dbOid, lock->relOid);
+			deletionList = lappend(deletionList, lock);
+		}
+	}
+
+	/*
+	 * Now remove the elements from RecoveryLockList. We can't navigate
+	 * the list at the same time as deleting multiple elements from it.
+	 */
+	foreach(l, deletionList)
+	{
+		xl_rel_lock *lock = (xl_rel_lock *) lfirst(l);
+
+		RecoveryLockList = list_delete_ptr(RecoveryLockList, lock);
+		pfree(lock);
+	}
+}
+
+/*
+ * Called during xact_commit_redo() and xact_commit_abort when InArchiveRecovery
+ * to remove any AccessExclusiveLocks requested by a transaction.
+ *
+ * Remove the lock tree, starting at xid down, from the RecoveryLockList.
+ */
+void
+RelationReleaseRecoveryLockTree(TransactionId xid, int nsubxids, TransactionId *subxids)
+{
+	int i;
+
+	RelationRemoveRecoveryLocks(xid);
+
+	for (i = 0; i < nsubxids; i++)
+		RelationRemoveRecoveryLocks(subxids[i]);
+}
+
+/*
+ * Called at end of recovery and when we see a shutdown checkpoint.
+ */
+void
+RelationClearRecoveryLocks(void)
+{
+	elog(trace_recovery(DEBUG1), "clearing recovery locks");
+	RelationRemoveRecoveryLocks(InvalidTransactionId);
+}
+
+/*
+ * --------------------------------------------------
+ * 		Recovery handling for Rmgr RM_RELATION_ID
+ * --------------------------------------------------
+ */
+
+/*
+ * Redo for relation lock messages
+ */
+static void
+relation_redo_lock(xl_rel_lock *xlrec)
+{
+	RelationAddRecoveryLock(xlrec);
+}
+
+/*
+ * Redo for relation invalidation messages
+ */
+static void
+relation_redo_inval(xl_rel_inval *xlrec)
+{
+	SharedInvalidationMessage *msgs = &(xlrec->msgs[0]);
+	int		nmsgs = xlrec->nmsgs;
+
+	Assert(nmsgs > 0);		/* else we should not have written a record */
+
+	/* 
+	 * Smack them straight onto the queue and we're done. This is safe
+	 * because the only writer of these messages is non-transactional
+	 * invalidation.
+	 */
+	SendSharedInvalidMessages(msgs, nmsgs);
+}
+
+void
+relation_redo(XLogRecPtr lsn, XLogRecord *record)
+{
+	uint8		info = record->xl_info & ~XLR_INFO_MASK;
+
+	if (InArchiveRecovery)
+		(void) RecordKnownAssignedTransactionIds(lsn, record->xl_topxid, record->xl_xid);
+
+	if (info == XLOG_RELATION_INVAL)
+	{
+		xl_rel_inval *xlrec = (xl_rel_inval *) XLogRecGetData(record);
+
+		relation_redo_inval(xlrec);
+	}
+	else if (info == XLOG_RELATION_LOCK)
+	{
+		xl_rel_lock *xlrec = (xl_rel_lock *) XLogRecGetData(record);
+
+		relation_redo_lock(xlrec);
+	}
+	else
+		elog(PANIC, "relation_redo: unknown op code %u", info);
+}
+
+static void
+relation_desc_inval(StringInfo buf, xl_rel_inval *xlrec)
+{
+	SharedInvalidationMessage *msgs = &(xlrec->msgs[0]);
+	int							nmsgs = xlrec->nmsgs;
+
+	appendStringInfo(buf, "nmsgs %d;", nmsgs);
+
+	if (nmsgs > 0)
+	{
+		int i;
+	
+		for (i = 0; i < nmsgs; i++)
+		{
+			SharedInvalidationMessage *msg = msgs + i;
+
+			if (msg->id >= 0)
+				appendStringInfo(buf,  "catcache id %d", msg->id);
+			else if (msg->id == SHAREDINVALRELCACHE_ID)
+				appendStringInfo(buf,  "relcache ");
+			else if (msg->id == SHAREDINVALSMGR_ID)
+				appendStringInfo(buf,  "smgr ");
+		}
+	}
+}
+
+void
+relation_desc(StringInfo buf, uint8 xl_info, char *rec)
+{
+	uint8		info = xl_info & ~XLR_INFO_MASK;
+
+	if (info == XLOG_RELATION_INVAL)
+	{
+		xl_rel_inval *xlrec = (xl_rel_inval *) rec;
+
+		appendStringInfo(buf, "inval: ");
+		relation_desc_inval(buf, xlrec);
+	}
+	else if (info == XLOG_RELATION_LOCK)
+	{
+		xl_rel_lock *xlrec = (xl_rel_lock *) rec;
+
+		appendStringInfo(buf, "exclusive relation lock: xid %u db %d rel %d", 
+								xlrec->xid, xlrec->dbOid, xlrec->relOid);
+	}
+	else
+		appendStringInfo(buf, "UNKNOWN");
+}
diff --git a/src/backend/utils/error/elog.c b/src/backend/utils/error/elog.c
index a33c94ed67..67adc7afa6 100644
--- a/src/backend/utils/error/elog.c
+++ b/src/backend/utils/error/elog.c
@@ -2579,3 +2579,20 @@ is_log_level_output(int elevel, int log_min_level)
 
 	return false;
 }
+
+/*
+ * If trace_recovery_messages is set to make this visible, then show as LOG,
+ * else display as whatever level is set. It may still be shown, but only
+ * if log_min_messages is set lower than trace_recovery_messages.
+ *
+ * Intention is to keep this for at least the whole of the 8.4 production
+ * release, so we can more easily diagnose production problems in the field.
+ */
+int
+trace_recovery(int trace_level)
+{
+	if (trace_level >= trace_recovery_messages)
+		return LOG;
+
+	return trace_level;
+}
diff --git a/src/backend/utils/init/flatfiles.c b/src/backend/utils/init/flatfiles.c
index 9dbc53c159..404a8f753c 100644
--- a/src/backend/utils/init/flatfiles.c
+++ b/src/backend/utils/init/flatfiles.c
@@ -678,9 +678,10 @@ write_auth_file(Relation rel_authid, Relation rel_authmem)
 /*
  * This routine is called once during database startup, after completing
  * WAL replay if needed.  Its purpose is to sync the flat files with the
- * current state of the database tables.  This is particularly important
- * during PITR operation, since the flat files will come from the
- * base backup which may be far out of sync with the current state.
+ * current state of the database tables.  
+ *
+ * In 8.4 we also run this during xact_redo_commit() if the transaction
+ * wrote a new database or auth flat file. 
  *
  * In theory we could skip rebuilding the flat files if no WAL replay
  * occurred, but it seems best to just do it always.  We have to
@@ -716,8 +717,6 @@ BuildFlatFiles(bool database_only)
 	/*
 	 * We don't have any hope of running a real relcache, but we can use the
 	 * same fake-relcache facility that WAL replay uses.
-	 *
-	 * No locking is needed because no one else is alive yet.
 	 */
 	rel_db = CreateFakeRelcacheEntry(rnode);
 	write_database_file(rel_db, true);
@@ -832,14 +831,14 @@ AtEOXact_UpdateFlatFiles(bool isCommit)
 	/* Okay to write the files */
 	if (database_file_update_subid != InvalidSubTransactionId)
 	{
-		database_file_update_subid = InvalidSubTransactionId;
+		/* reset database_file_update_subid later during commit */
 		write_database_file(drel, false);
 		heap_close(drel, NoLock);
 	}
 
 	if (auth_file_update_subid != InvalidSubTransactionId)
 	{
-		auth_file_update_subid = InvalidSubTransactionId;
+		/* reset auth_file_update_subid later during commit */
 		write_auth_file(arel, mrel);
 		heap_close(arel, NoLock);
 		heap_close(mrel, NoLock);
@@ -859,6 +858,30 @@ AtEOXact_UpdateFlatFiles(bool isCommit)
 	ForceSyncCommit();
 }
 
+/*
+ * Exported to allow transaction commit to set flags to perform flat file
+ * update in redo. Reset per-transaction flags. For abort case they were
+ * already set during AtEOXact_UpdateFlatFiles().
+ */
+bool
+AtEOXact_Database_FlatFile_Update_Needed(void)
+{
+	bool result = TransactionIdIsValid(database_file_update_subid);
+
+	database_file_update_subid = InvalidSubTransactionId;
+
+	return result;
+}
+
+bool
+AtEOXact_Auth_FlatFile_Update_Needed(void)
+{
+	bool result = TransactionIdIsValid(auth_file_update_subid);		
+
+	auth_file_update_subid = InvalidSubTransactionId;
+
+	return result;
+}
 
 /*
  * This routine is called during transaction prepare.
diff --git a/src/backend/utils/init/postinit.c b/src/backend/utils/init/postinit.c
index cf98323d2a..d39180bf69 100644
--- a/src/backend/utils/init/postinit.c
+++ b/src/backend/utils/init/postinit.c
@@ -440,7 +440,7 @@ InitPostgres(const char *in_dbname, Oid dboid, const char *username,
 	 */
 	MyBackendId = InvalidBackendId;
 
-	SharedInvalBackendInit();
+	SharedInvalBackendInit(false);
 
 	if (MyBackendId > MaxBackends || MyBackendId <= 0)
 		elog(FATAL, "bad backend id: %d", MyBackendId);
@@ -489,9 +489,15 @@ InitPostgres(const char *in_dbname, Oid dboid, const char *username,
 	 * Start a new transaction here before first access to db, and get a
 	 * snapshot.  We don't have a use for the snapshot itself, but we're
 	 * interested in the secondary effect that it sets RecentGlobalXmin.
+	 * If we are connecting during recovery, make sure the initial
+	 * transaction is read only and force all subsequent transactions
+	 * that way also.
 	 */
 	if (!bootstrap)
 	{
+		if (IsRecoveryProcessingMode())
+			SetConfigOption("default_transaction_read_only", "true",
+				PGC_POSTMASTER, PGC_S_OVERRIDE);
 		StartTransactionCommand();
 		(void) GetTransactionSnapshot();
 	}
@@ -515,7 +521,7 @@ InitPostgres(const char *in_dbname, Oid dboid, const char *username,
 	 */
 	if (!bootstrap)
 		LockSharedObject(DatabaseRelationId, MyDatabaseId, 0,
-						 RowExclusiveLock);
+				(IsRecoveryProcessingMode() ? AccessShareLock : RowExclusiveLock));
 
 	/*
 	 * Recheck the flat file copy of pg_database to make sure the target
diff --git a/src/backend/utils/misc/guc.c b/src/backend/utils/misc/guc.c
index 90f077a370..bd44494062 100644
--- a/src/backend/utils/misc/guc.c
+++ b/src/backend/utils/misc/guc.c
@@ -115,6 +115,8 @@ extern char *temp_tablespaces;
 extern bool synchronize_seqscans;
 extern bool fullPageWrites;
 
+int	trace_recovery_messages = DEBUG1; /* XXXHS set to LOG for production */
+
 #ifdef TRACE_SORT
 extern bool trace_sort;
 #endif
@@ -2635,6 +2637,16 @@ static struct config_enum ConfigureNamesEnum[] =
 	},
 
 	{
+		{"trace_recovery_messages", PGC_SUSET, LOGGING_WHEN,
+			gettext_noop("Sets the message levels that are logged during recovery."),
+			gettext_noop("Each level includes all the levels that follow it. The later"
+						 " the level, the fewer messages are sent.")
+		},
+		&trace_recovery_messages,
+		DEBUG1, server_message_level_options, NULL, NULL
+	},
+
+	{
 		{"track_functions", PGC_SUSET, STATS_COLLECTOR,
 			gettext_noop("Collects function-level statistics on database activity."),
 			NULL
@@ -5501,8 +5513,19 @@ ExecSetVariableStmt(VariableSetStmt *stmt)
 						SetPGVariable("transaction_isolation",
 									  list_make1(item->arg), stmt->is_local);
 					else if (strcmp(item->defname, "transaction_read_only") == 0)
+					{
+						A_Const	   *con;
+
+						Assert(IsA(item->arg, A_Const));
+						con = (A_Const *) item->arg;
+						Assert(nodeTag(&con->val) == T_Integer);
+
+						if (!intVal(&con->val))
+							PreventCommandDuringRecovery();
+
 						SetPGVariable("transaction_read_only",
 									  list_make1(item->arg), stmt->is_local);
+					}
 					else
 						elog(ERROR, "unexpected SET TRANSACTION element: %s",
 							 item->defname);
@@ -5520,8 +5543,19 @@ ExecSetVariableStmt(VariableSetStmt *stmt)
 						SetPGVariable("default_transaction_isolation",
 									  list_make1(item->arg), stmt->is_local);
 					else if (strcmp(item->defname, "transaction_read_only") == 0)
+					{
+						A_Const	   *con;
+
+						Assert(IsA(item->arg, A_Const));
+						con = (A_Const *) item->arg;
+						Assert(nodeTag(&con->val) == T_Integer);
+
+						if (!intVal(&con->val))
+							PreventCommandDuringRecovery();
+						
 						SetPGVariable("default_transaction_read_only",
 									  list_make1(item->arg), stmt->is_local);
+					}
 					else
 						elog(ERROR, "unexpected SET SESSION element: %s",
 							 item->defname);
diff --git a/src/backend/utils/time/snapmgr.c b/src/backend/utils/time/snapmgr.c
index 9992895941..f6e043399b 100644
--- a/src/backend/utils/time/snapmgr.c
+++ b/src/backend/utils/time/snapmgr.c
@@ -27,6 +27,7 @@
 
 #include "access/transam.h"
 #include "access/xact.h"
+#include "storage/bufmgr.h"
 #include "storage/proc.h"
 #include "storage/procarray.h"
 #include "utils/memutils.h"
@@ -433,7 +434,11 @@ static void
 SnapshotResetXmin(void)
 {
 	if (RegisteredSnapshots == 0 && ActiveSnapshot == NULL)
+	{
 		MyProc->xmin = InvalidTransactionId;
+		if (IsRecoveryProcessingMode())
+			SetBufferRecoveryConflictLSN(InvalidXLogRecPtr);
+	}
 }
 
 /*
diff --git a/src/backend/utils/time/tqual.c b/src/backend/utils/time/tqual.c
index dbfbb023ae..aa60d8c116 100644
--- a/src/backend/utils/time/tqual.c
+++ b/src/backend/utils/time/tqual.c
@@ -86,7 +86,7 @@ static inline void
 SetHintBits(HeapTupleHeader tuple, Buffer buffer,
 			uint16 infomask, TransactionId xid)
 {
-	if (TransactionIdIsValid(xid))
+	if (!IsRecoveryProcessingMode() && TransactionIdIsValid(xid))
 	{
 		/* NB: xid must be known committed here! */
 		XLogRecPtr	commitLSN = TransactionIdGetCommitLSN(xid);
@@ -1238,26 +1238,52 @@ XidInMVCCSnapshot(TransactionId xid, Snapshot snapshot)
 		return true;
 
 	/*
-	 * If the snapshot contains full subxact data, the fastest way to check
-	 * things is just to compare the given XID against both subxact XIDs and
-	 * top-level XIDs.	If the snapshot overflowed, we have to use pg_subtrans
-	 * to convert a subxact XID to its parent XID, but then we need only look
-	 * at top-level XIDs not subxacts.
+	 * Our strategy for checking xids changed in 8.4. Prior to 8.4
+	 * we either checked the subxid cache on the snapshot or we 
+	 * checked subtrans. That was much more efficient than just using
+	 * subtrans but it has some problems. First, as soon as *any*
+	 * transaction had more than 64 transactions we forced *all*
+	 * snapshots to check against subtrans, giving a sharp modal
+	 * change in behaviour. Second because we either checked subtrans
+	 * or the snapshot, we were forced to place entries in subtrans
+	 * in case the snapshot later overflowed, even if we never
+	 * actually checked subtrans.
+	 *
+	 * In 8.4 we improve on that scheme in a number of ways. As before
+	 * we check subtrans if the snapshot has overflowed. We *also*
+	 * check the subxid cache. This has two benefits: first the 
+	 * behaviour degrades gracefully when the cache overflows, so we
+	 * retain much of its benefit if it has only just overflowed.
+	 * Second, a transaction doesn't need to insert entries into
+	 * subtrans until its own personal subxid cache overflows. This
+	 * means entries into subtrans become significantly rarer, 
+	 * perhaps less than 1% of the previous insert rate, giving
+	 * considerable benefit for transactions using only a few
+	 * subtransactions.
+	 *
+	 * This behaviour is also necessary for allowing snapshots to work
+	 * correctly on a standby server. By this subtle change of behaviour
+	 * we can now utilise the subxid cache to store "unobserved xids"
+	 * of which we can infer their existence from watching the 
+	 * arrival sequence of newly observed transactionids in the WAL.
 	 */
-	if (snapshot->subxcnt >= 0)
-	{
-		/* full data, so search subxip */
-		int32		j;
 
-		for (j = 0; j < snapshot->subxcnt; j++)
-		{
-			if (TransactionIdEquals(xid, snapshot->subxip[j]))
+	/*
+	 * First, compare the given XID against cached subxact XIDs.
+	 */
+	for (i = 0; i < snapshot->subxcnt; i++)
+	{
+		if (TransactionIdEquals(xid, snapshot->subxip[i]))
 				return true;
 		}
 
-		/* not there, fall through to search xip[] */
-	}
-	else
+	/*
+	 * If the snapshot overflowed and we haven't already located the xid
+	 * we also have to consult pg_subtrans. We use subtrans to convert a 
+	 * subxact XID to its parent XID, so that we can then check the status
+	 * of the top-level TransactionId.
+	 */
+	if (snapshot->suboverflowed)
 	{
 		/* overflowed, so convert xid to top-level */
 		xid = SubTransGetTopmostTransaction(xid);
@@ -1270,6 +1296,10 @@ XidInMVCCSnapshot(TransactionId xid, Snapshot snapshot)
 			return false;
 	}
 
+	/*
+	 * By now xid is either not present, or a top-level xid. So now
+	 * we just need to check the main transaction ids.
+	 */
 	for (i = 0; i < snapshot->xcnt; i++)
 	{
 		if (TransactionIdEquals(xid, snapshot->xip[i]))
diff --git a/src/include/access/heapam.h b/src/include/access/heapam.h
index a5d9769794..e3f94edcea 100644
--- a/src/include/access/heapam.h
+++ b/src/include/access/heapam.h
@@ -130,11 +130,13 @@ extern void heap2_desc(StringInfo buf, uint8 xl_info, char *rec);
 extern XLogRecPtr log_heap_move(Relation reln, Buffer oldbuf,
 			  ItemPointerData from,
 			  Buffer newbuf, HeapTuple newtup);
+extern XLogRecPtr log_heap_cleanup_info(RelFileNode rnode, 
+			  TransactionId latestRemovedXid);
 extern XLogRecPtr log_heap_clean(Relation reln, Buffer buffer,
 			   OffsetNumber *redirected, int nredirected,
 			   OffsetNumber *nowdead, int ndead,
 			   OffsetNumber *nowunused, int nunused,
-			   bool redirect_move);
+			   TransactionId latestRemovedXid, bool redirect_move);
 extern XLogRecPtr log_heap_freeze(Relation reln, Buffer buffer,
 				TransactionId cutoff_xid,
 				OffsetNumber *offsets, int offcnt);
diff --git a/src/include/access/htup.h b/src/include/access/htup.h
index 54264bdca4..96fb89d088 100644
--- a/src/include/access/htup.h
+++ b/src/include/access/htup.h
@@ -580,6 +580,7 @@ typedef HeapTupleData *HeapTuple;
 #define XLOG_HEAP2_FREEZE		0x00
 #define XLOG_HEAP2_CLEAN		0x10
 #define XLOG_HEAP2_CLEAN_MOVE	0x20
+#define XLOG_HEAP2_CLEANUP_INFO 0x30
 
 /*
  * All what we need to find changed tuple
@@ -668,6 +669,7 @@ typedef struct xl_heap_clean
 {
 	RelFileNode node;
 	BlockNumber block;
+	TransactionId	latestRemovedXid;
 	uint16		nredirected;
 	uint16		ndead;
 	/* OFFSET NUMBERS FOLLOW */
@@ -675,6 +677,19 @@ typedef struct xl_heap_clean
 
 #define SizeOfHeapClean (offsetof(xl_heap_clean, ndead) + sizeof(uint16))
 
+/*
+ * Cleanup_info is required in some cases during a lazy VACUUM.
+ * Used for reporting the results of HeapTupleHeaderAdvanceLatestRemovedXid()
+ * see vacuumlazy.c for full explanation
+ */
+typedef struct xl_heap_cleanup_info
+{
+	RelFileNode 	node;
+	TransactionId	latestRemovedXid;
+} xl_heap_cleanup_info;
+
+#define SizeOfHeapCleanupInfo (sizeof(xl_heap_cleanup_info))
+
 /* This is for replacing a page's contents in toto */
 /* NB: this is used for indexes as well as heaps */
 typedef struct xl_heap_newpage
@@ -718,6 +733,9 @@ typedef struct xl_heap_freeze
 
 #define SizeOfHeapFreeze (offsetof(xl_heap_freeze, cutoff_xid) + sizeof(TransactionId))
 
+extern void HeapTupleHeaderAdvanceLatestRemovedXid(HeapTupleHeader tuple, 
+										TransactionId *latestRemovedXid);
+
 /* HeapTupleHeader functions implemented in utils/time/combocid.c */
 extern CommandId HeapTupleHeaderGetCmin(HeapTupleHeader tup);
 extern CommandId HeapTupleHeaderGetCmax(HeapTupleHeader tup);
diff --git a/src/include/access/nbtree.h b/src/include/access/nbtree.h
index 2df34f54ee..8028fce356 100644
--- a/src/include/access/nbtree.h
+++ b/src/include/access/nbtree.h
@@ -214,12 +214,13 @@ typedef struct BTMetaPageData
 #define XLOG_BTREE_SPLIT_R		0x40	/* as above, new item on right */
 #define XLOG_BTREE_SPLIT_L_ROOT 0x50	/* add tuple with split of root */
 #define XLOG_BTREE_SPLIT_R_ROOT 0x60	/* as above, new item on right */
-#define XLOG_BTREE_DELETE		0x70	/* delete leaf index tuple */
+#define XLOG_BTREE_DELETE		0x70	/* delete leaf index tuples for a page */
 #define XLOG_BTREE_DELETE_PAGE	0x80	/* delete an entire page */
 #define XLOG_BTREE_DELETE_PAGE_META 0x90		/* same, and update metapage */
 #define XLOG_BTREE_NEWROOT		0xA0	/* new root page */
 #define XLOG_BTREE_DELETE_PAGE_HALF 0xB0		/* page deletion that makes
 												 * parent half-dead */
+#define XLOG_BTREE_VACUUM		0xC0	/* delete entries on a page during vacuum */
 
 /*
  * All that we need to find changed index tuple
@@ -306,16 +307,53 @@ typedef struct xl_btree_split
 /*
  * This is what we need to know about delete of individual leaf index tuples.
  * The WAL record can represent deletion of any number of index tuples on a
- * single index page.
+ * single index page when *not* executed by VACUUM.
  */
 typedef struct xl_btree_delete
 {
 	RelFileNode node;
 	BlockNumber block;
+	TransactionId	latestRemovedXid;
+	int			numItems;		 /* number of items in the offset array */
+
 	/* TARGET OFFSET NUMBERS FOLLOW AT THE END */
 } xl_btree_delete;
 
-#define SizeOfBtreeDelete	(offsetof(xl_btree_delete, block) + sizeof(BlockNumber))
+#define SizeOfBtreeDelete	(offsetof(xl_btree_delete, latestRemovedXid) + sizeof(TransactionId))
+
+/*
+ * This is what we need to know about vacuum of individual leaf index tuples.
+ * The WAL record can represent deletion of any number of index tuples on a
+ * single index page when executed by VACUUM. 
+ * 
+ * The correctness requirement for applying these changes during recovery is
+ * that we must do one of these two things for every block in the index:
+ * 		* lock the block for cleanup and apply any required changes
+ *		* EnsureBlockUnpinned() 
+ * The purpose of this is to ensure that no index scans started before we
+ * finish scanning the index are still running by the time we begin to remove
+ * heap tuples.
+ * 
+ * Any changes to any one block are registered on just one WAL record. All
+ * blocks that we need to run EnsureBlockUnpinned() before we touch the changed
+ * block are also given on this record as a variable length array. The array
+ * is compressed by way of storing an array of block ranges, rather than an
+ * actual array of blockids.
+ *
+ * Note that the *last* WAL record in any vacuum of an index is allowed to
+ * have numItems == 0. All other WAL records must have numItems > 0.
+ */
+typedef struct xl_btree_vacuum
+{
+	RelFileNode node;
+	BlockNumber block;
+	BlockNumber lastBlockVacuumed;
+	int			numItems;		 /* number of items in the offset array */
+
+	/* TARGET OFFSET NUMBERS FOLLOW */
+} xl_btree_vacuum;
+
+#define SizeOfBtreeVacuum	(offsetof(xl_btree_vacuum, lastBlockVacuumed) + sizeof(BlockNumber))
 
 /*
  * This is what we need to know about deletion of a btree page.  The target
@@ -498,6 +536,10 @@ typedef BTScanOpaqueData *BTScanOpaque;
 #define SK_BT_DESC			(INDOPTION_DESC << SK_BT_INDOPTION_SHIFT)
 #define SK_BT_NULLS_FIRST	(INDOPTION_NULLS_FIRST << SK_BT_INDOPTION_SHIFT)
 
+/* XXX probably needs new RMgr call to do this cleanly */
+extern bool btree_is_cleanup_record(uint8 info);
+extern bool btree_needs_cleanup_lock(uint8 info);
+
 /*
  * prototypes for functions in nbtree.c (external entry points for btree)
  */
@@ -537,7 +579,8 @@ extern void _bt_relbuf(Relation rel, Buffer buf);
 extern void _bt_pageinit(Page page, Size size);
 extern bool _bt_page_recyclable(Page page);
 extern void _bt_delitems(Relation rel, Buffer buf,
-			 OffsetNumber *itemnos, int nitems);
+			 OffsetNumber *itemnos, int nitems, bool isVacuum,
+			 BlockNumber lastBlockVacuumed);
 extern int _bt_pagedel(Relation rel, Buffer buf,
 			BTStack stack, bool vacuum_full);
 
diff --git a/src/include/access/relscan.h b/src/include/access/relscan.h
index 47b95c27ac..55cb8d300b 100644
--- a/src/include/access/relscan.h
+++ b/src/include/access/relscan.h
@@ -68,6 +68,7 @@ typedef struct IndexScanDescData
 	/* signaling to index AM about killing index tuples */
 	bool		kill_prior_tuple;		/* last-returned tuple is dead */
 	bool		ignore_killed_tuples;	/* do not return killed entries */
+	bool		xactStartedInRecovery;	/* prevents killing/seeing killed tuples */
 
 	/* index access method's private state */
 	void	   *opaque;			/* access-method-specific info */
diff --git a/src/include/access/rmgr.h b/src/include/access/rmgr.h
index 5702f5f4d0..8ab1148a4c 100644
--- a/src/include/access/rmgr.h
+++ b/src/include/access/rmgr.h
@@ -23,6 +23,7 @@ typedef uint8 RmgrId;
 #define RM_DBASE_ID				4
 #define RM_TBLSPC_ID			5
 #define RM_MULTIXACT_ID			6
+#define RM_RELATION_ID			8
 #define RM_HEAP2_ID				9
 #define RM_HEAP_ID				10
 #define RM_BTREE_ID				11
diff --git a/src/include/access/subtrans.h b/src/include/access/subtrans.h
index 6ff25fc5cf..6a196210d0 100644
--- a/src/include/access/subtrans.h
+++ b/src/include/access/subtrans.h
@@ -11,6 +11,9 @@
 #ifndef SUBTRANS_H
 #define SUBTRANS_H
 
+/* included solely to allow recovery-code to access InRecovery state */
+#include "access/xlog.h"
+
 /* Number of SLRU buffers to use for subtrans */
 #define NUM_SUBTRANS_BUFFERS	32
 
diff --git a/src/include/access/transam.h b/src/include/access/transam.h
index 2b796b699e..b625e3e205 100644
--- a/src/include/access/transam.h
+++ b/src/include/access/transam.h
@@ -129,6 +129,9 @@ typedef VariableCacheData *VariableCache;
  * ----------------
  */
 
+/* in transam/xact.c */
+extern bool TransactionStartedDuringRecovery(void);
+
 /* in transam/varsup.c */
 extern VariableCache ShmemVariableCache;
 
diff --git a/src/include/access/xact.h b/src/include/access/xact.h
index f255d88c10..519e6672cf 100644
--- a/src/include/access/xact.h
+++ b/src/include/access/xact.h
@@ -17,6 +17,7 @@
 #include "access/xlog.h"
 #include "nodes/pg_list.h"
 #include "storage/relfilenode.h"
+#include "utils/snapshot.h"
 #include "utils/timestamp.h"
 
 
@@ -84,18 +85,58 @@ typedef void (*SubXactCallback) (SubXactEvent event, SubTransactionId mySubid,
 #define XLOG_XACT_ABORT				0x20
 #define XLOG_XACT_COMMIT_PREPARED	0x30
 #define XLOG_XACT_ABORT_PREPARED	0x40
+#define XLOG_XACT_ASSIGNMENT		0x50
+#define XLOG_XACT_RUNNING_XACTS		0x60
+/* 0x70 can also be used, if required */
+
+typedef struct xl_xact_assignment
+{
+	TransactionId	xassign;	/* assigned xid */
+	TransactionId	xtop;		/* assigned xids top-level xid, if any */
+} xl_xact_assignment;
+
+/* 
+ * xl_xact_running_xacts is in utils/snapshot.h so it can be passed
+ * around to the same places as snapshots. Not snapmgr.h
+ */
 
 typedef struct xl_xact_commit
 {
-	TimestampTz xact_time;		/* time of commit */
-	int			nrels;			/* number of RelFileNodes */
-	int			nsubxacts;		/* number of subtransaction XIDs */
-	/* Array of RelFileNode(s) to drop at commit */
-	RelFileNode	xnodes[1];		/* VARIABLE LENGTH ARRAY */
-	/* ARRAY OF COMMITTED SUBTRANSACTION XIDs FOLLOWS */
+  	TimestampTz xact_time;		/* time of commit */
+ 	uint32		xinfo;			/* info flags */
+  	int			nrels;			/* number of RelFileForks */
+  	int			nsubxacts;		/* number of subtransaction XIDs */
+	int			nmsgs;			/* number of shared inval msgs */
+  	/* Array of RelFileFork(s) to drop at commit */
+  	RelFileNode	xnodes[1];		/* VARIABLE LENGTH ARRAY */
+  	/* ARRAY OF COMMITTED SUBTRANSACTION XIDs FOLLOWS */
+	/* ARRAY OF SHARED INVALIDATION MESSAGES FOLLOWS */
 } xl_xact_commit;
 
 #define MinSizeOfXactCommit offsetof(xl_xact_commit, xnodes)
+#define OffsetSharedInvalInXactCommit() \
+( \
+	MinSizeOfXactCommit +  \
+	(xlrec->nsubxacts * sizeof(TransactionId)) + \
+	(xlrec->nrels * sizeof(RelFileNode)) \
+)
+
+/*
+ * These flags are set in the xinfo fields of WAL commit records,
+ * indicating a variety of additional actions that need to occur
+ * when emulating transaction effects during recovery.
+ * They are named XactCompletion... to differentiate them from
+ * EOXact... routines which run at the end of the original
+ * transaction completion.
+ */
+#define XACT_COMPLETION_UPDATE_DB_FILE			0x01
+#define XACT_COMPLETION_UPDATE_AUTH_FILE		0x02
+#define XACT_COMPLETION_UPDATE_RELCACHE_FILE	0x04
+
+/* Access macros for above flags */
+#define XactCompletionUpdateDBFile(xlrec) 			((xlrec)->xinfo & XACT_COMPLETION_UPDATE_DB_FILE)
+#define XactCompletionUpdateAuthFile(xlrec) 		((xlrec)->xinfo & XACT_COMPLETION_UPDATE_AUTH_FILE)
+#define XactCompletionRelcacheInitFileInval(xlrec)	((xlrec)->xinfo & XACT_COMPLETION_UPDATE_RELCACHE_FILE)
 
 typedef struct xl_xact_abort
 {
@@ -106,6 +147,7 @@ typedef struct xl_xact_abort
 	RelFileNode	xnodes[1];		/* VARIABLE LENGTH ARRAY */
 	/* ARRAY OF ABORTED SUBTRANSACTION XIDs FOLLOWS */
 } xl_xact_abort;
+/* Note the intentional lack of an invalidation message array c.f. commit */
 
 #define MinSizeOfXactAbort offsetof(xl_xact_abort, xnodes)
 
@@ -185,6 +227,14 @@ extern TransactionId RecordTransactionCommit(void);
 
 extern int	xactGetCommittedChildren(TransactionId **ptr);
 
+extern void LogCurrentRunningXacts(void);
+extern bool IsRunningXactDataValid(void);
+extern void SetRunningXactData(bool mode);
+
+extern void InitRecoveryTransactionEnvironment(void);
+extern bool RecordKnownAssignedTransactionIds(XLogRecPtr lsn, TransactionId top_xid, TransactionId child_xid);
+extern bool LatestRemovedXidAdvances(TransactionId latestXid);
+
 extern void xact_redo(XLogRecPtr lsn, XLogRecord *record);
 extern void xact_desc(StringInfo buf, uint8 xl_info, char *rec);
 
diff --git a/src/include/access/xlog.h b/src/include/access/xlog.h
index cf787c8df6..cc8c0b46dd 100644
--- a/src/include/access/xlog.h
+++ b/src/include/access/xlog.h
@@ -18,7 +18,9 @@
 #include "utils/pg_crc.h"
 #include "utils/timestamp.h"
 
-
+/* Handy constant for an invalid xlog recptr */
+static const XLogRecPtr InvalidXLogRecPtr = {0, 0};
+#define XLogRecPtrIsValid(xp)	(xp.xlogid !=0 && xp.xrecoff != 0)
 /*
  * The overall layout of an XLOG record is:
  *		Fixed-size header (XLogRecord struct)
@@ -46,10 +48,11 @@ typedef struct XLogRecord
 	TransactionId xl_xid;		/* xact id */
 	uint32		xl_tot_len;		/* total len of entire record */
 	uint32		xl_len;			/* total len of rmgr data */
-	uint8		xl_info;		/* flag bits, see below */
+	uint8		xl_info;		/* flag bits, see below (XLR_ entries) */
 	RmgrId		xl_rmid;		/* resource manager for this record */
+	TransactionId xl_topxid;	/* top-level xid == xl_xid if top-level */
 
-	/* Depending on MAXALIGN, there are either 2 or 6 wasted bytes here */
+	/* Above structure has 2 bytes spare in both 4 byte and 8 byte alignment */
 
 	/* ACTUAL LOG DATA FOLLOWS AT END OF STRUCT */
 
@@ -133,7 +136,6 @@ typedef struct XLogRecData
 } XLogRecData;
 
 extern TimeLineID ThisTimeLineID;		/* current TLI */
-
 /* 
  * Prior to 8.4, all activity during recovery were carried out by Startup
  * process. This local variable continues to be used in many parts of the
@@ -142,7 +144,7 @@ extern TimeLineID ThisTimeLineID;		/* current TLI */
  * IsRecoveryProcessingMode(), see XLogCtl notes in xlog.c
  */
 extern bool InRecovery;	
-										
+extern bool InArchiveRecovery;
 extern XLogRecPtr XactLastRecEnd;
 
 /* these variables are GUC parameters related to XLOG */
@@ -152,6 +154,7 @@ extern bool XLogArchiveMode;
 extern char *XLogArchiveCommand;
 extern int	XLogArchiveTimeout;
 extern bool log_checkpoints;
+extern int maxStandbyDelay;
 
 #define XLogArchivingActive()	(XLogArchiveMode)
 #define XLogArchiveCommandSet() (XLogArchiveCommand[0] != '\0')
@@ -210,6 +213,9 @@ extern void xlog_redo(XLogRecPtr lsn, XLogRecord *record);
 extern void xlog_desc(StringInfo buf, uint8 xl_info, char *rec);
 
 extern bool IsRecoveryProcessingMode(void);
+extern int GetLatestReplicationDelay(void);
+
+extern void RestoreBkpBlocks(XLogRecPtr lsn, XLogRecord *record, bool cleanup);
 
 extern void UpdateControlFile(void);
 extern Size XLOGShmemSize(void);
diff --git a/src/include/access/xlog_internal.h b/src/include/access/xlog_internal.h
index 4830a5ce74..5daac3ad08 100644
--- a/src/include/access/xlog_internal.h
+++ b/src/include/access/xlog_internal.h
@@ -72,7 +72,7 @@ typedef struct XLogContRecord
 /*
  * Each page of XLOG file has a header like this:
  */
-#define XLOG_PAGE_MAGIC 0xD063	/* can be used as WAL version indicator */
+#define XLOG_PAGE_MAGIC 0x5352	/* can be used as WAL version indicator */
 
 typedef struct XLogPageHeaderData
 {
@@ -259,5 +259,17 @@ extern Datum pg_current_xlog_location(PG_FUNCTION_ARGS);
 extern Datum pg_current_xlog_insert_location(PG_FUNCTION_ARGS);
 extern Datum pg_xlogfile_name_offset(PG_FUNCTION_ARGS);
 extern Datum pg_xlogfile_name(PG_FUNCTION_ARGS);
+extern Datum pg_recovery_continue(PG_FUNCTION_ARGS);
+extern Datum pg_recovery_pause(PG_FUNCTION_ARGS);
+extern Datum pg_recovery_pause_cleanup(PG_FUNCTION_ARGS);
+extern Datum pg_recovery_pause_xid(PG_FUNCTION_ARGS);
+extern Datum pg_recovery_pause_time(PG_FUNCTION_ARGS);
+extern Datum pg_recovery_advance(PG_FUNCTION_ARGS);
+extern Datum pg_recovery_stop(PG_FUNCTION_ARGS);
+extern Datum pg_current_recovery_target(PG_FUNCTION_ARGS);
+extern Datum pg_is_in_recovery(PG_FUNCTION_ARGS);
+extern Datum pg_last_recovered_xact_timestamp(PG_FUNCTION_ARGS);
+extern Datum pg_last_recovered_xid(PG_FUNCTION_ARGS);
+extern Datum pg_last_recovered_xlog_location(PG_FUNCTION_ARGS);
 
 #endif   /* XLOG_INTERNAL_H */
diff --git a/src/include/catalog/pg_control.h b/src/include/catalog/pg_control.h
index e69c8ec553..515df399b8 100644
--- a/src/include/catalog/pg_control.h
+++ b/src/include/catalog/pg_control.h
@@ -22,6 +22,7 @@
 
 /* Version identifier for this pg_control format */
 #define PG_CONTROL_VERSION	847
+// XXXHS change PG_CONTROL_VERSION
 
 /*
  * Body of CheckPoint XLOG records.  This is declared here because we keep
@@ -46,7 +47,12 @@ typedef struct CheckPoint
 #define XLOG_NOOP						0x20
 #define XLOG_NEXTOID					0x30
 #define XLOG_SWITCH						0x40
-#define XLOG_RECOVERY_END			0x50
+/*
+ * Prior to 8.4 we wrote a shutdown checkpoint when recovery completed.
+ * Now we write an XLOG_RECOVERY_END record, which helps differentiate
+ * between a checkpoint-at-shutdown and the startup case.
+ */
+#define XLOG_RECOVERY_END				0x50
 
 /* System status indicator */
 typedef enum DBState
@@ -101,6 +107,10 @@ typedef struct ControlFileData
 
 	CheckPoint	checkPointCopy; /* copy of last check point record */
 
+	/* 
+	 * Next two sound very similar, yet are distinct and necessary. 
+	 * Check comments in xlog.c for a full explanation not easily repeated.
+	 */
 	XLogRecPtr	minRecoveryPoint;		/* must replay xlog to here */
 	XLogRecPtr	minSafeStartPoint;		/* safe point after recovery crashes */
 
diff --git a/src/include/catalog/pg_proc.h b/src/include/catalog/pg_proc.h
index 4c3aa956d6..0b79d3b666 100644
--- a/src/include/catalog/pg_proc.h
+++ b/src/include/catalog/pg_proc.h
@@ -3230,6 +3230,31 @@ DESCR("xlog filename and byte offset, given an xlog location");
 DATA(insert OID = 2851 ( pg_xlogfile_name			PGNSP PGUID 12 1 0 0 f f f t f i 1 0 25 "25" _null_ _null_ _null_ _null_ pg_xlogfile_name _null_ _null_ _null_ ));
 DESCR("xlog filename, given an xlog location");
 
+DATA(insert OID = 3801 (  pg_recovery_continue		PGNSP PGUID 12 1 0 0 f f f t f v 0 0 2278 "" _null_ _null_ _null_ _null_ pg_recovery_continue _null_ _null_ _null_ ));
+DESCR("if recovery is paused, continue with recovery");
+DATA(insert OID = 3802 (  pg_recovery_pause		PGNSP PGUID 12 1 0 0 f f f t f v 0 0 2278 "" _null_ _null_ _null_ _null_ pg_recovery_pause _null_ _null_ _null_ ));
+DESCR("pause recovery until recovery target reset");
+
+DATA(insert OID = 3804 (  pg_recovery_pause_xid		PGNSP PGUID 12 1 0 0 f f f t f v 1 0 2278 "23" _null_ _null_ _null_ _null_ pg_recovery_pause_xid _null_ _null_ _null_ ));
+DESCR("continue recovery until specified xid completes, if ever seen, then pause recovery");
+DATA(insert OID = 3805 (  pg_recovery_pause_time		PGNSP PGUID 12 1 0 0 f f f t f v 1 0 2278 "1184" _null_ _null_ _null_ _null_ pg_recovery_pause_time _null_ _null_ _null_ ));
+DESCR("continue recovery until a transaction with specified timestamp completes, if ever seen, then pause recovery");
+DATA(insert OID = 3806 (  pg_recovery_advance		PGNSP PGUID 12 1 0 0 f f f t f v 1 0 2278 "23" _null_ _null_ _null_ _null_ pg_recovery_advance _null_ _null_ _null_ ));
+DESCR("continue recovery exactly specified number of records, then pause recovery");
+DATA(insert OID = 3807 (  pg_recovery_stop		PGNSP PGUID 12 1 0 0 f f f t f v 0 0 2278 "" _null_ _null_ _null_ _null_ pg_recovery_stop _null_ _null_ _null_ ));
+DESCR("stop recovery immediately");
+DATA(insert OID = 3808 (  pg_current_recovery_target		PGNSP PGUID 12 1 0 0 f f f t f v 0 0 25 "" _null_ _null_ _null_ _null_ pg_current_recovery_target _null_ _null_ _null_ ));
+DESCR("get current recovery target state and target values, if any");
+
+DATA(insert OID = 3810 (  pg_is_in_recovery 	PGNSP PGUID 12 1 0 0 f f f t f v 0 0 16 "" _null_ _null_ _null_ _null_ pg_is_in_recovery _null_ _null_ _null_ ));
+DESCR("true if server is in recovery");
+DATA(insert OID = 3811 (  pg_last_recovered_xact_timestamp 	PGNSP PGUID 12 1 0 0 f f f t f v 0 0 1184 "" _null_ _null_ _null_ _null_ pg_last_recovered_xact_timestamp _null_ _null_ _null_ ));
+DESCR("timestamp of last commit or abort xlog record that arrived during recovery, if any");
+DATA(insert OID = 3812 (  pg_last_recovered_xid 	PGNSP PGUID 12 1 0 0 f f f t f v 0 0 28 "" _null_ _null_ _null_ _null_ pg_last_recovered_xid _null_ _null_ _null_ ));
+DESCR("xid of last commit or abort xlog record that arrived during recovery, if any");
+DATA(insert OID = 3813 (  pg_last_recovered_xlog_location 	PGNSP PGUID 12 1 0 0 f f f t f v 0 0 25 "" _null_ _null_ _null_ _null_ pg_last_recovered_xlog_location _null_ _null_ _null_ ));
+DESCR("xlog location of last xlog record that arrived during recovery, if any");
+
 DATA(insert OID = 2621 ( pg_reload_conf			PGNSP PGUID 12 1 0 0 f f f t f v 0 0 16 "" _null_ _null_ _null_ _null_ pg_reload_conf _null_ _null_ _null_ ));
 DESCR("reload configuration files");
 DATA(insert OID = 2622 ( pg_rotate_logfile		PGNSP PGUID 12 1 0 0 f f f t f v 0 0 16 "" _null_ _null_ _null_ _null_ pg_rotate_logfile _null_ _null_ _null_ ));
diff --git a/src/include/miscadmin.h b/src/include/miscadmin.h
index 465261a284..de849cad46 100644
--- a/src/include/miscadmin.h
+++ b/src/include/miscadmin.h
@@ -235,6 +235,12 @@ extern bool VacuumCostActive;
 /* in tcop/postgres.c */
 extern void check_stack_depth(void);
 
+/* in tcop/utility.c */
+extern void PreventCommandDuringRecovery(void);
+
+/* in utils/misc/guc.c */
+extern int trace_recovery_messages;
+int trace_recovery(int trace_level);
 
 /*****************************************************************************
  *	  pdir.h --																 *
diff --git a/src/include/postmaster/bgwriter.h b/src/include/postmaster/bgwriter.h
index d4b389e927..95c1695b37 100644
--- a/src/include/postmaster/bgwriter.h
+++ b/src/include/postmaster/bgwriter.h
@@ -29,7 +29,7 @@ extern void RequestCheckpoint(int flags);
 extern void RequestRestartPoint(const XLogRecPtr ReadPtr, const CheckPoint *restartPoint, bool sendToBGWriter);
 extern void RequestRestartPointCompletion(void);
 extern XLogRecPtr GetRedoLocationForArchiveCheckpoint(void);
-extern void SetRedoLocationForArchiveCheckpoint(XLogRecPtr redo);
+extern bool SetRedoLocationForArchiveCheckpoint(XLogRecPtr redo);
 
 extern void CheckpointWriteDelay(int flags, double progress);
 
diff --git a/src/include/storage/bufmgr.h b/src/include/storage/bufmgr.h
index 42766e99f7..39ab4e8449 100644
--- a/src/include/storage/bufmgr.h
+++ b/src/include/storage/bufmgr.h
@@ -68,6 +68,9 @@ extern PGDLLIMPORT int32 *LocalRefCount;
 #define BUFFER_LOCK_SHARE		1
 #define BUFFER_LOCK_EXCLUSIVE	2
 
+/* Not used by LockBuffer, but is used by XLogReadBuffer... */
+#define BUFFER_LOCK_CLEANUP		3
+
 /*
  * These routines are beaten on quite heavily, hence the macroization.
  */
@@ -169,6 +172,8 @@ extern void IncrBufferRefCount(Buffer buffer);
 extern Buffer ReleaseAndReadBuffer(Buffer buffer, Relation relation,
 					 BlockNumber blockNum);
 
+extern bool SetBufferRecoveryConflictLSN(XLogRecPtr conflict_LSN);
+
 extern void InitBufferPool(void);
 extern void InitBufferPoolAccess(void);
 extern void InitBufferPoolBackend(void);
@@ -200,6 +205,10 @@ extern bool ConditionalLockBuffer(Buffer buffer);
 extern void LockBufferForCleanup(Buffer buffer);
 extern bool ConditionalLockBufferForCleanup(Buffer buffer);
 
+extern void StartCleanupDelayStats(void);
+extern void EndCleanupDelayStats(void);
+extern void ReportCleanupDelayStats(void);
+
 extern void AbortBufferIO(void);
 
 extern void BufmgrCommit(void);
diff --git a/src/include/storage/lwlock.h b/src/include/storage/lwlock.h
index a323db8941..054de7ce4b 100644
--- a/src/include/storage/lwlock.h
+++ b/src/include/storage/lwlock.h
@@ -96,6 +96,8 @@ extern bool LWLockConditionalAcquire(LWLockId lockid, LWLockMode mode);
 extern void LWLockRelease(LWLockId lockid);
 extern void LWLockReleaseAll(void);
 extern bool LWLockHeldByMe(LWLockId lockid);
+extern void PrintLWLocksHeldByMe(void);
+extern int NumLWLocksHeldByMe(void);
 
 extern int	NumLWLocks(void);
 extern Size LWLockShmemSize(void);
diff --git a/src/include/storage/proc.h b/src/include/storage/proc.h
index 7509332d59..6469dece1b 100644
--- a/src/include/storage/proc.h
+++ b/src/include/storage/proc.h
@@ -14,6 +14,7 @@
 #ifndef _PROC_H_
 #define _PROC_H_
 
+#include "access/xlog.h"
 #include "storage/lock.h"
 #include "storage/pg_sema.h"
 
@@ -93,6 +94,20 @@ struct PGPROC
 
 	uint8		vacuumFlags;	/* vacuum-related flags, see above */
 
+	/* 
+	 * The LSN field exists to allow procs to be used during recovery
+	 * for managing snapshot data for standby servers. The lsn allows
+	 * us to disambiguate any incoming information so we always respect
+	 * the latest info.
+	 */
+	XLogRecPtr	lsn;	/* Last LSN which maintained state of Recovery Proc */
+
+	/*
+	 * Recovery processing fields
+	 */
+	XLogRecPtr 	recoveryConflictLSN;
+	bool		recoveryConflictCancelMode;
+
 	/* Info about LWLock the process is currently waiting for, if any. */
 	bool		lwWaiting;		/* true if waiting for an LW lock */
 	bool		lwExclusive;	/* true if waiting for exclusive access */
@@ -131,8 +146,13 @@ typedef struct PROC_HDR
 	PGPROC	   *freeProcs;
 	/* Head of list of autovacuum's free PGPROC structures */
 	PGPROC	   *autovacFreeProcs;
+	/* Head of list of free recovery PGPROC structures */
+	PGPROC	   *freeRecoveryProcs;
 	/* Current shared estimate of appropriate spins_per_delay value */
 	int			spins_per_delay;
+	/* The proc of the Startup process, since not in ProcArray */
+	PGPROC	   *startupProc;
+	int	   startupProcPid;
 } PROC_HDR;
 
 /*
@@ -157,8 +177,15 @@ extern int	ProcGlobalSemas(void);
 extern Size ProcGlobalShmemSize(void);
 extern void InitProcGlobal(void);
 extern void InitProcess(void);
+extern PGPROC *InitRecoveryProcess(TransactionId xid);
+extern void FreeRecoveryProcess(PGPROC *proc);
 extern void InitProcessPhase2(void);
 extern void InitAuxiliaryProcess(void);
+
+extern void PublishStartupProcessInformation(void);
+extern void ProcSetRecoveryConflict(PGPROC *proc, XLogRecPtr conflict_LSN, int cancel_mode);
+extern XLogRecPtr ProcGetRecoveryConflict(int *cancel_mode);
+
 extern bool HaveNFreeProcs(int n);
 extern void ProcReleaseLocks(bool isCommit);
 
diff --git a/src/include/storage/procarray.h b/src/include/storage/procarray.h
index 065a9b9ac8..200dcb4efb 100644
--- a/src/include/storage/procarray.h
+++ b/src/include/storage/procarray.h
@@ -14,18 +14,29 @@
 #ifndef PROCARRAY_H
 #define PROCARRAY_H
 
+#include "access/xact.h"
 #include "storage/lock.h"
 #include "utils/snapshot.h"
 
 
 extern Size ProcArrayShmemSize(void);
 extern void CreateSharedProcArray(void);
-extern void ProcArrayAdd(PGPROC *proc);
-extern void ProcArrayRemove(PGPROC *proc, TransactionId latestXid);
+extern void ProcArrayAdd(PGPROC *proc, bool need_lock);
+extern void ProcArrayRemove(PGPROC *proc, TransactionId latestXid,
+										int nsubxids, TransactionId *subxids);
+
+extern void ProcArrayInitRecoveryEnvironment(void);
+extern void ProcArrayEndTransaction(PGPROC *proc, TransactionId latestXid); 
 
-extern void ProcArrayEndTransaction(PGPROC *proc, TransactionId latestXid);
 extern void ProcArrayClearTransaction(PGPROC *proc);
+extern void ProcArrayClearRecoveryTransactions(void);
+extern bool XidInRecoveryProcs(TransactionId xid);
+extern void ProcArrayDisplay(int trace_level);
+extern void ProcArrayUpdateRecoveryTransactions(XLogRecPtr lsn, 
+												xl_xact_running_xacts *xlrec);
+extern PGPROC *CreateRecoveryProcessForTransactionId(TransactionId xid);
 
+extern RunningTransactions GetRunningTransactionData(void);
 extern Snapshot GetSnapshotData(Snapshot snapshot);
 
 extern bool TransactionIdIsInProgress(TransactionId xid);
@@ -36,11 +47,16 @@ extern int	GetTransactionsInCommit(TransactionId **xids_p);
 extern bool HaveTransactionsInCommit(TransactionId *xids, int nxids);
 
 extern PGPROC *BackendPidGetProc(int pid);
+extern PGPROC *BackendXidGetProc(TransactionId xid);
 extern int	BackendXidGetPid(TransactionId xid);
 extern bool IsBackendPid(int pid);
 
-extern VirtualTransactionId *GetCurrentVirtualXIDs(TransactionId limitXmin,
-					  bool allDbs, int excludeVacuum);
+extern VirtualTransactionId *GetCurrentVirtualXIDs(TransactionId limitXmin, 
+					Oid	dbOid, int excludeVacuum);
+extern VirtualTransactionId *GetConflictingVirtualXIDs(TransactionId limitXmin, 
+					Oid dbOid, Oid roleId);
+extern PGPROC *VirtualTransactionIdGetProc(VirtualTransactionId vxid);
+
 extern int	CountActiveBackends(void);
 extern int	CountDBBackends(Oid databaseid);
 extern int	CountUserBackends(Oid roleid);
@@ -51,4 +67,14 @@ extern void XidCacheRemoveRunningXids(TransactionId xid,
 						  int nxids, const TransactionId *xids,
 						  TransactionId latestXid);
 
+/* Primitives for UnobservedXids array handling for standby */
+extern void UnobservedTransactionsAddXids(TransactionId firstXid, 
+											TransactionId lastXid);
+extern void UnobservedTransactionsRemoveXid(TransactionId xid, 
+											bool missing_is_error); 
+extern void UnobservedTransactionsPruneXids(TransactionId limitXid);
+extern void UnobservedTransactionsClearXids(void);
+extern void UnobservedTransactionsDisplay(int trace_level);
+extern bool XidInUnobservedTransactions(TransactionId xid);
+
 #endif   /* PROCARRAY_H */
diff --git a/src/include/storage/sinval.h b/src/include/storage/sinval.h
index 4a07e0cfc1..5bf1be1e89 100644
--- a/src/include/storage/sinval.h
+++ b/src/include/storage/sinval.h
@@ -89,6 +89,44 @@ extern void ReceiveSharedInvalidMessages(
 					  void (*invalFunction) (SharedInvalidationMessage *msg),
 							 void (*resetFunction) (void));
 
+extern int xactGetCommittedInvalidationMessages(SharedInvalidationMessage **msgs, 
+										bool *RelcacheInitFileInval);
+
+/* 
+ * Relation Rmgr (RM_RELATION_ID)
+ *
+ * Relation recovery manager exists to allow locks and certain kinds of
+ * invalidation message to be passed across to a standby server.
+ */
+extern void RelationReleaseRecoveryLockTree(TransactionId xid, 
+										int nsubxids, TransactionId *subxids);
+extern void RelationClearRecoveryLocks(void);
+
+/* Recovery handlers for the Relation Rmgr (RM_RELATION_ID) */
+extern void relation_redo(XLogRecPtr lsn, XLogRecord *record);
+extern void relation_desc(StringInfo buf, uint8 xl_info, char *rec);
+
+/*
+ * XLOG message types
+ */
+#define XLOG_RELATION_INVAL			0x00
+#define XLOG_RELATION_LOCK			0x10
+
+typedef struct xl_rel_inval
+{
+	int							nmsgs;		/* number of shared inval msgs */
+	SharedInvalidationMessage	msgs[1];	/* VARIABLE LENGTH ARRAY */ 
+} xl_rel_inval;
+
+#define MinSizeOfRelationInval offsetof(xl_rel_inval, msgs)
+
+typedef struct xl_rel_lock
+{
+	TransactionId	xid;	/* xid of holder of AccessExclusiveLock */
+	Oid		dbOid;
+	Oid		relOid;
+} xl_rel_lock;
+
 /* signal handler for catchup events (SIGUSR1) */
 extern void CatchupInterruptHandler(SIGNAL_ARGS);
 
diff --git a/src/include/storage/sinvaladt.h b/src/include/storage/sinvaladt.h
index 3c4c030416..c612ee7c15 100644
--- a/src/include/storage/sinvaladt.h
+++ b/src/include/storage/sinvaladt.h
@@ -29,7 +29,7 @@
  */
 extern Size SInvalShmemSize(void);
 extern void CreateSharedInvalidationState(void);
-extern void SharedInvalBackendInit(void);
+extern void SharedInvalBackendInit(bool sendOnly);
 extern bool BackendIdIsActive(int backendID);
 
 extern void SIInsertDataEntries(const SharedInvalidationMessage *data, int n);
diff --git a/src/include/utils/flatfiles.h b/src/include/utils/flatfiles.h
index 36f47b87a5..f9569a224a 100644
--- a/src/include/utils/flatfiles.h
+++ b/src/include/utils/flatfiles.h
@@ -27,6 +27,13 @@ extern void AtEOSubXact_UpdateFlatFiles(bool isCommit,
 							SubTransactionId mySubid,
 							SubTransactionId parentSubid);
 
+/* 
+ * Called by RecordTransactionCommit to allow it to set xinfo flags
+ * on the commit record. Used for standby invalidation of flat files.
+ */
+extern bool AtEOXact_Database_FlatFile_Update_Needed(void);
+extern bool AtEOXact_Auth_FlatFile_Update_Needed(void);
+
 extern Datum flatfile_update_trigger(PG_FUNCTION_ARGS);
 
 extern void flatfile_twophase_postcommit(TransactionId xid, uint16 info,
diff --git a/src/include/utils/inval.h b/src/include/utils/inval.h
index 42fd8ba118..9c6eca47a1 100644
--- a/src/include/utils/inval.h
+++ b/src/include/utils/inval.h
@@ -15,6 +15,8 @@
 #define INVAL_H
 
 #include "access/htup.h"
+#include "access/xact.h"
+#include "storage/lock.h"
 #include "utils/relcache.h"
 
 
@@ -60,4 +62,8 @@ extern void CacheRegisterRelcacheCallback(RelcacheCallbackFunction func,
 extern void inval_twophase_postcommit(TransactionId xid, uint16 info,
 						  void *recdata, uint32 len);
 
+extern void ResolveRecoveryConflictWithVirtualXIDs(VirtualTransactionId *waitlist,
+										char *reason, int cancel_mode, 
+										XLogRecPtr conflict_LSN);
+
 #endif   /* INVAL_H */
diff --git a/src/include/utils/snapshot.h b/src/include/utils/snapshot.h
index 0af1f6f17a..c57fa89929 100644
--- a/src/include/utils/snapshot.h
+++ b/src/include/utils/snapshot.h
@@ -49,7 +49,17 @@ typedef struct SnapshotData
 	uint32		xcnt;			/* # of xact ids in xip[] */
 	TransactionId *xip;			/* array of xact IDs in progress */
 	/* note: all ids in xip[] satisfy xmin <= xip[i] < xmax */
-	int32		subxcnt;		/* # of xact ids in subxip[], -1 if overflow */
+
+	/* 
+	 * Prior to 8.4 we represented an overflowed subxid cache with subxcnt -1.
+	 * In 8.4+ we separate the two concepts because when checking the xids
+	 * in the snapshot we check *both* subxid cache and subtrans, if subxid
+	 * cache has overflowed. So we still need the count, even if overflowed.
+	 * We do this to allow unobserved xids to be placed into the snapshot
+	 * even when snapshot overflows. It is also a performance gain.
+	 */
+	uint32		subxcnt;		/* # of xact ids in subxip[] */
+	bool		suboverflowed;	/* true means at least one subxid cache overflowed */
 	TransactionId *subxip;		/* array of subxact IDs in progress */
 
 	/*
@@ -63,6 +73,73 @@ typedef struct SnapshotData
 } SnapshotData;
 
 /*
+ * Declarations for GetRunningTransactionData(). Similar to Snapshots, but
+ * not quite. This has nothing at all to do with visibility on this server,
+ * so this is completely separate from snapmgr.c and snapmgr.h
+ * This data is important for creating the initial snapshot state on a 
+ * standby server. We need lots more information than a normal snapshot, 
+ * hence we use a specific data structure for our needs. This data
+ * is written to WAL as a separate record immediately after each
+ * checkpoint. That means that wherever we start a standby from we will
+ * almost immediately see the data we need to begin executing queries.
+ */
+typedef struct RunningXact
+{
+	/* Items matching PGPROC entries */
+	TransactionId	xid;			/* xact ID in progress */
+
+	/* Items matching XidCache */ 
+	bool        	overflowed;
+	int	        	nsubxids;		/* # of subxact ids for this xact only */
+
+	/* Additional info */
+	uint32     		subx_offset;	/* array offset of start of subxip,
+									 * zero if nsubxids == 0
+									 */
+} RunningXact;
+
+typedef struct RunningXactsData
+{
+	uint32			xcnt;				/* # of xact ids in xrun[] */
+	uint32			subxcnt;			/* # of xact ids in subxip[] */
+	TransactionId 	latestRunningXid;	/* Initial setting of LatestObservedXid */
+	TransactionId	oldestRunningXid;	/* *not* oldestXmin */
+	TransactionId 	latestCompletedXid;
+
+	RunningXact	*xrun;			/* array of RunningXact structs */
+
+	/* 
+	 * subxip is held as a single contiguous array, so no space is wasted,
+	 * plus it helps it fit into one XLogRecord.  We continue to keep track
+	 * of which subxids go with each top-level xid by tracking the start
+	 * offset, held on each RunningXact struct.
+	 */
+	TransactionId *subxip;		/* array of subxact IDs in progress */
+
+} RunningXactsData;
+
+typedef RunningXactsData *RunningTransactions;
+
+/*
+ * When we write running xact data to WAL, we use this structure.
+ */
+typedef struct xl_xact_running_xacts
+{
+	int				xcnt;				/* # of xact ids in xrun[] */
+	int				subxcnt;			/* # of xact ids in subxip[] */
+	TransactionId	latestRunningXid;	/* Initial setting of LatestObservedXid */
+	TransactionId	oldestRunningXid;	/* *not* oldestXmin */
+	TransactionId	latestCompletedXid; 
+
+	/* Array of RunningXact(s)  */
+	RunningXact	xrun[1];		/* VARIABLE LENGTH ARRAY */
+
+	/* ARRAY OF RUNNING SUBTRANSACTION XIDs FOLLOWS */
+} xl_xact_running_xacts;
+
+#define MinSizeOfXactRunningXacts offsetof(xl_xact_running_xacts, xrun)
+
+/*
  * Result codes for HeapTupleSatisfiesUpdate.  This should really be in
  * tqual.h, but we want to avoid including that file elsewhere.
  */
diff --git a/src/test/regress/parallel_schedule b/src/test/regress/parallel_schedule
index 7a2f374e7b..44b345d484 100644
--- a/src/test/regress/parallel_schedule
+++ b/src/test/regress/parallel_schedule
@@ -67,9 +67,9 @@ test: select
 ignore: random
 
 # ----------
-# Another group of parallel tests
+# Another group of parallel tests   test removed=prepared_xacts
 # ----------
-test: select_into select_distinct select_distinct_on select_implicit select_having subselect union case join aggregates transactions random portals arrays btree_index hash_index update namespace prepared_xacts delete
+test: select_into select_distinct select_distinct_on select_implicit select_having subselect union case join aggregates transactions random portals arrays btree_index hash_index update namespace delete
 
 test: privileges
 test: misc
diff --git a/src/test/regress/serial_schedule b/src/test/regress/serial_schedule
index 594345f9fe..db2ebbdfec 100644
--- a/src/test/regress/serial_schedule
+++ b/src/test/regress/serial_schedule
@@ -84,7 +84,7 @@ test: hash_index
 test: update
 test: delete
 test: namespace
-test: prepared_xacts
+#test: prepared_xacts
 test: privileges
 test: misc
 test: select_views
author	Heikki Linnakangas	2009-01-23 12:34:24 +0000
committer	Heikki Linnakangas	2009-01-23 12:34:24 +0000
commit	25d33629c3019fe96988610211b45d195f72c70a (patch)
tree	42821ba7b65ae97b389aba804164bd75cd274718
parent	f4fd26775b8b6e7090774bb3d794b8529771fce5 (diff)