From 3da534dc6c28b2b3127d7400ab524f12b9faa0ae Mon Sep 17 00:00:00 2001
From: Thomas Munro <thomas.munro@gmail.com>
Date: Wed, 6 Mar 2019 16:46:04 +1300
Subject: [PATCH] Add undo log manager.

Add a new subsystem to manage undo logs.  Undo logs allow data to be appended
efficiently, like logs.  They also allow data to be discarded efficiently from
the other end, like a queue.  Thirdly, they allow efficient buffered random
access, like a relation.

Undo logs physically consist of a set of 1MB segment files under
$PGDATA/base/undo (or per-tablespace equivalent) that are created, deleted or
renamed as required, similarly to the way that WAL segments are managed.
Meta-data about the set of undo logs is stored in shared memory, and written
to per-checkpoint files under $PGDATA/pg_undo.

Provide access to the undo files managed by undolog.c through bufmgr.c.
A new SMGR implementation allows bufmgr.c to access files created by
undolog.c.

Author: Thomas Munro, with contributions from Dilip Kumar, Rafia Sabih,
        Robert Haas and Amit Kapila
Reviewed-by:
Discussion: https://postgr.es/m/CAEepm%3D2EqROYJ_xYz4v5kfr4b0qw_Lq_6Pe8RTEC8rx3upWsSQ%40mail.gmail.com
---
 src/backend/access/Makefile               |    2 +-
 src/backend/access/rmgrdesc/Makefile      |    2 +-
 src/backend/access/rmgrdesc/undologdesc.c |   81 +
 src/backend/access/transam/rmgr.c         |    1 +
 src/backend/access/transam/xlog.c         |    5 +
 src/backend/access/transam/xlogutils.c    |   70 +-
 src/backend/access/undo/Makefile          |   17 +
 src/backend/access/undo/undolog.c         | 2627 +++++++++++++++++++++
 src/backend/bootstrap/bootstrap.c         |    3 +
 src/backend/catalog/system_views.sql      |    4 +
 src/backend/commands/tablespace.c         |   23 +
 src/backend/postmaster/pgstat.c           |   21 +
 src/backend/replication/basebackup.c      |   18 +-
 src/backend/replication/logical/decode.c  |    1 +
 src/backend/storage/buffer/bufmgr.c       |   80 +-
 src/backend/storage/buffer/localbuf.c     |   43 +
 src/backend/storage/file/fd.c             |    3 +-
 src/backend/storage/ipc/ipci.c            |    3 +
 src/backend/storage/lmgr/lwlock.c         |    2 +
 src/backend/storage/lmgr/lwlocknames.txt  |    1 +
 src/backend/storage/smgr/Makefile         |    2 +-
 src/backend/storage/smgr/smgr.c           |   23 +
 src/backend/storage/smgr/undofile.c       |  422 ++++
 src/backend/storage/sync/sync.c           |    6 +
 src/backend/utils/init/postinit.c         |    2 +
 src/backend/utils/misc/guc.c              |   12 +
 src/bin/initdb/initdb.c                   |    2 +
 src/bin/pg_checksums/pg_checksums.c       |   23 +-
 src/bin/pg_resetwal/pg_resetwal.c         |   76 +
 src/bin/pg_upgrade/Makefile               |    2 +-
 src/bin/pg_upgrade/check.c                |   43 +
 src/bin/pg_upgrade/controldata.c          |   25 +
 src/bin/pg_upgrade/exec.c                 |    4 +
 src/bin/pg_upgrade/pg_upgrade.c           |    2 +
 src/bin/pg_upgrade/pg_upgrade.h           |    5 +
 src/bin/pg_upgrade/undo.c                 |  292 +++
 src/bin/pg_waldump/rmgrdesc.c             |    1 +
 src/include/access/rmgrlist.h             |    1 +
 src/include/access/session.h              |    7 +
 src/include/access/undolog.h              |  489 ++++
 src/include/access/undolog_xlog.h         |   62 +
 src/include/access/xlogutils.h            |   14 +
 src/include/catalog/database_internal.h   |   21 +
 src/include/catalog/pg_proc.dat           |    7 +
 src/include/pgstat.h                      |    7 +
 src/include/storage/bufmgr.h              |   14 +-
 src/include/storage/fd.h                  |    1 +
 src/include/storage/lwlock.h              |    5 +-
 src/include/storage/smgr.h                |    3 +
 src/include/storage/sync.h                |    3 +-
 src/include/storage/undofile.h            |   59 +
 src/include/utils/guc.h                   |    2 +
 src/test/regress/expected/rules.out       |   10 +
 53 files changed, 4613 insertions(+), 41 deletions(-)
 create mode 100644 src/backend/access/rmgrdesc/undologdesc.c
 create mode 100644 src/backend/access/undo/Makefile
 create mode 100644 src/backend/access/undo/undolog.c
 create mode 100644 src/backend/storage/smgr/undofile.c
 create mode 100644 src/bin/pg_upgrade/undo.c
 create mode 100644 src/include/access/undolog.h
 create mode 100644 src/include/access/undolog_xlog.h
 create mode 100644 src/include/catalog/database_internal.h
 create mode 100644 src/include/storage/undofile.h

diff --git a/src/backend/access/Makefile b/src/backend/access/Makefile
index 0880e0a8bb..bf6d3fa1bd 100644
--- a/src/backend/access/Makefile
+++ b/src/backend/access/Makefile
@@ -9,6 +9,6 @@ top_builddir = ../../..
 include $(top_builddir)/src/Makefile.global
 
 SUBDIRS	    = brin common gin gist hash heap index nbtree rmgrdesc spgist \
-			  table tablesample transam
+			  table tablesample transam undo
 
 include $(top_srcdir)/src/backend/common.mk
diff --git a/src/backend/access/rmgrdesc/Makefile b/src/backend/access/rmgrdesc/Makefile
index 5514db1dda..91ad1ef8a3 100644
--- a/src/backend/access/rmgrdesc/Makefile
+++ b/src/backend/access/rmgrdesc/Makefile
@@ -11,6 +11,6 @@ include $(top_builddir)/src/Makefile.global
 OBJS = brindesc.o clogdesc.o committsdesc.o dbasedesc.o genericdesc.o \
 	   gindesc.o gistdesc.o hashdesc.o heapdesc.o logicalmsgdesc.o \
 	   mxactdesc.o nbtdesc.o relmapdesc.o replorigindesc.o seqdesc.o \
-	   smgrdesc.o spgdesc.o standbydesc.o tblspcdesc.o xactdesc.o xlogdesc.o
+	   smgrdesc.o spgdesc.o standbydesc.o tblspcdesc.o undologdesc.o xactdesc.o xlogdesc.o
 
 include $(top_srcdir)/src/backend/common.mk
diff --git a/src/backend/access/rmgrdesc/undologdesc.c b/src/backend/access/rmgrdesc/undologdesc.c
new file mode 100644
index 0000000000..f89fcb31c6
--- /dev/null
+++ b/src/backend/access/rmgrdesc/undologdesc.c
@@ -0,0 +1,81 @@
+/*-------------------------------------------------------------------------
+ *
+ * undologdesc.c
+ *	  rmgr descriptor routines for access/undo/undolog.c
+ *
+ * Portions Copyright (c) 1996-2019, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ *	  src/backend/access/rmgrdesc/undologdesc.c
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include "access/undolog.h"
+#include "access/undolog_xlog.h"
+
+void
+undolog_desc(StringInfo buf, XLogReaderState *record)
+{
+	char	   *rec = XLogRecGetData(record);
+	uint8		info = XLogRecGetInfo(record) & ~XLR_INFO_MASK;
+
+	if (info == XLOG_UNDOLOG_CREATE)
+	{
+		xl_undolog_create *xlrec = (xl_undolog_create *) rec;
+
+		appendStringInfo(buf, "logno %u", xlrec->logno);
+	}
+	else if (info == XLOG_UNDOLOG_EXTEND)
+	{
+		xl_undolog_extend *xlrec = (xl_undolog_extend *) rec;
+
+		appendStringInfo(buf, "logno %u end " UndoLogOffsetFormat,
+						 xlrec->logno, xlrec->end);
+	}
+	else if (info == XLOG_UNDOLOG_DISCARD)
+	{
+		xl_undolog_discard *xlrec = (xl_undolog_discard *) rec;
+
+		appendStringInfo(buf, "logno %u discard " UndoLogOffsetFormat " end "
+						 UndoLogOffsetFormat,
+						 xlrec->logno, xlrec->discard, xlrec->end);
+	}
+	else if (info == XLOG_UNDOLOG_SWITCH)
+	{
+		xl_undolog_switch *xlrec = (xl_undolog_switch *) rec;
+
+		appendStringInfo(buf, "logno %u start " UndoLogOffsetFormat " last " UndoLogOffsetFormat,
+						 xlrec->logno,
+						 xlrec->prevlog_xact_start,
+						 xlrec->prevlog_last_urp);
+	}
+
+}
+
+const char *
+undolog_identify(uint8 info)
+{
+	const char *id = NULL;
+
+	switch (info & ~XLR_INFO_MASK)
+	{
+		case XLOG_UNDOLOG_CREATE:
+			id = "CREATE";
+			break;
+		case XLOG_UNDOLOG_EXTEND:
+			id = "EXTEND";
+			break;
+		case XLOG_UNDOLOG_DISCARD:
+			id = "DISCARD";
+			break;
+		case XLOG_UNDOLOG_SWITCH:
+			id = "SWITCH";
+			break;
+	}
+
+	return id;
+}
diff --git a/src/backend/access/transam/rmgr.c b/src/backend/access/transam/rmgr.c
index 9368b56c4c..8b0537405a 100644
--- a/src/backend/access/transam/rmgr.c
+++ b/src/backend/access/transam/rmgr.c
@@ -18,6 +18,7 @@
 #include "access/multixact.h"
 #include "access/nbtxlog.h"
 #include "access/spgxlog.h"
+#include "access/undolog_xlog.h"
 #include "access/xact.h"
 #include "access/xlog_internal.h"
 #include "catalog/storage_xlog.h"
diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c
index b6c9353cbd..5dbe485af2 100644
--- a/src/backend/access/transam/xlog.c
+++ b/src/backend/access/transam/xlog.c
@@ -31,6 +31,7 @@
 #include "access/transam.h"
 #include "access/tuptoaster.h"
 #include "access/twophase.h"
+#include "access/undolog.h"
 #include "access/xact.h"
 #include "access/xlog_internal.h"
 #include "access/xloginsert.h"
@@ -6710,6 +6711,9 @@ StartupXLOG(void)
 	 */
 	restoreTwoPhaseData();
 
+	/* Recover undo log meta data corresponding to this checkpoint. */
+	StartupUndoLogs(ControlFile->checkPointCopy.redo);
+
 	lastFullPageWrites = checkPoint.fullPageWrites;
 
 	RedoRecPtr = XLogCtl->RedoRecPtr = XLogCtl->Insert.RedoRecPtr = checkPoint.redo;
@@ -8977,6 +8981,7 @@ static void
 CheckPointGuts(XLogRecPtr checkPointRedo, int flags)
 {
 	CheckPointCLOG();
+	CheckPointUndoLogs(checkPointRedo, ControlFile->checkPointCopy.redo);
 	CheckPointCommitTs();
 	CheckPointSUBTRANS();
 	CheckPointMultiXact();
diff --git a/src/backend/access/transam/xlogutils.c b/src/backend/access/transam/xlogutils.c
index 10a663bae6..c227c03854 100644
--- a/src/backend/access/transam/xlogutils.c
+++ b/src/backend/access/transam/xlogutils.c
@@ -293,6 +293,65 @@ XLogReadBufferForRedo(XLogReaderState *record, uint8 block_id,
 										 false, buf);
 }
 
+/*
+ * Find the block ID of the first block that matches the given rnode forknum
+ * and blockno.  If blockno is InvalidBlockNumber, then match any block
+ * number.  Return true if found.
+ */
+bool
+XLogFindBlockId(XLogReaderState *record,
+				RelFileNode rnode,
+				ForkNumber forknum,
+				BlockNumber blockno,
+				uint8 *block_id)
+{
+	uint8	i;
+
+	for (i = 0; i <= record->max_block_id; ++i)
+	{
+		DecodedBkpBlock *block = &record->blocks[i];
+
+		if (block->in_use &&
+			RelFileNodeEquals(block->rnode, rnode) &&
+			block->forknum == forknum &&
+			(block->blkno == blockno || blockno == InvalidBlockNumber))
+		{
+			*block_id = i;
+			return true;
+		}
+	}
+
+	return false;
+}
+
+/*
+ * If the caller doesn't know the the block_id, but does know the RelFileNode,
+ * forknum and block number, then we try to find it.
+ */
+XLogRedoAction
+XLogReadBufferForRedoBlock(XLogReaderState *record,
+						   RelFileNode rnode,
+						   ForkNumber forknum,
+						   BlockNumber blockno,
+						   ReadBufferMode mode,
+						   bool get_cleanup_lock,
+						   Buffer *buf)
+{
+	uint8  	block_id;
+
+	if (XLogFindBlockId(record, rnode, forknum, blockno, &block_id))
+		return XLogReadBufferForRedoExtended(record,
+											 block_id,
+											 mode,
+											 get_cleanup_lock,
+											 buf);
+
+	elog(ERROR, "failed to find block reference rel %u/%u/%u, forknum = %u, block = %u",
+		 rnode.spcNode, rnode.dbNode, rnode.relNode, forknum, blockno);
+
+	return BLK_NOTFOUND;	/* not reached */
+}
+
 /*
  * Pin and lock a buffer referenced by a WAL record, for the purpose of
  * re-initializing it.
@@ -346,7 +405,8 @@ XLogReadBufferForRedoExtended(XLogReaderState *record,
 	 * Make sure that if the block is marked with WILL_INIT, the caller is
 	 * going to initialize it. And vice versa.
 	 */
-	zeromode = (mode == RBM_ZERO_AND_LOCK || mode == RBM_ZERO_AND_CLEANUP_LOCK);
+	zeromode = (mode == RBM_ZERO || mode == RBM_ZERO_AND_LOCK ||
+				mode == RBM_ZERO_AND_CLEANUP_LOCK);
 	willinit = (record->blocks[block_id].flags & BKPBLOCK_WILL_INIT) != 0;
 	if (willinit && !zeromode)
 		elog(PANIC, "block with WILL_INIT flag in WAL record must be zeroed by redo routine");
@@ -462,7 +522,7 @@ XLogReadBufferExtended(RelFileNode rnode, ForkNumber forknum,
 	{
 		/* page exists in file */
 		buffer = ReadBufferWithoutRelcache(rnode, forknum, blkno,
-										   mode, NULL);
+										   mode, NULL, RELPERSISTENCE_PERMANENT);
 	}
 	else
 	{
@@ -487,7 +547,8 @@ XLogReadBufferExtended(RelFileNode rnode, ForkNumber forknum,
 				ReleaseBuffer(buffer);
 			}
 			buffer = ReadBufferWithoutRelcache(rnode, forknum,
-											   P_NEW, mode, NULL);
+											   P_NEW, mode, NULL,
+											   RELPERSISTENCE_PERMANENT);
 		}
 		while (BufferGetBlockNumber(buffer) < blkno);
 		/* Handle the corner case that P_NEW returns non-consecutive pages */
@@ -497,7 +558,8 @@ XLogReadBufferExtended(RelFileNode rnode, ForkNumber forknum,
 				LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
 			ReleaseBuffer(buffer);
 			buffer = ReadBufferWithoutRelcache(rnode, forknum, blkno,
-											   mode, NULL);
+											   mode, NULL,
+											   RELPERSISTENCE_PERMANENT);
 		}
 	}
 
diff --git a/src/backend/access/undo/Makefile b/src/backend/access/undo/Makefile
new file mode 100644
index 0000000000..219c6963cf
--- /dev/null
+++ b/src/backend/access/undo/Makefile
@@ -0,0 +1,17 @@
+#-------------------------------------------------------------------------
+#
+# Makefile--
+#    Makefile for access/undo
+#
+# IDENTIFICATION
+#    src/backend/access/undo/Makefile
+#
+#-------------------------------------------------------------------------
+
+subdir = src/backend/access/undo
+top_builddir = ../../../..
+include $(top_builddir)/src/Makefile.global
+
+OBJS = undolog.o
+
+include $(top_srcdir)/src/backend/common.mk
diff --git a/src/backend/access/undo/undolog.c b/src/backend/access/undo/undolog.c
new file mode 100644
index 0000000000..f2e0272ab1
--- /dev/null
+++ b/src/backend/access/undo/undolog.c
@@ -0,0 +1,2627 @@
+/*-------------------------------------------------------------------------
+ *
+ * undolog.c
+ *	  management of undo logs
+ *
+ * PostgreSQL undo log manager.  This module is responsible for managing the
+ * lifecycle of undo logs and their segment files, associating undo logs with
+ * backends, and allocating space within undo logs.
+ *
+ * For the code that reads and writes blocks of data, see undofile.c.
+ *
+ * Portions Copyright (c) 1996-2019, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * src/backend/access/undo/undolog.c
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#include "postgres.h"
+
+#include "access/session.h"
+#include "access/transam.h"
+#include "access/undolog.h"
+#include "access/undolog_xlog.h"
+#include "access/xact.h"
+#include "access/xlog.h"
+#include "access/xlogreader.h"
+#include "access/xlogutils.h"
+#include "catalog/catalog.h"
+#include "catalog/pg_tablespace.h"
+#include "commands/tablespace.h"
+#include "funcapi.h"
+#include "miscadmin.h"
+#include "nodes/execnodes.h"
+#include "pgstat.h"
+#include "storage/buf.h"
+#include "storage/bufmgr.h"
+#include "storage/fd.h"
+#include "storage/ipc.h"
+#include "storage/lwlock.h"
+#include "storage/procarray.h"
+#include "storage/shmem.h"
+#include "storage/standby.h"
+#include "storage/sync.h"
+#include "storage/undofile.h"
+#include "utils/builtins.h"
+#include "utils/guc.h"
+#include "utils/memutils.h"
+#include "utils/varlena.h"
+
+#include <sys/stat.h>
+#include <unistd.h>
+
+/*
+ * Main control structure for undo log management in shared memory.
+ * UndoLogSlot objects are arranged in a fixed-size array, with no particular
+ * ordering.
+ */
+typedef struct UndoLogSharedData
+{
+	UndoLogNumber	free_lists[UndoLogCategories];
+	UndoLogNumber	low_logno;
+	UndoLogNumber	next_logno;
+	UndoLogNumber	nslots;
+	UndoLogSlot		slots[FLEXIBLE_ARRAY_MEMBER];
+} UndoLogSharedData;
+
+/* The shared memory region that all backends are attach to. */
+UndoLogSharedData *UndoLogShared;
+
+undologtable_hash *undologtable_cache;
+
+/* GUC variables */
+char	   *undo_tablespaces = NULL;
+
+static UndoLogSlot *find_undo_log_slot(UndoLogNumber logno, bool locked);
+static UndoLogSlot *allocate_undo_log_slot(void);
+static void free_undo_log_slot(UndoLogSlot *log);
+static void attach_undo_log(UndoLogCategory category, Oid tablespace);
+static void detach_current_undo_log(UndoLogCategory category, bool full);
+static void extend_undo_log(UndoLogNumber logno, UndoLogOffset new_end);
+static void undo_log_before_exit(int code, Datum value);
+static void forget_undo_buffers(int logno, UndoLogOffset old_discard,
+								UndoLogOffset new_discard,
+								bool drop_tail);
+static bool choose_undo_tablespace(bool force_detach, Oid *oid);
+
+PG_FUNCTION_INFO_V1(pg_stat_get_undo_logs);
+
+/*
+ * How many undo logs can be active at a time?  This creates a theoretical
+ * maximum amount of undo data that can exist, but if we set it to a multiple
+ * of the maximum number of backends it will be a very high limit.
+ * Alternative designs involving demand paging or dynamic shared memory could
+ * remove this limit but would be complicated.
+ */
+static inline size_t
+UndoLogNumSlots(void)
+{
+	return MaxBackends * 4;
+}
+
+/*
+ * Return the amount of traditional shmem required for undo log management.
+ */
+Size
+UndoLogShmemSize(void)
+{
+	return sizeof(UndoLogSharedData) +
+		UndoLogNumSlots() * sizeof(UndoLogSlot);
+}
+
+/*
+ * Initialize the undo log subsystem.  Called in each backend.
+ */
+void
+UndoLogShmemInit(void)
+{
+	bool found;
+
+	UndoLogShared = (UndoLogSharedData *)
+		ShmemInitStruct("UndoLogShared", UndoLogShmemSize(), &found);
+
+	/* The postmaster initialized the shared memory state. */
+	if (!IsUnderPostmaster)
+	{
+		int		i;
+
+		Assert(!found);
+
+		/*
+		 * We start with no active undo logs.  StartUpUndoLogs() will recreate
+		 * the undo logs that were known at the last checkpoint.
+		 */
+		memset(UndoLogShared, 0, sizeof(*UndoLogShared));
+		UndoLogShared->nslots = UndoLogNumSlots();
+		for (i = 0; i < UndoLogCategories; ++i)
+			UndoLogShared->free_lists[i] = InvalidUndoLogNumber;
+		for (i = 0; i < UndoLogShared->nslots; ++i)
+		{
+			memset(&UndoLogShared->slots[i], 0, sizeof(UndoLogShared->slots[i]));
+			UndoLogShared->slots[i].logno = InvalidUndoLogNumber;
+			LWLockInitialize(&UndoLogShared->slots[i].mutex,
+							 LWTRANCHE_UNDOLOG);
+			LWLockInitialize(&UndoLogShared->slots[i].discard_lock,
+							 LWTRANCHE_UNDODISCARD);
+		}
+	}
+	else
+		Assert(found);
+
+	/* All backends prepare their per-backend lookup table. */
+	undologtable_cache = undologtable_create(TopMemoryContext,
+											 UndoLogNumSlots(),
+											 NULL);
+}
+
+void
+UndoLogInit(void)
+{
+	before_shmem_exit(undo_log_before_exit, 0);
+}
+
+/*
+ * Figure out which directory holds an undo log based on tablespace.
+ */
+void
+UndoLogDirectory(Oid tablespace, char *dir)
+{
+	if (tablespace == DEFAULTTABLESPACE_OID ||
+		tablespace == InvalidOid)
+		snprintf(dir, MAXPGPATH, "base/undo");
+	else
+		snprintf(dir, MAXPGPATH, "pg_tblspc/%u/%s/undo",
+				 tablespace, TABLESPACE_VERSION_DIRECTORY);
+}
+
+/*
+ * Compute the pathname to use for an undo log segment file.
+ */
+void
+UndoLogSegmentPath(UndoLogNumber logno, int segno, Oid tablespace, char *path)
+{
+	char		dir[MAXPGPATH];
+
+	/* Figure out which directory holds the segment, based on tablespace. */
+	UndoLogDirectory(tablespace, dir);
+
+	/*
+	 * Build the path from log number and offset.  The pathname is the
+	 * UndoRecPtr of the first byte in the segment in hexadecimal, with a
+	 * period inserted between the components.
+	 */
+	snprintf(path, MAXPGPATH, "%s/%06X.%010zX", dir, logno,
+			 segno * UndoLogSegmentSize);
+}
+
+/*
+ * Iterate through the set of currently active logs.  Pass in NULL to get the
+ * first undo log.  NULL indicates the end of the set of logs.  The caller
+ * must lock the returned log before accessing its members, and must skip if
+ * logno is not valid.
+ */
+UndoLogSlot *
+UndoLogNextSlot(UndoLogSlot *slot)
+{
+	LWLockAcquire(UndoLogLock, LW_SHARED);
+	for (;;)
+	{
+		/* Advance to the next log. */
+		if (slot == NULL)
+		{
+			/* Start at the beginning. */
+			slot = &UndoLogShared->slots[0];
+		}
+		else if (++slot == &UndoLogShared->slots[UndoLogShared->nslots])
+		{
+			/* Past the end. */
+			slot = NULL;
+			break;
+		}
+		/* Have we found a slot with a valid log? */
+		if (slot->logno != InvalidUndoLogNumber)
+			break;
+	}
+	LWLockRelease(UndoLogLock);
+
+	/* XXX: erm, which lock should the caller hold!? */
+	return slot;
+}
+
+/*
+ * Check if an undo log position has been discarded.  'pointer' must be an
+ * undo log pointer that was allocated at some point in the past, otherwise
+ * the result is undefined.
+ */
+bool
+UndoLogRecPtrIsDiscardedSlowPath(UndoRecPtr pointer)
+{
+	UndoLogNumber logno = UndoRecPtrGetLogNo(pointer);
+	UndoLogSlot *slot;
+	UndoRecPtr discard;
+
+	slot = find_undo_log_slot(logno, false);
+
+	if (slot == NULL)
+	{
+		/*
+		 * If we couldn't find the undo log number, then it must be entirely
+		 * discarded.  Set this backend's recent_discard value to the highest
+		 * possible value, so that all records appear to be discarded to the
+		 * fast-path code.  Technically this value is too low by 1, but
+		 * assuming only pointers to records are tested, and no record can
+		 * have size 1, this value suffices.
+		 */
+		discard = MakeUndoRecPtr(logno, UndoLogMaxSize - 1);
+	}
+	else
+	{
+		LWLockAcquire(&slot->mutex, LW_SHARED);
+		if (unlikely(logno != slot->logno))
+		{
+			/*
+			 * The undo log has been entirely discarded since we looked it up
+			 * above, and the UndoLogSlot is now unused or being used for some
+			 * other undo log.  This is the same as not finding it.
+			 */
+			discard = MakeUndoRecPtr(logno, UndoLogMaxSize - 1);
+		}
+		else
+			discard = MakeUndoRecPtr(logno, slot->meta.discard);
+		LWLockRelease(&slot->mutex);
+	}
+
+	/*
+	 * Remember this discard pointer in this backend so that future lookups
+	 * via UndoLogRecPtrIsDiscarded() have a chance of avoiding the slow path.
+	 */
+	UndoLogGetTableEntry(logno)->recent_discard = discard;
+
+	return pointer < discard;
+}
+
+/*
+ * Fetch the previous transaction's start undo record point.
+ */
+UndoRecPtr
+UndoLogGetLastXactStartPoint(UndoLogNumber logno)
+{
+	UndoLogSlot *slot = find_undo_log_slot(logno, false);
+	uint64 last_xact_start = 0;
+
+	if (unlikely(slot == NULL))
+		return InvalidUndoRecPtr;
+
+	LWLockAcquire(&slot->mutex, LW_SHARED);
+	/* TODO: review */
+	last_xact_start = slot->meta.unlogged.last_xact_start;
+	LWLockRelease(&slot->mutex);
+
+	if (last_xact_start == 0)
+		return InvalidUndoRecPtr;
+
+	return MakeUndoRecPtr(logno, last_xact_start);
+}
+
+/*
+ * Detach from the undo log we are currently attached to, returning it to the
+ * appropriate free list if it still has space.
+ */
+static void
+detach_current_undo_log(UndoLogCategory category, bool full)
+{
+	UndoLogSlot *slot;
+
+	slot = CurrentSession->attached_undo_slots[category];
+
+	Assert(slot != NULL);
+
+	CurrentSession->attached_undo_slots[category] = NULL;
+
+	LWLockAcquire(&slot->mutex, LW_EXCLUSIVE);
+	slot->pid = InvalidPid;
+	slot->meta.unlogged.xid = InvalidTransactionId;
+	if (full)
+		slot->meta.status = UNDO_LOG_STATUS_FULL;
+	LWLockRelease(&slot->mutex);
+
+	/* Push back onto the appropriate free list, unless it's full. */
+	if (!full)
+	{
+		LWLockAcquire(UndoLogLock, LW_EXCLUSIVE);
+		slot->next_free = UndoLogShared->free_lists[category];
+		UndoLogShared->free_lists[category] = slot->logno;
+		LWLockRelease(UndoLogLock);
+	}
+}
+
+/*
+ * Exit handler, detaching from all undo logs.
+ */
+static void
+undo_log_before_exit(int code, Datum arg)
+{
+	int		i;
+
+	if (!CurrentSession)
+		return;
+
+	for (i = 0; i < UndoLogCategories; ++i)
+	{
+		if (CurrentSession->attached_undo_slots[i] != NULL)
+			detach_current_undo_log(i, false);
+	}
+}
+
+/*
+ * Create a new empty segment file on disk for the byte starting at 'end'.
+ */
+static void
+allocate_empty_undo_segment(UndoLogNumber logno, Oid tablespace,
+							UndoLogOffset end)
+{
+	struct stat	stat_buffer;
+	off_t	size;
+	char	path[MAXPGPATH];
+	void   *zeroes;
+	size_t	nzeroes = 8192;
+	int		fd;
+
+	UndoLogSegmentPath(logno, end / UndoLogSegmentSize, tablespace, path);
+
+	/*
+	 * Create and fully allocate a new file.  If we crashed and recovered
+	 * then the file might already exist, so use flags that tolerate that.
+	 * It's also possible that it exists but is too short, in which case
+	 * we'll write the rest.  We don't really care what's in the file, we
+	 * just want to make sure that the filesystem has allocated physical
+	 * blocks for it, so that non-COW filesystems will report ENOSPC now
+	 * rather than later when the space is needed and we'll avoid creating
+	 * files with holes.
+	 */
+	fd = OpenTransientFile(path, O_RDWR | O_CREAT | PG_BINARY);
+	if (fd < 0 && tablespace != 0)
+	{
+		char undo_path[MAXPGPATH];
+
+		/* Try creating the undo directory for this tablespace. */
+		UndoLogDirectory(tablespace, undo_path);
+		if (mkdir(undo_path, S_IRWXU) != 0 && errno != EEXIST)
+		{
+			char	   *parentdir;
+
+			if (errno != ENOENT || !InRecovery)
+				ereport(ERROR,
+						(errcode_for_file_access(),
+						 errmsg("could not create directory \"%s\": %m",
+								undo_path)));
+
+			/*
+			 * In recovery, it's possible that the tablespace directory
+			 * doesn't exist because a later WAL record removed the whole
+			 * tablespace.  In that case we create a regular directory to
+			 * stand in for it.  This is similar to the logic in
+			 * TablespaceCreateDbspace().
+			 */
+
+			/* create two parents up if not exist */
+			parentdir = pstrdup(undo_path);
+			get_parent_directory(parentdir);
+			get_parent_directory(parentdir);
+			/* Can't create parent and it doesn't already exist? */
+			if (mkdir(parentdir, S_IRWXU) < 0 && errno != EEXIST)
+				ereport(ERROR,
+						(errcode_for_file_access(),
+						 errmsg("could not create directory \"%s\": %m",
+								parentdir)));
+			pfree(parentdir);
+
+			/* create one parent up if not exist */
+			parentdir = pstrdup(undo_path);
+			get_parent_directory(parentdir);
+			/* Can't create parent and it doesn't already exist? */
+			if (mkdir(parentdir, S_IRWXU) < 0 && errno != EEXIST)
+				ereport(ERROR,
+						(errcode_for_file_access(),
+						 errmsg("could not create directory \"%s\": %m",
+								parentdir)));
+			pfree(parentdir);
+
+			if (mkdir(undo_path, S_IRWXU) != 0 && errno != EEXIST)
+				ereport(ERROR,
+						(errcode_for_file_access(),
+						 errmsg("could not create directory \"%s\": %m",
+								undo_path)));
+		}
+
+		fd = OpenTransientFile(path, O_RDWR | O_CREAT | PG_BINARY);
+	}
+	if (fd < 0)
+		elog(ERROR, "could not create new file \"%s\": %m", path);
+	if (fstat(fd, &stat_buffer) < 0)
+		elog(ERROR, "could not stat \"%s\": %m", path);
+	size = stat_buffer.st_size;
+
+	/* A buffer full of zeroes we'll use to fill up new segment files. */
+	zeroes = palloc0(nzeroes);
+
+	while (size < UndoLogSegmentSize)
+	{
+		ssize_t written;
+
+		written = write(fd, zeroes, Min(nzeroes, UndoLogSegmentSize - size));
+		if (written < 0)
+			elog(ERROR, "cannot initialize undo log segment file \"%s\": %m",
+				 path);
+		size += written;
+	}
+
+	/* Flush the contents of the file to disk before the next checkpoint. */
+	undofile_request_sync(logno, end / UndoLogSegmentSize, tablespace);
+
+	CloseTransientFile(fd);
+
+	pfree(zeroes);
+
+	elog(DEBUG1, "created undo segment \"%s\"", path);
+}
+
+/*
+ * Create a new undo segment, when it is unexpectedly not present.
+ */
+void
+UndoLogNewSegment(UndoLogNumber logno, Oid tablespace, int segno)
+{
+	Assert(InRecovery);
+	allocate_empty_undo_segment(logno, tablespace, segno * UndoLogSegmentSize);
+}
+
+/*
+ * Create and zero-fill a new segment for a given undo log number.
+ */
+static void
+extend_undo_log(UndoLogNumber logno, UndoLogOffset new_end)
+{
+	UndoLogSlot *slot;
+	size_t		end;
+
+	slot = find_undo_log_slot(logno, false);
+
+	/* TODO review interlocking */
+
+	Assert(slot != NULL);
+	Assert(slot->meta.end % UndoLogSegmentSize == 0);
+	Assert(new_end % UndoLogSegmentSize == 0);
+	Assert(InRecovery ||
+		   CurrentSession->attached_undo_slots[slot->meta.category] == slot);
+
+	/*
+	 * Create all the segments needed to increase 'end' to the requested
+	 * size.  This is quite expensive, so we will try to avoid it completely
+	 * by renaming files into place in UndoLogDiscard() instead.
+	 */
+	end = slot->meta.end;
+	while (end < new_end)
+	{
+		allocate_empty_undo_segment(logno, slot->meta.tablespace, end);
+		end += UndoLogSegmentSize;
+	}
+
+	/* Flush the directory entries before next checkpoint. */
+	undofile_request_sync_dir(slot->meta.tablespace);
+
+	/*
+	 * If we're not in recovery, we need to WAL-log the creation of the new
+	 * file(s).  We do that after the above filesystem modifications, in
+	 * violation of the data-before-WAL rule as exempted by
+	 * src/backend/access/transam/README.  This means that it's possible for
+	 * us to crash having made some or all of the filesystem changes but
+	 * before WAL logging, but in that case we'll eventually try to create the
+	 * same segment(s) again, which is tolerated.
+	 */
+	if (!InRecovery)
+	{
+		xl_undolog_extend xlrec;
+		XLogRecPtr	ptr;
+
+		xlrec.logno = logno;
+		xlrec.end = end;
+
+		XLogBeginInsert();
+		XLogRegisterData((char *) &xlrec, sizeof(xlrec));
+		ptr = XLogInsert(RM_UNDOLOG_ID, XLOG_UNDOLOG_EXTEND);
+		XLogFlush(ptr);
+	}
+
+	/*
+	 * We didn't need to acquire the mutex to read 'end' above because only
+	 * we write to it.  But we need the mutex to update it, because the
+	 * checkpointer might read it concurrently.
+	 *
+	 * XXX It's possible for meta.end to be higher already during
+	 * recovery, because of the timing of a checkpoint; in that case we did
+	 * nothing above and we shouldn't update shmem here.  That interaction
+	 * needs more analysis.
+	 */
+	LWLockAcquire(&slot->mutex, LW_EXCLUSIVE);
+	if (slot->meta.end < end)
+		slot->meta.end = end;
+	LWLockRelease(&slot->mutex);
+}
+
+/*
+ * This function must be called before all of the undo log activity that will
+ * be covered by a single WAL record.
+ */
+void
+UndoLogBeginInsert(UndoLogAllocContext *context,
+				   UndoLogCategory category,
+				   XLogReaderState *xlog_record)
+{
+	context->try_location = InvalidUndoRecPtr;
+	context->category = category;
+
+	/*
+	 * Tell UndoLogAllocate() to capture undo log meta-data before-change
+	 * images, so that UndoLogRegister() can find them and they can be written
+	 * to the WAL once per checkpoint.
+	 */
+	context->num_meta_data_images = 0;
+
+	/*
+	 * Tell UndoLogAllocateInRecovery() that we don't know which undo log to
+	 * allocate in yet, and to start its search for registered blocks at
+	 * the lowest-numbered block_id.
+	 */
+	context->xlog_record = xlog_record;
+	context->recovery_logno = InvalidUndoLogNumber;
+	context->recovery_block_id = 0;
+
+	/*
+	 * For UNDO_SHARED, this always denotes the beginning of a new record set.
+	 * For other categories, the boundaries are detected by transaction ID
+	 * changes.
+	 */
+	context->new_shared_record_set = category == UNDO_SHARED;
+}
+
+/*
+ * Get an insertion point that is guaranteed to be backed by enough space to
+ * hold 'size' bytes of data.  To actually write into the undo log, client
+ * code should call this first and then use bufmgr routines to access buffers
+ * and provide WAL logs and redo handlers.  In other words, while this module
+ * looks after making sure the undo log has sufficient space and the undo meta
+ * data is crash safe, the *contents* of the undo log and (indirectly) the
+ * insertion point are the responsibility of client code.
+ *
+ * A suggested insertion point can optionally be passed in as 'try_location',
+ * and will be returned if possible.  If not InvalidUndoRecPtr, it must fall
+ * with, or exactly one byte after, the most recent allocation for the same
+ * persistence level.  This interface allows for a series of allocation to be
+ * made without committing to using the space yet; call UndoLogAdvance() to
+ * actually advance the insert pointer.
+ *
+ * Return an undo log insertion point that can be converted to a buffer tag
+ * and an insertion point within a buffer page.
+ */
+UndoRecPtr
+UndoLogAllocate(UndoLogAllocContext *context,
+				uint16 size,
+				bool *need_xact_header,
+				UndoRecPtr *last_xact_start,
+				UndoRecPtr *prevlog_xact_start,
+				UndoRecPtr *prevlog_insert_urp)
+{
+	Session *session = CurrentSession;
+	UndoLogSlot *slot;
+	UndoLogOffset new_insert;
+	TransactionId logxid;
+
+	slot = CurrentSession->attached_undo_slots[context->category];
+
+	/*
+	 * We may need to attach to an undo log, either because this is the first
+	 * time this backend as needed to write to an undo log at all or because
+	 * the undo_tablespaces GUC was changed.  When doing that, we'll need
+	 * interlocking against tablespaces being concurrently dropped.
+	 */
+
+ retry:
+	/* See if we need to check the undo_tablespaces GUC. */
+	if (unlikely(session->need_to_choose_undo_tablespace || slot == NULL))
+	{
+		Oid		tablespace;
+		bool	need_to_unlock;
+
+		need_to_unlock =
+			choose_undo_tablespace(session->need_to_choose_undo_tablespace,
+								   &tablespace);
+		attach_undo_log(context->category, tablespace);
+		if (need_to_unlock)
+			LWLockRelease(TablespaceCreateLock);
+		slot = CurrentSession->attached_undo_slots[context->category];
+		session->need_to_choose_undo_tablespace = false;
+	}
+
+	LWLockAcquire(&slot->mutex, LW_EXCLUSIVE);
+	logxid = slot->meta.unlogged.xid;
+
+	if (logxid != GetTopTransactionId())
+	{
+		/*
+		 * While we have the lock, check if we have been forcibly detached by
+		 * DROP TABLESPACE.  That can only happen between transactions (see
+		 * DropUndoLogsInsTablespace()).
+		 */
+		if (slot->pid == InvalidPid)
+		{
+			LWLockRelease(&slot->mutex);
+			slot = NULL;
+			goto retry;
+		}
+		/* Record that we are attached to this log. */
+		slot->meta.unlogged.xid = GetTopTransactionId();
+		/*
+		 * Maintain our tracking of the and the previous transaction start
+		 * locations.
+		 */
+		if (slot->meta.unlogged.this_xact_start != slot->meta.unlogged.insert)
+		{
+			slot->meta.unlogged.last_xact_start =
+				slot->meta.unlogged.this_xact_start;
+			slot->meta.unlogged.this_xact_start = slot->meta.unlogged.insert;
+		}
+	}
+	LWLockRelease(&slot->mutex);
+
+	/*
+	 * 'size' is expressed in usable non-header bytes.  Figure out how far we
+	 * have to move insert to create space for 'size' usable bytes, stepping
+	 * over any intervening headers.
+	 */
+	Assert(slot->meta.unlogged.insert % BLCKSZ >= UndoLogBlockHeaderSize);
+	if (context->try_location != InvalidUndoRecPtr)
+	{
+		/*
+		 * The try location must be in the log we're attached to, at most one
+		 * byte past the end of space backed by files.
+		 */
+		UndoLogOffset try_offset = UndoRecPtrGetOffset(context->try_location);
+
+		Assert(UndoRecPtrGetLogNo(context->try_location) == slot->logno);
+		Assert(try_offset <= slot->meta.end);
+		new_insert = UndoLogOffsetPlusUsableBytes(try_offset, size);
+	}
+	else
+	{
+		new_insert = UndoLogOffsetPlusUsableBytes(slot->meta.unlogged.insert,
+												  size);
+	}
+	Assert(new_insert % BLCKSZ >= UndoLogBlockHeaderSize);
+
+	/*
+	 * We don't need to acquire log->mutex to read log->meta.insert and
+	 * log->meta.end, because this backend is the only one that can
+	 * modify them.
+	 */
+	if (unlikely(new_insert > slot->meta.end))
+	{
+		if (new_insert > UndoLogMaxSize)
+		{
+			/* This undo log is entirely full.  Get a new one. */
+			if (logxid == GetTopTransactionId())
+			{
+				/*
+				 * If the same transaction is split over two undo logs then
+				 * store the previous log number in new log.  See detailed
+				 * comments in undorecord.c file header.
+				 */
+				*prevlog_xact_start =
+					MakeUndoRecPtr(slot->logno,
+								   slot->meta.unlogged.this_xact_start);
+				*prevlog_insert_urp =
+					MakeUndoRecPtr(slot->logno, slot->meta.unlogged.insert);
+			}
+			elog(DEBUG1, "undo log %u is full, switching to a new one", slot->logno);
+			slot = NULL;
+			detach_current_undo_log(context->category, true);
+			context->try_location = InvalidUndoRecPtr;
+			goto retry;
+		}
+		/*
+		 * Extend the end of this undo log to cover new_insert (in other words
+		 * round up to the segment size).
+		 */
+		extend_undo_log(slot->logno,
+						new_insert + UndoLogSegmentSize -
+						new_insert % UndoLogSegmentSize);
+		Assert(new_insert <= slot->meta.end);
+	}
+
+	/*
+	 * Create a back-up image of the unlogged part of the undo log's
+	 * meta-data, if we haven't already done so since UndoLogBeginInsert() (ie
+	 * for the WAL record that this undo allocation will be replayed by).
+	 */
+	if (context->num_meta_data_images == 0 ||
+		context->meta_data_images[context->num_meta_data_images - 1].logno != slot->logno)
+	{
+		if (context->num_meta_data_images >= MAX_META_DATA_IMAGES)
+			elog(ERROR, "too many undo log meta data images");
+		context->meta_data_images[context->num_meta_data_images].logno = slot->logno;
+		context->meta_data_images[context->num_meta_data_images++].data = slot->meta.unlogged;
+	}
+
+	/*
+	 * If no try_location was passed in, or if we switched logs, then we'll
+	 * return the current insertion point.
+	 */
+	if (context->try_location == InvalidUndoRecPtr)
+		context->try_location = MakeUndoRecPtr(slot->logno, slot->meta.unlogged.insert);
+
+	/*
+	 * Is this location the first in this undo log for a transaction or a
+	 * shared record set?
+	 */
+	if (context->new_shared_record_set)
+	{
+		context->new_shared_record_set = false;
+		*need_xact_header = true;
+	}
+	else
+	{
+		*need_xact_header =
+			UndoRecPtrGetOffset(context->try_location) ==
+			slot->meta.unlogged.this_xact_start;
+	}
+	*last_xact_start =
+		MakeUndoRecPtr(slot->logno, slot->meta.unlogged.last_xact_start);
+
+	return context->try_location;
+}
+
+void
+UndoLogRegister(UndoLogAllocContext *context, uint8 block_id, UndoLogNumber logno)
+{
+	int		i;
+
+	for (i = 0; i < context->num_meta_data_images; ++i)
+	{
+		if (context->meta_data_images[i].logno == logno)
+		{
+			XLogRegisterBufData(block_id,
+								(char *) &context->meta_data_images[i].data,
+								sizeof(context->meta_data_images[i].data));
+			return;
+		}
+	}
+}
+
+/*
+ * In recovery, we expect exactly the same sequence of allocation sizes, but
+ * we also need the WAL record that is being replayed so we can figure out
+ * where the undo space was allocated.
+ */
+UndoRecPtr
+UndoLogAllocateInRecovery(UndoLogAllocContext *context,
+						  TransactionId xid,
+						  uint16 size,
+						  bool *need_xact_header,
+						  UndoRecPtr *last_xact_start,
+						  UndoRecPtr *prevlog_xact_start,
+						  UndoRecPtr *prevlog_last_urp)
+{
+	UndoLogSlot *slot;
+
+	Assert(InRecovery);
+
+	/*
+	 * Just as in UndoLogAllocate(), the caller may be extending an existing
+	 * allocation before committing with UndoLogAdvance().
+	 */
+	if (context->try_location != InvalidUndoRecPtr)
+	{
+		/*
+		 * The try location must be in the log we're attached to, at most one
+		 * byte past the end of space backed by files.
+		 */
+		UndoLogOffset try_offset = UndoRecPtrGetOffset(context->try_location);
+		UndoLogNumber logno = UndoRecPtrGetLogNo(context->try_location);
+
+		/*
+		 * You can only have a try_location on your second or later allocation
+		 * for a given WAL record.  It had better be in the same log as the
+		 * previous allocation for this WAL record (though it may not turn out
+		 * to have enough space, below).
+		 */
+		Assert(logno == context->recovery_logno);
+
+		/*
+		 * Any log extension triggered by UndoLogAllocate() must have been
+		 * replayed by now, so we can just check if this log has enough space,
+		 * and if so, return.
+		 */
+		slot = find_undo_log_slot(logno, false);
+		if (UndoLogOffsetPlusUsableBytes(try_offset, size) <= slot->meta.end)
+		{
+			*need_xact_header = false;
+			return try_offset;
+		}
+
+		/* Full.  Ignore try_location and find the next log that was used. */
+		Assert(slot->meta.status == UNDO_LOG_STATUS_FULL);
+	}
+	else
+	{
+		/*
+		 * For now we only support one allocation per WAL record that doesn't
+		 * have a try_location (ie the first one).  We'll have to find out
+		 * which log was used first.
+		 */
+		Assert(context->recovery_logno == InvalidUndoLogNumber);
+	}
+
+	/*
+	 * In order to find the undo log that was used by UndoLogAllocate(), we
+	 * consult the list of registered blocks to figure out which undo logs
+	 * should be written to by this WAL record.
+	 */
+	while (context->recovery_block_id <= context->xlog_record->max_block_id)
+	{
+		DecodedBkpBlock *block;
+
+		/* We're looking for the first block referencing a new undo log. */
+		block = &context->xlog_record->blocks[context->recovery_block_id];
+		if (block->rnode.dbNode == UndoDbOid &&
+			block->rnode.relNode != context->recovery_logno)
+		{
+			UndoLogNumber logno = block->rnode.relNode;
+			const void *backup;
+			size_t backup_size;
+
+			/* We found a reference to a different (or first) undo log. */
+			slot = find_undo_log_slot(logno, false);
+
+			/*
+			 * Since on-line checkpoints capture an inconsistent snapshot of
+			 * undo log meta-data, we'll restore the unlogged part of the
+			 * meta-data image if one was attached to the WAL record (that is,
+			 * the members that don't have WAL records for every change
+			 * already).
+			 */
+			backup =
+				XLogRecGetBlockData(context->xlog_record,
+									context->recovery_block_id,
+									&backup_size);
+			if (unlikely(backup))
+			{
+				Assert(backup_size == sizeof(UndoLogUnloggedMetaData));
+
+				/* Restore the unlogged members from the backup-imaged. */
+				LWLockAcquire(&slot->mutex, LW_EXCLUSIVE);
+				memcpy(&slot->meta.unlogged, backup, sizeof(UndoLogUnloggedMetaData));
+				LWLockRelease(&slot->mutex);
+			}
+			else
+			{
+				/*
+				 * Otherwise we need to do our own transaction tracking
+				 * whenever we see a new xid, to match the logic in
+				 * UndoLogAllocate().
+				 */
+				if (xid != slot->meta.unlogged.xid)
+				{
+					slot->meta.unlogged.xid = xid;
+					if (slot->meta.unlogged.this_xact_start != slot->meta.unlogged.insert)
+						slot->meta.unlogged.last_xact_start =
+							slot->meta.unlogged.this_xact_start;
+					slot->meta.unlogged.this_xact_start =
+						slot->meta.unlogged.insert;
+				}
+			}
+
+			/* TODO: check locking against undo log slot recycling? */
+
+			/*
+			 * At this stage we should have an undo log that can handle this
+			 * allocation.  If we don't, something is screwed up.
+			 */
+			if (UndoLogOffsetPlusUsableBytes(slot->meta.unlogged.insert, size) > slot->meta.end)
+				elog(ERROR,
+					 "cannot allocate %d bytes in undo log %d",
+					 (int) size, slot->logno);
+
+			*need_xact_header =
+				context->try_location == InvalidUndoRecPtr &&
+				slot->meta.unlogged.insert == slot->meta.unlogged.this_xact_start;
+			*last_xact_start = slot->meta.unlogged.last_xact_start;
+			context->recovery_logno = slot->logno;
+
+			/* Read log switch information from meta and reset it. */
+			*prevlog_xact_start = slot->meta.unlogged.prevlog_xact_start;
+			*prevlog_last_urp = slot->meta.unlogged.prevlog_last_urp;
+
+			slot->meta.unlogged.prevlog_xact_start = InvalidUndoRecPtr;
+			slot->meta.unlogged.prevlog_last_urp = InvalidUndoRecPtr;
+
+			return MakeUndoRecPtr(slot->logno, slot->meta.unlogged.insert);
+		}
+		++context->recovery_block_id;
+	}
+
+	/*
+	 * If we've run out of blocks to inspect, then we must have replayed a
+	 * different sequence of allocation sizes, or screwed up the
+	 * XLOG_UNDOLOG_EXTEND records, indicating a bug somewhere.
+	 */
+	elog(ERROR, "cannot determine undo log to allocate from");
+
+	return 0;		/* not reached */
+}
+
+/*
+ * Advance the insertion pointer in this context by 'size' usable (non-header)
+ * bytes.  This is the next place we'll try to allocate a record, if it fits.
+ * This is not committed to shared memory until after we've WAL-logged the
+ * record and UndoLogAdvanceFinal() is called.
+ */
+void
+UndoLogAdvance(UndoLogAllocContext *context, size_t size)
+{
+	context->try_location = UndoLogOffsetPlusUsableBytes(context->try_location,
+														 size);
+}
+
+/*
+ * Advance the insertion pointer to 'size' usable (non-header) bytes past
+ * insertion_point.
+ */
+void
+UndoLogAdvanceFinal(UndoRecPtr insertion_point, size_t size)
+{
+	UndoLogSlot *slot = NULL;
+	UndoLogNumber	logno = UndoRecPtrGetLogNo(insertion_point) ;
+
+	slot = find_undo_log_slot(logno, false);
+
+	/*
+	 * Either we're in recovery, or is a log we are currently attached to, or
+	 * recently detached from because it was full.
+	 */
+	Assert(InRecovery ||
+		   AmAttachedToUndoLogSlot(slot) ||
+		   slot->meta.status == UNDO_LOG_STATUS_FULL);
+
+	/*
+	 * The caller has the current insertion point, as returned by
+	 * UndoLogAllocate[InRecovery]().
+	 */
+	Assert(UndoRecPtrGetOffset(insertion_point) == slot->meta.unlogged.insert);
+
+	LWLockAcquire(&slot->mutex, LW_EXCLUSIVE);
+	slot->meta.unlogged.insert =
+		UndoLogOffsetPlusUsableBytes(slot->meta.unlogged.insert, size);
+	LWLockRelease(&slot->mutex);
+}
+
+/*
+ * Advance the discard pointer in one undo log, discarding all undo data
+ * relating to one or more whole transactions.  The passed in undo pointer is
+ * the address of the oldest data that the called would like to keep, and the
+ * affected undo log is implied by this pointer, ie
+ * UndoRecPtrGetLogNo(discard_pointer).
+ *
+ * The caller asserts that there will be no attempts to access the undo log
+ * region being discarded after this moment.  This operation will cause the
+ * relevant buffers to be dropped immediately, without writing any data out to
+ * disk.  Any attempt to read the buffers (except a partial buffer at the end
+ * of this range which will remain) may result in IO errors, because the
+ * underlying segment file may have been physically removed.
+ *
+ * Return true if the discard point was updated, and false if nothing was done
+ * because the log precending the given point was already discarded.
+ *
+ * TODO: The return value is not yet reliable and the code still doesn't work
+ * correctly if called for the same undo log in two backends; more
+ * interlocking work required here.
+ */
+bool
+UndoLogDiscard(UndoRecPtr discard_point, TransactionId xid)
+{
+	UndoLogNumber logno = UndoRecPtrGetLogNo(discard_point);
+	UndoLogOffset discard = UndoRecPtrGetOffset(discard_point);
+	UndoLogOffset old_discard;
+	UndoLogOffset end;
+	UndoLogSlot *slot;
+	int			segno;
+	int			new_segno;
+	bool		need_to_flush_wal = false;
+	bool		entirely_discarded = false;
+
+	slot = find_undo_log_slot(logno, false);
+	if (unlikely(slot == NULL))
+	{
+		/* Already discarded (entirely). */
+		return false;
+	}
+
+	LWLockAcquire(&slot->mutex, LW_EXCLUSIVE);
+	if (unlikely(slot->logno != logno || discard <= slot->meta.discard))
+	{
+		/*
+		 * Already discarded entirely and the slot has been recycled, or up
+		 * to this point).
+		 */
+		LWLockRelease(&slot->mutex);
+		return false;
+	}
+	if (discard > slot->meta.unlogged.insert)
+		elog(ERROR, "cannot move discard point past insert point");
+	old_discard = slot->meta.discard;
+	end = slot->meta.end;
+	/* Are we discarding the last remaining data in a log marked as full? */
+	if (slot->meta.status == UNDO_LOG_STATUS_FULL &&
+		discard == slot->meta.unlogged.insert)
+	{
+		/*
+		 * Adjust the discard and insert pointers so that the final segment is
+		 * deleted from disk, and remember not to recycle it.
+		 */
+		entirely_discarded = true;
+		/* TODO: Check if the following line is replayed correctly */
+		slot->meta.unlogged.insert = slot->meta.end;
+		discard = slot->meta.end;
+	}
+	LWLockRelease(&slot->mutex);
+
+	/*
+	 * TODO: I think we need a new lock just for this phase, so that buffer
+	 * dropping and IO are done by only one backend if a superuser command and
+	 * a discard worker both run this!
+	 */
+
+	/*
+	 * Drop all buffers holding this undo data out of the buffer pool (except
+	 * the last one, if the new location is in the middle of it somewhere), so
+	 * that the contained data doesn't ever touch the disk.  The caller
+	 * promises that this data will not be needed again.  We have to drop the
+	 * buffers from the buffer pool before removing files, otherwise a
+	 * concurrent session might try to write the block to evict the buffer.
+	 */
+	forget_undo_buffers(logno, old_discard, discard, entirely_discarded);
+
+	/*
+	 * Check if we crossed a segment boundary and need to do some synchronous
+	 * filesystem operations.
+	 */
+	segno = old_discard / UndoLogSegmentSize;
+	new_segno = discard / UndoLogSegmentSize;
+	if (segno < new_segno)
+	{
+		int		recycle;
+		UndoLogOffset pointer;
+
+		/*
+		 * We always WAL-log discards, but we only need to flush the WAL if we
+		 * have performed a filesystem operation.
+		 */
+		need_to_flush_wal = true;
+
+		/*
+		 * XXX When we rename or unlink a file, it's possible that some
+		 * backend still has it open because it has recently read a page from
+		 * it.  smgr/undofile.c in any such backend will eventually close it,
+		 * because it considers that fd to belong to the file with the name
+		 * that we're unlinking or renaming and it doesn't like to keep more
+		 * than one open at a time.  No backend should ever try to read from
+		 * such a file descriptor; that is what it means when we say that the
+		 * caller of UndoLogDiscard() asserts that there will be no attempts
+		 * to access the discarded range of undo log.  In the case of a
+		 * rename, if a backend were to attempt to read undo data in the range
+		 * being discarded, it would read entirely the wrong data.
+		 */
+
+		/*
+		 * How many segments should we recycle (= rename from tail position to
+		 * head position)?  For now it's always 1 unless there is already a
+		 * spare one, but we could have an adaptive algorithm that recycles
+		 * multiple segments at a time and pays just one fsync().
+		 */
+		LWLockAcquire(&slot->mutex, LW_SHARED);
+		if ((slot->meta.end - slot->meta.unlogged.insert) < UndoLogSegmentSize &&
+			slot->meta.status == UNDO_LOG_STATUS_ACTIVE)
+			recycle = 1;
+		else
+			recycle = 0;
+		LWLockRelease(&slot->mutex);
+
+		/* Rewind to the start of the segment. */
+		pointer = segno * UndoLogSegmentSize;
+
+		while (pointer < new_segno * UndoLogSegmentSize)
+		{
+			char	discard_path[MAXPGPATH];
+
+			/* Tell the checkpointer that the file is going away. */
+			undofile_forget_sync(logno, pointer / UndoLogSegmentSize,
+								 slot->meta.tablespace);
+
+			UndoLogSegmentPath(logno, pointer / UndoLogSegmentSize,
+							   slot->meta.tablespace, discard_path);
+
+			/* Can we recycle the oldest segment? */
+			if (recycle > 0)
+			{
+				char	recycle_path[MAXPGPATH];
+
+				/*
+				 * End points one byte past the end of the current undo space,
+				 * ie to the first byte of the segment file we want to create.
+				 */
+				UndoLogSegmentPath(logno, end / UndoLogSegmentSize,
+								   slot->meta.tablespace, recycle_path);
+				if (rename(discard_path, recycle_path) == 0)
+				{
+					elog(DEBUG1, "recycled undo segment \"%s\" -> \"%s\"",
+						 discard_path, recycle_path);
+					end += UndoLogSegmentSize;
+					--recycle;
+				}
+				else
+				{
+					elog(ERROR, "could not rename \"%s\" to \"%s\": %m",
+						 discard_path, recycle_path);
+				}
+			}
+			else
+			{
+				if (unlink(discard_path) == 0)
+					elog(DEBUG1, "unlinked undo segment \"%s\"", discard_path);
+				else
+					elog(ERROR, "could not unlink \"%s\": %m", discard_path);
+			}
+			pointer += UndoLogSegmentSize;
+		}
+	}
+
+	/* WAL log the discard. */
+	{
+		xl_undolog_discard xlrec;
+		XLogRecPtr ptr;
+
+		xlrec.logno = logno;
+		xlrec.discard = discard;
+		xlrec.end = end;
+		xlrec.latestxid = xid;
+		xlrec.entirely_discarded = entirely_discarded;
+
+		XLogBeginInsert();
+		XLogRegisterData((char *) &xlrec, sizeof(xlrec));
+		ptr = XLogInsert(RM_UNDOLOG_ID, XLOG_UNDOLOG_DISCARD);
+
+		if (need_to_flush_wal)
+			XLogFlush(ptr);
+	}
+
+	/* Update shmem to show the new discard and end pointers. */
+	LWLockAcquire(&slot->mutex, LW_EXCLUSIVE);
+	slot->meta.discard = discard;
+	slot->meta.end = end;
+	LWLockRelease(&slot->mutex);
+
+	/* If we discarded everything, the slot can be given up. */
+	if (entirely_discarded)
+		free_undo_log_slot(slot);
+
+	return true;
+}
+
+/*
+ * Return an UndoRecPtr to the oldest valid data in an undo log, or
+ * InvalidUndoRecPtr if it is empty.
+ */
+UndoRecPtr
+UndoLogGetOldestRecord(UndoLogNumber logno, bool *full)
+{
+	UndoLogSlot *slot;
+	UndoRecPtr	result;
+
+	/* Try to find the slot for this undo log number. */
+	slot = find_undo_log_slot(logno, false);
+	if (slot == NULL)
+	{
+		/* It's unknown to us, so we assume it's been entirely discarded. */
+		if (full)
+			*full = true;
+		return InvalidUndoRecPtr;
+	}
+
+	LWLockAcquire(&slot->mutex, LW_SHARED);
+	if (slot->logno != logno)
+	{
+		/* It's been recycled.  SO it must have been entirely discarded. */
+		result = InvalidUndoRecPtr;
+		if (full)
+			*full = true;
+	}
+	else if (slot->meta.discard == slot->meta.unlogged.insert)
+	{
+		/* It's empty, so there is no oldest record pointer to return. */
+		result = InvalidUndoRecPtr;
+		if (full)
+			*full = slot->meta.status == UNDO_LOG_STATUS_FULL;
+	}
+	else
+	{
+		/* There is a record here! */
+		result = MakeUndoRecPtr(slot->logno, slot->meta.discard);
+		if (full)
+			*full = slot->meta.status == UNDO_LOG_STATUS_FULL;
+	}
+	LWLockRelease(&slot->mutex);
+
+	return result;
+}
+
+/*
+ * UndoLogSwitchSetPrevLogInfo - Store previous log info on the log switch and
+ * wal log the same.
+ */
+void
+UndoLogSwitchSetPrevLogInfo(UndoLogNumber logno, UndoRecPtr prevlog_xact_start,
+							UndoRecPtr prevlog_last_urp)
+{
+	UndoLogSlot *slot;
+
+	slot = find_undo_log_slot(logno, false);
+
+	/*
+	 * Either we're in recovery, or is a log we are currently attached to, or
+	 * recently detached from because it was full.
+	 */
+	Assert(AmAttachedToUndoLogSlot(slot));
+
+	LWLockAcquire(&slot->mutex, LW_EXCLUSIVE);
+	slot->meta.unlogged.prevlog_xact_start = prevlog_last_urp;
+	slot->meta.unlogged.prevlog_last_urp = prevlog_last_urp;
+	LWLockRelease(&slot->mutex);
+
+	/* Wal log the log switch. */
+	{
+		xl_undolog_switch xlrec;
+
+		xlrec.logno = logno;
+		xlrec.prevlog_xact_start = prevlog_last_urp;
+		xlrec.prevlog_last_urp = prevlog_xact_start;
+
+		XLogBeginInsert();
+		XLogRegisterData((char *) &xlrec, sizeof(xlrec));
+		XLogInsert(RM_UNDOLOG_ID, XLOG_UNDOLOG_SWITCH);
+	}
+}
+
+/*
+ * Return the next insert location.
+ */
+UndoRecPtr
+UndoLogGetNextInsertPtr(UndoLogNumber logno)
+{
+	UndoLogSlot *slot = find_undo_log_slot(logno, false);
+	UndoRecPtr	insert;
+
+	LWLockAcquire(&slot->mutex, LW_SHARED);
+	/* TODO: what if the slot has been recycled? */
+	insert = slot->meta.unlogged.insert;
+	LWLockRelease(&slot->mutex);
+
+	return MakeUndoRecPtr(logno, insert);
+}
+
+/*
+ * Delete unreachable files under pg_undo.  Any files corresponding to LSN
+ * positions before the previous checkpoint are no longer needed.
+ */
+static void
+CleanUpUndoCheckPointFiles(XLogRecPtr checkPointRedo)
+{
+	DIR	   *dir;
+	struct dirent *de;
+	char	path[MAXPGPATH];
+	char	oldest_path[MAXPGPATH];
+
+	/*
+	 * If a base backup is in progress, we can't delete any checkpoint
+	 * snapshot files because one of them corresponds to the backup label but
+	 * there could be any number of checkpoints during the backup.
+	 */
+	if (BackupInProgress())
+		return;
+
+	/* Otherwise keep only those >= the previous checkpoint's redo point. */
+	snprintf(oldest_path, MAXPGPATH, "%016" INT64_MODIFIER "X",
+			 checkPointRedo);
+	dir = AllocateDir("pg_undo");
+	while ((de = ReadDir(dir, "pg_undo")) != NULL)
+	{
+		/*
+		 * Assume that fixed width uppercase hex strings sort the same way as
+		 * the values they represent, so we can use strcmp to identify undo
+		 * log snapshot files corresponding to checkpoints that we don't need
+		 * anymore.  This assumption holds for ASCII.
+		 */
+		if (!(strlen(de->d_name) == UNDO_CHECKPOINT_FILENAME_LENGTH))
+			continue;
+
+		if (UndoCheckPointFilenamePrecedes(de->d_name, oldest_path))
+		{
+			snprintf(path, MAXPGPATH, "pg_undo/%s", de->d_name);
+			if (unlink(path) != 0)
+				elog(ERROR, "could not unlink file \"%s\": %m", path);
+		}
+	}
+	FreeDir(dir);
+}
+
+/*
+ * Write out the undo log meta data to the pg_undo directory.  The actual
+ * contents of undo logs is in shared buffers and therefore handled by
+ * CheckPointBuffers(), but here we record the table of undo logs and their
+ * properties.
+ */
+void
+CheckPointUndoLogs(XLogRecPtr checkPointRedo, XLogRecPtr priorCheckPointRedo)
+{
+	UndoLogMetaData *serialized = NULL;
+	size_t	serialized_size = 0;
+	char   *data;
+	char	path[MAXPGPATH];
+	UndoLogNumber num_logs;
+	int		fd;
+	int		i;
+	pg_crc32c crc;
+
+	/*
+	 * We acquire UndoLogLock to prevent any undo logs from being created or
+	 * discarded while we build a snapshot of them.  This isn't expected to
+	 * take long on a healthy system because the number of active logs should
+	 * be around the number of backends.  Holding this lock won't prevent
+	 * concurrent access to the undo log, except when segments need to be
+	 * added or removed.
+	 */
+	LWLockAcquire(UndoLogLock, LW_SHARED);
+
+	/*
+	 * Rather than doing the file IO while we hold locks, we'll copy the
+	 * meta-data into a palloc'd buffer.
+	 */
+	serialized_size = sizeof(UndoLogMetaData) * UndoLogNumSlots();
+	serialized = (UndoLogMetaData *) palloc0(serialized_size);
+
+	/* Scan through all slots looking for non-empty ones. */
+	num_logs = 0;
+	for (i = 0; i < UndoLogNumSlots(); ++i)
+	{
+		UndoLogSlot *slot = &UndoLogShared->slots[i];
+
+		/* Skip empty slots. */
+		if (slot->logno == InvalidUndoLogNumber)
+			continue;
+
+		/* Capture snapshot while holding each mutex. */
+		LWLockAcquire(&slot->mutex, LW_EXCLUSIVE);
+		serialized[num_logs++] = slot->meta;
+		LWLockRelease(&slot->mutex);
+	}
+
+	LWLockRelease(UndoLogLock);
+
+	/* Dump into a file under pg_undo. */
+	snprintf(path, MAXPGPATH, "pg_undo/%016" INT64_MODIFIER "X",
+			 checkPointRedo);
+	pgstat_report_wait_start(WAIT_EVENT_UNDO_CHECKPOINT_WRITE);
+	fd = OpenTransientFile(path, O_RDWR | O_CREAT | PG_BINARY);
+	if (fd < 0)
+		ereport(ERROR,
+				(errcode_for_file_access(),
+				 errmsg("could not create file \"%s\": %m", path)));
+
+	/* Compute header checksum. */
+	INIT_CRC32C(crc);
+	COMP_CRC32C(crc, &UndoLogShared->low_logno, sizeof(UndoLogShared->low_logno));
+	COMP_CRC32C(crc, &UndoLogShared->next_logno, sizeof(UndoLogShared->next_logno));
+	COMP_CRC32C(crc, &num_logs, sizeof(num_logs));
+	FIN_CRC32C(crc);
+
+	/* Write out the number of active logs + crc. */
+	if ((write(fd, &UndoLogShared->low_logno, sizeof(UndoLogShared->low_logno)) != sizeof(UndoLogShared->low_logno)) ||
+		(write(fd, &UndoLogShared->next_logno, sizeof(UndoLogShared->next_logno)) != sizeof(UndoLogShared->next_logno)) ||
+		(write(fd, &num_logs, sizeof(num_logs)) != sizeof(num_logs)) ||
+		(write(fd, &crc, sizeof(crc)) != sizeof(crc)))
+		ereport(ERROR,
+				(errcode_for_file_access(),
+				 errmsg("could not write to file \"%s\": %m", path)));
+
+	/* Write out the meta data for all active undo logs. */
+	data = (char *) serialized;
+	INIT_CRC32C(crc);
+	serialized_size = num_logs * sizeof(UndoLogMetaData);
+	while (serialized_size > 0)
+	{
+		ssize_t written;
+
+		written = write(fd, data, serialized_size);
+		if (written < 0)
+			ereport(ERROR,
+					(errcode_for_file_access(),
+					 errmsg("could not write to file \"%s\": %m", path)));
+		COMP_CRC32C(crc, data, written);
+		serialized_size -= written;
+		data += written;
+	}
+	FIN_CRC32C(crc);
+
+	if (write(fd, &crc, sizeof(crc)) != sizeof(crc))
+		ereport(ERROR,
+				(errcode_for_file_access(),
+				 errmsg("could not write to file \"%s\": %m", path)));
+
+
+	/* Flush file and directory entry. */
+	pgstat_report_wait_start(WAIT_EVENT_UNDO_CHECKPOINT_SYNC);
+	pg_fsync(fd);
+	if (CloseTransientFile(fd) < 0)
+		ereport(data_sync_elevel(ERROR),
+				(errcode_for_file_access(),
+				 errmsg("could not close file \"%s\": %m", path)));
+	fsync_fname("pg_undo", true);
+	pgstat_report_wait_end();
+
+	if (serialized)
+		pfree(serialized);
+
+	CleanUpUndoCheckPointFiles(priorCheckPointRedo);
+}
+
+void
+StartupUndoLogs(XLogRecPtr checkPointRedo)
+{
+	char	path[MAXPGPATH];
+	int		i;
+	int		fd;
+	int		nlogs;
+	pg_crc32c crc;
+	pg_crc32c new_crc;
+
+	/* If initdb is calling, there is no file to read yet. */
+	if (IsBootstrapProcessingMode())
+		return;
+
+	/* Open the pg_undo file corresponding to the given checkpoint. */
+	snprintf(path, MAXPGPATH, "pg_undo/%016" INT64_MODIFIER "X",
+			 checkPointRedo);
+	pgstat_report_wait_start(WAIT_EVENT_UNDO_CHECKPOINT_READ);
+	fd = OpenTransientFile(path, O_RDONLY | PG_BINARY);
+	if (fd < 0)
+		elog(ERROR, "cannot open undo checkpoint snapshot \"%s\": %m", path);
+
+	/* Read the active log number range. */
+	if ((read(fd, &UndoLogShared->low_logno, sizeof(UndoLogShared->low_logno))
+		 != sizeof(UndoLogShared->low_logno)) ||
+		(read(fd, &UndoLogShared->next_logno, sizeof(UndoLogShared->next_logno))
+		 != sizeof(UndoLogShared->next_logno)) ||
+		(read(fd, &nlogs, sizeof(nlogs)) != sizeof(nlogs)) ||
+		(read(fd, &crc, sizeof(crc)) != sizeof(crc)))
+		elog(ERROR, "pg_undo file \"%s\" is corrupted", path);
+
+	/* Verify the header checksum. */
+	INIT_CRC32C(new_crc);
+	COMP_CRC32C(new_crc, &UndoLogShared->low_logno, sizeof(UndoLogShared->low_logno));
+	COMP_CRC32C(new_crc, &UndoLogShared->next_logno, sizeof(UndoLogShared->next_logno));
+	COMP_CRC32C(new_crc, &nlogs, sizeof(UndoLogShared->next_logno));
+	FIN_CRC32C(new_crc);
+
+	if (crc != new_crc)
+		elog(ERROR,
+			 "pg_undo file \"%s\" has incorrect checksum", path);
+
+	/*
+	 * We'll acquire UndoLogLock just because allocate_undo_log() asserts we
+	 * hold it (we don't actually expect concurrent access yet).
+	 */
+	LWLockAcquire(UndoLogLock, LW_EXCLUSIVE);
+
+	/* Initialize all the logs and set up the freelist. */
+	INIT_CRC32C(new_crc);
+	for (i = 0; i < nlogs; ++i)
+	{
+		ssize_t size;
+		UndoLogSlot *slot;
+
+		/*
+		 * Get a new UndoLogSlot.  If this checkpoint was created on a system
+		 * with a higher max_connections setting, it's theoretically possible
+		 * that we don't have enough space and cannot start up.
+		 */
+		slot = allocate_undo_log_slot();
+		if (!slot)
+			ereport(ERROR,
+					(errmsg("not enough undo log slots to recover from checkpoint: need at least %d, have %zu",
+							nlogs, UndoLogNumSlots()),
+					 errhint("Consider increasing max_connections")));
+
+		/* Read in the meta data for this undo log. */
+		if ((size = read(fd, &slot->meta, sizeof(slot->meta))) != sizeof(slot->meta))
+			elog(ERROR, "short read of pg_undo meta data in file \"%s\": %m (got %zu, wanted %zu)",
+				 path, size, sizeof(slot->meta));
+		COMP_CRC32C(new_crc, &slot->meta, sizeof(slot->meta));
+
+		/*
+		 * At normal start-up, or during recovery, all active undo logs start
+		 * out on the appropriate free list.
+		 */
+		slot->logno = slot->meta.logno;
+		slot->pid = InvalidPid;
+		slot->oldest_data = MakeUndoRecPtr(slot->logno, slot->meta.discard);
+		if (slot->meta.status == UNDO_LOG_STATUS_ACTIVE)
+		{
+			slot->next_free = UndoLogShared->free_lists[slot->meta.category];
+			UndoLogShared->free_lists[slot->meta.category] = slot->logno;
+		}
+	}
+	FIN_CRC32C(new_crc);
+
+	LWLockRelease(UndoLogLock);
+
+	/* Verify body checksum. */
+	if (read(fd, &crc, sizeof(crc)) != sizeof(crc))
+		elog(ERROR, "pg_undo file \"%s\" is corrupted", path);
+	if (crc != new_crc)
+		elog(ERROR,
+			 "pg_undo file \"%s\" has incorrect checksum", path);
+
+	CloseTransientFile(fd);
+	pgstat_report_wait_end();
+}
+
+/*
+ * Allocate a new UndoLogSlot object.
+ */
+static UndoLogSlot *
+allocate_undo_log_slot(void)
+{
+	UndoLogSlot *slot;
+	UndoLogNumber i;
+
+	Assert(LWLockHeldByMeInMode(UndoLogLock, LW_EXCLUSIVE));
+
+	for (i = 0; i < UndoLogNumSlots(); ++i)
+	{
+		slot = &UndoLogShared->slots[i];
+		if (slot->logno == InvalidUndoLogNumber)
+		{
+			memset(&slot->meta, 0, sizeof(slot->meta));
+			slot->pid = 0;
+			slot->wait_fxmin = InvalidFullTransactionId;
+			slot->oldest_data =0;
+			slot->next_free = -1;
+			slot->logno = -1;
+			return slot;
+		}
+	}
+
+	return NULL;
+}
+
+/*
+ * Free an UndoLogSlot object in shared memory, so that it can be reused.
+ * This is a rare event, and has complications for all code paths that access
+ * slots.  Unless the current session is attached to the slot, it must be
+ * prepared for it to be freed and then potentially recycled for use by
+ * another log.  See UndoLogGetSlot().
+ */
+static void
+free_undo_log_slot(UndoLogSlot *slot)
+{
+	/*
+	 * When removing an undo log from a slot in shared memory, we acquire
+	 * UndoLogLock, log->mutex and log->discard_lock, so that other code can
+	 * hold any one of those locks to prevent the slot from being recycled.
+	 */
+	LWLockAcquire(UndoLogLock, LW_EXCLUSIVE);
+	LWLockAcquire(&slot->mutex, LW_EXCLUSIVE);
+	Assert(slot->logno != InvalidUndoLogNumber);
+	slot->logno = InvalidUndoLogNumber;
+	memset(&slot->meta, 0, sizeof(slot->meta));
+	LWLockRelease(&slot->mutex);
+	LWLockRelease(UndoLogLock);
+}
+
+/*
+ * Find the UndoLogSlot object for a given log number.
+ *
+ * The caller may or may not already hold UndoLogLock, and should indicate
+ * this by passing 'locked'.  We'll acquire it in the slow path if necessary.
+ * If it is not held by the caller, the caller must deal with the possibility
+ * that the returned UndoLogSlot no longer contains the requested logno by the
+ * time it is accessed.
+ *
+ * To do that, one of the following approaches must be taken by the calling
+ * code:
+ *
+ * 1.  If the calling code knows that it is attached to this lock or is the
+ * recovery process, then there is no way for the slot to be recycled, so it's
+ * not necessary to check that the log number hasn't changed.  The slot cannot
+ * be recycled while a backend is attached.  It should probably assert that it
+ * is attached, however.
+ *
+ * 2.  All other code should acquire log->mutex before accessing any members,
+ * and after doing so, check that the logno hasn't moved.  If it is not, the
+ * entire undo log must be assumed to be discarded (as if this function
+ * returned NULL) and the caller must behave accordingly.
+ *
+ * Return NULL if the undo log has been entirely discarded.  It is an error to
+ * ask for undo logs that have never been created.
+ */
+static UndoLogSlot *
+find_undo_log_slot(UndoLogNumber logno, bool locked)
+{
+	UndoLogSlot *result = NULL;
+	UndoLogTableEntry *entry;
+	bool	   found;
+
+	Assert(locked == LWLockHeldByMe(UndoLogLock));
+
+	/* First see if we already have it in our cache. */
+	entry = undologtable_lookup(undologtable_cache, logno);
+	if (likely(entry))
+		result = entry->slot;
+	else
+	{
+		UndoLogNumber i;
+
+		/* Nope.  Linear search for the slot in shared memory. */
+		if (!locked)
+			LWLockAcquire(UndoLogLock, LW_SHARED);
+		for (i = 0; i < UndoLogNumSlots(); ++i)
+		{
+			if (UndoLogShared->slots[i].logno == logno)
+			{
+				/* Found it. */
+
+				/*
+				 * TODO: Should this function be usable in a critical section?
+				 * Would it make sense to detect that we are in a critical
+				 * section and just return the pointer to the log without
+				 * updating the cache, to avoid any chance of allocating
+				 * memory?
+				 */
+
+				entry = undologtable_insert(undologtable_cache, logno, &found);
+				entry->number = logno;
+				entry->slot = &UndoLogShared->slots[i];
+				entry->tablespace = entry->slot->meta.tablespace;
+				entry->category = entry->slot->meta.category;
+				entry->recent_discard =
+					MakeUndoRecPtr(logno, entry->slot->meta.discard);
+				result = entry->slot;
+				break;
+			}
+		}
+
+		/*
+		 * If we didn't find it, then it must already have been entirely
+		 * discarded.  We create a negative cache entry so that we can answer
+		 * this question quickly next time.
+		 *
+		 * TODO: We could track the lowest known undo log number, to reduce
+		 * the negative cache entry bloat.
+		 */
+		if (result == NULL)
+		{
+			/*
+			 * Sanity check: the caller should not be asking about undo logs
+			 * that have never existed.
+			 */
+			if (logno >= UndoLogShared->next_logno)
+				elog(ERROR, "undo log %u hasn't been created yet", logno);
+			entry = undologtable_insert(undologtable_cache, logno, &found);
+			entry->number = logno;
+			entry->slot = NULL;
+			entry->tablespace = 0;
+		}
+		if (!locked)
+			LWLockRelease(UndoLogLock);
+	}
+
+	return result;
+}
+
+/*
+ * Get a pointer to an UndoLogSlot object corresponding to a given logno.
+ *
+ * In general, the caller must acquire the UndoLogSlot's mutex to access
+ * the contents, and at that time must consider that the logno might have
+ * changed because the undo log it contained has been entirely discarded.
+ *
+ * If the calling backend is currently attached to the undo log, that is not
+ * possible, because logs can only reach UNDO_LOG_STATUS_DISCARDED after first
+ * reaching UNDO_LOG_STATUS_FULL, and that only happens while detaching.
+ */
+UndoLogSlot *
+UndoLogGetSlot(UndoLogNumber logno, bool missing_ok)
+{
+	UndoLogSlot *slot = find_undo_log_slot(logno, false);
+
+	if (slot == NULL && !missing_ok)
+		elog(ERROR, "unknown undo log number %d", logno);
+
+	return slot;
+}
+
+/*
+ * Attach to a free undo log, creating a new one if required.
+ */
+static void
+attach_undo_log(UndoLogCategory category, Oid tablespace)
+{
+	UndoLogSlot *slot = NULL;
+	UndoLogNumber logno;
+	UndoLogNumber *place;
+
+	Assert(!InRecovery);
+	Assert(CurrentSession->attached_undo_slots[category] == NULL);
+
+	LWLockAcquire(UndoLogLock, LW_EXCLUSIVE);
+
+	/*
+	 * For now we have a simple linked list of unattached undo logs for each
+	 * persistence level.  We'll grovel though it to find something for the
+	 * tablespace you asked for.  If you're not using multiple tablespaces
+	 * it'll be able to pop one off the front.  We might need a hash table
+	 * keyed by tablespace if this simple scheme turns out to be too slow when
+	 * using many tablespaces and many undo logs, but that seems like an
+	 * unusual use case not worth optimizing for.
+	 */
+	place = &UndoLogShared->free_lists[category];
+	while (*place != InvalidUndoLogNumber)
+	{
+		UndoLogSlot *candidate = find_undo_log_slot(*place, true);
+
+		/*
+		 * There should never be an undo log on the freelist that has been
+		 * entirely discarded, or hasn't been created yet.  The persistence
+		 * level should match the freelist.
+		 */
+		if (unlikely(candidate == NULL))
+			elog(ERROR,
+				 "corrupted undo log freelist, no such undo log %u", *place);
+		if (unlikely(candidate->meta.category != category))
+			elog(ERROR,
+				 "corrupted undo log freelist, undo log %u with persistence %d found on freelist %d",
+				 *place, candidate->meta.category, category);
+
+		if (candidate->meta.tablespace == tablespace)
+		{
+			logno = *place;
+			slot = candidate;
+			*place = candidate->next_free;
+			break;
+		}
+		place = &candidate->next_free;
+	}
+
+	/*
+	 * If all existing undo logs for this tablespace and persistence level are
+	 * busy, we'll have to create a new one.
+	 */
+	if (slot == NULL)
+	{
+		if (UndoLogShared->next_logno > MaxUndoLogNumber)
+		{
+			/*
+			 * You've used up all 16 exabytes of undo log addressing space.
+			 * This is a difficult state to reach using only 16 exabytes of
+			 * WAL.
+			 */
+			elog(ERROR, "undo log address space exhausted");
+		}
+
+		/* Allocate a slot from the UndoLogSlot pool. */
+		slot = allocate_undo_log_slot();
+		if (unlikely(!slot))
+			ereport(ERROR,
+					(errmsg("could not create new undo log"),
+					 errdetail("The maximum number of active undo logs is %zu.",
+							   UndoLogNumSlots()),
+					 errhint("Consider increasing max_connections.")));
+		slot->logno = logno = UndoLogShared->next_logno;
+
+		/*
+		 * The insert and discard pointers start after the first block's
+		 * header.  XXX That means that insert is > end for a short time in a
+		 * newly created undo log.  Is there any problem with that?
+		 */
+		slot->meta.unlogged.insert = UndoLogBlockHeaderSize;
+		slot->meta.discard = UndoLogBlockHeaderSize;
+
+		slot->meta.logno = logno;
+		slot->meta.tablespace = tablespace;
+		slot->meta.category = category;
+		slot->meta.status = UNDO_LOG_STATUS_ACTIVE;
+
+		/* Move the high log number pointer past this one. */
+		++UndoLogShared->next_logno;
+
+		/* WAL-log the creation of this new undo log. */
+		{
+			xl_undolog_create xlrec;
+
+			xlrec.logno = logno;
+			xlrec.tablespace = slot->meta.tablespace;
+			xlrec.category = slot->meta.category;
+
+			XLogBeginInsert();
+			XLogRegisterData((char *) &xlrec, sizeof(xlrec));
+			XLogInsert(RM_UNDOLOG_ID, XLOG_UNDOLOG_CREATE);
+		}
+
+		/*
+		 * This undo log has no segments.  UndoLogAllocate will create the
+		 * first one on demand.
+		 */
+	}
+	LWLockRelease(UndoLogLock);
+
+	LWLockAcquire(&slot->mutex, LW_EXCLUSIVE);
+	slot->pid = MyProcPid;
+	LWLockRelease(&slot->mutex);
+
+	CurrentSession->attached_undo_slots[category] = slot;
+}
+
+/* check_hook: validate new undo_tablespaces */
+bool
+check_undo_tablespaces(char **newval, void **extra, GucSource source)
+{
+	char	   *rawname;
+	List	   *namelist;
+
+	/* Need a modifiable copy of string */
+	rawname = pstrdup(*newval);
+
+	/*
+	 * Parse string into list of identifiers, just to check for
+	 * well-formedness (unfortunateley we can't validate the names in the
+	 * catalog yet).
+	 */
+	if (!SplitIdentifierString(rawname, ',', &namelist))
+	{
+		/* syntax error in name list */
+		GUC_check_errdetail("List syntax is invalid.");
+		pfree(rawname);
+		list_free(namelist);
+		return false;
+	}
+
+	/*
+	 * Make sure we aren't already in a transaction that has been assigned an
+	 * XID.  This ensures we don't detach from an undo log that we might have
+	 * started writing undo data into for this transaction.
+	 */
+	if (GetTopTransactionIdIfAny() != InvalidTransactionId)
+		ereport(ERROR,
+				(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
+				 (errmsg("undo_tablespaces cannot be changed while a transaction is in progress"))));
+	list_free(namelist);
+
+	return true;
+}
+
+/* assign_hook: do extra actions as needed */
+void
+assign_undo_tablespaces(const char *newval, void *extra)
+{
+	/*
+	 * This is normally called only when GetTopTransactionIdIfAny() ==
+	 * InvalidTransactionId (because you can't change undo_tablespaces in the
+	 * middle of a transaction that's been asigned an xid), but we can't
+	 * assert that because it's also called at the end of a transaction that's
+	 * rolling back, to reset the GUC if it was set inside the transaction.
+	 */
+
+	/* Tell UndoLogAllocate() to reexamine undo_tablespaces. */
+	if (CurrentSession)
+		CurrentSession->need_to_choose_undo_tablespace = true;
+}
+
+static bool
+choose_undo_tablespace(bool force_detach, Oid *tablespace)
+{
+	char   *rawname;
+	List   *namelist;
+	bool	need_to_unlock;
+	int		length;
+	int		i;
+
+	/* We need a modifiable copy of string. */
+	rawname = pstrdup(undo_tablespaces);
+
+	/* Break string into list of identifiers. */
+	if (!SplitIdentifierString(rawname, ',', &namelist))
+		elog(ERROR, "undo_tablespaces is unexpectedly malformed");
+
+	length = list_length(namelist);
+	if (length == 0 ||
+		(length == 1 && ((char *) linitial(namelist))[0] == '\0'))
+	{
+		/*
+		 * If it's an empty string, then we'll use the default tablespace.  No
+		 * locking is required because it can't be dropped.
+		 */
+		*tablespace = DEFAULTTABLESPACE_OID;
+		need_to_unlock = false;
+	}
+	else
+	{
+		/*
+		 * Choose an OID using our pid, so that if several backends have the
+		 * same multi-tablespace setting they'll spread out.  We could easily
+		 * do better than this if more serious load balancing is judged
+		 * useful.
+		 */
+		int		index = MyProcPid % length;
+		int		first_index = index;
+		Oid		oid = InvalidOid;
+
+		/*
+		 * Take the tablespace create/drop lock while we look the name up.
+		 * This prevents the tablespace from being dropped while we're trying
+		 * to resolve the name, or while the called is trying to create an
+		 * undo log in it.  The caller will have to release this lock.
+		 */
+		LWLockAcquire(TablespaceCreateLock, LW_EXCLUSIVE);
+		for (;;)
+		{
+			const char *name = list_nth(namelist, index);
+
+			oid = get_tablespace_oid(name, true);
+			if (oid == InvalidOid)
+			{
+				/* Unknown tablespace, try the next one. */
+				index = (index + 1) % length;
+				/*
+				 * But if we've tried them all, it's time to complain.  We'll
+				 * arbitrarily complain about the last one we tried in the
+				 * error message.
+				 */
+				if (index == first_index)
+					ereport(ERROR,
+							(errcode(ERRCODE_UNDEFINED_OBJECT),
+							 errmsg("tablespace \"%s\" does not exist", name),
+							 errhint("Create the tablespace or set undo_tablespaces to a valid or empty list.")));
+				continue;
+			}
+			if (oid == GLOBALTABLESPACE_OID)
+				ereport(ERROR,
+						(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+						 errmsg("undo logs cannot be placed in pg_global tablespace")));
+			/* If we got here we succeeded in finding one. */
+			break;
+		}
+
+		Assert(oid != InvalidOid);
+		*tablespace = oid;
+		need_to_unlock = true;
+	}
+
+	/*
+	 * If we came here because the user changed undo_tablesaces, then detach
+	 * from any undo logs we happen to be attached to.
+	 */
+	if (force_detach)
+	{
+		for (i = 0; i < UndoLogCategories; ++i)
+		{
+			UndoLogSlot *slot = CurrentSession->attached_undo_slots[i];
+
+			if (slot != NULL)
+			{
+				LWLockAcquire(&slot->mutex, LW_EXCLUSIVE);
+				slot->pid = InvalidPid;
+				slot->meta.unlogged.xid = InvalidTransactionId;
+				LWLockRelease(&slot->mutex);
+
+				LWLockAcquire(UndoLogLock, LW_EXCLUSIVE);
+				slot->next_free = UndoLogShared->free_lists[i];
+				UndoLogShared->free_lists[i] = slot->logno;
+				LWLockRelease(UndoLogLock);
+
+				CurrentSession->attached_undo_slots[i] = NULL;
+			}
+		}
+	}
+
+	return need_to_unlock;
+}
+
+bool
+DropUndoLogsInTablespace(Oid tablespace)
+{
+	DIR *dir;
+	char undo_path[MAXPGPATH];
+	UndoLogSlot *slot = NULL;
+	int		i;
+
+	Assert(LWLockHeldByMe(TablespaceCreateLock));
+	Assert(tablespace != DEFAULTTABLESPACE_OID);
+
+	/* First, try to kick everyone off any undo logs in this tablespace. */
+	while ((slot = UndoLogNextSlot(slot)))
+	{
+		bool ok;
+		bool return_to_freelist = false;
+
+		/* Skip undo logs in other tablespaces. */
+		if (slot->meta.tablespace != tablespace)
+			continue;
+
+		/* Check if this undo log can be forcibly detached. */
+		LWLockAcquire(&slot->mutex, LW_EXCLUSIVE);
+		if (slot->meta.discard == slot->meta.unlogged.insert &&
+			(slot->meta.unlogged.xid == InvalidTransactionId ||
+			 !TransactionIdIsInProgress(slot->meta.unlogged.xid)))
+		{
+			slot->meta.unlogged.xid = InvalidTransactionId;
+			if (slot->pid != InvalidPid)
+			{
+				slot->pid = InvalidPid;
+				return_to_freelist = true;
+			}
+			ok = true;
+		}
+		else
+		{
+			/*
+			 * There is data we need in this undo log.  We can't force it to
+			 * be detached.
+			 */
+			ok = false;
+		}
+		LWLockRelease(&slot->mutex);
+
+		/* If we failed, then give up now and report failure. */
+		if (!ok)
+			return false;
+
+		/*
+		 * Put this undo log back on the appropriate free-list.  No one can
+		 * attach to it while we hold TablespaceCreateLock, but if we return
+		 * earlier in a future go around this loop, we need the undo log to
+		 * remain usable.  We'll remove all appropriate logs from the
+		 * free-lists in a separate step below.
+		 */
+		if (return_to_freelist)
+		{
+			LWLockAcquire(UndoLogLock, LW_EXCLUSIVE);
+			slot->next_free = UndoLogShared->free_lists[slot->meta.category];
+			UndoLogShared->free_lists[slot->meta.category] = slot->logno;
+			LWLockRelease(UndoLogLock);
+		}
+	}
+
+	/*
+	 * We detached all backends from undo logs in this tablespace, and no one
+	 * can attach to any non-default-tablespace undo logs while we hold
+	 * TablespaceCreateLock.  We can now drop the undo logs.
+	 */
+	slot = NULL;
+	while ((slot = UndoLogNextSlot(slot)))
+	{
+		/* Skip undo logs in other tablespaces. */
+		if (slot->meta.tablespace != tablespace)
+			continue;
+
+		/*
+		 * Make sure no buffers remain.  When that is done by
+		 * UndoLogDiscard(), the final page is left in shared_buffers because
+		 * it may contain data, or at least be needed again very soon.  Here
+		 * we need to drop even that page from the buffer pool.
+		 */
+		forget_undo_buffers(slot->logno, slot->meta.discard, slot->meta.discard, true);
+
+		/*
+		 * TODO: For now we drop the undo log, meaning that it will never be
+		 * used again.  That wastes the rest of its address space.  Instead,
+		 * we should put it onto a special list of 'offline' undo logs, ready
+		 * to be reactivated in some other tablespace.  Then we can keep the
+		 * unused portion of its address space.
+		 */
+		LWLockAcquire(&slot->mutex, LW_EXCLUSIVE);
+		slot->meta.status = UNDO_LOG_STATUS_DISCARDED;
+		LWLockRelease(&slot->mutex);
+	}
+
+	/* Forget about all sync requests relating to this tablespace. */
+	undofile_forget_sync_tablespace(tablespace);
+
+	/* Unlink all undo segment files in this tablespace. */
+	UndoLogDirectory(tablespace, undo_path);
+
+	dir = AllocateDir(undo_path);
+	if (dir != NULL)
+	{
+		struct dirent *de;
+
+		while ((de = ReadDirExtended(dir, undo_path, LOG)) != NULL)
+		{
+			char segment_path[MAXPGPATH];
+
+			if (strcmp(de->d_name, ".") == 0 ||
+				strcmp(de->d_name, "..") == 0)
+				continue;
+			snprintf(segment_path, sizeof(segment_path), "%s/%s",
+					 undo_path, de->d_name);
+			if (unlink(segment_path) < 0)
+				elog(LOG, "couldn't unlink file \"%s\": %m", segment_path);
+		}
+		FreeDir(dir);
+	}
+
+	/* Remove all dropped undo logs from the free-lists. */
+	LWLockAcquire(UndoLogLock, LW_EXCLUSIVE);
+	for (i = 0; i < UndoLogCategories; ++i)
+	{
+		UndoLogSlot *slot;
+		UndoLogNumber *place;
+
+		place = &UndoLogShared->free_lists[i];
+		while (*place != InvalidUndoLogNumber)
+		{
+			slot = find_undo_log_slot(*place, true);
+			if (!slot)
+				elog(ERROR,
+					 "corrupted undo log freelist, unknown log %u", *place);
+			if (slot->meta.status == UNDO_LOG_STATUS_DISCARDED)
+				*place = slot->next_free;
+			else
+				place = &slot->next_free;
+		}
+	}
+	LWLockRelease(UndoLogLock);
+
+	return true;
+}
+
+void
+ResetUndoLogs(UndoLogCategory category)
+{
+	UndoLogSlot *slot = NULL;
+
+	while ((slot = UndoLogNextSlot(slot)))
+	{
+		DIR	   *dir;
+		struct dirent *de;
+		char	undo_path[MAXPGPATH];
+		char	segment_prefix[MAXPGPATH];
+		size_t	segment_prefix_size;
+
+		if (slot->meta.category != category)
+			continue;
+
+		/* Scan the directory for files belonging to this undo log. */
+		snprintf(segment_prefix, sizeof(segment_prefix), "%06X.", slot->logno);
+		segment_prefix_size = strlen(segment_prefix);
+		UndoLogDirectory(slot->meta.tablespace, undo_path);
+		dir = AllocateDir(undo_path);
+		if (dir == NULL)
+			continue;
+		while ((de = ReadDirExtended(dir, undo_path, LOG)) != NULL)
+		{
+			char segment_path[MAXPGPATH];
+
+			if (strncmp(de->d_name, segment_prefix, segment_prefix_size) != 0)
+				continue;
+			snprintf(segment_path, sizeof(segment_path), "%s/%s",
+					 undo_path, de->d_name);
+			elog(DEBUG1, "unlinked undo segment \"%s\"", segment_path);
+			if (unlink(segment_path) < 0)
+				elog(LOG, "couldn't unlink file \"%s\": %m", segment_path);
+		}
+		FreeDir(dir);
+
+		/*
+		 * We have no segment files.  Set the pointers to indicate that there
+		 * is no data.  The discard and insert pointers point to the first
+		 * usable byte in the segment we will create when we next try to
+		 * allocate.  This is a bit strange, because it means that they are
+		 * past the end pointer.  That's the same as when new undo logs are
+		 * created.
+		 *
+		 * TODO: Should we rewind to zero instead, so we can reuse that (now)
+		 * unreferenced address space?
+		 */
+		slot->meta.unlogged.insert = slot->meta.discard = slot->meta.end +
+			UndoLogBlockHeaderSize;
+	}
+}
+
+Datum
+pg_stat_get_undo_logs(PG_FUNCTION_ARGS)
+{
+#define PG_STAT_GET_UNDO_LOGS_COLS 9
+	ReturnSetInfo *rsinfo = (ReturnSetInfo *) fcinfo->resultinfo;
+	TupleDesc	tupdesc;
+	Tuplestorestate *tupstore;
+	MemoryContext per_query_ctx;
+	MemoryContext oldcontext;
+	char *tablespace_name = NULL;
+	Oid last_tablespace = InvalidOid;
+	int			i;
+
+	/* check to see if caller supports us returning a tuplestore */
+	if (rsinfo == NULL || !IsA(rsinfo, ReturnSetInfo))
+		ereport(ERROR,
+				(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+				 errmsg("set-valued function called in context that cannot accept a set")));
+	if (!(rsinfo->allowedModes & SFRM_Materialize))
+		ereport(ERROR,
+				(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+				 errmsg("materialize mode required, but it is not " \
+						"allowed in this context")));
+
+	/* Build a tuple descriptor for our result type */
+	if (get_call_result_type(fcinfo, NULL, &tupdesc) != TYPEFUNC_COMPOSITE)
+		elog(ERROR, "return type must be a row type");
+
+	per_query_ctx = rsinfo->econtext->ecxt_per_query_memory;
+	oldcontext = MemoryContextSwitchTo(per_query_ctx);
+
+	tupstore = tuplestore_begin_heap(true, false, work_mem);
+	rsinfo->returnMode = SFRM_Materialize;
+	rsinfo->setResult = tupstore;
+	rsinfo->setDesc = tupdesc;
+
+	MemoryContextSwitchTo(oldcontext);
+
+	/* Scan all undo logs to build the results. */
+	for (i = 0; i < UndoLogShared->nslots; ++i)
+	{
+		UndoLogSlot *slot = &UndoLogShared->slots[i];
+		char buffer[17];
+		Datum values[PG_STAT_GET_UNDO_LOGS_COLS];
+		bool nulls[PG_STAT_GET_UNDO_LOGS_COLS] = { false };
+		Oid tablespace;
+
+		/*
+		 * This won't be a consistent result overall, but the values for each
+		 * log will be consistent because we'll take the per-log lock while
+		 * copying them.
+		 */
+		LWLockAcquire(&slot->mutex, LW_SHARED);
+
+		/* Skip unused slots and entirely discarded undo logs. */
+		if (slot->logno == InvalidUndoLogNumber ||
+			slot->meta.status == UNDO_LOG_STATUS_DISCARDED)
+		{
+			LWLockRelease(&slot->mutex);
+			continue;
+		}
+
+		values[0] = ObjectIdGetDatum((Oid) slot->logno);
+		values[1] = CStringGetTextDatum(
+			slot->meta.category == UNDO_PERMANENT ? "permanent" :
+			slot->meta.category == UNDO_UNLOGGED ? "unlogged" :
+			slot->meta.category == UNDO_TEMP ? "temporary" :
+			slot->meta.category == UNDO_SHARED ? "shared" : "<unknown>");
+		tablespace = slot->meta.tablespace;
+
+		snprintf(buffer, sizeof(buffer), UndoRecPtrFormat,
+				 MakeUndoRecPtr(slot->logno, slot->meta.discard));
+		values[3] = CStringGetTextDatum(buffer);
+		snprintf(buffer, sizeof(buffer), UndoRecPtrFormat,
+				 MakeUndoRecPtr(slot->logno, slot->meta.unlogged.insert));
+		values[4] = CStringGetTextDatum(buffer);
+		snprintf(buffer, sizeof(buffer), UndoRecPtrFormat,
+				 MakeUndoRecPtr(slot->logno, slot->meta.end));
+		values[5] = CStringGetTextDatum(buffer);
+		if (slot->meta.unlogged.xid == InvalidTransactionId)
+			nulls[6] = true;
+		else
+			values[6] = TransactionIdGetDatum(slot->meta.unlogged.xid);
+		if (slot->pid == InvalidPid)
+			nulls[7] = true;
+		else
+			values[7] = Int32GetDatum((int32) slot->pid);
+		switch (slot->meta.status)
+		{
+		case UNDO_LOG_STATUS_ACTIVE:
+			values[8] = CStringGetTextDatum("ACTIVE"); break;
+		case UNDO_LOG_STATUS_FULL:
+			values[8] = CStringGetTextDatum("FULL"); break;
+		default:
+			nulls[8] = true;
+		}
+		LWLockRelease(&slot->mutex);
+
+		/*
+		 * Deal with potentially slow tablespace name lookup without the lock.
+		 * Avoid making multiple calls to that expensive function for the
+		 * common case of repeating tablespace.
+		 */
+		if (tablespace != last_tablespace)
+		{
+			if (tablespace_name)
+				pfree(tablespace_name);
+			tablespace_name = get_tablespace_name(tablespace);
+			last_tablespace = tablespace;
+		}
+		if (tablespace_name)
+		{
+			values[2] = CStringGetTextDatum(tablespace_name);
+			nulls[2] = false;
+		}
+		else
+			nulls[2] = true;
+
+		tuplestore_putvalues(tupstore, tupdesc, values, nulls);
+	}
+
+	if (tablespace_name)
+		pfree(tablespace_name);
+	tuplestore_donestoring(tupstore);
+
+	return (Datum) 0;
+}
+
+/*
+ * replay the creation of a new undo log
+ */
+static void
+undolog_xlog_create(XLogReaderState *record)
+{
+	xl_undolog_create *xlrec = (xl_undolog_create *) XLogRecGetData(record);
+	UndoLogSlot *slot;
+
+	/* Create meta-data space in shared memory. */
+	LWLockAcquire(UndoLogLock, LW_EXCLUSIVE);
+
+	/* TODO: assert that it doesn't exist already? */
+
+	slot = allocate_undo_log_slot();
+	LWLockAcquire(&slot->mutex, LW_EXCLUSIVE);
+	slot->logno = xlrec->logno;
+	slot->meta.logno = xlrec->logno;
+	slot->meta.status = UNDO_LOG_STATUS_ACTIVE;
+	slot->meta.category = xlrec->category;
+	slot->meta.tablespace = xlrec->tablespace;
+	slot->meta.unlogged.insert = UndoLogBlockHeaderSize;
+	slot->meta.discard = UndoLogBlockHeaderSize;
+	UndoLogShared->next_logno = Max(xlrec->logno + 1, UndoLogShared->next_logno);
+	LWLockRelease(&slot->mutex);
+	LWLockRelease(UndoLogLock);
+}
+
+/*
+ * replay the addition of a new segment to an undo log
+ */
+static void
+undolog_xlog_extend(XLogReaderState *record)
+{
+	xl_undolog_extend *xlrec = (xl_undolog_extend *) XLogRecGetData(record);
+
+	/* Extend exactly as we would during DO phase. */
+	extend_undo_log(xlrec->logno, xlrec->end);
+}
+
+/*
+ * Drop all buffers for the given undo log, from the old_discard to up
+ * new_discard.  If drop_tail is true, also drop the buffer that holds
+ * new_discard; this is used when discarding undo logs completely, for example
+ * via DROP TABLESPACE.  If it is false, then the final buffer is not dropped
+ * because it may contain data.
+ *
+ */
+static void
+forget_undo_buffers(int logno, UndoLogOffset old_discard,
+					UndoLogOffset new_discard, bool drop_tail)
+{
+	BlockNumber old_blockno;
+	BlockNumber new_blockno;
+	RelFileNode	rnode;
+
+	UndoRecPtrAssignRelFileNode(rnode, MakeUndoRecPtr(logno, old_discard));
+	old_blockno = old_discard / BLCKSZ;
+	new_blockno = new_discard / BLCKSZ;
+	if (drop_tail)
+		++new_blockno;
+	while (old_blockno < new_blockno)
+	{
+		ForgetBuffer(rnode, UndoLogForkNum, old_blockno);
+		ForgetLocalBuffer(rnode, UndoLogForkNum, old_blockno++);
+	}
+}
+/*
+ * replay an undo segment discard record
+ */
+static void
+undolog_xlog_discard(XLogReaderState *record)
+{
+	xl_undolog_discard *xlrec = (xl_undolog_discard *) XLogRecGetData(record);
+	UndoLogSlot *slot;
+	UndoLogOffset discard;
+	UndoLogOffset end;
+	UndoLogOffset old_segment_begin;
+	UndoLogOffset new_segment_begin;
+	RelFileNode rnode = {0};
+
+	slot = find_undo_log_slot(xlrec->logno, false);
+	if (slot == NULL)
+		elog(ERROR, "unknown undo log %d", xlrec->logno);
+
+	/*
+	 * We're about to discard undologs. In Hot Standby mode, ensure that
+	 * there's no queries running which need to get tuple from discarded undo.
+	 *
+	 * XXX we are passing empty rnode to the conflict function so that it can
+	 * check conflict in all the backend regardless of which database the
+	 * backend is connected.
+	 */
+	if (InHotStandby && TransactionIdIsValid(xlrec->latestxid))
+		ResolveRecoveryConflictWithSnapshot(xlrec->latestxid, rnode);
+
+	/*
+	 * See if we need to unlink or rename any files, but don't consider it an
+	 * error if we find that files are missing.  Since UndoLogDiscard()
+	 * performs filesystem operations before WAL logging or updating shmem
+	 * which could be checkpointed, a crash could have left files already
+	 * deleted, but we could replay WAL that expects the files to be there.
+	 */
+
+	LWLockAcquire(&slot->mutex, LW_EXCLUSIVE);
+	Assert(slot->logno == xlrec->logno);
+	discard = slot->meta.discard;
+	end = slot->meta.end;
+	LWLockRelease(&slot->mutex);
+
+	/* Drop buffers before we remove/recycle any files. */
+	forget_undo_buffers(xlrec->logno, discard, xlrec->discard,
+						xlrec->entirely_discarded);
+
+	/* Rewind to the start of the segment. */
+	old_segment_begin = discard - discard % UndoLogSegmentSize;
+	new_segment_begin = xlrec->discard - xlrec->discard % UndoLogSegmentSize;
+
+	/* Unlink or rename segments that are no longer in range. */
+	while (old_segment_begin < new_segment_begin)
+	{
+		char	discard_path[MAXPGPATH];
+
+		/* Tell the checkpointer that the file is going away. */
+		undofile_forget_sync(slot->logno,
+							 old_segment_begin / UndoLogSegmentSize,
+							 slot->meta.tablespace);
+
+		UndoLogSegmentPath(xlrec->logno, old_segment_begin / UndoLogSegmentSize,
+						   slot->meta.tablespace, discard_path);
+
+		/* Can we recycle the oldest segment? */
+		if (end < xlrec->end)
+		{
+			char	recycle_path[MAXPGPATH];
+
+			UndoLogSegmentPath(xlrec->logno, end / UndoLogSegmentSize,
+							   slot->meta.tablespace, recycle_path);
+			if (rename(discard_path, recycle_path) == 0)
+			{
+				elog(DEBUG1, "recycled undo segment \"%s\" -> \"%s\"",
+					 discard_path, recycle_path);
+				end += UndoLogSegmentSize;
+			}
+			else
+			{
+				elog(LOG, "could not rename \"%s\" to \"%s\": %m",
+					 discard_path, recycle_path);
+			}
+		}
+		else
+		{
+			if (unlink(discard_path) == 0)
+				elog(DEBUG1, "unlinked undo segment \"%s\"", discard_path);
+			else
+				elog(LOG, "could not unlink \"%s\": %m", discard_path);
+		}
+		old_segment_begin += UndoLogSegmentSize;
+	}
+
+	/* Create any further new segments that are needed the slow way. */
+	while (end < xlrec->end)
+	{
+		allocate_empty_undo_segment(xlrec->logno, slot->meta.tablespace, end);
+		end += UndoLogSegmentSize;
+	}
+
+	/* Flush the directory entries before next checkpoint. */
+	undofile_request_sync_dir(slot->meta.tablespace);
+
+	/* Update shmem. */
+	LWLockAcquire(&slot->mutex, LW_EXCLUSIVE);
+	slot->meta.discard = xlrec->discard;
+	slot->meta.end = end;
+	LWLockRelease(&slot->mutex);
+
+	/* If we discarded everything, the slot can be given up. */
+	if (xlrec->entirely_discarded)
+		free_undo_log_slot(slot);
+}
+
+/*
+ * replay the switch of a undo log
+ */
+static void
+undolog_xlog_switch(XLogReaderState *record)
+{
+	xl_undolog_switch *xlrec = (xl_undolog_switch *) XLogRecGetData(record);
+	UndoLogSlot *slot;
+
+	slot = find_undo_log_slot(xlrec->logno, false);
+
+	/*
+	 * Restore the log switch information in the MyUndoLogState this will be
+	 * reset by following UndoLogAllocateDuringRecovery.
+	 */
+	slot->meta.unlogged.prevlog_xact_start = xlrec->prevlog_xact_start;
+	slot->meta.unlogged.prevlog_last_urp = xlrec->prevlog_last_urp;
+}
+
+void
+undolog_redo(XLogReaderState *record)
+{
+	uint8		info = XLogRecGetInfo(record) & ~XLR_INFO_MASK;
+
+	switch (info)
+	{
+		case XLOG_UNDOLOG_CREATE:
+			undolog_xlog_create(record);
+			break;
+		case XLOG_UNDOLOG_EXTEND:
+			undolog_xlog_extend(record);
+			break;
+		case XLOG_UNDOLOG_DISCARD:
+			undolog_xlog_discard(record);
+			break;
+		case XLOG_UNDOLOG_SWITCH:
+			undolog_xlog_switch(record);
+		default:
+			elog(PANIC, "undo_redo: unknown op code %u", info);
+	}
+}
+
+/*
+ * For assertions only.
+ */
+bool
+AmAttachedToUndoLogSlot(UndoLogSlot *slot)
+{
+	/*
+	 * In general, we can't access log's members without locking.  But this
+	 * function is intended only for asserting that you are attached, and
+	 * while you're attached the slot can't be recycled, so don't bother
+	 * locking.
+	 */
+	return CurrentSession->attached_undo_slots[slot->meta.category] == slot;
+}
+
+/*
+ * For testing use only.  This function is only used by the test_undo module.
+ */
+void
+UndoLogDetachFull(void)
+{
+	int		i;
+
+	for (i = 0; i < UndoLogCategories; ++i)
+		if (CurrentSession->attached_undo_slots[i])
+			detach_current_undo_log(i, true);
+}
diff --git a/src/backend/bootstrap/bootstrap.c b/src/backend/bootstrap/bootstrap.c
index 9238fbe98d..4671838dd2 100644
--- a/src/backend/bootstrap/bootstrap.c
+++ b/src/backend/bootstrap/bootstrap.c
@@ -20,6 +20,7 @@
 #include "access/genam.h"
 #include "access/heapam.h"
 #include "access/htup_details.h"
+#include "access/session.h"
 #include "access/tableam.h"
 #include "access/xact.h"
 #include "access/xlog_internal.h"
@@ -519,6 +520,8 @@ BootstrapModeMain(void)
 
 	InitPostgres(NULL, InvalidOid, NULL, InvalidOid, NULL, false);
 
+	InitializeSession();
+
 	/* Initialize stuff for bootstrap-file processing */
 	for (i = 0; i < MAXATTR; i++)
 	{
diff --git a/src/backend/catalog/system_views.sql b/src/backend/catalog/system_views.sql
index ea4c85e395..cbe04e4dcc 100644
--- a/src/backend/catalog/system_views.sql
+++ b/src/backend/catalog/system_views.sql
@@ -1062,6 +1062,10 @@ GRANT SELECT (subdbid, subname, subowner, subenabled, subslotname, subpublicatio
     ON pg_subscription TO public;
 
 
+CREATE VIEW pg_stat_undo_logs AS
+    SELECT *
+    FROM pg_stat_get_undo_logs();
+
 --
 -- We have a few function definitions in here, too.
 -- At some point there might be enough to justify breaking them out into
diff --git a/src/backend/commands/tablespace.c b/src/backend/commands/tablespace.c
index 84efb414d8..17c28b2c66 100644
--- a/src/backend/commands/tablespace.c
+++ b/src/backend/commands/tablespace.c
@@ -55,6 +55,7 @@
 #include "access/htup_details.h"
 #include "access/sysattr.h"
 #include "access/tableam.h"
+#include "access/undolog.h"
 #include "access/xact.h"
 #include "access/xlog.h"
 #include "access/xloginsert.h"
@@ -496,6 +497,20 @@ DropTableSpace(DropTableSpaceStmt *stmt)
 	 */
 	LWLockAcquire(TablespaceCreateLock, LW_EXCLUSIVE);
 
+	/*
+	 * Drop the undo logs in this tablespace.  This will fail (without
+	 * dropping anything) if there are undo logs that we can't afford to drop
+	 * because they contain non-discarded data or a transaction is in
+	 * progress.  Since we hold TablespaceCreateLock, no other session will be
+	 * able to attach to an undo log in this tablespace (or any tablespace
+	 * except default) concurrently.
+	 */
+	if (!DropUndoLogsInTablespace(tablespaceoid))
+		ereport(ERROR,
+				(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
+				 errmsg("tablespace \"%s\" cannot be dropped because it contains non-empty undo logs",
+						tablespacename)));
+
 	/*
 	 * Try to remove the physical infrastructure.
 	 */
@@ -1517,6 +1532,14 @@ tblspc_redo(XLogReaderState *record)
 	{
 		xl_tblspc_drop_rec *xlrec = (xl_tblspc_drop_rec *) XLogRecGetData(record);
 
+		/* This shouldn't be able to fail in recovery. */
+		LWLockAcquire(TablespaceCreateLock, LW_EXCLUSIVE);
+		if (!DropUndoLogsInTablespace(xlrec->ts_id))
+			ereport(ERROR,
+					(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
+					 errmsg("tablespace cannot be dropped because it contains non-empty undo logs")));
+		LWLockRelease(TablespaceCreateLock);
+
 		/*
 		 * If we issued a WAL record for a drop tablespace it implies that
 		 * there were no files in it at all when the DROP was done. That means
diff --git a/src/backend/postmaster/pgstat.c b/src/backend/postmaster/pgstat.c
index b4f2b28b51..c742861dae 100644
--- a/src/backend/postmaster/pgstat.c
+++ b/src/backend/postmaster/pgstat.c
@@ -4070,6 +4070,27 @@ pgstat_get_wait_io(WaitEventIO w)
 		case WAIT_EVENT_TWOPHASE_FILE_WRITE:
 			event_name = "TwophaseFileWrite";
 			break;
+		case WAIT_EVENT_UNDO_CHECKPOINT_READ:
+			event_name = "UndoCheckpointRead";
+			break;
+		case WAIT_EVENT_UNDO_CHECKPOINT_WRITE:
+			event_name = "UndoCheckpointWrite";
+			break;
+		case WAIT_EVENT_UNDO_CHECKPOINT_SYNC:
+			event_name = "UndoCheckpointSync";
+			break;
+		case WAIT_EVENT_UNDO_FILE_READ:
+			event_name = "UndoFileRead";
+			break;
+		case WAIT_EVENT_UNDO_FILE_WRITE:
+			event_name = "UndoFileWrite";
+			break;
+		case WAIT_EVENT_UNDO_FILE_FLUSH:
+			event_name = "UndoFileFlush";
+			break;
+		case WAIT_EVENT_UNDO_FILE_SYNC:
+			event_name = "UndoFileSync";
+			break;
 		case WAIT_EVENT_WALSENDER_TIMELINE_HISTORY_READ:
 			event_name = "WALSenderTimelineHistoryRead";
 			break;
diff --git a/src/backend/replication/basebackup.c b/src/backend/replication/basebackup.c
index d5f9b617c8..abcb5e5ad2 100644
--- a/src/backend/replication/basebackup.c
+++ b/src/backend/replication/basebackup.c
@@ -1368,7 +1368,7 @@ sendFile(const char *readfilename, const char *tarfilename, struct stat *statbuf
 	char	   *page;
 	size_t		pad;
 	PageHeader	phdr;
-	int			segmentno = 0;
+	BlockNumber	first_blkno = 0;
 	char	   *segmentpath;
 	bool		verify_checksum = false;
 
@@ -1406,12 +1406,18 @@ sendFile(const char *readfilename, const char *tarfilename, struct stat *statbuf
 			segmentpath = strstr(filename, ".");
 			if (segmentpath != NULL)
 			{
-				segmentno = atoi(segmentpath + 1);
-				if (segmentno == 0)
+				char	   *end;
+				if (strstr(readfilename, "undo"))
+					first_blkno = strtol(segmentpath + 1, &end, 16) / BLCKSZ;
+				else
+					first_blkno = strtol(segmentpath + 1, &end, 10) * RELSEG_SIZE;
+				if (*end != '\0')
 					ereport(ERROR,
-							(errmsg("invalid segment number %d in file \"%s\"",
-									segmentno, filename)));
+							(errmsg("invalid segment number in file \"%s\"",
+									filename)));
 			}
+			else
+				first_blkno = 0;
 		}
 	}
 
@@ -1451,7 +1457,7 @@ sendFile(const char *readfilename, const char *tarfilename, struct stat *statbuf
 				 */
 				if (!PageIsNew(page) && PageGetLSN(page) < startptr)
 				{
-					checksum = pg_checksum_page((char *) page, blkno + segmentno * RELSEG_SIZE);
+					checksum = pg_checksum_page((char *) page, blkno + first_blkno);
 					phdr = (PageHeader) page;
 					if (phdr->pd_checksum != checksum)
 					{
diff --git a/src/backend/replication/logical/decode.c b/src/backend/replication/logical/decode.c
index 151c3ef882..d3a9c4d64c 100644
--- a/src/backend/replication/logical/decode.c
+++ b/src/backend/replication/logical/decode.c
@@ -154,6 +154,7 @@ LogicalDecodingProcessRecord(LogicalDecodingContext *ctx, XLogReaderState *recor
 		case RM_COMMIT_TS_ID:
 		case RM_REPLORIGIN_ID:
 		case RM_GENERIC_ID:
+		case RM_UNDOLOG_ID:
 			/* just deal with xid, and done */
 			ReorderBufferProcessXid(ctx->reorder, XLogRecGetXid(record),
 									buf.origptr);
diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c
index 6f3a402854..2b1d60680e 100644
--- a/src/backend/storage/buffer/bufmgr.c
+++ b/src/backend/storage/buffer/bufmgr.c
@@ -177,6 +177,7 @@ static PrivateRefCountEntry *NewPrivateRefCountEntry(Buffer buffer);
 static PrivateRefCountEntry *GetPrivateRefCountEntry(Buffer buffer, bool do_move);
 static inline int32 GetPrivateRefCount(Buffer buffer);
 static void ForgetPrivateRefCountEntry(PrivateRefCountEntry *ref);
+static void InvalidateBuffer(BufferDesc *buf);
 
 /*
  * Ensure that the PrivateRefCountArray has sufficient space to store one more
@@ -620,10 +621,12 @@ ReadBuffer(Relation reln, BlockNumber blockNum)
  * valid, the page is zeroed instead of throwing an error. This is intended
  * for non-critical data, where the caller is prepared to repair errors.
  *
- * In RBM_ZERO_AND_LOCK mode, if the page isn't in buffer cache already, it's
+ * In RBM_ZERO mode, if the page isn't in buffer cache already, it's
  * filled with zeros instead of reading it from disk.  Useful when the caller
  * is going to fill the page from scratch, since this saves I/O and avoids
  * unnecessary failure if the page-on-disk has corrupt page headers.
+ *
+ * In RBM_ZERO_AND_LOCK mode, the page is zeroed and also locked.
  * The page is returned locked to ensure that the caller has a chance to
  * initialize the page before it's made visible to others.
  * Caution: do not use this mode to read a page that is beyond the relation's
@@ -674,24 +677,20 @@ ReadBufferExtended(Relation reln, ForkNumber forkNum, BlockNumber blockNum,
 /*
  * ReadBufferWithoutRelcache -- like ReadBufferExtended, but doesn't require
  *		a relcache entry for the relation.
- *
- * NB: At present, this function may only be used on permanent relations, which
- * is OK, because we only use it during XLOG replay.  If in the future we
- * want to use it on temporary or unlogged relations, we could pass additional
- * parameters.
  */
 Buffer
 ReadBufferWithoutRelcache(RelFileNode rnode, ForkNumber forkNum,
 						  BlockNumber blockNum, ReadBufferMode mode,
-						  BufferAccessStrategy strategy)
+						  BufferAccessStrategy strategy,
+						  char relpersistence)
 {
 	bool		hit;
 
-	SMgrRelation smgr = smgropen(rnode, InvalidBackendId);
-
-	Assert(InRecovery);
+	SMgrRelation smgr = smgropen(rnode,
+								 relpersistence == RELPERSISTENCE_TEMP
+								 ? MyBackendId : InvalidBackendId);
 
-	return ReadBuffer_common(smgr, RELPERSISTENCE_PERMANENT, forkNum, blockNum,
+	return ReadBuffer_common(smgr, relpersistence, forkNum, blockNum,
 							 mode, strategy, &hit);
 }
 
@@ -885,7 +884,9 @@ ReadBuffer_common(SMgrRelation smgr, char relpersistence, ForkNumber forkNum,
 		 * Read in the page, unless the caller intends to overwrite it and
 		 * just wants us to allocate a buffer.
 		 */
-		if (mode == RBM_ZERO_AND_LOCK || mode == RBM_ZERO_AND_CLEANUP_LOCK)
+		if (mode == RBM_ZERO ||
+			mode == RBM_ZERO_AND_LOCK ||
+			mode == RBM_ZERO_AND_CLEANUP_LOCK)
 			MemSet((char *) bufBlock, 0, BLCKSZ);
 		else
 		{
@@ -1339,6 +1340,61 @@ BufferAlloc(SMgrRelation smgr, char relpersistence, ForkNumber forkNum,
 	return buf;
 }
 
+/*
+ * ForgetBuffer -- drop a buffer from shared buffers
+ *
+ * If the buffer isn't present in shared buffers, nothing happens.  If it is
+ * present, it is discarded without making any attempt to write it back out to
+ * the operating system.  The caller must therefore somehow be sure that the
+ * data won't be needed for anything now or in the future.  It assumes that
+ * there is no concurrent access to the block, except that it might be being
+ * concurrently written.
+ */
+void
+ForgetBuffer(RelFileNode rnode, ForkNumber forkNum, BlockNumber blockNum)
+{
+	SMgrRelation smgr = smgropen(rnode, InvalidBackendId);
+	BufferTag	tag;			/* identity of target block */
+	uint32		hash;			/* hash value for tag */
+	LWLock	   *partitionLock;	/* buffer partition lock for it */
+	int			buf_id;
+	BufferDesc *bufHdr;
+	uint32		buf_state;
+
+	/* create a tag so we can lookup the buffer */
+	INIT_BUFFERTAG(tag, smgr->smgr_rnode.node, forkNum, blockNum);
+
+	/* determine its hash code and partition lock ID */
+	hash = BufTableHashCode(&tag);
+	partitionLock = BufMappingPartitionLock(hash);
+
+	/* see if the block is in the buffer pool */
+	LWLockAcquire(partitionLock, LW_SHARED);
+	buf_id = BufTableLookup(&tag, hash);
+	LWLockRelease(partitionLock);
+
+	/* didn't find it, so nothing to do */
+	if (buf_id < 0)
+		return;
+
+	/* take the buffer header lock */
+	bufHdr = GetBufferDescriptor(buf_id);
+	buf_state = LockBufHdr(bufHdr);
+
+	/*
+	 * The buffer might been evicted after we released the partition lock and
+	 * before we acquired the buffer header lock.  If so, the buffer we've
+	 * locked might contain some other data which we shouldn't touch. If the
+	 * buffer hasn't been recycled, we proceed to invalidate it.
+	 */
+	if (RelFileNodeEquals(bufHdr->tag.rnode, rnode) &&
+		bufHdr->tag.blockNum == blockNum &&
+		bufHdr->tag.forkNum == forkNum)
+		InvalidateBuffer(bufHdr);		/* releases spinlock */
+	else
+		UnlockBufHdr(bufHdr, buf_state);
+}
+
 /*
  * InvalidateBuffer -- mark a shared buffer invalid and return it to the
  * freelist.
diff --git a/src/backend/storage/buffer/localbuf.c b/src/backend/storage/buffer/localbuf.c
index f5f6a29222..a6fe3dbf82 100644
--- a/src/backend/storage/buffer/localbuf.c
+++ b/src/backend/storage/buffer/localbuf.c
@@ -272,6 +272,49 @@ LocalBufferAlloc(SMgrRelation smgr, ForkNumber forkNum, BlockNumber blockNum,
 	return bufHdr;
 }
 
+/*
+ * ForgetLocalBuffer - drop a buffer from local buffers
+ *
+ * This is similar to bufmgr.c's ForgetBuffer, except that we do not need
+ * to do any locking since this is all local.  As with that function, this
+ * must be used very carefully, since we'll cheerfully throw away dirty
+ * buffers without any attempt to write them.
+ */
+void
+ForgetLocalBuffer(RelFileNode rnode, ForkNumber forkNum, BlockNumber blockNum)
+{
+	SMgrRelation smgr = smgropen(rnode, BackendIdForTempRelations());
+	BufferTag	tag;					/* identity of target block */
+	LocalBufferLookupEnt *hresult;
+	BufferDesc *bufHdr;
+	uint32		buf_state;
+
+	/*
+	 * If somehow this is the first request in the session, there's nothing to
+	 * do.  (This probably shouldn't happen, though.)
+	 */
+	if (LocalBufHash == NULL)
+		return;
+
+	/* create a tag so we can lookup the buffer */
+	INIT_BUFFERTAG(tag, smgr->smgr_rnode.node, forkNum, blockNum);
+
+	/* see if the block is in the local buffer pool */
+	hresult = (LocalBufferLookupEnt *)
+	hash_search(LocalBufHash, (void *) &tag, HASH_REMOVE, NULL);
+
+	/* didn't find it, so nothing to do */
+	if (!hresult)
+		return;
+
+	/* mark buffer invalid */
+	bufHdr = GetLocalBufferDescriptor(hresult->id);
+	CLEAR_BUFFERTAG(bufHdr->tag);
+	buf_state = pg_atomic_read_u32(&bufHdr->state);
+	buf_state &= ~(BM_VALID | BM_TAG_VALID | BM_DIRTY);
+	pg_atomic_unlocked_write_u32(&bufHdr->state, buf_state);
+}
+
 /*
  * MarkLocalBufferDirty -
  *	  mark a local buffer dirty
diff --git a/src/backend/storage/file/fd.c b/src/backend/storage/file/fd.c
index 315c74c745..3c4f0537d4 100644
--- a/src/backend/storage/file/fd.c
+++ b/src/backend/storage/file/fd.c
@@ -322,7 +322,6 @@ static void pre_sync_fname(const char *fname, bool isdir, int elevel);
 static void datadir_fsync_fname(const char *fname, bool isdir, int elevel);
 static void unlink_if_exists_fname(const char *fname, bool isdir, int elevel);
 
-static int	fsync_fname_ext(const char *fname, bool isdir, bool ignore_perm, int elevel);
 static int	fsync_parent_path(const char *fname, int elevel);
 
 
@@ -3345,7 +3344,7 @@ unlink_if_exists_fname(const char *fname, bool isdir, int elevel)
  *
  * Returns 0 if the operation succeeded, -1 otherwise.
  */
-static int
+int
 fsync_fname_ext(const char *fname, bool isdir, bool ignore_perm, int elevel)
 {
 	int			fd;
diff --git a/src/backend/storage/ipc/ipci.c b/src/backend/storage/ipc/ipci.c
index d7d733530f..12c324925c 100644
--- a/src/backend/storage/ipc/ipci.c
+++ b/src/backend/storage/ipc/ipci.c
@@ -21,6 +21,7 @@
 #include "access/nbtree.h"
 #include "access/subtrans.h"
 #include "access/twophase.h"
+#include "access/undolog.h"
 #include "commands/async.h"
 #include "miscadmin.h"
 #include "pgstat.h"
@@ -125,6 +126,7 @@ CreateSharedMemoryAndSemaphores(int port)
 		size = add_size(size, ProcGlobalShmemSize());
 		size = add_size(size, XLOGShmemSize());
 		size = add_size(size, CLOGShmemSize());
+		size = add_size(size, UndoLogShmemSize());
 		size = add_size(size, CommitTsShmemSize());
 		size = add_size(size, SUBTRANSShmemSize());
 		size = add_size(size, TwoPhaseShmemSize());
@@ -213,6 +215,7 @@ CreateSharedMemoryAndSemaphores(int port)
 	 */
 	XLOGShmemInit();
 	CLOGShmemInit();
+	UndoLogShmemInit();
 	CommitTsShmemInit();
 	SUBTRANSShmemInit();
 	MultiXactShmemInit();
diff --git a/src/backend/storage/lmgr/lwlock.c b/src/backend/storage/lmgr/lwlock.c
index bc1aa88322..5df658d349 100644
--- a/src/backend/storage/lmgr/lwlock.c
+++ b/src/backend/storage/lmgr/lwlock.c
@@ -522,6 +522,8 @@ RegisterLWLockTranches(void)
 	LWLockRegisterTranche(LWTRANCHE_PARALLEL_APPEND, "parallel_append");
 	LWLockRegisterTranche(LWTRANCHE_PARALLEL_HASH_JOIN, "parallel_hash_join");
 	LWLockRegisterTranche(LWTRANCHE_SXACT, "serializable_xact");
+	LWLockRegisterTranche(LWTRANCHE_UNDOLOG, "undo_log");
+	LWLockRegisterTranche(LWTRANCHE_UNDODISCARD, "undo_discard");
 
 	/* Register named tranches. */
 	for (i = 0; i < NamedLWLockTrancheRequests; i++)
diff --git a/src/backend/storage/lmgr/lwlocknames.txt b/src/backend/storage/lmgr/lwlocknames.txt
index db47843229..4b42a1cf0b 100644
--- a/src/backend/storage/lmgr/lwlocknames.txt
+++ b/src/backend/storage/lmgr/lwlocknames.txt
@@ -49,3 +49,4 @@ MultiXactTruncationLock				41
 OldSnapshotTimeMapLock				42
 LogicalRepWorkerLock				43
 CLogTruncationLock					44
+UndoLogLock                                      45
diff --git a/src/backend/storage/smgr/Makefile b/src/backend/storage/smgr/Makefile
index e486b7c0d1..ff2e5e2db4 100644
--- a/src/backend/storage/smgr/Makefile
+++ b/src/backend/storage/smgr/Makefile
@@ -12,6 +12,6 @@ subdir = src/backend/storage/smgr
 top_builddir = ../../../..
 include $(top_builddir)/src/Makefile.global
 
-OBJS = md.o smgr.o
+OBJS = md.o smgr.o undofile.o
 
 include $(top_srcdir)/src/backend/common.mk
diff --git a/src/backend/storage/smgr/smgr.c b/src/backend/storage/smgr/smgr.c
index d00b275e46..93df85cba9 100644
--- a/src/backend/storage/smgr/smgr.c
+++ b/src/backend/storage/smgr/smgr.c
@@ -17,11 +17,13 @@
  */
 #include "postgres.h"
 
+#include "catalog/database_internal.h"
 #include "lib/ilist.h"
 #include "storage/bufmgr.h"
 #include "storage/ipc.h"
 #include "storage/md.h"
 #include "storage/smgr.h"
+#include "storage/undofile.h"
 #include "utils/hsearch.h"
 #include "utils/inval.h"
 
@@ -81,6 +83,24 @@ static const f_smgr smgrsw[] = {
 		.smgr_nblocks = mdnblocks,
 		.smgr_truncate = mdtruncate,
 		.smgr_immedsync = mdimmedsync,
+	},
+	/* undo logs */
+	{
+		.smgr_init = undofile_init,
+		.smgr_shutdown = undofile_shutdown,
+		.smgr_open = undofile_open,
+		.smgr_close = undofile_close,
+		.smgr_create = undofile_create,
+		.smgr_exists = undofile_exists,
+		.smgr_unlink = undofile_unlink,
+		.smgr_extend = undofile_extend,
+		.smgr_prefetch = undofile_prefetch,
+		.smgr_read = undofile_read,
+		.smgr_write = undofile_write,
+		.smgr_writeback = undofile_writeback,
+		.smgr_nblocks = undofile_nblocks,
+		.smgr_truncate = undofile_truncate,
+		.smgr_immedsync = undofile_immedsync,
 	}
 };
 
@@ -105,6 +125,8 @@ smgrwhich(RelFileNode rnode)
 {
 	switch (rnode.dbNode)
 	{
+		case UndoDbOid:
+			return 1;			/* undofile.c */
 		default:
 			return 0;			/* md.c */
 	}
@@ -189,6 +211,7 @@ smgropen(RelFileNode rnode, BackendId backend)
 		reln->smgr_fsm_nblocks = InvalidBlockNumber;
 		reln->smgr_vm_nblocks = InvalidBlockNumber;
 		reln->smgr_which = smgrwhich(rnode);
+		reln->private_data = NULL;
 
 		/* implementation-specific initialization */
 		smgrsw[reln->smgr_which].smgr_open(reln);
diff --git a/src/backend/storage/smgr/undofile.c b/src/backend/storage/smgr/undofile.c
new file mode 100644
index 0000000000..04d4514e35
--- /dev/null
+++ b/src/backend/storage/smgr/undofile.c
@@ -0,0 +1,422 @@
+/*
+ * undofile.h
+ *
+ * PostgreSQL undo file manager.  This module provides SMGR-compatible
+ * interface to the files that back undo logs on the filesystem, so that undo
+ * log data can use the shared buffer pool.  Other aspects of undo log
+ * management are provided by undolog.c, so the SMGR interfaces not directly
+ * concerned with reading, writing and flushing data are unimplemented.
+ *
+ * Portions Copyright (c) 1996-2019, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * src/storage/smgr/undofile.c
+ */
+
+#include "postgres.h"
+
+#include "access/undolog.h"
+#include "access/xlog.h"
+#include "catalog/database_internal.h"
+#include "miscadmin.h"
+#include "pgstat.h"
+#include "postmaster/bgwriter.h"
+#include "storage/fd.h"
+#include "storage/smgr.h"
+#include "storage/undofile.h"
+#include "utils/memutils.h"
+
+/* Populate a file tag describing an undo segment file. */
+#define INIT_UNDOFILETAG(a,xx_logno,xx_tbspc,xx_segno) \
+( \
+	memset(&(a), 0, sizeof(FileTag)), \
+	(a).handler = SYNC_HANDLER_UNDO, \
+	(a).rnode.dbNode = UndoDbOid, \
+	(a).rnode.spcNode = (xx_tbspc), \
+	(a).rnode.relNode = (xx_logno), \
+	(a).segno = (xx_segno) \
+)
+
+/*
+ * While md.c expects random access and has a small number of huge
+ * segments, undofile.c manages a potentially very large number of smaller
+ * segments and has a less random access pattern.  Therefore, instead of
+ * keeping a potentially huge array of vfds we'll just keep the most
+ * recently accessed N.
+ *
+ * For now, N == 1, so we just need to hold onto one 'File' handle.
+ */
+typedef struct UndoFileState
+{
+	int		mru_segno;
+	File	mru_file;
+} UndoFileState;
+
+static MemoryContext UndoFileCxt;
+
+static File undofile_open_segment_file(Oid relNode, Oid spcNode,
+									   BlockNumber segno, bool missing_ok);
+static File undofile_get_segment_file(SMgrRelation reln, BlockNumber segno);
+
+void
+undofile_init(void)
+{
+	UndoFileCxt = AllocSetContextCreate(TopMemoryContext,
+										"UndoFileSmgr",
+										ALLOCSET_DEFAULT_SIZES);
+}
+
+void
+undofile_shutdown(void)
+{
+}
+
+void
+undofile_open(SMgrRelation reln)
+{
+	UndoFileState *state;
+
+	state = MemoryContextAllocZero(UndoFileCxt, sizeof(UndoFileState));
+	reln->private_data = state;
+}
+
+void
+undofile_close(SMgrRelation reln, ForkNumber forknum)
+{
+}
+
+void
+undofile_create(SMgrRelation reln, ForkNumber forknum, bool isRedo)
+{
+	/*
+	 * File creation is managed by undolog.c, but xlogutils.c likes to call
+	 * this just in case.  Ignore.
+	 */
+}
+
+bool
+undofile_exists(SMgrRelation reln, ForkNumber forknum)
+{
+	elog(ERROR, "undofile_exists is not supported");
+
+	return false;		/* not reached */
+}
+
+void
+undofile_unlink(RelFileNodeBackend rnode, ForkNumber forknum, bool isRedo)
+{
+	elog(ERROR, "undofile_unlink is not supported");
+}
+
+void
+undofile_extend(SMgrRelation reln, ForkNumber forknum,
+				BlockNumber blocknum, char *buffer,
+				bool skipFsync)
+{
+	elog(ERROR, "undofile_extend is not supported");
+}
+
+void
+undofile_prefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum)
+{
+	elog(ERROR, "undofile_prefetch is not supported");
+}
+
+void
+undofile_read(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
+			  char *buffer)
+{
+	File		file;
+	off_t		seekpos;
+	int			nbytes;
+
+	Assert(forknum == MAIN_FORKNUM);
+	file = undofile_get_segment_file(reln, blocknum / UNDOSEG_SIZE);
+	seekpos = (off_t) BLCKSZ * (blocknum % ((BlockNumber) UNDOSEG_SIZE));
+	Assert(seekpos < (off_t) BLCKSZ * UNDOSEG_SIZE);
+	nbytes = FileRead(file, buffer, BLCKSZ, seekpos, WAIT_EVENT_UNDO_FILE_READ);
+	if (nbytes != BLCKSZ)
+	{
+		if (nbytes < 0)
+			ereport(ERROR,
+					(errcode_for_file_access(),
+					 errmsg("could not read block %u in file \"%s\": %m",
+							blocknum, FilePathName(file))));
+		ereport(ERROR,
+				(errcode(ERRCODE_DATA_CORRUPTED),
+				 errmsg("could not read block %u in file \"%s\": read only %d of %d bytes",
+						blocknum, FilePathName(file),
+						nbytes, BLCKSZ)));
+	}
+}
+
+void
+undofile_write(SMgrRelation reln, ForkNumber forknum,
+			   BlockNumber blocknum, char *buffer,
+			   bool skipFsync)
+{
+	File		file;
+	off_t		seekpos;
+	int			nbytes;
+
+	Assert(forknum == MAIN_FORKNUM);
+	file = undofile_get_segment_file(reln, blocknum / UNDOSEG_SIZE);
+	seekpos = (off_t) BLCKSZ * (blocknum % ((BlockNumber) UNDOSEG_SIZE));
+	Assert(seekpos < (off_t) BLCKSZ * UNDOSEG_SIZE);
+	nbytes = FileWrite(file, buffer, BLCKSZ, seekpos, WAIT_EVENT_UNDO_FILE_WRITE);
+	if (nbytes != BLCKSZ)
+	{
+		if (nbytes < 0)
+			ereport(ERROR,
+					(errcode_for_file_access(),
+					 errmsg("could not write block %u in file \"%s\": %m",
+							blocknum, FilePathName(file))));
+		/*
+		 * short write: unexpected, because this should be overwriting an
+		 * entirely pre-allocated segment file
+		 */
+		ereport(ERROR,
+				(errcode(ERRCODE_DISK_FULL),
+				 errmsg("could not write block %u in file \"%s\": wrote only %d of %d bytes",
+						blocknum, FilePathName(file),
+						nbytes, BLCKSZ)));
+	}
+
+	/* Tell checkpointer this file is dirty. */
+	if (!skipFsync && !SmgrIsTemp(reln))
+	{
+		undofile_request_sync(reln->smgr_rnode.node.relNode,
+							  blocknum / UNDOSEG_SIZE,
+							  reln->smgr_rnode.node.spcNode);
+	}
+}
+
+void
+undofile_writeback(SMgrRelation reln, ForkNumber forknum,
+				   BlockNumber blocknum, BlockNumber nblocks)
+{
+	while (nblocks > 0)
+	{
+		File	file;
+		int		nflush;
+
+		file = undofile_get_segment_file(reln, blocknum / UNDOSEG_SIZE);
+
+		/* compute number of desired writes within the current segment */
+		nflush = Min(nblocks,
+					 1 + UNDOSEG_SIZE - (blocknum % UNDOSEG_SIZE));
+
+		FileWriteback(file,
+					  (blocknum % UNDOSEG_SIZE) * BLCKSZ,
+					  nflush * BLCKSZ, WAIT_EVENT_UNDO_FILE_FLUSH);
+
+		nblocks -= nflush;
+		blocknum += nflush;
+	}
+}
+
+BlockNumber
+undofile_nblocks(SMgrRelation reln, ForkNumber forknum)
+{
+	/*
+	 * xlogutils.c likes to call this to decide whether to read or extend; for
+	 * now we lie and say the relation is big as possible.
+	 */
+	return UndoLogMaxSize / BLCKSZ;
+}
+
+void
+undofile_truncate(SMgrRelation reln, ForkNumber forknum, BlockNumber nblocks)
+{
+	elog(ERROR, "undofile_truncate is not supported");
+}
+
+void
+undofile_immedsync(SMgrRelation reln, ForkNumber forknum)
+{
+	elog(ERROR, "undofile_immedsync is not supported");
+}
+
+static File undofile_open_segment_file(Oid relNode, Oid spcNode,
+									   BlockNumber segno, bool missing_ok)
+{
+	File		file;
+	char		path[MAXPGPATH];
+
+	UndoLogSegmentPath(relNode, segno, spcNode, path);
+	file = PathNameOpenFile(path, O_RDWR | PG_BINARY);
+
+	if (file <= 0 && (!missing_ok || errno != ENOENT))
+		elog(ERROR, "cannot open undo segment file '%s': %m", path);
+
+	return file;
+}
+
+/*
+ * Get a File for a particular segment of a SMgrRelation representing an undo
+ * log.
+ */
+static File undofile_get_segment_file(SMgrRelation reln, BlockNumber segno)
+{
+	UndoFileState *state = (UndoFileState *) reln->private_data;
+
+	/* If we have a file open already, check if we need to close it. */
+	if (state->mru_file > 0 && state->mru_segno != segno)
+	{
+		/* These are not the blocks we're looking for. */
+		FileClose(state->mru_file);
+		state->mru_file = 0;
+	}
+
+	/* Check if we need to open a new file. */
+	if (state->mru_file <= 0)
+	{
+		state->mru_file =
+			undofile_open_segment_file(reln->smgr_rnode.node.relNode,
+									   reln->smgr_rnode.node.spcNode,
+									   segno, InRecovery);
+		if (InRecovery && state->mru_file <= 0)
+		{
+			/*
+			 * If in recovery, we may be trying to access a file that will
+			 * later be unlinked.  Tolerate missing files, creating a new
+			 * zero-filled file as required.
+			 */
+			UndoLogNewSegment(reln->smgr_rnode.node.relNode,
+							  reln->smgr_rnode.node.spcNode,
+							  segno);
+			state->mru_file =
+				undofile_open_segment_file(reln->smgr_rnode.node.relNode,
+										   reln->smgr_rnode.node.spcNode,
+										   segno, false);
+			Assert(state->mru_file > 0);
+		}
+		state->mru_segno = segno;
+	}
+
+	return state->mru_file;
+}
+
+/*
+ * Callback to handle a queued sync request.
+ */
+int
+undofile_syncfiletag(const FileTag *tag, char *path)
+{
+	SMgrRelation reln = smgropen(tag->rnode, InvalidBackendId);
+	File		file;
+
+	if (tag->rnode.relNode == (Oid) InvalidUndoLogNumber)
+	{
+		/* Sync parent directory for this tablespace. */
+		UndoLogDirectory(tag->rnode.spcNode, path);
+
+		/* The caller (sync.c) will do appropriate error reporting. */
+		return fsync_fname_ext(path, true, false, WARNING);
+	}
+	else
+	{
+		/* Sync a segment file. */
+		UndoLogSegmentPath(tag->rnode.relNode, tag->segno, tag->rnode.spcNode,
+						   path);
+
+		file = undofile_get_segment_file(reln, tag->segno);
+		if (file <= 0)
+		{
+			/* errno set by undofile_get_segment_file() */
+			return -1;
+		}
+
+		return FileSync(file, WAIT_EVENT_UNDO_FILE_SYNC);
+	}
+}
+
+/*
+ * Filtering callback used by SYNC_FILTER_REQUEST to forget some requests.
+ */
+bool
+undofile_filetagmatches(const FileTag *tag, const FileTag *candidate)
+{
+	/*
+	 * We use SYNC_FILTER_REQUEST to forget requests for a given tablespace,
+	 * before removing all undo files in the tablespace.
+	 */
+	return tag->rnode.spcNode == candidate->rnode.spcNode;
+}
+
+/*
+ * Tell the checkpointer to sync a segment file.
+ */
+void
+undofile_request_sync(UndoLogNumber logno, BlockNumber segno, Oid tablespace)
+{
+	char		path[MAXPGPATH];
+	FileTag		tag;
+
+	INIT_UNDOFILETAG(tag, logno, tablespace, segno);
+
+	/* Try to send to the checkpointer, but if out of space, do it here. */
+	if (!RegisterSyncRequest(&tag, SYNC_REQUEST, false))
+	{
+		if (undofile_syncfiletag(&tag, path) < 0)
+			ereport(data_sync_elevel(ERROR),
+					(errmsg("could not fsync file \"%s\": %m", path)));
+	}
+}
+
+/*
+ * Tell the checkpointer to forget about any sync requests for a given segment
+ * file, because it's about to go away.
+ */
+void
+undofile_forget_sync(UndoLogNumber logno, BlockNumber segno, Oid tablespace)
+{
+	FileTag		tag;
+
+	INIT_UNDOFILETAG(tag, logno, tablespace, segno);
+
+	/* Send, and keep retrying if out of space. */
+	(void) RegisterSyncRequest(&tag, SYNC_FORGET_REQUEST, true);
+}
+
+/*
+ * Tell the checkpointer to fsync the undo directory in a given tablespace,
+ * because we have created or renamed files inside it.
+ */
+void
+undofile_request_sync_dir(Oid tablespace)
+{
+	char		path[MAXPGPATH];
+	FileTag		tag;
+
+	/* We use a special logno and segno to mean "the directory". */
+	INIT_UNDOFILETAG(tag, (Oid) InvalidUndoLogNumber, tablespace,
+					 InvalidBlockNumber);
+
+	/* Try to send to the checkpointer, but if out of space, do it here. */
+	if (!RegisterSyncRequest(&tag, SYNC_REQUEST, false))
+	{
+		if (undofile_syncfiletag(&tag, path) < 0)
+			ereport(data_sync_elevel(ERROR),
+					(errmsg("could not fsync directory \"%s\": %m", path)));
+	}
+}
+
+/*
+ * Tell the checkpointer to forget about all sync requests for a given
+ * tablespace, because it's about to go away.
+ */
+void
+undofile_forget_sync_tablespace(Oid tablespace)
+{
+	FileTag		tag;
+
+	INIT_UNDOFILETAG(tag, (Oid) InvalidUndoLogNumber, tablespace,
+					 InvalidBlockNumber);
+
+	/*
+	 * Tell checkpointer to forget about any request for this tag, and keep
+	 * waiting if there is not enough space.
+	 */
+	(void) RegisterSyncRequest(&tag, SYNC_FILTER_REQUEST, true);
+}
diff --git a/src/backend/storage/sync/sync.c b/src/backend/storage/sync/sync.c
index f329c3fd66..db7b417d83 100644
--- a/src/backend/storage/sync/sync.c
+++ b/src/backend/storage/sync/sync.c
@@ -28,6 +28,7 @@
 #include "storage/bufmgr.h"
 #include "storage/ipc.h"
 #include "storage/md.h"
+#include "storage/undofile.h"
 #include "utils/hsearch.h"
 #include "utils/memutils.h"
 #include "utils/inval.h"
@@ -96,6 +97,11 @@ static const SyncOps syncsw[] = {
 		.sync_syncfiletag = mdsyncfiletag,
 		.sync_unlinkfiletag = mdunlinkfiletag,
 		.sync_filetagmatches = mdfiletagmatches
+	},
+	/* undo log segment files */
+	{
+		.sync_syncfiletag = undofile_syncfiletag,
+		.sync_filetagmatches = undofile_filetagmatches
 	}
 };
 
diff --git a/src/backend/utils/init/postinit.c b/src/backend/utils/init/postinit.c
index 43b9f17f72..5d9af89036 100644
--- a/src/backend/utils/init/postinit.c
+++ b/src/backend/utils/init/postinit.c
@@ -25,6 +25,7 @@
 #include "access/session.h"
 #include "access/sysattr.h"
 #include "access/tableam.h"
+#include "access/undolog.h"
 #include "access/xact.h"
 #include "access/xlog.h"
 #include "catalog/catalog.h"
@@ -561,6 +562,7 @@ BaseInit(void)
 	InitSync();
 	smgrinit();
 	InitBufferPoolAccess();
+	UndoLogInit();
 }
 
 
diff --git a/src/backend/utils/misc/guc.c b/src/backend/utils/misc/guc.c
index fc463601ff..296fb77166 100644
--- a/src/backend/utils/misc/guc.c
+++ b/src/backend/utils/misc/guc.c
@@ -121,6 +121,7 @@ extern int	CommitDelay;
 extern int	CommitSiblings;
 extern char *default_tablespace;
 extern char *temp_tablespaces;
+extern char *undo_tablespaces;
 extern bool ignore_checksum_failure;
 extern bool synchronize_seqscans;
 
@@ -3667,6 +3668,17 @@ static struct config_string ConfigureNamesString[] =
 		check_temp_tablespaces, assign_temp_tablespaces, NULL
 	},
 
+	{
+		{"undo_tablespaces", PGC_USERSET, CLIENT_CONN_STATEMENT,
+			gettext_noop("Sets the tablespace(s) to use for undo logs."),
+			NULL,
+			GUC_LIST_INPUT | GUC_LIST_QUOTE
+		},
+		&undo_tablespaces,
+		"",
+		check_undo_tablespaces, assign_undo_tablespaces, NULL
+	},
+
 	{
 		{"dynamic_library_path", PGC_SUSET, CLIENT_CONN_OTHER,
 			gettext_noop("Sets the path for dynamically loadable modules."),
diff --git a/src/bin/initdb/initdb.c b/src/bin/initdb/initdb.c
index 04d77ad700..bbcbdfdb87 100644
--- a/src/bin/initdb/initdb.c
+++ b/src/bin/initdb/initdb.c
@@ -210,11 +210,13 @@ static const char *const subdirs[] = {
 	"pg_snapshots",
 	"pg_subtrans",
 	"pg_twophase",
+	"pg_undo",
 	"pg_multixact",
 	"pg_multixact/members",
 	"pg_multixact/offsets",
 	"base",
 	"base/1",
+	"base/undo",
 	"pg_replslot",
 	"pg_tblspc",
 	"pg_stat",
diff --git a/src/bin/pg_checksums/pg_checksums.c b/src/bin/pg_checksums/pg_checksums.c
index 8c00ec9a3b..8e83be9d52 100644
--- a/src/bin/pg_checksums/pg_checksums.c
+++ b/src/bin/pg_checksums/pg_checksums.c
@@ -167,7 +167,7 @@ skipfile(const char *fn)
 }
 
 static void
-scan_file(const char *fn, BlockNumber segmentno)
+scan_file(const char *fn, BlockNumber first_blkno)
 {
 	PGAlignedBlock buf;
 	PageHeader	header = (PageHeader) buf.data;
@@ -208,7 +208,7 @@ scan_file(const char *fn, BlockNumber segmentno)
 		if (PageIsNew(header))
 			continue;
 
-		csum = pg_checksum_page(buf.data, blockno + segmentno * RELSEG_SIZE);
+		csum = pg_checksum_page(buf.data, blockno + first_blkno);
 		current_size += r;
 		if (mode == PG_MODE_CHECK)
 		{
@@ -310,7 +310,7 @@ scan_directory(const char *basedir, const char *subdir, bool sizeonly)
 			char		fnonly[MAXPGPATH];
 			char	   *forkpath,
 					   *segmentpath;
-			BlockNumber segmentno = 0;
+			BlockNumber first_blkno;
 
 			if (skipfile(de->d_name))
 				continue;
@@ -325,15 +325,22 @@ scan_directory(const char *basedir, const char *subdir, bool sizeonly)
 			segmentpath = strchr(fnonly, '.');
 			if (segmentpath != NULL)
 			{
+				char	   *end;
+
 				*segmentpath++ = '\0';
-				segmentno = atoi(segmentpath);
-				if (segmentno == 0)
+				if (strstr(segmentpath, "undo"))
+					first_blkno = strtol(segmentpath, &end, 16) / BLCKSZ;
+				else
+					first_blkno = strtol(segmentpath, &end, 10) * RELSEG_SIZE;
+				if (*end != '\0')
 				{
-					pg_log_error("invalid segment number %d in file name \"%s\"",
-								 segmentno, fn);
+					pg_log_error("invalid segment number in file name \"%s\"",
+								 fn);
 					exit(1);
 				}
 			}
+			else
+				first_blkno = 0;
 
 			forkpath = strchr(fnonly, '_');
 			if (forkpath != NULL)
@@ -350,7 +357,7 @@ scan_directory(const char *basedir, const char *subdir, bool sizeonly)
 			 * the items in the data folder.
 			 */
 			if (!sizeonly)
-				scan_file(fn, segmentno);
+				scan_file(fn, first_blkno);
 		}
 #ifndef WIN32
 		else if (S_ISDIR(st.st_mode) || S_ISLNK(st.st_mode))
diff --git a/src/bin/pg_resetwal/pg_resetwal.c b/src/bin/pg_resetwal/pg_resetwal.c
index ff0f8ea5e7..ad96fe75af 100644
--- a/src/bin/pg_resetwal/pg_resetwal.c
+++ b/src/bin/pg_resetwal/pg_resetwal.c
@@ -80,6 +80,7 @@ static bool ReadControlFile(void);
 static void GuessControlValues(void);
 static void PrintControlValues(bool guessed);
 static void PrintNewControlValues(void);
+static void AdjustRedoLocation(const char *DataDir);
 static void RewriteControlFile(void);
 static void FindEndOfXLOG(void);
 static void KillExistingXLOG(void);
@@ -510,6 +511,7 @@ main(int argc, char *argv[])
 	/*
 	 * Else, do the dirty deed.
 	 */
+	AdjustRedoLocation(DataDir);
 	RewriteControlFile();
 	KillExistingXLOG();
 	KillExistingArchiveStatus();
@@ -889,6 +891,80 @@ PrintNewControlValues(void)
 }
 
 
+/*
+ * Compute the new redo, and move the pg_undo file to match if necessary.
+ * Rather than renaming it, we'll create a new copy, so that a failure that
+ * occurs before the controlfile is rewritten won't be fatal.
+ */
+static void
+AdjustRedoLocation(const char *DataDir)
+{
+	uint64		old_redo = ControlFile.checkPointCopy.redo;
+	char		old_pg_undo_path[MAXPGPATH];
+	char		new_pg_undo_path[MAXPGPATH];
+	int			old_fd;
+	int			new_fd;
+	ssize_t		nread;
+	ssize_t		nwritten;
+	char		buffer[1024];
+
+	/*
+	 * Adjust fields as needed to force an empty XLOG starting at
+	 * newXlogSegNo.
+	 */
+	XLogSegNoOffsetToRecPtr(newXlogSegNo, SizeOfXLogLongPHD, WalSegSz,
+							ControlFile.checkPointCopy.redo);
+
+	/* If the redo location didn't move, we don't need to do anything. */
+	if (old_redo == ControlFile.checkPointCopy.redo)
+		return;
+
+	/*
+	 * Otherwise we copy the pg_undo file, because its name must match the redo
+	 * location.
+	 */
+	snprintf(old_pg_undo_path,
+			 sizeof(old_pg_undo_path),
+			 "pg_undo/%016" INT64_MODIFIER "X",
+			 old_redo);
+	snprintf(new_pg_undo_path,
+			 sizeof(new_pg_undo_path),
+			 "pg_undo/%016" INT64_MODIFIER "X",
+			 ControlFile.checkPointCopy.redo);
+	old_fd = open(old_pg_undo_path, O_RDONLY, 0);
+	if (old_fd < 0)
+	{
+		pg_log_error("could not open \"%s\": %m", old_pg_undo_path);
+		exit(1);
+	}
+	new_fd = open(new_pg_undo_path, O_RDWR | O_CREAT, 0644);
+	if (new_fd < 0)
+	{
+		pg_log_error("could not create \"%s\": %m", new_pg_undo_path);
+		exit(1);
+	}
+	while ((nread = read(old_fd, buffer, sizeof(buffer))) > 0)
+	{
+		do
+		{
+			nwritten = write(new_fd, buffer, nread);
+			if (nwritten < 0)
+			{
+				pg_log_error("could not write to \"%s\": %m", new_pg_undo_path);
+				exit(1);
+			}
+			nread -= nwritten;
+		} while (nread > 0);
+	}
+	if (nread < 0)
+	{
+		pg_log_error("could not read from \"%s\": %m", old_pg_undo_path);
+		exit(1);
+	}
+	close(old_fd);
+	close(new_fd);
+}
+
 /*
  * Write out the new pg_control file.
  */
diff --git a/src/bin/pg_upgrade/Makefile b/src/bin/pg_upgrade/Makefile
index 2cb829acd5..82c80336c5 100644
--- a/src/bin/pg_upgrade/Makefile
+++ b/src/bin/pg_upgrade/Makefile
@@ -9,7 +9,7 @@ include $(top_builddir)/src/Makefile.global
 
 OBJS = check.o controldata.o dump.o exec.o file.o function.o info.o \
        option.o parallel.o pg_upgrade.o relfilenode.o server.o \
-       tablespace.o util.o version.o $(WIN32RES)
+       tablespace.o undo.o util.o version.o $(WIN32RES)
 
 override CPPFLAGS := -DDLSUFFIX=\"$(DLSUFFIX)\" -I$(srcdir) -I$(libpq_srcdir) $(CPPFLAGS)
 LDFLAGS_INTERNAL += -L$(top_builddir)/src/fe_utils -lpgfeutils $(libpq_pgport)
diff --git a/src/bin/pg_upgrade/check.c b/src/bin/pg_upgrade/check.c
index 617270f101..4a431dce83 100644
--- a/src/bin/pg_upgrade/check.c
+++ b/src/bin/pg_upgrade/check.c
@@ -21,6 +21,7 @@ static void check_locale_and_encoding(DbInfo *olddb, DbInfo *newdb);
 static bool equivalent_locale(int category, const char *loca, const char *locb);
 static void check_is_install_user(ClusterInfo *cluster);
 static void check_proper_datallowconn(ClusterInfo *cluster);
+static void check_for_undo_data(ClusterInfo *cluster);
 static void check_for_prepared_transactions(ClusterInfo *cluster);
 static void check_for_isn_and_int8_passing_mismatch(ClusterInfo *cluster);
 static void check_for_tables_with_oids(ClusterInfo *cluster);
@@ -97,6 +98,7 @@ check_and_dump_old_cluster(bool live_check)
 	 */
 	check_is_install_user(&old_cluster);
 	check_proper_datallowconn(&old_cluster);
+	check_for_undo_data(&old_cluster);
 	check_for_prepared_transactions(&old_cluster);
 	check_for_reg_data_type_usage(&old_cluster);
 	check_for_isn_and_int8_passing_mismatch(&old_cluster);
@@ -171,6 +173,7 @@ check_new_cluster(void)
 
 	check_is_install_user(&new_cluster);
 
+	check_for_undo_data(&new_cluster);
 	check_for_prepared_transactions(&new_cluster);
 }
 
@@ -800,6 +803,46 @@ check_for_prepared_transactions(ClusterInfo *cluster)
 }
 
 
+/*
+ *	check_for_live_undo_data()
+ *
+ *	Make sure there are no live undo records (aborted transactions that have
+ *	not been rolled back, or committed transactions whose undo data has not
+ *	yet been discarded).
+ */
+static void
+check_for_undo_data(ClusterInfo *cluster)
+{
+	PGresult   *res;
+	PGconn	   *conn;
+
+	if (GET_MAJOR_VERSION(old_cluster.major_version) < 1300)
+		return;
+
+	conn = connectToServer(cluster, "template1");
+	prep_status("Checking for undo data");
+
+	res = executeQueryOrDie(conn,
+							"SELECT * "
+							"FROM pg_catalog.pg_stat_undo_logs "
+							"WHERE discard != insert");
+
+	if (PQntuples(res) != 0)
+	{
+		if (cluster == &old_cluster)
+			pg_fatal("The source cluster contains live undo data\n");
+		else
+			pg_fatal("The target cluster contains live undo data\n");
+	}
+
+	PQclear(res);
+
+	PQfinish(conn);
+
+	check_ok();
+}
+
+
 /*
  *	check_for_isn_and_int8_passing_mismatch()
  *
diff --git a/src/bin/pg_upgrade/controldata.c b/src/bin/pg_upgrade/controldata.c
index 38236415be..523e86eeff 100644
--- a/src/bin/pg_upgrade/controldata.c
+++ b/src/bin/pg_upgrade/controldata.c
@@ -59,6 +59,7 @@ get_control_data(ClusterInfo *cluster, bool live_check)
 	bool		got_date_is_int = false;
 	bool		got_data_checksum_version = false;
 	bool		got_cluster_state = false;
+	bool		got_redo_location = false;
 	char	   *lc_collate = NULL;
 	char	   *lc_ctype = NULL;
 	char	   *lc_monetary = NULL;
@@ -485,6 +486,23 @@ get_control_data(ClusterInfo *cluster, bool live_check)
 			cluster->controldata.data_checksum_version = str2uint(p);
 			got_data_checksum_version = true;
 		}
+		else if ((p = strstr(bufin, "Latest checkpoint's REDO location:")) != NULL)
+		{
+			uint32		hi;
+			uint32		lo;
+
+			p = strchr(p, ':');
+
+			if (p == NULL || strlen(p) <= 1)
+				pg_fatal("%d: controldata retrieval problem\n", __LINE__);
+
+			p++;				/* remove ':' char */
+
+			if (sscanf(p, "%X/%X", &hi, &lo) != 2)
+				pg_fatal("%d: controldata cannot parse REDO location\n", __LINE__);
+			cluster->controldata.redo_location = (((uint64) hi) << 32) | lo;
+			got_redo_location = true;
+		}
 	}
 
 	pclose(output);
@@ -528,6 +546,13 @@ get_control_data(ClusterInfo *cluster, bool live_check)
 		}
 	}
 
+	/*
+	 * If we used pg_resetwal instead of pg_controldata, there is no REDO
+	 * location.
+	 */
+	if (!got_redo_location)
+		cluster->controldata.redo_location = 0;
+
 	/* verify that we got all the mandatory pg_control data */
 	if (!got_xid || !got_oid ||
 		!got_multi ||
diff --git a/src/bin/pg_upgrade/exec.c b/src/bin/pg_upgrade/exec.c
index 0363309328..f23451a876 100644
--- a/src/bin/pg_upgrade/exec.c
+++ b/src/bin/pg_upgrade/exec.c
@@ -351,6 +351,10 @@ check_data_dir(ClusterInfo *cluster)
 		check_single_dir(pg_data, "pg_clog");
 	else
 		check_single_dir(pg_data, "pg_xact");
+
+	/* pg_undo is new in v13 */
+	if (GET_MAJOR_VERSION(cluster->major_version) >= 1300)
+		check_single_dir(pg_data, "pg_undo");
 }
 
 
diff --git a/src/bin/pg_upgrade/pg_upgrade.c b/src/bin/pg_upgrade/pg_upgrade.c
index d1975aab2b..09b786f472 100644
--- a/src/bin/pg_upgrade/pg_upgrade.c
+++ b/src/bin/pg_upgrade/pg_upgrade.c
@@ -139,6 +139,8 @@ main(int argc, char **argv)
 
 	/* New now using xids of the old system */
 
+	merge_undo_logs();
+
 	/* -- NEW -- */
 	start_postmaster(&new_cluster, true);
 
diff --git a/src/bin/pg_upgrade/pg_upgrade.h b/src/bin/pg_upgrade/pg_upgrade.h
index 5d31750d86..d0d93c0682 100644
--- a/src/bin/pg_upgrade/pg_upgrade.h
+++ b/src/bin/pg_upgrade/pg_upgrade.h
@@ -227,6 +227,7 @@ typedef struct
 	bool		date_is_int;
 	bool		float8_pass_by_value;
 	bool		data_checksum_version;
+	uint64		redo_location;
 } ControlData;
 
 /*
@@ -465,3 +466,7 @@ void		parallel_transfer_all_new_dbs(DbInfoArr *old_db_arr, DbInfoArr *new_db_arr
 										  char *old_pgdata, char *new_pgdata,
 										  char *old_tablespace);
 bool		reap_child(bool wait_for_child);
+
+/* undo.c */
+
+void		merge_undo_logs(void);
diff --git a/src/bin/pg_upgrade/undo.c b/src/bin/pg_upgrade/undo.c
new file mode 100644
index 0000000000..48397b0141
--- /dev/null
+++ b/src/bin/pg_upgrade/undo.c
@@ -0,0 +1,292 @@
+/*
+ *	undo.c
+ *
+ *	Support for upgrading undo logs.\
+ *	Copyright (c) 2019, PostgreSQL Global Development Group
+ *	src/bin/pg_upgrade/undo.c
+ */
+
+
+#include "postgres_fe.h"
+#include "pg_upgrade.h"
+#include "access/undolog.h"
+
+/*
+ * The relevant parts of UndoLogMetaDataData, in a version-independent format.
+ */
+typedef struct
+{
+	UndoLogNumber logno;
+	UndoLogOffset discard;
+	UndoLogStatus status;
+	UndoLogCategory category;
+	Oid			tablespace;
+} UndoLogInfo;
+
+/*
+ * Read the header of a pg_undo file and extract basic information.  If the
+ * format of the header changes in later versions, this may need to change
+ * depending on "cluster".
+ */
+static void
+read_pg_undo_header(int fd, ClusterInfo *cluster, UndoLogNumber *low_logno,
+					UndoLogNumber *next_logno, UndoLogNumber *num_logs)
+{
+	pg_crc32c crc;
+
+	/* Read the header, much like StartupUndoLogs(). */
+	if (read(fd, low_logno, sizeof(*low_logno)) != sizeof(*low_logno) ||
+		read(fd, next_logno, sizeof(*next_logno)) != sizeof(*next_logno) ||
+		read(fd, num_logs, sizeof(*num_logs)) != sizeof(*num_logs) ||
+		read(fd, &crc, sizeof(crc)) != sizeof(crc))
+		pg_fatal("pg_undo file is corrupted or cannot be read\n");
+}
+
+/*
+ * Read a single UndoLogMetaData object.  If the format changes in later
+ * versions, this may need to change to be able to read different structs
+ * depending on "cluster".
+ */
+static void
+read_one_undo_log(int fd, ClusterInfo *cluster, UndoLogInfo *info)
+{
+	UndoLogMetaData meta_data;
+	int		rc;
+
+	rc = read(fd, &meta_data, sizeof(meta_data));
+	if (rc < 0)
+		pg_fatal("could not read undo log meta-data: %m");
+	else if (rc != sizeof(meta_data))
+		pg_fatal("could not read undo log meta-data: expect %zu bytes but read only %d bytes",
+				 sizeof(meta_data), rc);
+
+	info->logno = meta_data.logno;
+	info->category = meta_data.category;
+	info->tablespace = meta_data.tablespace;
+	info->discard = meta_data.discard;
+	info->status = meta_data.status;
+}
+
+static void
+merge_undo_log(UndoLogInfo *logs, UndoLogNumber *num_logs,
+			   const UndoLogInfo *info)
+{
+	UndoLogNumber i;
+
+	/* Do we already have an entry for this logno? */
+	for (i = 0; i < *num_logs; ++i)
+	{
+		if (logs[i].logno == info->logno)
+		{
+			/*
+			 * Take the highest discard offset, so that any pointers that
+			 * originated in either cluster appear to be discarded.
+			 */
+			if (logs[i].discard < info->discard)
+				logs[i].discard = info->discard;
+
+			/*
+			 * Take the highest status so that entirely discarded logs trump
+			 * active logs.
+			 */
+			StaticAssertStmt(UNDO_LOG_STATUS_ACTIVE < UNDO_LOG_STATUS_FULL,
+							 "undo log status out of order");
+			StaticAssertStmt(UNDO_LOG_STATUS_FULL < UNDO_LOG_STATUS_DISCARDED,
+							 "undo log status out of order");
+			if (logs[i].status < info->status)
+				logs[i].status = info->status;
+
+			/*
+			 * Take the most persistent persistence level.  While we could
+			 * just convert them all to permanent and it wouldn't hurt, it's
+			 * probably a better idea to keep about the same number of each
+			 * persistence level, so that a system that has stabilized with
+			 * those numbers will continue to be stable after the upgrade (ie
+			 * not suddenly need to create more undo logs of different
+			 * levels).  The most permanent is the best choice, because TEMP
+			 * undo logs might be rewound in future.
+			 */
+			StaticAssertStmt(UNDO_PERMANENT < UNDO_UNLOGGED,
+							 "undo log persistent out of order");
+			StaticAssertStmt(UNDO_UNLOGGED < UNDO_TEMP,
+							 "undo log persistent out of order");
+			if (logs[i].status > info->status)
+				logs[i].status = info->status;
+
+			/*
+			 * Take the highest tablespace OID.  The choice of 'highest' is
+			 * arbitrary (we don't really expect the new cluster to have more
+			 * than one log), but it seems useful to preserve the distribution
+			 * of tablespaces from the old cluster for stability, as above.
+			 */
+			if (logs[i].tablespace < info->tablespace)
+				logs[i].tablespace = info->tablespace;
+			break;
+		}
+	}
+
+	/* Otherwise create a new entry. */
+	logs[*num_logs++] = *info;
+}
+
+/*
+ * We need to merge the old undo logs and the new undo logs.  We know that
+ * there is no live undo data (see check_for_live_undo_data()), but we need to
+ * make sure that any undo record pointers that exist in the old OR new
+ * cluster appear as discarded.  That is, any log numbers that are entirely
+ * discarded in either cluster appear as entirely discarded, and we retain
+ * the higher of the discard pointers in any log that is active.  This is
+ * mostly a theoretical concern for now, but perhaps a future release will be
+ * able to create higher undo record pointers during initdb than the old
+ * cluster had, so let's use an algorithm that doesn't make any assumptions
+ * about that.
+ */
+void
+merge_undo_logs(void)
+{
+	char		old_pg_undo_path[MAXPGPATH];
+	char		new_pg_undo_path[MAXPGPATH];
+	UndoLogInfo *logs;
+	UndoLogNumber num_logs;
+	UndoLogNumber num_old_logs;
+	UndoLogNumber old_low_logno;
+	UndoLogNumber old_next_logno;
+	UndoLogNumber num_new_logs;
+	UndoLogNumber new_low_logno;
+	UndoLogNumber new_next_logno;
+	UndoLogNumber i;
+	int			old_fd;
+	int			new_fd;
+	pg_crc32c	crc;
+
+	/* If the old cluster has no undo logs, there is nothing to do */
+	if (GET_MAJOR_VERSION(old_cluster.major_version) < 1300)
+		return;
+
+	/*
+	 * Open the pg_undo files corresponding to the old and new redo locations.
+	 * First, we'll reload pg_controldata output, so that we have up-to-date
+	 * redo locations.
+	 */
+	get_control_data(&old_cluster, true);
+	get_control_data(&new_cluster, true);
+	snprintf(old_pg_undo_path,
+			 sizeof(old_pg_undo_path),
+			 "%s/pg_undo/%016" INT64_MODIFIER "X",
+			 old_cluster.pgdata,
+			 old_cluster.controldata.redo_location);
+	snprintf(new_pg_undo_path,
+			 sizeof(new_pg_undo_path),
+			 "%s/pg_undo/%016" INT64_MODIFIER "X",
+			 new_cluster.pgdata,
+			 new_cluster.controldata.redo_location);
+	old_fd = open(old_pg_undo_path, O_RDONLY, 0);
+	if (old_fd < 0)
+		pg_fatal("could not open file \"%s\": %m\n", old_pg_undo_path);
+	new_fd = open(new_pg_undo_path, O_RDWR, 0);
+	if (new_fd < 0)
+		pg_fatal("could not open file \"%s\": %m\n", new_pg_undo_path);
+
+	/* Read the headers */
+	read_pg_undo_header(old_fd, &old_cluster, &old_low_logno, &old_next_logno, &num_old_logs);
+	read_pg_undo_header(new_fd, &new_cluster, &new_low_logno, &new_next_logno, &num_new_logs);
+
+	/* Allocate workspace that is sure to be enough for the merged set */
+	logs = malloc(sizeof(*logs) * (num_old_logs + num_new_logs));
+	if (logs == NULL)
+	{
+		pg_fatal("out of memory\n");
+		exit(1);
+	}
+	num_logs = 0;
+
+	/*
+	 * Anything below the "low" logno has been entirely discarded, so we'll
+	 * take the higher of the two values.  Likewise, the "next" log number to
+	 * allocate should be the higher of the two.
+	 */
+	new_low_logno = Max(old_low_logno, new_low_logno);
+	new_next_logno = Max(old_next_logno, new_next_logno);
+
+	/* Merge in the old logs */
+	while (num_old_logs > 0)
+	{
+		UndoLogInfo	info;
+
+		read_one_undo_log(old_fd, &old_cluster, &info);
+		merge_undo_log(logs, &num_logs, &info);
+		--num_old_logs;
+	}
+
+	/* Merge in the new logs */
+	while (num_new_logs > 0)
+	{
+		UndoLogInfo	info;
+
+		read_one_undo_log(new_fd, &old_cluster, &info);
+		merge_undo_log(logs, &num_logs, &info);
+		--num_new_logs;
+	}
+
+	close(old_fd);
+
+	/* Now write out the new file, much like CheckPointUndoLogs() */
+	if (ftruncate(new_fd, 0) < 0)
+		pg_fatal("could not truncate file \"%s\": %m", new_pg_undo_path);
+	if (lseek(new_fd, SEEK_SET, 0) < 0)
+		pg_fatal("could not seek to start of file \"%s\": %m", new_pg_undo_path);
+
+	/* Compute header checksum */
+	INIT_CRC32C(crc);
+	COMP_CRC32C(crc, &new_low_logno, sizeof(new_low_logno));
+	COMP_CRC32C(crc, &new_next_logno, sizeof(new_next_logno));
+	COMP_CRC32C(crc, &num_logs, sizeof(num_logs));
+	FIN_CRC32C(crc);
+
+	/* Write out the header */
+	if ((write(new_fd, &new_low_logno, sizeof(new_low_logno)) != sizeof(new_low_logno)) ||
+		(write(new_fd, &new_next_logno, sizeof(new_next_logno)) != sizeof(new_next_logno)) ||
+		(write(new_fd, &num_logs, sizeof(num_logs)) != sizeof(num_logs)) ||
+		(write(new_fd, &crc, sizeof(crc)) != sizeof(crc)))
+		pg_fatal("could not write to file \"%s\": %m", new_pg_undo_path);
+
+	/* Write out the undo logs */
+	INIT_CRC32C(crc);
+	for (i = 0; i < num_logs; ++i)
+	{
+		UndoLogMetaData	meta_data;
+		UndoLogInfo	*info = &logs[i];
+		UndoLogOffset end;
+
+		memset(&meta_data, 0, sizeof(meta_data));
+		meta_data.logno = info->logno;
+
+		/*
+		 * Round the discard offset up so that it points to the first byte in
+		 * a segment, and assign that to all three offsets.  That means there
+		 * is no logical data, and there are no physical files.
+		 */
+		end = ((info->discard + UndoLogSegmentSize - 1) / UndoLogSegmentSize)
+			* UndoLogSegmentSize;
+		meta_data.unlogged.insert = meta_data.discard = meta_data.end = end;
+
+		/*
+		 * We have whatever was the highest status (though it probably
+		 * wouldn't hurt if we set them all to ACTIVE).
+		 */
+		meta_data.status = info->status;
+
+
+		if (write(new_fd, &meta_data, sizeof(meta_data)) != sizeof(meta_data))
+			pg_fatal("could not write to file \"%s\": %m", new_pg_undo_path);
+
+		COMP_CRC32C(crc, &meta_data, sizeof(meta_data));
+	}
+	FIN_CRC32C(crc);
+
+	if (write(new_fd, &crc, sizeof(crc)) != sizeof(crc))
+		pg_fatal("could not write to file \"%s\": %m", new_pg_undo_path);
+
+	close(new_fd);
+	free(logs);
+}
diff --git a/src/bin/pg_waldump/rmgrdesc.c b/src/bin/pg_waldump/rmgrdesc.c
index 852d8ca4b1..938150dd91 100644
--- a/src/bin/pg_waldump/rmgrdesc.c
+++ b/src/bin/pg_waldump/rmgrdesc.c
@@ -20,6 +20,7 @@
 #include "access/nbtxlog.h"
 #include "access/rmgr.h"
 #include "access/spgxlog.h"
+#include "access/undolog_xlog.h"
 #include "access/xact.h"
 #include "access/xlog_internal.h"
 #include "catalog/storage_xlog.h"
diff --git a/src/include/access/rmgrlist.h b/src/include/access/rmgrlist.h
index 3c0db2ccf5..6945e3e950 100644
--- a/src/include/access/rmgrlist.h
+++ b/src/include/access/rmgrlist.h
@@ -47,3 +47,4 @@ PG_RMGR(RM_COMMIT_TS_ID, "CommitTs", commit_ts_redo, commit_ts_desc, commit_ts_i
 PG_RMGR(RM_REPLORIGIN_ID, "ReplicationOrigin", replorigin_redo, replorigin_desc, replorigin_identify, NULL, NULL, NULL)
 PG_RMGR(RM_GENERIC_ID, "Generic", generic_redo, generic_desc, generic_identify, NULL, NULL, generic_mask)
 PG_RMGR(RM_LOGICALMSG_ID, "LogicalMessage", logicalmsg_redo, logicalmsg_desc, logicalmsg_identify, NULL, NULL, NULL)
+PG_RMGR(RM_UNDOLOG_ID, "UndoLog", undolog_redo, undolog_desc, undolog_identify, NULL, NULL, NULL)
diff --git a/src/include/access/session.h b/src/include/access/session.h
index 8fba568029..da0770e614 100644
--- a/src/include/access/session.h
+++ b/src/include/access/session.h
@@ -17,6 +17,9 @@
 /* Avoid including typcache.h */
 struct SharedRecordTypmodRegistry;
 
+/* Avoid including undolog.h */
+struct UndoLogSlot;
+
 /*
  * A struct encapsulating some elements of a user's session.  For now this
  * manages state that applies to parallel query, but it principle it could
@@ -27,6 +30,10 @@ typedef struct Session
 	dsm_segment *segment;		/* The session-scoped DSM segment. */
 	dsa_area   *area;			/* The session-scoped DSA area. */
 
+	/* State managed by undolog.c. */
+	struct UndoLogSlot *attached_undo_slots[4];		/* UndoLogCategories */
+	bool		need_to_choose_undo_tablespace;
+
 	/* State managed by typcache.c. */
 	struct SharedRecordTypmodRegistry *shared_typmod_registry;
 	dshash_table *shared_record_table;
diff --git a/src/include/access/undolog.h b/src/include/access/undolog.h
new file mode 100644
index 0000000000..c4a6b29ef0
--- /dev/null
+++ b/src/include/access/undolog.h
@@ -0,0 +1,489 @@
+/*-------------------------------------------------------------------------
+ *
+ * undolog.h
+ *
+ * PostgreSQL undo log manager.  This module is responsible for lifecycle
+ * management of undo logs and backing files, associating undo logs with
+ * backends, allocating and managing space within undo logs.
+ *
+ * Portions Copyright (c) 1996-2019, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * src/include/access/undolog.h
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifndef UNDOLOG_H
+#define UNDOLOG_H
+
+#include "access/transam.h"
+#include "access/xlogreader.h"
+#include "catalog/database_internal.h"
+#include "catalog/pg_class.h"
+#include "common/relpath.h"
+#include "storage/bufpage.h"
+
+#ifndef FRONTEND
+#include "storage/lwlock.h"
+#endif
+
+/* The type used to identify an undo log and position within it. */
+typedef uint64 UndoRecPtr;
+
+/* The type used for undo record lengths. */
+typedef uint16 UndoRecordSize;
+
+/* Undo log statuses. */
+typedef enum
+{
+	UNDO_LOG_STATUS_UNUSED = 0,
+	UNDO_LOG_STATUS_ACTIVE,
+	UNDO_LOG_STATUS_FULL,
+	UNDO_LOG_STATUS_DISCARDED
+} UndoLogStatus;
+
+/*
+ * Undo log categories.  These correspond to the different persistence levels
+ * of relations so that we can discard unlogged and temporary undo data
+ * wholesale in some circumstance.  We also have a separate category for
+ * 'shared' records that are not associated with a single transactions.  Since
+ * they might live longer than the transaction that created them, and since we
+ * prefer to avoid interleaving records that don't belong to the same
+ * transaction, we keep them separate.
+ */
+typedef enum
+{
+	UNDO_PERMANENT = 0,
+	UNDO_UNLOGGED = 1,
+	UNDO_TEMP = 2,
+	UNDO_SHARED = 3
+} UndoLogCategory;
+
+#define UndoLogCategories 4
+
+/*
+ * Convert from relpersistence ('p', 'u', 't') to an UndoLogCategory
+ * enumerator.
+ */
+#define UndoLogCategoryForRelPersistence(rp)						\
+	((rp) == RELPERSISTENCE_PERMANENT ? UNDO_PERMANENT :			\
+	 (rp) == RELPERSISTENCE_UNLOGGED ? UNDO_UNLOGGED : UNDO_TEMP)
+
+/*
+ * Convert from UndoLogCategory to a relpersistence value.  There is no
+ * relpersistence level for UNDO_SHARED, but the only use of this macro is to
+ * pass a value to ReadBufferWithoutRelcache, which cares only about detecting
+ * RELPERSISTENCE_TEMP.  XXX There must be a better way.
+ */
+#define RelPersistenceForUndoLogCategory(up)				\
+	((up) == UNDO_PERMANENT ? RELPERSISTENCE_PERMANENT :	\
+	 (up) == UNDO_UNLOGGED ? RELPERSISTENCE_UNLOGGED :		\
+	 (up) == UNDO_SHARED ? RELPERSISTENCE_PERMANENT :		\
+	 RELPERSISTENCE_TEMP)
+
+/*
+ * Get the appropriate UndoLogCategory value from a Relation.
+ */
+#define UndoLogCategoryForRelation(rel)									\
+	(UndoLogCategoryForRelPersistence((rel)->rd_rel->relpersistence))
+
+/* Type for offsets within undo logs */
+typedef uint64 UndoLogOffset;
+
+/* printf-family format string for UndoRecPtr. */
+#define UndoRecPtrFormat "%016" INT64_MODIFIER "X"
+
+/* printf-family format string for UndoLogOffset. */
+#define UndoLogOffsetFormat UINT64_FORMAT
+
+/* Number of blocks of BLCKSZ in an undo log segment file.  128 = 1MB. */
+#define UNDOSEG_SIZE 128
+
+/* Size of an undo log segment file in bytes. */
+#define UndoLogSegmentSize ((size_t) BLCKSZ * UNDOSEG_SIZE)
+
+/* The width of an undo log number in bits.  24 allows for 16.7m logs. */
+#define UndoLogNumberBits 24
+
+/* The maximum valid undo log number. */
+#define MaxUndoLogNumber ((1 << UndoLogNumberBits) - 1)
+
+/* The width of an undo log offset in bits.  40 allows for 1TB per log.*/
+#define UndoLogOffsetBits (64 - UndoLogNumberBits)
+
+/* Special value for undo record pointer which indicates that it is invalid. */
+#define	InvalidUndoRecPtr	((UndoRecPtr) 0)
+
+/* End-of-list value when building linked lists of undo logs. */
+#define InvalidUndoLogNumber -1
+
+/*
+ * The maximum amount of data that can be stored in an undo log.  Can be set
+ * artificially low to test full log behavior.
+ */
+#define UndoLogMaxSize ((UndoLogOffset) 1 << UndoLogOffsetBits)
+
+/* Type for numbering undo logs. */
+typedef int UndoLogNumber;
+
+/* Extract the undo log number from an UndoRecPtr. */
+#define UndoRecPtrGetLogNo(urp)					\
+	((urp) >> UndoLogOffsetBits)
+
+/* Extract the offset from an UndoRecPtr. */
+#define UndoRecPtrGetOffset(urp)				\
+	((urp) & ((UINT64CONST(1) << UndoLogOffsetBits) - 1))
+
+/* Make an UndoRecPtr from an log number and offset. */
+#define MakeUndoRecPtr(logno, offset)			\
+	(((uint64) (logno) << UndoLogOffsetBits) | (offset))
+
+/* The number of unusable bytes in the header of each block. */
+#define UndoLogBlockHeaderSize SizeOfPageHeaderData
+
+/* The number of usable bytes we can store per block. */
+#define UndoLogUsableBytesPerPage (BLCKSZ - UndoLogBlockHeaderSize)
+
+/* Length of undo checkpoint filename */
+#define UNDO_CHECKPOINT_FILENAME_LENGTH	16
+
+/*
+ * UndoRecPtrIsValid
+ *		True iff undoRecPtr is valid.
+ */
+#define UndoRecPtrIsValid(undoRecPtr) \
+	((bool) ((UndoRecPtr) (undoRecPtr) != InvalidUndoRecPtr))
+
+/* Extract the relnode for an undo log. */
+#define UndoRecPtrGetRelNode(urp)				\
+	UndoRecPtrGetLogNo(urp)
+
+/* The only valid fork number for undo log buffers. */
+#define UndoLogForkNum MAIN_FORKNUM
+
+/* Compute the block number that holds a given UndoRecPtr. */
+#define UndoRecPtrGetBlockNum(urp)				\
+	(UndoRecPtrGetOffset(urp) / BLCKSZ)
+
+/* Compute the offset of a given UndoRecPtr in the page that holds it. */
+#define UndoRecPtrGetPageOffset(urp)			\
+	(UndoRecPtrGetOffset(urp) % BLCKSZ)
+
+/* Compare two undo checkpoint files to find the oldest file. */
+#define UndoCheckPointFilenamePrecedes(file1, file2)	\
+	(strcmp(file1, file2) < 0)
+
+/* What is the offset of the i'th non-header byte? */
+#define UndoLogOffsetFromUsableByteNo(i)								\
+	(((i) / UndoLogUsableBytesPerPage) * BLCKSZ +						\
+	 UndoLogBlockHeaderSize +											\
+	 ((i) % UndoLogUsableBytesPerPage))
+
+/* How many non-header bytes are there before a given offset? */
+#define UndoLogOffsetToUsableByteNo(offset)				\
+	(((offset) % BLCKSZ - UndoLogBlockHeaderSize) +		\
+	 ((offset) / BLCKSZ) * UndoLogUsableBytesPerPage)
+
+/* Add 'n' usable bytes to offset stepping over headers to find new offset. */
+#define UndoLogOffsetPlusUsableBytes(offset, n)							\
+	UndoLogOffsetFromUsableByteNo(UndoLogOffsetToUsableByteNo(offset) + (n))
+
+/* Populate a RelFileNode from an UndoRecPtr. */
+#define UndoRecPtrAssignRelFileNode(rfn, urp)								 \
+	do																		 \
+	{																		 \
+		(rfn).spcNode = UndoLogNumberGetTablespace(UndoRecPtrGetLogNo(urp)); \
+		(rfn).dbNode = UndoDbOid;											 \
+		(rfn).relNode = UndoRecPtrGetRelNode(urp);							 \
+	} while (false);
+
+/*
+ * Properties of an undo log that don't have explicit WAL records logging
+ * their changes, to reduce WAL volume.  Instead, they change incrementally
+ * whenever data is inserted as a result of other WAL records.  Since the
+ * values recorded in an online checkpoint may be out of the sync (ie not the
+ * correct values as at the redo LSN), these are backed up in buffer data on
+ * first change after each checkpoint.
+ */
+typedef struct UndoLogUnloggedMetaData
+{
+	UndoLogOffset insert;			/* next insertion point (head) */
+	UndoLogOffset last_xact_start;	/* last transaction's first byte in this log */
+	UndoLogOffset this_xact_start;	/* this transaction's first byte in this log */
+	TransactionId xid;				/* currently attached/writing xid */
+
+	/*
+	 * Below two variable are used during recovery when transaction's undo
+	 * records are split across undo logs.  Replay of switch will restore
+	 * these two undo record pointers which will be reset on next allocation
+	 * during recovery. */
+	UndoRecPtr	prevlog_xact_start; /* Transaction's start undo record pointer
+									 * in the previous log. */
+	UndoRecPtr	prevlog_last_urp;	/* Transaction's last undo record pointer in
+									 * the previous log. */
+} UndoLogUnloggedMetaData;
+
+/*
+ * Control metadata for an active undo log.  Lives in shared memory inside an
+ * UndoLogSlot object, but also written to disk during checkpoints.
+ */
+typedef struct UndoLogMetaData
+{
+	/* Members that are not managed by explicit WAL logs. */
+	UndoLogUnloggedMetaData unlogged;
+
+	/* Members that are fixed for the lifetime of the undo log. */
+	UndoLogNumber logno;
+	Oid		tablespace;
+	UndoLogCategory category;
+
+	/* Members that are changed by explicit WAL records. */
+	UndoLogStatus status;
+	UndoLogOffset end;				/* one past end of highest segment */
+	UndoLogOffset discard;			/* oldest data needed (tail) */
+} UndoLogMetaData;
+
+/*
+ * Context used to hold undo log state across all the undo log insertions
+ * corresponding to a single WAL record.
+ */
+typedef struct UndoLogAllocContext
+{
+	UndoLogCategory category;
+	UndoRecPtr	try_location;
+	XLogReaderState *xlog_record;
+	UndoLogNumber recovery_logno;
+	uint8 		recovery_block_id;
+	bool		new_shared_record_set;
+
+	/*
+	 * The maximum number of undo logs that a single WAL record could insert
+	 * into, modifying its unlogged meta data.  Typically the number is 1, but
+	 * it might touch a couple or more in rare cases where space runs out.
+	 */
+#define MAX_META_DATA_IMAGES 4
+	int			num_meta_data_images;
+	struct
+	{
+		UndoLogNumber logno;
+		UndoLogUnloggedMetaData data;
+	} meta_data_images[MAX_META_DATA_IMAGES];
+} UndoLogAllocContext;
+
+#ifndef FRONTEND
+
+/*
+ * The in-memory control object for an undo log.  We have a fixed-sized array
+ * of these.
+ */
+typedef struct UndoLogSlot
+{
+	/*
+	 * Protected by UndoLogLock and 'mutex'.  Both must be held to steal this
+	 * slot for another undolog.  Either may be held to prevent that from
+	 * happening.
+	 */
+	UndoLogNumber logno;			/* InvalidUndoLogNumber for unused slots */
+
+	/* Protected by UndoLogLock. */
+	UndoLogNumber next_free;		/* link for active unattached undo logs */
+
+	/* Protected by 'mutex'. */
+	LWLock		mutex;
+	UndoLogMetaData meta;			/* current meta-data */
+	pid_t		pid;				/* InvalidPid for unattached */
+
+	/* Protected by 'discard_lock'.  State used by undo workers. */
+	FullTransactionId	wait_fxmin;		/* trigger for processing this log again */
+	UndoRecPtr	oldest_data;
+	LWLock		discard_lock;		/* prevents discarding while reading */
+	LWLock      discard_update_lock;    /* block updaters during discard */
+} UndoLogSlot;
+
+extern UndoLogSlot *UndoLogGetSlot(UndoLogNumber logno, bool missing_ok);
+extern UndoLogSlot *UndoLogNextSlot(UndoLogSlot *slot);
+extern bool AmAttachedToUndoLogSlot(UndoLogSlot *slot);
+extern UndoRecPtr UndoLogGetOldestRecord(UndoLogNumber logno, bool *full);
+
+/*
+ * Each backend maintains a small hash table mapping undo log numbers to
+ * UndoLogSlot objects in shared memory.
+ *
+ * We also cache the tablespace, category and a recently observed discard
+ * pointer here, since we need fast access to those.  We could also reach them
+ * via slot->meta, but they can't be accessed without locking (since the
+ * UndoLogSlot object might be recycled if the log is entirely discard).
+ * Since tablespace and category are constant for lifetime of the undo log
+ * number, and the discard pointer only travels in one direction, there is no
+ * cache invalidation problem to worry about.
+ */
+typedef struct UndoLogTableEntry
+{
+	UndoLogNumber	number;
+	UndoLogSlot	   *slot;
+	Oid				tablespace;
+	UndoLogCategory category;
+	UndoRecPtr		recent_discard;
+	char			status;			/* used by simplehash */
+} UndoLogTableEntry;
+
+/*
+ * Instantiate fast inline hash table access functions.  We use an identity
+ * hash function for speed, since we already have integers and don't expect
+ * many collisions.
+ */
+#define SH_PREFIX undologtable
+#define SH_ELEMENT_TYPE UndoLogTableEntry
+#define SH_KEY_TYPE UndoLogNumber
+#define SH_KEY number
+#define SH_HASH_KEY(tb, key) (key)
+#define SH_EQUAL(tb, a, b) ((a) == (b))
+#define SH_SCOPE static inline
+#define SH_DECLARE
+#define SH_DEFINE
+#include "lib/simplehash.h"
+
+extern PGDLLIMPORT undologtable_hash *undologtable_cache;
+
+/*
+ * Find or create an UndoLogTableGetEntry for this log number.  This is used
+ * only for fast look-ups of tablespace and persistence.
+ */
+static pg_attribute_always_inline UndoLogTableEntry *
+UndoLogGetTableEntry(UndoLogNumber logno)
+{
+	UndoLogTableEntry  *entry;
+
+	/* Fast path. */
+	entry = undologtable_lookup(undologtable_cache, logno);
+	if (likely(entry))
+		return entry;
+
+	/* Slow path: force cache entry to be created. */
+	UndoLogGetSlot(logno, false);
+	entry = undologtable_lookup(undologtable_cache, logno);
+
+	return entry;
+}
+
+/*
+ * Look up the tablespace for an undo log in our cache.
+ */
+static inline Oid
+UndoLogNumberGetTablespace(UndoLogNumber logno)
+{
+	return UndoLogGetTableEntry(logno)->tablespace;
+}
+
+static inline Oid
+UndoRecPtrGetTablespace(UndoRecPtr urp)
+{
+	return UndoLogNumberGetTablespace(UndoRecPtrGetLogNo(urp));
+}
+
+/*
+ * Look up the category for an undo log in our cache.
+ */
+static inline UndoLogCategory
+UndoLogNumberGetCategory(UndoLogNumber logno)
+{
+	return UndoLogGetTableEntry(logno)->category;
+}
+
+static inline UndoLogCategory
+UndoRecPtrGetCategory(UndoRecPtr urp)
+{
+	return UndoLogNumberGetCategory(UndoRecPtrGetLogNo(urp));
+}
+
+#endif
+
+/* Space management. */
+extern void UndoLogBeginInsert(UndoLogAllocContext *context,
+							   UndoLogCategory category,
+							   XLogReaderState *xlog_record);
+extern void UndoLogRegister(UndoLogAllocContext *context,
+							uint8 block_id,
+							UndoLogNumber logno);
+extern UndoRecPtr UndoLogAllocate(UndoLogAllocContext *context,
+								  uint16 size,
+								  bool *need_xact_header,
+								  UndoRecPtr *last_xact_start,
+								  UndoRecPtr *prevlog_xact_start,
+								  UndoRecPtr *prevlog_insert_urp);
+extern UndoRecPtr UndoLogAllocateInRecovery(UndoLogAllocContext *context,
+											TransactionId xid,
+											uint16 size,
+											bool *need_xact_header,
+											UndoRecPtr *last_xact_start,
+											UndoRecPtr *prevlog_xact_start,
+											UndoRecPtr *prevlog_last_urp);
+extern void UndoLogAdvance(UndoLogAllocContext *context, size_t size);
+extern void UndoLogAdvanceFinal(UndoRecPtr insertion_point, size_t size);
+extern bool UndoLogDiscard(UndoRecPtr discard_point, TransactionId xid);
+extern bool UndoLogRecPtrIsDiscardedSlowPath(UndoRecPtr pointer);
+
+#ifndef FRONTEND
+
+/*
+ * Check if an undo log pointer is discarded.
+ */
+static inline bool
+UndoRecPtrIsDiscarded(UndoRecPtr pointer)
+{
+	UndoLogNumber	logno = UndoRecPtrGetLogNo(pointer);
+	UndoRecPtr		recent_discard;
+
+	/* See if we can answer the question without acquiring any locks. */
+	recent_discard = UndoLogGetTableEntry(logno)->recent_discard;
+	if (likely(recent_discard > pointer))
+		return true;
+
+	/*
+	 * It might be discarded or not, but we'll need to do a bit more work to
+	 * find out.
+	 */
+	return UndoLogRecPtrIsDiscardedSlowPath(pointer);
+}
+
+#endif
+
+/* Initialization interfaces. */
+extern void StartupUndoLogs(XLogRecPtr checkPointRedo);
+extern void UndoLogShmemInit(void);
+extern Size UndoLogShmemSize(void);
+extern void UndoLogInit(void);
+extern void UndoLogDirectory(Oid tablespace, char *path);
+extern void UndoLogSegmentPath(UndoLogNumber logno, int segno, Oid tablespace,
+							   char *path);
+extern void ResetUndoLogs(UndoLogCategory category);
+
+/* Interface use by tablespace.c. */
+extern bool DropUndoLogsInTablespace(Oid tablespace);
+
+/* GUC interfaces. */
+extern void assign_undo_tablespaces(const char *newval, void *extra);
+
+/* Checkpointing interfaces. */
+extern void CheckPointUndoLogs(XLogRecPtr checkPointRedo,
+							   XLogRecPtr priorCheckPointRedo);
+
+/* File sync request management. */
+
+
+extern UndoRecPtr UndoLogGetLastXactStartPoint(UndoLogNumber logno);
+extern UndoRecPtr UndoLogGetNextInsertPtr(UndoLogNumber logno);
+extern void UndoLogSwitchSetPrevLogInfo(UndoLogNumber logno,
+										UndoRecPtr prevlog_last_urp,
+										UndoRecPtr prevlog_xact_start);
+extern void UndoLogSetLSN(XLogRecPtr lsn);
+void UndoLogNewSegment(UndoLogNumber logno, Oid tablespace, int segno);
+/* Redo interface. */
+extern void undolog_redo(XLogReaderState *record);
+/* Discard the undo logs for temp tables */
+extern void TempUndoDiscard(UndoLogNumber);
+
+/* Test-only interfacing. */
+extern void UndoLogDetachFull(void);
+
+#endif
diff --git a/src/include/access/undolog_xlog.h b/src/include/access/undolog_xlog.h
new file mode 100644
index 0000000000..aa9003ba4b
--- /dev/null
+++ b/src/include/access/undolog_xlog.h
@@ -0,0 +1,62 @@
+/*-------------------------------------------------------------------------
+ *
+ * undolog_xlog.h
+ *	  undo log access XLOG definitions.
+ *
+ * Portions Copyright (c) 1996-2019, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * src/include/access/undolog_xlog.h
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifndef UNDOLOG_XLOG_H
+#define UNDOLOG_XLOG_H
+
+#include "access/undolog.h"
+#include "access/xlogreader.h"
+#include "lib/stringinfo.h"
+
+/* XLOG records */
+#define XLOG_UNDOLOG_CREATE		0x00
+#define XLOG_UNDOLOG_EXTEND		0x10
+#define XLOG_UNDOLOG_DISCARD	0x20
+#define XLOG_UNDOLOG_SWITCH		0x30
+
+/* Create a new undo log. */
+typedef struct xl_undolog_create
+{
+	UndoLogNumber logno;
+	Oid		tablespace;
+	UndoLogCategory category;
+} xl_undolog_create;
+
+/* Extend an undo log by adding a new segment. */
+typedef struct xl_undolog_extend
+{
+	UndoLogNumber logno;
+	UndoLogOffset end;
+} xl_undolog_extend;
+
+/* Discard space, and possibly destroy or recycle undo log segments. */
+typedef struct xl_undolog_discard
+{
+	UndoLogNumber logno;
+	UndoLogOffset discard;
+	UndoLogOffset end;
+	TransactionId latestxid;	/* latest xid whose undolog are discarded. */
+	bool		  entirely_discarded;
+} xl_undolog_discard;
+
+/* Switch undo log. */
+typedef struct xl_undolog_switch
+{
+	UndoLogNumber logno;
+	UndoRecPtr prevlog_xact_start;
+	UndoRecPtr prevlog_last_urp;
+} xl_undolog_switch;
+
+extern void undolog_desc(StringInfo buf,XLogReaderState *record);
+extern const char *undolog_identify(uint8 info);
+
+#endif
diff --git a/src/include/access/xlogutils.h b/src/include/access/xlogutils.h
index 4105b59904..4db68f1082 100644
--- a/src/include/access/xlogutils.h
+++ b/src/include/access/xlogutils.h
@@ -44,6 +44,20 @@ extern XLogRedoAction XLogReadBufferForRedoExtended(XLogReaderState *record,
 extern Buffer XLogReadBufferExtended(RelFileNode rnode, ForkNumber forknum,
 									 BlockNumber blkno, ReadBufferMode mode);
 
+extern bool XLogFindBlockId(XLogReaderState *record,
+							RelFileNode rnode,
+							ForkNumber forknum,
+							BlockNumber blockno,
+							uint8 *block_id);
+
+extern XLogRedoAction XLogReadBufferForRedoBlock(XLogReaderState *record,
+												 RelFileNode rnode,
+												 ForkNumber forknum,
+												 BlockNumber blockno,
+												 ReadBufferMode mode,
+												 bool get_cleanup_lock,
+												 Buffer *buf);
+
 extern Relation CreateFakeRelcacheEntry(RelFileNode rnode);
 extern void FreeFakeRelcacheEntry(Relation fakerel);
 
diff --git a/src/include/catalog/database_internal.h b/src/include/catalog/database_internal.h
new file mode 100644
index 0000000000..a2b3a122ef
--- /dev/null
+++ b/src/include/catalog/database_internal.h
@@ -0,0 +1,21 @@
+/*-------------------------------------------------------------------------
+ *
+ * database_internal.h
+ *
+ * Portions Copyright (c) 1996-2019, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * src/include/catalog/database_internal.h
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifndef DATABASE_INTERNAL_H
+#define DATABASE_INTERNAL_H
+
+/*
+ * We use this header to define the OIDs of pseudo-database OIDs used in
+ * buffer tags to hold system data.
+ */
+#define UndoDbOid 9
+
+#endif							/* DATABASE_INTERNAL_H */
diff --git a/src/include/catalog/pg_proc.dat b/src/include/catalog/pg_proc.dat
index 0902dce5f1..de475cb2d4 100644
--- a/src/include/catalog/pg_proc.dat
+++ b/src/include/catalog/pg_proc.dat
@@ -10701,4 +10701,11 @@
   proname => 'pg_partition_root', prorettype => 'regclass',
   proargtypes => 'regclass', prosrc => 'pg_partition_root' },
 
+# undo logs
+{ oid => '5032', descr => 'list undo logs',
+  proname => 'pg_stat_get_undo_logs', procost => '1', prorows => '10', proretset => 't',
+  prorettype => 'record', proargtypes => '',
+  proallargtypes => '{oid,text,text,text,text,text,xid,int4,text}', proargmodes => '{o,o,o,o,o,o,o,o,o}',
+  proargnames => '{logno,category,tablespace,discard,insert,end,xid,pid,status}', prosrc => 'pg_stat_get_undo_logs' },
+
 ]
diff --git a/src/include/pgstat.h b/src/include/pgstat.h
index 0a3ad3a188..1936c5db6f 100644
--- a/src/include/pgstat.h
+++ b/src/include/pgstat.h
@@ -934,6 +934,13 @@ typedef enum
 	WAIT_EVENT_TWOPHASE_FILE_READ,
 	WAIT_EVENT_TWOPHASE_FILE_SYNC,
 	WAIT_EVENT_TWOPHASE_FILE_WRITE,
+	WAIT_EVENT_UNDO_CHECKPOINT_READ,
+	WAIT_EVENT_UNDO_CHECKPOINT_SYNC,
+	WAIT_EVENT_UNDO_CHECKPOINT_WRITE,
+	WAIT_EVENT_UNDO_FILE_READ,
+	WAIT_EVENT_UNDO_FILE_WRITE,
+	WAIT_EVENT_UNDO_FILE_FLUSH,
+	WAIT_EVENT_UNDO_FILE_SYNC,
 	WAIT_EVENT_WALSENDER_TIMELINE_HISTORY_READ,
 	WAIT_EVENT_WAL_BOOTSTRAP_SYNC,
 	WAIT_EVENT_WAL_BOOTSTRAP_WRITE,
diff --git a/src/include/storage/bufmgr.h b/src/include/storage/bufmgr.h
index 509f4b7ef1..a04190aa92 100644
--- a/src/include/storage/bufmgr.h
+++ b/src/include/storage/bufmgr.h
@@ -37,8 +37,9 @@ typedef enum BufferAccessStrategyType
 typedef enum
 {
 	RBM_NORMAL,					/* Normal read */
-	RBM_ZERO_AND_LOCK,			/* Don't read from disk, caller will
-								 * initialize. Also locks the page. */
+	RBM_ZERO,					/* Don't read from disk, caller will
+								 * initialize. */
+	RBM_ZERO_AND_LOCK,			/* Like RBM_ZERO, but also locks the page. */
 	RBM_ZERO_AND_CLEANUP_LOCK,	/* Like RBM_ZERO_AND_LOCK, but locks the page
 								 * in "cleanup" mode */
 	RBM_ZERO_ON_ERROR,			/* Read, but return an all-zeros page on error */
@@ -170,7 +171,10 @@ extern Buffer ReadBufferExtended(Relation reln, ForkNumber forkNum,
 								 BufferAccessStrategy strategy);
 extern Buffer ReadBufferWithoutRelcache(RelFileNode rnode,
 										ForkNumber forkNum, BlockNumber blockNum,
-										ReadBufferMode mode, BufferAccessStrategy strategy);
+										ReadBufferMode mode, BufferAccessStrategy strategy,
+										char relpersistence);
+extern void ForgetBuffer(RelFileNode rnode, ForkNumber forkNum,
+						 BlockNumber blockNum);
 extern void ReleaseBuffer(Buffer buffer);
 extern void UnlockReleaseBuffer(Buffer buffer);
 extern void MarkBufferDirty(Buffer buffer);
@@ -227,6 +231,10 @@ extern void AtProcExit_LocalBuffers(void);
 
 extern void TestForOldSnapshot_impl(Snapshot snapshot, Relation relation);
 
+/* in localbuf.c */
+extern void ForgetLocalBuffer(RelFileNode rnode, ForkNumber forkNum,
+							  BlockNumber blockNum);
+
 /* in freelist.c */
 extern BufferAccessStrategy GetAccessStrategy(BufferAccessStrategyType btype);
 extern void FreeAccessStrategy(BufferAccessStrategy strategy);
diff --git a/src/include/storage/fd.h b/src/include/storage/fd.h
index d2a8c52044..b215201f30 100644
--- a/src/include/storage/fd.h
+++ b/src/include/storage/fd.h
@@ -143,6 +143,7 @@ extern int	pg_fsync_writethrough(int fd);
 extern int	pg_fdatasync(int fd);
 extern void pg_flush_data(int fd, off_t offset, off_t amount);
 extern void fsync_fname(const char *fname, bool isdir);
+extern int	fsync_fname_ext(const char *fname, bool isdir, bool perm, int elevel);
 extern int	durable_rename(const char *oldfile, const char *newfile, int loglevel);
 extern int	durable_unlink(const char *fname, int loglevel);
 extern int	durable_link_or_rename(const char *oldfile, const char *newfile, int loglevel);
diff --git a/src/include/storage/lwlock.h b/src/include/storage/lwlock.h
index 08e0dc8144..4abb344aec 100644
--- a/src/include/storage/lwlock.h
+++ b/src/include/storage/lwlock.h
@@ -220,7 +220,10 @@ typedef enum BuiltinTrancheIds
 	LWTRANCHE_TBM,
 	LWTRANCHE_PARALLEL_APPEND,
 	LWTRANCHE_SXACT,
-	LWTRANCHE_FIRST_USER_DEFINED
+	LWTRANCHE_UNDOLOG,
+	LWTRANCHE_UNDODISCARD,
+	LWTRANCHE_REWIND,
+	LWTRANCHE_FIRST_USER_DEFINED,
 }			BuiltinTrancheIds;
 
 /*
diff --git a/src/include/storage/smgr.h b/src/include/storage/smgr.h
index d286c8c7b1..0fd3b7505d 100644
--- a/src/include/storage/smgr.h
+++ b/src/include/storage/smgr.h
@@ -70,6 +70,9 @@ typedef struct SMgrRelationData
 	int			md_num_open_segs[MAX_FORKNUM + 1];
 	struct _MdfdVec *md_seg_fds[MAX_FORKNUM + 1];
 
+	/* For use by implementations. */
+	void	   *private_data;
+
 	/* if unowned, list link in list of all unowned SMgrRelations */
 	dlist_node	node;
 } SMgrRelationData;
diff --git a/src/include/storage/sync.h b/src/include/storage/sync.h
index 16428c5f5f..59f1da9178 100644
--- a/src/include/storage/sync.h
+++ b/src/include/storage/sync.h
@@ -34,7 +34,8 @@ typedef enum SyncRequestType
  */
 typedef enum SyncRequestHandler
 {
-	SYNC_HANDLER_MD = 0			/* md smgr */
+	SYNC_HANDLER_MD = 0,		/* md smgr */
+	SYNC_HANDLER_UNDO = 1		/* undo smgr */
 } SyncRequestHandler;
 
 /*
diff --git a/src/include/storage/undofile.h b/src/include/storage/undofile.h
new file mode 100644
index 0000000000..a5ec30fa7a
--- /dev/null
+++ b/src/include/storage/undofile.h
@@ -0,0 +1,59 @@
+/*-------------------------------------------------------------------------
+ *
+ * undofile.h
+ *	  undo storage manager public interface declarations.
+ *
+ *
+ * Portions Copyright (c) 1996-2019, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * src/include/storage/undofile.h
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifndef UNDOFILE_H
+#define UNDOFILE_H
+
+#include "access/undolog.h"
+#include "storage/block.h"
+#include "storage/relfilenode.h"
+#include "storage/smgr.h"
+#include "storage/sync.h"
+
+extern void undofile_init(void);
+extern void undofile_shutdown(void);
+extern void undofile_open(SMgrRelation reln);
+extern void undofile_close(SMgrRelation reln, ForkNumber forknum);
+extern void undofile_create(SMgrRelation reln, ForkNumber forknum,
+							bool isRedo);
+extern bool undofile_exists(SMgrRelation reln, ForkNumber forknum);
+extern void undofile_unlink(RelFileNodeBackend rnode, ForkNumber forknum,
+							bool isRedo);
+extern void undofile_extend(SMgrRelation reln, ForkNumber forknum,
+		 BlockNumber blocknum, char *buffer, bool skipFsync);
+extern void undofile_prefetch(SMgrRelation reln, ForkNumber forknum,
+		   BlockNumber blocknum);
+extern void undofile_read(SMgrRelation reln, ForkNumber forknum,
+						  BlockNumber blocknum, char *buffer);
+extern void undofile_write(SMgrRelation reln, ForkNumber forknum,
+		BlockNumber blocknum, char *buffer, bool skipFsync);
+extern void undofile_writeback(SMgrRelation reln, ForkNumber forknum,
+			BlockNumber blocknum, BlockNumber nblocks);
+extern BlockNumber undofile_nblocks(SMgrRelation reln, ForkNumber forknum);
+extern void undofile_truncate(SMgrRelation reln, ForkNumber forknum,
+		   BlockNumber nblocks);
+extern void undofile_immedsync(SMgrRelation reln, ForkNumber forknum);
+
+/* Callbacks used by sync.c. */
+extern int undofile_syncfiletag(const FileTag *tag, char *path);
+extern bool undofile_filetagmatches(const FileTag *tag, const FileTag *candidate);
+
+/* Management of checkpointer requests. */
+extern void undofile_request_sync(UndoLogNumber logno, BlockNumber segno,
+								  Oid tablespace);
+extern void undofile_forget_sync(UndoLogNumber logno, BlockNumber segno,
+								 Oid tablespace);
+extern void undofile_forget_sync_tablespace(Oid tablespace);
+extern void undofile_request_sync_dir(Oid tablespace);
+
+#endif							/* UNDOFILE_H */
diff --git a/src/include/utils/guc.h b/src/include/utils/guc.h
index e709177c37..84639eed22 100644
--- a/src/include/utils/guc.h
+++ b/src/include/utils/guc.h
@@ -431,6 +431,8 @@ extern void GUC_check_errcode(int sqlerrcode);
 extern bool check_default_tablespace(char **newval, void **extra, GucSource source);
 extern bool check_temp_tablespaces(char **newval, void **extra, GucSource source);
 extern void assign_temp_tablespaces(const char *newval, void *extra);
+extern bool check_undo_tablespaces(char **newval, void **extra, GucSource source);
+extern void assign_undo_tablespaces(const char *newval, void *extra);
 
 /* in catalog/namespace.c */
 extern bool check_search_path(char **newval, void **extra, GucSource source);
diff --git a/src/test/regress/expected/rules.out b/src/test/regress/expected/rules.out
index 210e9cd146..7098461dde 100644
--- a/src/test/regress/expected/rules.out
+++ b/src/test/regress/expected/rules.out
@@ -2010,6 +2010,16 @@ pg_stat_sys_tables| SELECT pg_stat_all_tables.relid,
     pg_stat_all_tables.autoanalyze_count
    FROM pg_stat_all_tables
   WHERE ((pg_stat_all_tables.schemaname = ANY (ARRAY['pg_catalog'::name, 'information_schema'::name])) OR (pg_stat_all_tables.schemaname ~ '^pg_toast'::text));
+pg_stat_undo_logs| SELECT pg_stat_get_undo_logs.logno,
+    pg_stat_get_undo_logs.category,
+    pg_stat_get_undo_logs.tablespace,
+    pg_stat_get_undo_logs.discard,
+    pg_stat_get_undo_logs.insert,
+    pg_stat_get_undo_logs."end",
+    pg_stat_get_undo_logs.xid,
+    pg_stat_get_undo_logs.pid,
+    pg_stat_get_undo_logs.status
+   FROM pg_stat_get_undo_logs() pg_stat_get_undo_logs(logno, category, tablespace, discard, insert, "end", xid, pid, status);
 pg_stat_user_functions| SELECT p.oid AS funcid,
     n.nspname AS schemaname,
     p.proname AS funcname,
-- 
2.39.5