From 3da534dc6c28b2b3127d7400ab524f12b9faa0ae Mon Sep 17 00:00:00 2001 From: Thomas Munro Date: Wed, 6 Mar 2019 16:46:04 +1300 Subject: [PATCH] Add undo log manager. Add a new subsystem to manage undo logs. Undo logs allow data to be appended efficiently, like logs. They also allow data to be discarded efficiently from the other end, like a queue. Thirdly, they allow efficient buffered random access, like a relation. Undo logs physically consist of a set of 1MB segment files under $PGDATA/base/undo (or per-tablespace equivalent) that are created, deleted or renamed as required, similarly to the way that WAL segments are managed. Meta-data about the set of undo logs is stored in shared memory, and written to per-checkpoint files under $PGDATA/pg_undo. Provide access to the undo files managed by undolog.c through bufmgr.c. A new SMGR implementation allows bufmgr.c to access files created by undolog.c. Author: Thomas Munro, with contributions from Dilip Kumar, Rafia Sabih, Robert Haas and Amit Kapila Reviewed-by: Discussion: https://postgr.es/m/CAEepm%3D2EqROYJ_xYz4v5kfr4b0qw_Lq_6Pe8RTEC8rx3upWsSQ%40mail.gmail.com --- src/backend/access/Makefile | 2 +- src/backend/access/rmgrdesc/Makefile | 2 +- src/backend/access/rmgrdesc/undologdesc.c | 81 + src/backend/access/transam/rmgr.c | 1 + src/backend/access/transam/xlog.c | 5 + src/backend/access/transam/xlogutils.c | 70 +- src/backend/access/undo/Makefile | 17 + src/backend/access/undo/undolog.c | 2627 +++++++++++++++++++++ src/backend/bootstrap/bootstrap.c | 3 + src/backend/catalog/system_views.sql | 4 + src/backend/commands/tablespace.c | 23 + src/backend/postmaster/pgstat.c | 21 + src/backend/replication/basebackup.c | 18 +- src/backend/replication/logical/decode.c | 1 + src/backend/storage/buffer/bufmgr.c | 80 +- src/backend/storage/buffer/localbuf.c | 43 + src/backend/storage/file/fd.c | 3 +- src/backend/storage/ipc/ipci.c | 3 + src/backend/storage/lmgr/lwlock.c | 2 + src/backend/storage/lmgr/lwlocknames.txt | 1 + src/backend/storage/smgr/Makefile | 2 +- src/backend/storage/smgr/smgr.c | 23 + src/backend/storage/smgr/undofile.c | 422 ++++ src/backend/storage/sync/sync.c | 6 + src/backend/utils/init/postinit.c | 2 + src/backend/utils/misc/guc.c | 12 + src/bin/initdb/initdb.c | 2 + src/bin/pg_checksums/pg_checksums.c | 23 +- src/bin/pg_resetwal/pg_resetwal.c | 76 + src/bin/pg_upgrade/Makefile | 2 +- src/bin/pg_upgrade/check.c | 43 + src/bin/pg_upgrade/controldata.c | 25 + src/bin/pg_upgrade/exec.c | 4 + src/bin/pg_upgrade/pg_upgrade.c | 2 + src/bin/pg_upgrade/pg_upgrade.h | 5 + src/bin/pg_upgrade/undo.c | 292 +++ src/bin/pg_waldump/rmgrdesc.c | 1 + src/include/access/rmgrlist.h | 1 + src/include/access/session.h | 7 + src/include/access/undolog.h | 489 ++++ src/include/access/undolog_xlog.h | 62 + src/include/access/xlogutils.h | 14 + src/include/catalog/database_internal.h | 21 + src/include/catalog/pg_proc.dat | 7 + src/include/pgstat.h | 7 + src/include/storage/bufmgr.h | 14 +- src/include/storage/fd.h | 1 + src/include/storage/lwlock.h | 5 +- src/include/storage/smgr.h | 3 + src/include/storage/sync.h | 3 +- src/include/storage/undofile.h | 59 + src/include/utils/guc.h | 2 + src/test/regress/expected/rules.out | 10 + 53 files changed, 4613 insertions(+), 41 deletions(-) create mode 100644 src/backend/access/rmgrdesc/undologdesc.c create mode 100644 src/backend/access/undo/Makefile create mode 100644 src/backend/access/undo/undolog.c create mode 100644 src/backend/storage/smgr/undofile.c create mode 100644 src/bin/pg_upgrade/undo.c create mode 100644 src/include/access/undolog.h create mode 100644 src/include/access/undolog_xlog.h create mode 100644 src/include/catalog/database_internal.h create mode 100644 src/include/storage/undofile.h diff --git a/src/backend/access/Makefile b/src/backend/access/Makefile index 0880e0a8bb..bf6d3fa1bd 100644 --- a/src/backend/access/Makefile +++ b/src/backend/access/Makefile @@ -9,6 +9,6 @@ top_builddir = ../../.. include $(top_builddir)/src/Makefile.global SUBDIRS = brin common gin gist hash heap index nbtree rmgrdesc spgist \ - table tablesample transam + table tablesample transam undo include $(top_srcdir)/src/backend/common.mk diff --git a/src/backend/access/rmgrdesc/Makefile b/src/backend/access/rmgrdesc/Makefile index 5514db1dda..91ad1ef8a3 100644 --- a/src/backend/access/rmgrdesc/Makefile +++ b/src/backend/access/rmgrdesc/Makefile @@ -11,6 +11,6 @@ include $(top_builddir)/src/Makefile.global OBJS = brindesc.o clogdesc.o committsdesc.o dbasedesc.o genericdesc.o \ gindesc.o gistdesc.o hashdesc.o heapdesc.o logicalmsgdesc.o \ mxactdesc.o nbtdesc.o relmapdesc.o replorigindesc.o seqdesc.o \ - smgrdesc.o spgdesc.o standbydesc.o tblspcdesc.o xactdesc.o xlogdesc.o + smgrdesc.o spgdesc.o standbydesc.o tblspcdesc.o undologdesc.o xactdesc.o xlogdesc.o include $(top_srcdir)/src/backend/common.mk diff --git a/src/backend/access/rmgrdesc/undologdesc.c b/src/backend/access/rmgrdesc/undologdesc.c new file mode 100644 index 0000000000..f89fcb31c6 --- /dev/null +++ b/src/backend/access/rmgrdesc/undologdesc.c @@ -0,0 +1,81 @@ +/*------------------------------------------------------------------------- + * + * undologdesc.c + * rmgr descriptor routines for access/undo/undolog.c + * + * Portions Copyright (c) 1996-2019, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * + * IDENTIFICATION + * src/backend/access/rmgrdesc/undologdesc.c + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include "access/undolog.h" +#include "access/undolog_xlog.h" + +void +undolog_desc(StringInfo buf, XLogReaderState *record) +{ + char *rec = XLogRecGetData(record); + uint8 info = XLogRecGetInfo(record) & ~XLR_INFO_MASK; + + if (info == XLOG_UNDOLOG_CREATE) + { + xl_undolog_create *xlrec = (xl_undolog_create *) rec; + + appendStringInfo(buf, "logno %u", xlrec->logno); + } + else if (info == XLOG_UNDOLOG_EXTEND) + { + xl_undolog_extend *xlrec = (xl_undolog_extend *) rec; + + appendStringInfo(buf, "logno %u end " UndoLogOffsetFormat, + xlrec->logno, xlrec->end); + } + else if (info == XLOG_UNDOLOG_DISCARD) + { + xl_undolog_discard *xlrec = (xl_undolog_discard *) rec; + + appendStringInfo(buf, "logno %u discard " UndoLogOffsetFormat " end " + UndoLogOffsetFormat, + xlrec->logno, xlrec->discard, xlrec->end); + } + else if (info == XLOG_UNDOLOG_SWITCH) + { + xl_undolog_switch *xlrec = (xl_undolog_switch *) rec; + + appendStringInfo(buf, "logno %u start " UndoLogOffsetFormat " last " UndoLogOffsetFormat, + xlrec->logno, + xlrec->prevlog_xact_start, + xlrec->prevlog_last_urp); + } + +} + +const char * +undolog_identify(uint8 info) +{ + const char *id = NULL; + + switch (info & ~XLR_INFO_MASK) + { + case XLOG_UNDOLOG_CREATE: + id = "CREATE"; + break; + case XLOG_UNDOLOG_EXTEND: + id = "EXTEND"; + break; + case XLOG_UNDOLOG_DISCARD: + id = "DISCARD"; + break; + case XLOG_UNDOLOG_SWITCH: + id = "SWITCH"; + break; + } + + return id; +} diff --git a/src/backend/access/transam/rmgr.c b/src/backend/access/transam/rmgr.c index 9368b56c4c..8b0537405a 100644 --- a/src/backend/access/transam/rmgr.c +++ b/src/backend/access/transam/rmgr.c @@ -18,6 +18,7 @@ #include "access/multixact.h" #include "access/nbtxlog.h" #include "access/spgxlog.h" +#include "access/undolog_xlog.h" #include "access/xact.h" #include "access/xlog_internal.h" #include "catalog/storage_xlog.h" diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c index b6c9353cbd..5dbe485af2 100644 --- a/src/backend/access/transam/xlog.c +++ b/src/backend/access/transam/xlog.c @@ -31,6 +31,7 @@ #include "access/transam.h" #include "access/tuptoaster.h" #include "access/twophase.h" +#include "access/undolog.h" #include "access/xact.h" #include "access/xlog_internal.h" #include "access/xloginsert.h" @@ -6710,6 +6711,9 @@ StartupXLOG(void) */ restoreTwoPhaseData(); + /* Recover undo log meta data corresponding to this checkpoint. */ + StartupUndoLogs(ControlFile->checkPointCopy.redo); + lastFullPageWrites = checkPoint.fullPageWrites; RedoRecPtr = XLogCtl->RedoRecPtr = XLogCtl->Insert.RedoRecPtr = checkPoint.redo; @@ -8977,6 +8981,7 @@ static void CheckPointGuts(XLogRecPtr checkPointRedo, int flags) { CheckPointCLOG(); + CheckPointUndoLogs(checkPointRedo, ControlFile->checkPointCopy.redo); CheckPointCommitTs(); CheckPointSUBTRANS(); CheckPointMultiXact(); diff --git a/src/backend/access/transam/xlogutils.c b/src/backend/access/transam/xlogutils.c index 10a663bae6..c227c03854 100644 --- a/src/backend/access/transam/xlogutils.c +++ b/src/backend/access/transam/xlogutils.c @@ -293,6 +293,65 @@ XLogReadBufferForRedo(XLogReaderState *record, uint8 block_id, false, buf); } +/* + * Find the block ID of the first block that matches the given rnode forknum + * and blockno. If blockno is InvalidBlockNumber, then match any block + * number. Return true if found. + */ +bool +XLogFindBlockId(XLogReaderState *record, + RelFileNode rnode, + ForkNumber forknum, + BlockNumber blockno, + uint8 *block_id) +{ + uint8 i; + + for (i = 0; i <= record->max_block_id; ++i) + { + DecodedBkpBlock *block = &record->blocks[i]; + + if (block->in_use && + RelFileNodeEquals(block->rnode, rnode) && + block->forknum == forknum && + (block->blkno == blockno || blockno == InvalidBlockNumber)) + { + *block_id = i; + return true; + } + } + + return false; +} + +/* + * If the caller doesn't know the the block_id, but does know the RelFileNode, + * forknum and block number, then we try to find it. + */ +XLogRedoAction +XLogReadBufferForRedoBlock(XLogReaderState *record, + RelFileNode rnode, + ForkNumber forknum, + BlockNumber blockno, + ReadBufferMode mode, + bool get_cleanup_lock, + Buffer *buf) +{ + uint8 block_id; + + if (XLogFindBlockId(record, rnode, forknum, blockno, &block_id)) + return XLogReadBufferForRedoExtended(record, + block_id, + mode, + get_cleanup_lock, + buf); + + elog(ERROR, "failed to find block reference rel %u/%u/%u, forknum = %u, block = %u", + rnode.spcNode, rnode.dbNode, rnode.relNode, forknum, blockno); + + return BLK_NOTFOUND; /* not reached */ +} + /* * Pin and lock a buffer referenced by a WAL record, for the purpose of * re-initializing it. @@ -346,7 +405,8 @@ XLogReadBufferForRedoExtended(XLogReaderState *record, * Make sure that if the block is marked with WILL_INIT, the caller is * going to initialize it. And vice versa. */ - zeromode = (mode == RBM_ZERO_AND_LOCK || mode == RBM_ZERO_AND_CLEANUP_LOCK); + zeromode = (mode == RBM_ZERO || mode == RBM_ZERO_AND_LOCK || + mode == RBM_ZERO_AND_CLEANUP_LOCK); willinit = (record->blocks[block_id].flags & BKPBLOCK_WILL_INIT) != 0; if (willinit && !zeromode) elog(PANIC, "block with WILL_INIT flag in WAL record must be zeroed by redo routine"); @@ -462,7 +522,7 @@ XLogReadBufferExtended(RelFileNode rnode, ForkNumber forknum, { /* page exists in file */ buffer = ReadBufferWithoutRelcache(rnode, forknum, blkno, - mode, NULL); + mode, NULL, RELPERSISTENCE_PERMANENT); } else { @@ -487,7 +547,8 @@ XLogReadBufferExtended(RelFileNode rnode, ForkNumber forknum, ReleaseBuffer(buffer); } buffer = ReadBufferWithoutRelcache(rnode, forknum, - P_NEW, mode, NULL); + P_NEW, mode, NULL, + RELPERSISTENCE_PERMANENT); } while (BufferGetBlockNumber(buffer) < blkno); /* Handle the corner case that P_NEW returns non-consecutive pages */ @@ -497,7 +558,8 @@ XLogReadBufferExtended(RelFileNode rnode, ForkNumber forknum, LockBuffer(buffer, BUFFER_LOCK_UNLOCK); ReleaseBuffer(buffer); buffer = ReadBufferWithoutRelcache(rnode, forknum, blkno, - mode, NULL); + mode, NULL, + RELPERSISTENCE_PERMANENT); } } diff --git a/src/backend/access/undo/Makefile b/src/backend/access/undo/Makefile new file mode 100644 index 0000000000..219c6963cf --- /dev/null +++ b/src/backend/access/undo/Makefile @@ -0,0 +1,17 @@ +#------------------------------------------------------------------------- +# +# Makefile-- +# Makefile for access/undo +# +# IDENTIFICATION +# src/backend/access/undo/Makefile +# +#------------------------------------------------------------------------- + +subdir = src/backend/access/undo +top_builddir = ../../../.. +include $(top_builddir)/src/Makefile.global + +OBJS = undolog.o + +include $(top_srcdir)/src/backend/common.mk diff --git a/src/backend/access/undo/undolog.c b/src/backend/access/undo/undolog.c new file mode 100644 index 0000000000..f2e0272ab1 --- /dev/null +++ b/src/backend/access/undo/undolog.c @@ -0,0 +1,2627 @@ +/*------------------------------------------------------------------------- + * + * undolog.c + * management of undo logs + * + * PostgreSQL undo log manager. This module is responsible for managing the + * lifecycle of undo logs and their segment files, associating undo logs with + * backends, and allocating space within undo logs. + * + * For the code that reads and writes blocks of data, see undofile.c. + * + * Portions Copyright (c) 1996-2019, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * src/backend/access/undo/undolog.c + * + *------------------------------------------------------------------------- + */ + +#include "postgres.h" + +#include "access/session.h" +#include "access/transam.h" +#include "access/undolog.h" +#include "access/undolog_xlog.h" +#include "access/xact.h" +#include "access/xlog.h" +#include "access/xlogreader.h" +#include "access/xlogutils.h" +#include "catalog/catalog.h" +#include "catalog/pg_tablespace.h" +#include "commands/tablespace.h" +#include "funcapi.h" +#include "miscadmin.h" +#include "nodes/execnodes.h" +#include "pgstat.h" +#include "storage/buf.h" +#include "storage/bufmgr.h" +#include "storage/fd.h" +#include "storage/ipc.h" +#include "storage/lwlock.h" +#include "storage/procarray.h" +#include "storage/shmem.h" +#include "storage/standby.h" +#include "storage/sync.h" +#include "storage/undofile.h" +#include "utils/builtins.h" +#include "utils/guc.h" +#include "utils/memutils.h" +#include "utils/varlena.h" + +#include +#include + +/* + * Main control structure for undo log management in shared memory. + * UndoLogSlot objects are arranged in a fixed-size array, with no particular + * ordering. + */ +typedef struct UndoLogSharedData +{ + UndoLogNumber free_lists[UndoLogCategories]; + UndoLogNumber low_logno; + UndoLogNumber next_logno; + UndoLogNumber nslots; + UndoLogSlot slots[FLEXIBLE_ARRAY_MEMBER]; +} UndoLogSharedData; + +/* The shared memory region that all backends are attach to. */ +UndoLogSharedData *UndoLogShared; + +undologtable_hash *undologtable_cache; + +/* GUC variables */ +char *undo_tablespaces = NULL; + +static UndoLogSlot *find_undo_log_slot(UndoLogNumber logno, bool locked); +static UndoLogSlot *allocate_undo_log_slot(void); +static void free_undo_log_slot(UndoLogSlot *log); +static void attach_undo_log(UndoLogCategory category, Oid tablespace); +static void detach_current_undo_log(UndoLogCategory category, bool full); +static void extend_undo_log(UndoLogNumber logno, UndoLogOffset new_end); +static void undo_log_before_exit(int code, Datum value); +static void forget_undo_buffers(int logno, UndoLogOffset old_discard, + UndoLogOffset new_discard, + bool drop_tail); +static bool choose_undo_tablespace(bool force_detach, Oid *oid); + +PG_FUNCTION_INFO_V1(pg_stat_get_undo_logs); + +/* + * How many undo logs can be active at a time? This creates a theoretical + * maximum amount of undo data that can exist, but if we set it to a multiple + * of the maximum number of backends it will be a very high limit. + * Alternative designs involving demand paging or dynamic shared memory could + * remove this limit but would be complicated. + */ +static inline size_t +UndoLogNumSlots(void) +{ + return MaxBackends * 4; +} + +/* + * Return the amount of traditional shmem required for undo log management. + */ +Size +UndoLogShmemSize(void) +{ + return sizeof(UndoLogSharedData) + + UndoLogNumSlots() * sizeof(UndoLogSlot); +} + +/* + * Initialize the undo log subsystem. Called in each backend. + */ +void +UndoLogShmemInit(void) +{ + bool found; + + UndoLogShared = (UndoLogSharedData *) + ShmemInitStruct("UndoLogShared", UndoLogShmemSize(), &found); + + /* The postmaster initialized the shared memory state. */ + if (!IsUnderPostmaster) + { + int i; + + Assert(!found); + + /* + * We start with no active undo logs. StartUpUndoLogs() will recreate + * the undo logs that were known at the last checkpoint. + */ + memset(UndoLogShared, 0, sizeof(*UndoLogShared)); + UndoLogShared->nslots = UndoLogNumSlots(); + for (i = 0; i < UndoLogCategories; ++i) + UndoLogShared->free_lists[i] = InvalidUndoLogNumber; + for (i = 0; i < UndoLogShared->nslots; ++i) + { + memset(&UndoLogShared->slots[i], 0, sizeof(UndoLogShared->slots[i])); + UndoLogShared->slots[i].logno = InvalidUndoLogNumber; + LWLockInitialize(&UndoLogShared->slots[i].mutex, + LWTRANCHE_UNDOLOG); + LWLockInitialize(&UndoLogShared->slots[i].discard_lock, + LWTRANCHE_UNDODISCARD); + } + } + else + Assert(found); + + /* All backends prepare their per-backend lookup table. */ + undologtable_cache = undologtable_create(TopMemoryContext, + UndoLogNumSlots(), + NULL); +} + +void +UndoLogInit(void) +{ + before_shmem_exit(undo_log_before_exit, 0); +} + +/* + * Figure out which directory holds an undo log based on tablespace. + */ +void +UndoLogDirectory(Oid tablespace, char *dir) +{ + if (tablespace == DEFAULTTABLESPACE_OID || + tablespace == InvalidOid) + snprintf(dir, MAXPGPATH, "base/undo"); + else + snprintf(dir, MAXPGPATH, "pg_tblspc/%u/%s/undo", + tablespace, TABLESPACE_VERSION_DIRECTORY); +} + +/* + * Compute the pathname to use for an undo log segment file. + */ +void +UndoLogSegmentPath(UndoLogNumber logno, int segno, Oid tablespace, char *path) +{ + char dir[MAXPGPATH]; + + /* Figure out which directory holds the segment, based on tablespace. */ + UndoLogDirectory(tablespace, dir); + + /* + * Build the path from log number and offset. The pathname is the + * UndoRecPtr of the first byte in the segment in hexadecimal, with a + * period inserted between the components. + */ + snprintf(path, MAXPGPATH, "%s/%06X.%010zX", dir, logno, + segno * UndoLogSegmentSize); +} + +/* + * Iterate through the set of currently active logs. Pass in NULL to get the + * first undo log. NULL indicates the end of the set of logs. The caller + * must lock the returned log before accessing its members, and must skip if + * logno is not valid. + */ +UndoLogSlot * +UndoLogNextSlot(UndoLogSlot *slot) +{ + LWLockAcquire(UndoLogLock, LW_SHARED); + for (;;) + { + /* Advance to the next log. */ + if (slot == NULL) + { + /* Start at the beginning. */ + slot = &UndoLogShared->slots[0]; + } + else if (++slot == &UndoLogShared->slots[UndoLogShared->nslots]) + { + /* Past the end. */ + slot = NULL; + break; + } + /* Have we found a slot with a valid log? */ + if (slot->logno != InvalidUndoLogNumber) + break; + } + LWLockRelease(UndoLogLock); + + /* XXX: erm, which lock should the caller hold!? */ + return slot; +} + +/* + * Check if an undo log position has been discarded. 'pointer' must be an + * undo log pointer that was allocated at some point in the past, otherwise + * the result is undefined. + */ +bool +UndoLogRecPtrIsDiscardedSlowPath(UndoRecPtr pointer) +{ + UndoLogNumber logno = UndoRecPtrGetLogNo(pointer); + UndoLogSlot *slot; + UndoRecPtr discard; + + slot = find_undo_log_slot(logno, false); + + if (slot == NULL) + { + /* + * If we couldn't find the undo log number, then it must be entirely + * discarded. Set this backend's recent_discard value to the highest + * possible value, so that all records appear to be discarded to the + * fast-path code. Technically this value is too low by 1, but + * assuming only pointers to records are tested, and no record can + * have size 1, this value suffices. + */ + discard = MakeUndoRecPtr(logno, UndoLogMaxSize - 1); + } + else + { + LWLockAcquire(&slot->mutex, LW_SHARED); + if (unlikely(logno != slot->logno)) + { + /* + * The undo log has been entirely discarded since we looked it up + * above, and the UndoLogSlot is now unused or being used for some + * other undo log. This is the same as not finding it. + */ + discard = MakeUndoRecPtr(logno, UndoLogMaxSize - 1); + } + else + discard = MakeUndoRecPtr(logno, slot->meta.discard); + LWLockRelease(&slot->mutex); + } + + /* + * Remember this discard pointer in this backend so that future lookups + * via UndoLogRecPtrIsDiscarded() have a chance of avoiding the slow path. + */ + UndoLogGetTableEntry(logno)->recent_discard = discard; + + return pointer < discard; +} + +/* + * Fetch the previous transaction's start undo record point. + */ +UndoRecPtr +UndoLogGetLastXactStartPoint(UndoLogNumber logno) +{ + UndoLogSlot *slot = find_undo_log_slot(logno, false); + uint64 last_xact_start = 0; + + if (unlikely(slot == NULL)) + return InvalidUndoRecPtr; + + LWLockAcquire(&slot->mutex, LW_SHARED); + /* TODO: review */ + last_xact_start = slot->meta.unlogged.last_xact_start; + LWLockRelease(&slot->mutex); + + if (last_xact_start == 0) + return InvalidUndoRecPtr; + + return MakeUndoRecPtr(logno, last_xact_start); +} + +/* + * Detach from the undo log we are currently attached to, returning it to the + * appropriate free list if it still has space. + */ +static void +detach_current_undo_log(UndoLogCategory category, bool full) +{ + UndoLogSlot *slot; + + slot = CurrentSession->attached_undo_slots[category]; + + Assert(slot != NULL); + + CurrentSession->attached_undo_slots[category] = NULL; + + LWLockAcquire(&slot->mutex, LW_EXCLUSIVE); + slot->pid = InvalidPid; + slot->meta.unlogged.xid = InvalidTransactionId; + if (full) + slot->meta.status = UNDO_LOG_STATUS_FULL; + LWLockRelease(&slot->mutex); + + /* Push back onto the appropriate free list, unless it's full. */ + if (!full) + { + LWLockAcquire(UndoLogLock, LW_EXCLUSIVE); + slot->next_free = UndoLogShared->free_lists[category]; + UndoLogShared->free_lists[category] = slot->logno; + LWLockRelease(UndoLogLock); + } +} + +/* + * Exit handler, detaching from all undo logs. + */ +static void +undo_log_before_exit(int code, Datum arg) +{ + int i; + + if (!CurrentSession) + return; + + for (i = 0; i < UndoLogCategories; ++i) + { + if (CurrentSession->attached_undo_slots[i] != NULL) + detach_current_undo_log(i, false); + } +} + +/* + * Create a new empty segment file on disk for the byte starting at 'end'. + */ +static void +allocate_empty_undo_segment(UndoLogNumber logno, Oid tablespace, + UndoLogOffset end) +{ + struct stat stat_buffer; + off_t size; + char path[MAXPGPATH]; + void *zeroes; + size_t nzeroes = 8192; + int fd; + + UndoLogSegmentPath(logno, end / UndoLogSegmentSize, tablespace, path); + + /* + * Create and fully allocate a new file. If we crashed and recovered + * then the file might already exist, so use flags that tolerate that. + * It's also possible that it exists but is too short, in which case + * we'll write the rest. We don't really care what's in the file, we + * just want to make sure that the filesystem has allocated physical + * blocks for it, so that non-COW filesystems will report ENOSPC now + * rather than later when the space is needed and we'll avoid creating + * files with holes. + */ + fd = OpenTransientFile(path, O_RDWR | O_CREAT | PG_BINARY); + if (fd < 0 && tablespace != 0) + { + char undo_path[MAXPGPATH]; + + /* Try creating the undo directory for this tablespace. */ + UndoLogDirectory(tablespace, undo_path); + if (mkdir(undo_path, S_IRWXU) != 0 && errno != EEXIST) + { + char *parentdir; + + if (errno != ENOENT || !InRecovery) + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not create directory \"%s\": %m", + undo_path))); + + /* + * In recovery, it's possible that the tablespace directory + * doesn't exist because a later WAL record removed the whole + * tablespace. In that case we create a regular directory to + * stand in for it. This is similar to the logic in + * TablespaceCreateDbspace(). + */ + + /* create two parents up if not exist */ + parentdir = pstrdup(undo_path); + get_parent_directory(parentdir); + get_parent_directory(parentdir); + /* Can't create parent and it doesn't already exist? */ + if (mkdir(parentdir, S_IRWXU) < 0 && errno != EEXIST) + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not create directory \"%s\": %m", + parentdir))); + pfree(parentdir); + + /* create one parent up if not exist */ + parentdir = pstrdup(undo_path); + get_parent_directory(parentdir); + /* Can't create parent and it doesn't already exist? */ + if (mkdir(parentdir, S_IRWXU) < 0 && errno != EEXIST) + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not create directory \"%s\": %m", + parentdir))); + pfree(parentdir); + + if (mkdir(undo_path, S_IRWXU) != 0 && errno != EEXIST) + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not create directory \"%s\": %m", + undo_path))); + } + + fd = OpenTransientFile(path, O_RDWR | O_CREAT | PG_BINARY); + } + if (fd < 0) + elog(ERROR, "could not create new file \"%s\": %m", path); + if (fstat(fd, &stat_buffer) < 0) + elog(ERROR, "could not stat \"%s\": %m", path); + size = stat_buffer.st_size; + + /* A buffer full of zeroes we'll use to fill up new segment files. */ + zeroes = palloc0(nzeroes); + + while (size < UndoLogSegmentSize) + { + ssize_t written; + + written = write(fd, zeroes, Min(nzeroes, UndoLogSegmentSize - size)); + if (written < 0) + elog(ERROR, "cannot initialize undo log segment file \"%s\": %m", + path); + size += written; + } + + /* Flush the contents of the file to disk before the next checkpoint. */ + undofile_request_sync(logno, end / UndoLogSegmentSize, tablespace); + + CloseTransientFile(fd); + + pfree(zeroes); + + elog(DEBUG1, "created undo segment \"%s\"", path); +} + +/* + * Create a new undo segment, when it is unexpectedly not present. + */ +void +UndoLogNewSegment(UndoLogNumber logno, Oid tablespace, int segno) +{ + Assert(InRecovery); + allocate_empty_undo_segment(logno, tablespace, segno * UndoLogSegmentSize); +} + +/* + * Create and zero-fill a new segment for a given undo log number. + */ +static void +extend_undo_log(UndoLogNumber logno, UndoLogOffset new_end) +{ + UndoLogSlot *slot; + size_t end; + + slot = find_undo_log_slot(logno, false); + + /* TODO review interlocking */ + + Assert(slot != NULL); + Assert(slot->meta.end % UndoLogSegmentSize == 0); + Assert(new_end % UndoLogSegmentSize == 0); + Assert(InRecovery || + CurrentSession->attached_undo_slots[slot->meta.category] == slot); + + /* + * Create all the segments needed to increase 'end' to the requested + * size. This is quite expensive, so we will try to avoid it completely + * by renaming files into place in UndoLogDiscard() instead. + */ + end = slot->meta.end; + while (end < new_end) + { + allocate_empty_undo_segment(logno, slot->meta.tablespace, end); + end += UndoLogSegmentSize; + } + + /* Flush the directory entries before next checkpoint. */ + undofile_request_sync_dir(slot->meta.tablespace); + + /* + * If we're not in recovery, we need to WAL-log the creation of the new + * file(s). We do that after the above filesystem modifications, in + * violation of the data-before-WAL rule as exempted by + * src/backend/access/transam/README. This means that it's possible for + * us to crash having made some or all of the filesystem changes but + * before WAL logging, but in that case we'll eventually try to create the + * same segment(s) again, which is tolerated. + */ + if (!InRecovery) + { + xl_undolog_extend xlrec; + XLogRecPtr ptr; + + xlrec.logno = logno; + xlrec.end = end; + + XLogBeginInsert(); + XLogRegisterData((char *) &xlrec, sizeof(xlrec)); + ptr = XLogInsert(RM_UNDOLOG_ID, XLOG_UNDOLOG_EXTEND); + XLogFlush(ptr); + } + + /* + * We didn't need to acquire the mutex to read 'end' above because only + * we write to it. But we need the mutex to update it, because the + * checkpointer might read it concurrently. + * + * XXX It's possible for meta.end to be higher already during + * recovery, because of the timing of a checkpoint; in that case we did + * nothing above and we shouldn't update shmem here. That interaction + * needs more analysis. + */ + LWLockAcquire(&slot->mutex, LW_EXCLUSIVE); + if (slot->meta.end < end) + slot->meta.end = end; + LWLockRelease(&slot->mutex); +} + +/* + * This function must be called before all of the undo log activity that will + * be covered by a single WAL record. + */ +void +UndoLogBeginInsert(UndoLogAllocContext *context, + UndoLogCategory category, + XLogReaderState *xlog_record) +{ + context->try_location = InvalidUndoRecPtr; + context->category = category; + + /* + * Tell UndoLogAllocate() to capture undo log meta-data before-change + * images, so that UndoLogRegister() can find them and they can be written + * to the WAL once per checkpoint. + */ + context->num_meta_data_images = 0; + + /* + * Tell UndoLogAllocateInRecovery() that we don't know which undo log to + * allocate in yet, and to start its search for registered blocks at + * the lowest-numbered block_id. + */ + context->xlog_record = xlog_record; + context->recovery_logno = InvalidUndoLogNumber; + context->recovery_block_id = 0; + + /* + * For UNDO_SHARED, this always denotes the beginning of a new record set. + * For other categories, the boundaries are detected by transaction ID + * changes. + */ + context->new_shared_record_set = category == UNDO_SHARED; +} + +/* + * Get an insertion point that is guaranteed to be backed by enough space to + * hold 'size' bytes of data. To actually write into the undo log, client + * code should call this first and then use bufmgr routines to access buffers + * and provide WAL logs and redo handlers. In other words, while this module + * looks after making sure the undo log has sufficient space and the undo meta + * data is crash safe, the *contents* of the undo log and (indirectly) the + * insertion point are the responsibility of client code. + * + * A suggested insertion point can optionally be passed in as 'try_location', + * and will be returned if possible. If not InvalidUndoRecPtr, it must fall + * with, or exactly one byte after, the most recent allocation for the same + * persistence level. This interface allows for a series of allocation to be + * made without committing to using the space yet; call UndoLogAdvance() to + * actually advance the insert pointer. + * + * Return an undo log insertion point that can be converted to a buffer tag + * and an insertion point within a buffer page. + */ +UndoRecPtr +UndoLogAllocate(UndoLogAllocContext *context, + uint16 size, + bool *need_xact_header, + UndoRecPtr *last_xact_start, + UndoRecPtr *prevlog_xact_start, + UndoRecPtr *prevlog_insert_urp) +{ + Session *session = CurrentSession; + UndoLogSlot *slot; + UndoLogOffset new_insert; + TransactionId logxid; + + slot = CurrentSession->attached_undo_slots[context->category]; + + /* + * We may need to attach to an undo log, either because this is the first + * time this backend as needed to write to an undo log at all or because + * the undo_tablespaces GUC was changed. When doing that, we'll need + * interlocking against tablespaces being concurrently dropped. + */ + + retry: + /* See if we need to check the undo_tablespaces GUC. */ + if (unlikely(session->need_to_choose_undo_tablespace || slot == NULL)) + { + Oid tablespace; + bool need_to_unlock; + + need_to_unlock = + choose_undo_tablespace(session->need_to_choose_undo_tablespace, + &tablespace); + attach_undo_log(context->category, tablespace); + if (need_to_unlock) + LWLockRelease(TablespaceCreateLock); + slot = CurrentSession->attached_undo_slots[context->category]; + session->need_to_choose_undo_tablespace = false; + } + + LWLockAcquire(&slot->mutex, LW_EXCLUSIVE); + logxid = slot->meta.unlogged.xid; + + if (logxid != GetTopTransactionId()) + { + /* + * While we have the lock, check if we have been forcibly detached by + * DROP TABLESPACE. That can only happen between transactions (see + * DropUndoLogsInsTablespace()). + */ + if (slot->pid == InvalidPid) + { + LWLockRelease(&slot->mutex); + slot = NULL; + goto retry; + } + /* Record that we are attached to this log. */ + slot->meta.unlogged.xid = GetTopTransactionId(); + /* + * Maintain our tracking of the and the previous transaction start + * locations. + */ + if (slot->meta.unlogged.this_xact_start != slot->meta.unlogged.insert) + { + slot->meta.unlogged.last_xact_start = + slot->meta.unlogged.this_xact_start; + slot->meta.unlogged.this_xact_start = slot->meta.unlogged.insert; + } + } + LWLockRelease(&slot->mutex); + + /* + * 'size' is expressed in usable non-header bytes. Figure out how far we + * have to move insert to create space for 'size' usable bytes, stepping + * over any intervening headers. + */ + Assert(slot->meta.unlogged.insert % BLCKSZ >= UndoLogBlockHeaderSize); + if (context->try_location != InvalidUndoRecPtr) + { + /* + * The try location must be in the log we're attached to, at most one + * byte past the end of space backed by files. + */ + UndoLogOffset try_offset = UndoRecPtrGetOffset(context->try_location); + + Assert(UndoRecPtrGetLogNo(context->try_location) == slot->logno); + Assert(try_offset <= slot->meta.end); + new_insert = UndoLogOffsetPlusUsableBytes(try_offset, size); + } + else + { + new_insert = UndoLogOffsetPlusUsableBytes(slot->meta.unlogged.insert, + size); + } + Assert(new_insert % BLCKSZ >= UndoLogBlockHeaderSize); + + /* + * We don't need to acquire log->mutex to read log->meta.insert and + * log->meta.end, because this backend is the only one that can + * modify them. + */ + if (unlikely(new_insert > slot->meta.end)) + { + if (new_insert > UndoLogMaxSize) + { + /* This undo log is entirely full. Get a new one. */ + if (logxid == GetTopTransactionId()) + { + /* + * If the same transaction is split over two undo logs then + * store the previous log number in new log. See detailed + * comments in undorecord.c file header. + */ + *prevlog_xact_start = + MakeUndoRecPtr(slot->logno, + slot->meta.unlogged.this_xact_start); + *prevlog_insert_urp = + MakeUndoRecPtr(slot->logno, slot->meta.unlogged.insert); + } + elog(DEBUG1, "undo log %u is full, switching to a new one", slot->logno); + slot = NULL; + detach_current_undo_log(context->category, true); + context->try_location = InvalidUndoRecPtr; + goto retry; + } + /* + * Extend the end of this undo log to cover new_insert (in other words + * round up to the segment size). + */ + extend_undo_log(slot->logno, + new_insert + UndoLogSegmentSize - + new_insert % UndoLogSegmentSize); + Assert(new_insert <= slot->meta.end); + } + + /* + * Create a back-up image of the unlogged part of the undo log's + * meta-data, if we haven't already done so since UndoLogBeginInsert() (ie + * for the WAL record that this undo allocation will be replayed by). + */ + if (context->num_meta_data_images == 0 || + context->meta_data_images[context->num_meta_data_images - 1].logno != slot->logno) + { + if (context->num_meta_data_images >= MAX_META_DATA_IMAGES) + elog(ERROR, "too many undo log meta data images"); + context->meta_data_images[context->num_meta_data_images].logno = slot->logno; + context->meta_data_images[context->num_meta_data_images++].data = slot->meta.unlogged; + } + + /* + * If no try_location was passed in, or if we switched logs, then we'll + * return the current insertion point. + */ + if (context->try_location == InvalidUndoRecPtr) + context->try_location = MakeUndoRecPtr(slot->logno, slot->meta.unlogged.insert); + + /* + * Is this location the first in this undo log for a transaction or a + * shared record set? + */ + if (context->new_shared_record_set) + { + context->new_shared_record_set = false; + *need_xact_header = true; + } + else + { + *need_xact_header = + UndoRecPtrGetOffset(context->try_location) == + slot->meta.unlogged.this_xact_start; + } + *last_xact_start = + MakeUndoRecPtr(slot->logno, slot->meta.unlogged.last_xact_start); + + return context->try_location; +} + +void +UndoLogRegister(UndoLogAllocContext *context, uint8 block_id, UndoLogNumber logno) +{ + int i; + + for (i = 0; i < context->num_meta_data_images; ++i) + { + if (context->meta_data_images[i].logno == logno) + { + XLogRegisterBufData(block_id, + (char *) &context->meta_data_images[i].data, + sizeof(context->meta_data_images[i].data)); + return; + } + } +} + +/* + * In recovery, we expect exactly the same sequence of allocation sizes, but + * we also need the WAL record that is being replayed so we can figure out + * where the undo space was allocated. + */ +UndoRecPtr +UndoLogAllocateInRecovery(UndoLogAllocContext *context, + TransactionId xid, + uint16 size, + bool *need_xact_header, + UndoRecPtr *last_xact_start, + UndoRecPtr *prevlog_xact_start, + UndoRecPtr *prevlog_last_urp) +{ + UndoLogSlot *slot; + + Assert(InRecovery); + + /* + * Just as in UndoLogAllocate(), the caller may be extending an existing + * allocation before committing with UndoLogAdvance(). + */ + if (context->try_location != InvalidUndoRecPtr) + { + /* + * The try location must be in the log we're attached to, at most one + * byte past the end of space backed by files. + */ + UndoLogOffset try_offset = UndoRecPtrGetOffset(context->try_location); + UndoLogNumber logno = UndoRecPtrGetLogNo(context->try_location); + + /* + * You can only have a try_location on your second or later allocation + * for a given WAL record. It had better be in the same log as the + * previous allocation for this WAL record (though it may not turn out + * to have enough space, below). + */ + Assert(logno == context->recovery_logno); + + /* + * Any log extension triggered by UndoLogAllocate() must have been + * replayed by now, so we can just check if this log has enough space, + * and if so, return. + */ + slot = find_undo_log_slot(logno, false); + if (UndoLogOffsetPlusUsableBytes(try_offset, size) <= slot->meta.end) + { + *need_xact_header = false; + return try_offset; + } + + /* Full. Ignore try_location and find the next log that was used. */ + Assert(slot->meta.status == UNDO_LOG_STATUS_FULL); + } + else + { + /* + * For now we only support one allocation per WAL record that doesn't + * have a try_location (ie the first one). We'll have to find out + * which log was used first. + */ + Assert(context->recovery_logno == InvalidUndoLogNumber); + } + + /* + * In order to find the undo log that was used by UndoLogAllocate(), we + * consult the list of registered blocks to figure out which undo logs + * should be written to by this WAL record. + */ + while (context->recovery_block_id <= context->xlog_record->max_block_id) + { + DecodedBkpBlock *block; + + /* We're looking for the first block referencing a new undo log. */ + block = &context->xlog_record->blocks[context->recovery_block_id]; + if (block->rnode.dbNode == UndoDbOid && + block->rnode.relNode != context->recovery_logno) + { + UndoLogNumber logno = block->rnode.relNode; + const void *backup; + size_t backup_size; + + /* We found a reference to a different (or first) undo log. */ + slot = find_undo_log_slot(logno, false); + + /* + * Since on-line checkpoints capture an inconsistent snapshot of + * undo log meta-data, we'll restore the unlogged part of the + * meta-data image if one was attached to the WAL record (that is, + * the members that don't have WAL records for every change + * already). + */ + backup = + XLogRecGetBlockData(context->xlog_record, + context->recovery_block_id, + &backup_size); + if (unlikely(backup)) + { + Assert(backup_size == sizeof(UndoLogUnloggedMetaData)); + + /* Restore the unlogged members from the backup-imaged. */ + LWLockAcquire(&slot->mutex, LW_EXCLUSIVE); + memcpy(&slot->meta.unlogged, backup, sizeof(UndoLogUnloggedMetaData)); + LWLockRelease(&slot->mutex); + } + else + { + /* + * Otherwise we need to do our own transaction tracking + * whenever we see a new xid, to match the logic in + * UndoLogAllocate(). + */ + if (xid != slot->meta.unlogged.xid) + { + slot->meta.unlogged.xid = xid; + if (slot->meta.unlogged.this_xact_start != slot->meta.unlogged.insert) + slot->meta.unlogged.last_xact_start = + slot->meta.unlogged.this_xact_start; + slot->meta.unlogged.this_xact_start = + slot->meta.unlogged.insert; + } + } + + /* TODO: check locking against undo log slot recycling? */ + + /* + * At this stage we should have an undo log that can handle this + * allocation. If we don't, something is screwed up. + */ + if (UndoLogOffsetPlusUsableBytes(slot->meta.unlogged.insert, size) > slot->meta.end) + elog(ERROR, + "cannot allocate %d bytes in undo log %d", + (int) size, slot->logno); + + *need_xact_header = + context->try_location == InvalidUndoRecPtr && + slot->meta.unlogged.insert == slot->meta.unlogged.this_xact_start; + *last_xact_start = slot->meta.unlogged.last_xact_start; + context->recovery_logno = slot->logno; + + /* Read log switch information from meta and reset it. */ + *prevlog_xact_start = slot->meta.unlogged.prevlog_xact_start; + *prevlog_last_urp = slot->meta.unlogged.prevlog_last_urp; + + slot->meta.unlogged.prevlog_xact_start = InvalidUndoRecPtr; + slot->meta.unlogged.prevlog_last_urp = InvalidUndoRecPtr; + + return MakeUndoRecPtr(slot->logno, slot->meta.unlogged.insert); + } + ++context->recovery_block_id; + } + + /* + * If we've run out of blocks to inspect, then we must have replayed a + * different sequence of allocation sizes, or screwed up the + * XLOG_UNDOLOG_EXTEND records, indicating a bug somewhere. + */ + elog(ERROR, "cannot determine undo log to allocate from"); + + return 0; /* not reached */ +} + +/* + * Advance the insertion pointer in this context by 'size' usable (non-header) + * bytes. This is the next place we'll try to allocate a record, if it fits. + * This is not committed to shared memory until after we've WAL-logged the + * record and UndoLogAdvanceFinal() is called. + */ +void +UndoLogAdvance(UndoLogAllocContext *context, size_t size) +{ + context->try_location = UndoLogOffsetPlusUsableBytes(context->try_location, + size); +} + +/* + * Advance the insertion pointer to 'size' usable (non-header) bytes past + * insertion_point. + */ +void +UndoLogAdvanceFinal(UndoRecPtr insertion_point, size_t size) +{ + UndoLogSlot *slot = NULL; + UndoLogNumber logno = UndoRecPtrGetLogNo(insertion_point) ; + + slot = find_undo_log_slot(logno, false); + + /* + * Either we're in recovery, or is a log we are currently attached to, or + * recently detached from because it was full. + */ + Assert(InRecovery || + AmAttachedToUndoLogSlot(slot) || + slot->meta.status == UNDO_LOG_STATUS_FULL); + + /* + * The caller has the current insertion point, as returned by + * UndoLogAllocate[InRecovery](). + */ + Assert(UndoRecPtrGetOffset(insertion_point) == slot->meta.unlogged.insert); + + LWLockAcquire(&slot->mutex, LW_EXCLUSIVE); + slot->meta.unlogged.insert = + UndoLogOffsetPlusUsableBytes(slot->meta.unlogged.insert, size); + LWLockRelease(&slot->mutex); +} + +/* + * Advance the discard pointer in one undo log, discarding all undo data + * relating to one or more whole transactions. The passed in undo pointer is + * the address of the oldest data that the called would like to keep, and the + * affected undo log is implied by this pointer, ie + * UndoRecPtrGetLogNo(discard_pointer). + * + * The caller asserts that there will be no attempts to access the undo log + * region being discarded after this moment. This operation will cause the + * relevant buffers to be dropped immediately, without writing any data out to + * disk. Any attempt to read the buffers (except a partial buffer at the end + * of this range which will remain) may result in IO errors, because the + * underlying segment file may have been physically removed. + * + * Return true if the discard point was updated, and false if nothing was done + * because the log precending the given point was already discarded. + * + * TODO: The return value is not yet reliable and the code still doesn't work + * correctly if called for the same undo log in two backends; more + * interlocking work required here. + */ +bool +UndoLogDiscard(UndoRecPtr discard_point, TransactionId xid) +{ + UndoLogNumber logno = UndoRecPtrGetLogNo(discard_point); + UndoLogOffset discard = UndoRecPtrGetOffset(discard_point); + UndoLogOffset old_discard; + UndoLogOffset end; + UndoLogSlot *slot; + int segno; + int new_segno; + bool need_to_flush_wal = false; + bool entirely_discarded = false; + + slot = find_undo_log_slot(logno, false); + if (unlikely(slot == NULL)) + { + /* Already discarded (entirely). */ + return false; + } + + LWLockAcquire(&slot->mutex, LW_EXCLUSIVE); + if (unlikely(slot->logno != logno || discard <= slot->meta.discard)) + { + /* + * Already discarded entirely and the slot has been recycled, or up + * to this point). + */ + LWLockRelease(&slot->mutex); + return false; + } + if (discard > slot->meta.unlogged.insert) + elog(ERROR, "cannot move discard point past insert point"); + old_discard = slot->meta.discard; + end = slot->meta.end; + /* Are we discarding the last remaining data in a log marked as full? */ + if (slot->meta.status == UNDO_LOG_STATUS_FULL && + discard == slot->meta.unlogged.insert) + { + /* + * Adjust the discard and insert pointers so that the final segment is + * deleted from disk, and remember not to recycle it. + */ + entirely_discarded = true; + /* TODO: Check if the following line is replayed correctly */ + slot->meta.unlogged.insert = slot->meta.end; + discard = slot->meta.end; + } + LWLockRelease(&slot->mutex); + + /* + * TODO: I think we need a new lock just for this phase, so that buffer + * dropping and IO are done by only one backend if a superuser command and + * a discard worker both run this! + */ + + /* + * Drop all buffers holding this undo data out of the buffer pool (except + * the last one, if the new location is in the middle of it somewhere), so + * that the contained data doesn't ever touch the disk. The caller + * promises that this data will not be needed again. We have to drop the + * buffers from the buffer pool before removing files, otherwise a + * concurrent session might try to write the block to evict the buffer. + */ + forget_undo_buffers(logno, old_discard, discard, entirely_discarded); + + /* + * Check if we crossed a segment boundary and need to do some synchronous + * filesystem operations. + */ + segno = old_discard / UndoLogSegmentSize; + new_segno = discard / UndoLogSegmentSize; + if (segno < new_segno) + { + int recycle; + UndoLogOffset pointer; + + /* + * We always WAL-log discards, but we only need to flush the WAL if we + * have performed a filesystem operation. + */ + need_to_flush_wal = true; + + /* + * XXX When we rename or unlink a file, it's possible that some + * backend still has it open because it has recently read a page from + * it. smgr/undofile.c in any such backend will eventually close it, + * because it considers that fd to belong to the file with the name + * that we're unlinking or renaming and it doesn't like to keep more + * than one open at a time. No backend should ever try to read from + * such a file descriptor; that is what it means when we say that the + * caller of UndoLogDiscard() asserts that there will be no attempts + * to access the discarded range of undo log. In the case of a + * rename, if a backend were to attempt to read undo data in the range + * being discarded, it would read entirely the wrong data. + */ + + /* + * How many segments should we recycle (= rename from tail position to + * head position)? For now it's always 1 unless there is already a + * spare one, but we could have an adaptive algorithm that recycles + * multiple segments at a time and pays just one fsync(). + */ + LWLockAcquire(&slot->mutex, LW_SHARED); + if ((slot->meta.end - slot->meta.unlogged.insert) < UndoLogSegmentSize && + slot->meta.status == UNDO_LOG_STATUS_ACTIVE) + recycle = 1; + else + recycle = 0; + LWLockRelease(&slot->mutex); + + /* Rewind to the start of the segment. */ + pointer = segno * UndoLogSegmentSize; + + while (pointer < new_segno * UndoLogSegmentSize) + { + char discard_path[MAXPGPATH]; + + /* Tell the checkpointer that the file is going away. */ + undofile_forget_sync(logno, pointer / UndoLogSegmentSize, + slot->meta.tablespace); + + UndoLogSegmentPath(logno, pointer / UndoLogSegmentSize, + slot->meta.tablespace, discard_path); + + /* Can we recycle the oldest segment? */ + if (recycle > 0) + { + char recycle_path[MAXPGPATH]; + + /* + * End points one byte past the end of the current undo space, + * ie to the first byte of the segment file we want to create. + */ + UndoLogSegmentPath(logno, end / UndoLogSegmentSize, + slot->meta.tablespace, recycle_path); + if (rename(discard_path, recycle_path) == 0) + { + elog(DEBUG1, "recycled undo segment \"%s\" -> \"%s\"", + discard_path, recycle_path); + end += UndoLogSegmentSize; + --recycle; + } + else + { + elog(ERROR, "could not rename \"%s\" to \"%s\": %m", + discard_path, recycle_path); + } + } + else + { + if (unlink(discard_path) == 0) + elog(DEBUG1, "unlinked undo segment \"%s\"", discard_path); + else + elog(ERROR, "could not unlink \"%s\": %m", discard_path); + } + pointer += UndoLogSegmentSize; + } + } + + /* WAL log the discard. */ + { + xl_undolog_discard xlrec; + XLogRecPtr ptr; + + xlrec.logno = logno; + xlrec.discard = discard; + xlrec.end = end; + xlrec.latestxid = xid; + xlrec.entirely_discarded = entirely_discarded; + + XLogBeginInsert(); + XLogRegisterData((char *) &xlrec, sizeof(xlrec)); + ptr = XLogInsert(RM_UNDOLOG_ID, XLOG_UNDOLOG_DISCARD); + + if (need_to_flush_wal) + XLogFlush(ptr); + } + + /* Update shmem to show the new discard and end pointers. */ + LWLockAcquire(&slot->mutex, LW_EXCLUSIVE); + slot->meta.discard = discard; + slot->meta.end = end; + LWLockRelease(&slot->mutex); + + /* If we discarded everything, the slot can be given up. */ + if (entirely_discarded) + free_undo_log_slot(slot); + + return true; +} + +/* + * Return an UndoRecPtr to the oldest valid data in an undo log, or + * InvalidUndoRecPtr if it is empty. + */ +UndoRecPtr +UndoLogGetOldestRecord(UndoLogNumber logno, bool *full) +{ + UndoLogSlot *slot; + UndoRecPtr result; + + /* Try to find the slot for this undo log number. */ + slot = find_undo_log_slot(logno, false); + if (slot == NULL) + { + /* It's unknown to us, so we assume it's been entirely discarded. */ + if (full) + *full = true; + return InvalidUndoRecPtr; + } + + LWLockAcquire(&slot->mutex, LW_SHARED); + if (slot->logno != logno) + { + /* It's been recycled. SO it must have been entirely discarded. */ + result = InvalidUndoRecPtr; + if (full) + *full = true; + } + else if (slot->meta.discard == slot->meta.unlogged.insert) + { + /* It's empty, so there is no oldest record pointer to return. */ + result = InvalidUndoRecPtr; + if (full) + *full = slot->meta.status == UNDO_LOG_STATUS_FULL; + } + else + { + /* There is a record here! */ + result = MakeUndoRecPtr(slot->logno, slot->meta.discard); + if (full) + *full = slot->meta.status == UNDO_LOG_STATUS_FULL; + } + LWLockRelease(&slot->mutex); + + return result; +} + +/* + * UndoLogSwitchSetPrevLogInfo - Store previous log info on the log switch and + * wal log the same. + */ +void +UndoLogSwitchSetPrevLogInfo(UndoLogNumber logno, UndoRecPtr prevlog_xact_start, + UndoRecPtr prevlog_last_urp) +{ + UndoLogSlot *slot; + + slot = find_undo_log_slot(logno, false); + + /* + * Either we're in recovery, or is a log we are currently attached to, or + * recently detached from because it was full. + */ + Assert(AmAttachedToUndoLogSlot(slot)); + + LWLockAcquire(&slot->mutex, LW_EXCLUSIVE); + slot->meta.unlogged.prevlog_xact_start = prevlog_last_urp; + slot->meta.unlogged.prevlog_last_urp = prevlog_last_urp; + LWLockRelease(&slot->mutex); + + /* Wal log the log switch. */ + { + xl_undolog_switch xlrec; + + xlrec.logno = logno; + xlrec.prevlog_xact_start = prevlog_last_urp; + xlrec.prevlog_last_urp = prevlog_xact_start; + + XLogBeginInsert(); + XLogRegisterData((char *) &xlrec, sizeof(xlrec)); + XLogInsert(RM_UNDOLOG_ID, XLOG_UNDOLOG_SWITCH); + } +} + +/* + * Return the next insert location. + */ +UndoRecPtr +UndoLogGetNextInsertPtr(UndoLogNumber logno) +{ + UndoLogSlot *slot = find_undo_log_slot(logno, false); + UndoRecPtr insert; + + LWLockAcquire(&slot->mutex, LW_SHARED); + /* TODO: what if the slot has been recycled? */ + insert = slot->meta.unlogged.insert; + LWLockRelease(&slot->mutex); + + return MakeUndoRecPtr(logno, insert); +} + +/* + * Delete unreachable files under pg_undo. Any files corresponding to LSN + * positions before the previous checkpoint are no longer needed. + */ +static void +CleanUpUndoCheckPointFiles(XLogRecPtr checkPointRedo) +{ + DIR *dir; + struct dirent *de; + char path[MAXPGPATH]; + char oldest_path[MAXPGPATH]; + + /* + * If a base backup is in progress, we can't delete any checkpoint + * snapshot files because one of them corresponds to the backup label but + * there could be any number of checkpoints during the backup. + */ + if (BackupInProgress()) + return; + + /* Otherwise keep only those >= the previous checkpoint's redo point. */ + snprintf(oldest_path, MAXPGPATH, "%016" INT64_MODIFIER "X", + checkPointRedo); + dir = AllocateDir("pg_undo"); + while ((de = ReadDir(dir, "pg_undo")) != NULL) + { + /* + * Assume that fixed width uppercase hex strings sort the same way as + * the values they represent, so we can use strcmp to identify undo + * log snapshot files corresponding to checkpoints that we don't need + * anymore. This assumption holds for ASCII. + */ + if (!(strlen(de->d_name) == UNDO_CHECKPOINT_FILENAME_LENGTH)) + continue; + + if (UndoCheckPointFilenamePrecedes(de->d_name, oldest_path)) + { + snprintf(path, MAXPGPATH, "pg_undo/%s", de->d_name); + if (unlink(path) != 0) + elog(ERROR, "could not unlink file \"%s\": %m", path); + } + } + FreeDir(dir); +} + +/* + * Write out the undo log meta data to the pg_undo directory. The actual + * contents of undo logs is in shared buffers and therefore handled by + * CheckPointBuffers(), but here we record the table of undo logs and their + * properties. + */ +void +CheckPointUndoLogs(XLogRecPtr checkPointRedo, XLogRecPtr priorCheckPointRedo) +{ + UndoLogMetaData *serialized = NULL; + size_t serialized_size = 0; + char *data; + char path[MAXPGPATH]; + UndoLogNumber num_logs; + int fd; + int i; + pg_crc32c crc; + + /* + * We acquire UndoLogLock to prevent any undo logs from being created or + * discarded while we build a snapshot of them. This isn't expected to + * take long on a healthy system because the number of active logs should + * be around the number of backends. Holding this lock won't prevent + * concurrent access to the undo log, except when segments need to be + * added or removed. + */ + LWLockAcquire(UndoLogLock, LW_SHARED); + + /* + * Rather than doing the file IO while we hold locks, we'll copy the + * meta-data into a palloc'd buffer. + */ + serialized_size = sizeof(UndoLogMetaData) * UndoLogNumSlots(); + serialized = (UndoLogMetaData *) palloc0(serialized_size); + + /* Scan through all slots looking for non-empty ones. */ + num_logs = 0; + for (i = 0; i < UndoLogNumSlots(); ++i) + { + UndoLogSlot *slot = &UndoLogShared->slots[i]; + + /* Skip empty slots. */ + if (slot->logno == InvalidUndoLogNumber) + continue; + + /* Capture snapshot while holding each mutex. */ + LWLockAcquire(&slot->mutex, LW_EXCLUSIVE); + serialized[num_logs++] = slot->meta; + LWLockRelease(&slot->mutex); + } + + LWLockRelease(UndoLogLock); + + /* Dump into a file under pg_undo. */ + snprintf(path, MAXPGPATH, "pg_undo/%016" INT64_MODIFIER "X", + checkPointRedo); + pgstat_report_wait_start(WAIT_EVENT_UNDO_CHECKPOINT_WRITE); + fd = OpenTransientFile(path, O_RDWR | O_CREAT | PG_BINARY); + if (fd < 0) + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not create file \"%s\": %m", path))); + + /* Compute header checksum. */ + INIT_CRC32C(crc); + COMP_CRC32C(crc, &UndoLogShared->low_logno, sizeof(UndoLogShared->low_logno)); + COMP_CRC32C(crc, &UndoLogShared->next_logno, sizeof(UndoLogShared->next_logno)); + COMP_CRC32C(crc, &num_logs, sizeof(num_logs)); + FIN_CRC32C(crc); + + /* Write out the number of active logs + crc. */ + if ((write(fd, &UndoLogShared->low_logno, sizeof(UndoLogShared->low_logno)) != sizeof(UndoLogShared->low_logno)) || + (write(fd, &UndoLogShared->next_logno, sizeof(UndoLogShared->next_logno)) != sizeof(UndoLogShared->next_logno)) || + (write(fd, &num_logs, sizeof(num_logs)) != sizeof(num_logs)) || + (write(fd, &crc, sizeof(crc)) != sizeof(crc))) + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not write to file \"%s\": %m", path))); + + /* Write out the meta data for all active undo logs. */ + data = (char *) serialized; + INIT_CRC32C(crc); + serialized_size = num_logs * sizeof(UndoLogMetaData); + while (serialized_size > 0) + { + ssize_t written; + + written = write(fd, data, serialized_size); + if (written < 0) + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not write to file \"%s\": %m", path))); + COMP_CRC32C(crc, data, written); + serialized_size -= written; + data += written; + } + FIN_CRC32C(crc); + + if (write(fd, &crc, sizeof(crc)) != sizeof(crc)) + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not write to file \"%s\": %m", path))); + + + /* Flush file and directory entry. */ + pgstat_report_wait_start(WAIT_EVENT_UNDO_CHECKPOINT_SYNC); + pg_fsync(fd); + if (CloseTransientFile(fd) < 0) + ereport(data_sync_elevel(ERROR), + (errcode_for_file_access(), + errmsg("could not close file \"%s\": %m", path))); + fsync_fname("pg_undo", true); + pgstat_report_wait_end(); + + if (serialized) + pfree(serialized); + + CleanUpUndoCheckPointFiles(priorCheckPointRedo); +} + +void +StartupUndoLogs(XLogRecPtr checkPointRedo) +{ + char path[MAXPGPATH]; + int i; + int fd; + int nlogs; + pg_crc32c crc; + pg_crc32c new_crc; + + /* If initdb is calling, there is no file to read yet. */ + if (IsBootstrapProcessingMode()) + return; + + /* Open the pg_undo file corresponding to the given checkpoint. */ + snprintf(path, MAXPGPATH, "pg_undo/%016" INT64_MODIFIER "X", + checkPointRedo); + pgstat_report_wait_start(WAIT_EVENT_UNDO_CHECKPOINT_READ); + fd = OpenTransientFile(path, O_RDONLY | PG_BINARY); + if (fd < 0) + elog(ERROR, "cannot open undo checkpoint snapshot \"%s\": %m", path); + + /* Read the active log number range. */ + if ((read(fd, &UndoLogShared->low_logno, sizeof(UndoLogShared->low_logno)) + != sizeof(UndoLogShared->low_logno)) || + (read(fd, &UndoLogShared->next_logno, sizeof(UndoLogShared->next_logno)) + != sizeof(UndoLogShared->next_logno)) || + (read(fd, &nlogs, sizeof(nlogs)) != sizeof(nlogs)) || + (read(fd, &crc, sizeof(crc)) != sizeof(crc))) + elog(ERROR, "pg_undo file \"%s\" is corrupted", path); + + /* Verify the header checksum. */ + INIT_CRC32C(new_crc); + COMP_CRC32C(new_crc, &UndoLogShared->low_logno, sizeof(UndoLogShared->low_logno)); + COMP_CRC32C(new_crc, &UndoLogShared->next_logno, sizeof(UndoLogShared->next_logno)); + COMP_CRC32C(new_crc, &nlogs, sizeof(UndoLogShared->next_logno)); + FIN_CRC32C(new_crc); + + if (crc != new_crc) + elog(ERROR, + "pg_undo file \"%s\" has incorrect checksum", path); + + /* + * We'll acquire UndoLogLock just because allocate_undo_log() asserts we + * hold it (we don't actually expect concurrent access yet). + */ + LWLockAcquire(UndoLogLock, LW_EXCLUSIVE); + + /* Initialize all the logs and set up the freelist. */ + INIT_CRC32C(new_crc); + for (i = 0; i < nlogs; ++i) + { + ssize_t size; + UndoLogSlot *slot; + + /* + * Get a new UndoLogSlot. If this checkpoint was created on a system + * with a higher max_connections setting, it's theoretically possible + * that we don't have enough space and cannot start up. + */ + slot = allocate_undo_log_slot(); + if (!slot) + ereport(ERROR, + (errmsg("not enough undo log slots to recover from checkpoint: need at least %d, have %zu", + nlogs, UndoLogNumSlots()), + errhint("Consider increasing max_connections"))); + + /* Read in the meta data for this undo log. */ + if ((size = read(fd, &slot->meta, sizeof(slot->meta))) != sizeof(slot->meta)) + elog(ERROR, "short read of pg_undo meta data in file \"%s\": %m (got %zu, wanted %zu)", + path, size, sizeof(slot->meta)); + COMP_CRC32C(new_crc, &slot->meta, sizeof(slot->meta)); + + /* + * At normal start-up, or during recovery, all active undo logs start + * out on the appropriate free list. + */ + slot->logno = slot->meta.logno; + slot->pid = InvalidPid; + slot->oldest_data = MakeUndoRecPtr(slot->logno, slot->meta.discard); + if (slot->meta.status == UNDO_LOG_STATUS_ACTIVE) + { + slot->next_free = UndoLogShared->free_lists[slot->meta.category]; + UndoLogShared->free_lists[slot->meta.category] = slot->logno; + } + } + FIN_CRC32C(new_crc); + + LWLockRelease(UndoLogLock); + + /* Verify body checksum. */ + if (read(fd, &crc, sizeof(crc)) != sizeof(crc)) + elog(ERROR, "pg_undo file \"%s\" is corrupted", path); + if (crc != new_crc) + elog(ERROR, + "pg_undo file \"%s\" has incorrect checksum", path); + + CloseTransientFile(fd); + pgstat_report_wait_end(); +} + +/* + * Allocate a new UndoLogSlot object. + */ +static UndoLogSlot * +allocate_undo_log_slot(void) +{ + UndoLogSlot *slot; + UndoLogNumber i; + + Assert(LWLockHeldByMeInMode(UndoLogLock, LW_EXCLUSIVE)); + + for (i = 0; i < UndoLogNumSlots(); ++i) + { + slot = &UndoLogShared->slots[i]; + if (slot->logno == InvalidUndoLogNumber) + { + memset(&slot->meta, 0, sizeof(slot->meta)); + slot->pid = 0; + slot->wait_fxmin = InvalidFullTransactionId; + slot->oldest_data =0; + slot->next_free = -1; + slot->logno = -1; + return slot; + } + } + + return NULL; +} + +/* + * Free an UndoLogSlot object in shared memory, so that it can be reused. + * This is a rare event, and has complications for all code paths that access + * slots. Unless the current session is attached to the slot, it must be + * prepared for it to be freed and then potentially recycled for use by + * another log. See UndoLogGetSlot(). + */ +static void +free_undo_log_slot(UndoLogSlot *slot) +{ + /* + * When removing an undo log from a slot in shared memory, we acquire + * UndoLogLock, log->mutex and log->discard_lock, so that other code can + * hold any one of those locks to prevent the slot from being recycled. + */ + LWLockAcquire(UndoLogLock, LW_EXCLUSIVE); + LWLockAcquire(&slot->mutex, LW_EXCLUSIVE); + Assert(slot->logno != InvalidUndoLogNumber); + slot->logno = InvalidUndoLogNumber; + memset(&slot->meta, 0, sizeof(slot->meta)); + LWLockRelease(&slot->mutex); + LWLockRelease(UndoLogLock); +} + +/* + * Find the UndoLogSlot object for a given log number. + * + * The caller may or may not already hold UndoLogLock, and should indicate + * this by passing 'locked'. We'll acquire it in the slow path if necessary. + * If it is not held by the caller, the caller must deal with the possibility + * that the returned UndoLogSlot no longer contains the requested logno by the + * time it is accessed. + * + * To do that, one of the following approaches must be taken by the calling + * code: + * + * 1. If the calling code knows that it is attached to this lock or is the + * recovery process, then there is no way for the slot to be recycled, so it's + * not necessary to check that the log number hasn't changed. The slot cannot + * be recycled while a backend is attached. It should probably assert that it + * is attached, however. + * + * 2. All other code should acquire log->mutex before accessing any members, + * and after doing so, check that the logno hasn't moved. If it is not, the + * entire undo log must be assumed to be discarded (as if this function + * returned NULL) and the caller must behave accordingly. + * + * Return NULL if the undo log has been entirely discarded. It is an error to + * ask for undo logs that have never been created. + */ +static UndoLogSlot * +find_undo_log_slot(UndoLogNumber logno, bool locked) +{ + UndoLogSlot *result = NULL; + UndoLogTableEntry *entry; + bool found; + + Assert(locked == LWLockHeldByMe(UndoLogLock)); + + /* First see if we already have it in our cache. */ + entry = undologtable_lookup(undologtable_cache, logno); + if (likely(entry)) + result = entry->slot; + else + { + UndoLogNumber i; + + /* Nope. Linear search for the slot in shared memory. */ + if (!locked) + LWLockAcquire(UndoLogLock, LW_SHARED); + for (i = 0; i < UndoLogNumSlots(); ++i) + { + if (UndoLogShared->slots[i].logno == logno) + { + /* Found it. */ + + /* + * TODO: Should this function be usable in a critical section? + * Would it make sense to detect that we are in a critical + * section and just return the pointer to the log without + * updating the cache, to avoid any chance of allocating + * memory? + */ + + entry = undologtable_insert(undologtable_cache, logno, &found); + entry->number = logno; + entry->slot = &UndoLogShared->slots[i]; + entry->tablespace = entry->slot->meta.tablespace; + entry->category = entry->slot->meta.category; + entry->recent_discard = + MakeUndoRecPtr(logno, entry->slot->meta.discard); + result = entry->slot; + break; + } + } + + /* + * If we didn't find it, then it must already have been entirely + * discarded. We create a negative cache entry so that we can answer + * this question quickly next time. + * + * TODO: We could track the lowest known undo log number, to reduce + * the negative cache entry bloat. + */ + if (result == NULL) + { + /* + * Sanity check: the caller should not be asking about undo logs + * that have never existed. + */ + if (logno >= UndoLogShared->next_logno) + elog(ERROR, "undo log %u hasn't been created yet", logno); + entry = undologtable_insert(undologtable_cache, logno, &found); + entry->number = logno; + entry->slot = NULL; + entry->tablespace = 0; + } + if (!locked) + LWLockRelease(UndoLogLock); + } + + return result; +} + +/* + * Get a pointer to an UndoLogSlot object corresponding to a given logno. + * + * In general, the caller must acquire the UndoLogSlot's mutex to access + * the contents, and at that time must consider that the logno might have + * changed because the undo log it contained has been entirely discarded. + * + * If the calling backend is currently attached to the undo log, that is not + * possible, because logs can only reach UNDO_LOG_STATUS_DISCARDED after first + * reaching UNDO_LOG_STATUS_FULL, and that only happens while detaching. + */ +UndoLogSlot * +UndoLogGetSlot(UndoLogNumber logno, bool missing_ok) +{ + UndoLogSlot *slot = find_undo_log_slot(logno, false); + + if (slot == NULL && !missing_ok) + elog(ERROR, "unknown undo log number %d", logno); + + return slot; +} + +/* + * Attach to a free undo log, creating a new one if required. + */ +static void +attach_undo_log(UndoLogCategory category, Oid tablespace) +{ + UndoLogSlot *slot = NULL; + UndoLogNumber logno; + UndoLogNumber *place; + + Assert(!InRecovery); + Assert(CurrentSession->attached_undo_slots[category] == NULL); + + LWLockAcquire(UndoLogLock, LW_EXCLUSIVE); + + /* + * For now we have a simple linked list of unattached undo logs for each + * persistence level. We'll grovel though it to find something for the + * tablespace you asked for. If you're not using multiple tablespaces + * it'll be able to pop one off the front. We might need a hash table + * keyed by tablespace if this simple scheme turns out to be too slow when + * using many tablespaces and many undo logs, but that seems like an + * unusual use case not worth optimizing for. + */ + place = &UndoLogShared->free_lists[category]; + while (*place != InvalidUndoLogNumber) + { + UndoLogSlot *candidate = find_undo_log_slot(*place, true); + + /* + * There should never be an undo log on the freelist that has been + * entirely discarded, or hasn't been created yet. The persistence + * level should match the freelist. + */ + if (unlikely(candidate == NULL)) + elog(ERROR, + "corrupted undo log freelist, no such undo log %u", *place); + if (unlikely(candidate->meta.category != category)) + elog(ERROR, + "corrupted undo log freelist, undo log %u with persistence %d found on freelist %d", + *place, candidate->meta.category, category); + + if (candidate->meta.tablespace == tablespace) + { + logno = *place; + slot = candidate; + *place = candidate->next_free; + break; + } + place = &candidate->next_free; + } + + /* + * If all existing undo logs for this tablespace and persistence level are + * busy, we'll have to create a new one. + */ + if (slot == NULL) + { + if (UndoLogShared->next_logno > MaxUndoLogNumber) + { + /* + * You've used up all 16 exabytes of undo log addressing space. + * This is a difficult state to reach using only 16 exabytes of + * WAL. + */ + elog(ERROR, "undo log address space exhausted"); + } + + /* Allocate a slot from the UndoLogSlot pool. */ + slot = allocate_undo_log_slot(); + if (unlikely(!slot)) + ereport(ERROR, + (errmsg("could not create new undo log"), + errdetail("The maximum number of active undo logs is %zu.", + UndoLogNumSlots()), + errhint("Consider increasing max_connections."))); + slot->logno = logno = UndoLogShared->next_logno; + + /* + * The insert and discard pointers start after the first block's + * header. XXX That means that insert is > end for a short time in a + * newly created undo log. Is there any problem with that? + */ + slot->meta.unlogged.insert = UndoLogBlockHeaderSize; + slot->meta.discard = UndoLogBlockHeaderSize; + + slot->meta.logno = logno; + slot->meta.tablespace = tablespace; + slot->meta.category = category; + slot->meta.status = UNDO_LOG_STATUS_ACTIVE; + + /* Move the high log number pointer past this one. */ + ++UndoLogShared->next_logno; + + /* WAL-log the creation of this new undo log. */ + { + xl_undolog_create xlrec; + + xlrec.logno = logno; + xlrec.tablespace = slot->meta.tablespace; + xlrec.category = slot->meta.category; + + XLogBeginInsert(); + XLogRegisterData((char *) &xlrec, sizeof(xlrec)); + XLogInsert(RM_UNDOLOG_ID, XLOG_UNDOLOG_CREATE); + } + + /* + * This undo log has no segments. UndoLogAllocate will create the + * first one on demand. + */ + } + LWLockRelease(UndoLogLock); + + LWLockAcquire(&slot->mutex, LW_EXCLUSIVE); + slot->pid = MyProcPid; + LWLockRelease(&slot->mutex); + + CurrentSession->attached_undo_slots[category] = slot; +} + +/* check_hook: validate new undo_tablespaces */ +bool +check_undo_tablespaces(char **newval, void **extra, GucSource source) +{ + char *rawname; + List *namelist; + + /* Need a modifiable copy of string */ + rawname = pstrdup(*newval); + + /* + * Parse string into list of identifiers, just to check for + * well-formedness (unfortunateley we can't validate the names in the + * catalog yet). + */ + if (!SplitIdentifierString(rawname, ',', &namelist)) + { + /* syntax error in name list */ + GUC_check_errdetail("List syntax is invalid."); + pfree(rawname); + list_free(namelist); + return false; + } + + /* + * Make sure we aren't already in a transaction that has been assigned an + * XID. This ensures we don't detach from an undo log that we might have + * started writing undo data into for this transaction. + */ + if (GetTopTransactionIdIfAny() != InvalidTransactionId) + ereport(ERROR, + (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + (errmsg("undo_tablespaces cannot be changed while a transaction is in progress")))); + list_free(namelist); + + return true; +} + +/* assign_hook: do extra actions as needed */ +void +assign_undo_tablespaces(const char *newval, void *extra) +{ + /* + * This is normally called only when GetTopTransactionIdIfAny() == + * InvalidTransactionId (because you can't change undo_tablespaces in the + * middle of a transaction that's been asigned an xid), but we can't + * assert that because it's also called at the end of a transaction that's + * rolling back, to reset the GUC if it was set inside the transaction. + */ + + /* Tell UndoLogAllocate() to reexamine undo_tablespaces. */ + if (CurrentSession) + CurrentSession->need_to_choose_undo_tablespace = true; +} + +static bool +choose_undo_tablespace(bool force_detach, Oid *tablespace) +{ + char *rawname; + List *namelist; + bool need_to_unlock; + int length; + int i; + + /* We need a modifiable copy of string. */ + rawname = pstrdup(undo_tablespaces); + + /* Break string into list of identifiers. */ + if (!SplitIdentifierString(rawname, ',', &namelist)) + elog(ERROR, "undo_tablespaces is unexpectedly malformed"); + + length = list_length(namelist); + if (length == 0 || + (length == 1 && ((char *) linitial(namelist))[0] == '\0')) + { + /* + * If it's an empty string, then we'll use the default tablespace. No + * locking is required because it can't be dropped. + */ + *tablespace = DEFAULTTABLESPACE_OID; + need_to_unlock = false; + } + else + { + /* + * Choose an OID using our pid, so that if several backends have the + * same multi-tablespace setting they'll spread out. We could easily + * do better than this if more serious load balancing is judged + * useful. + */ + int index = MyProcPid % length; + int first_index = index; + Oid oid = InvalidOid; + + /* + * Take the tablespace create/drop lock while we look the name up. + * This prevents the tablespace from being dropped while we're trying + * to resolve the name, or while the called is trying to create an + * undo log in it. The caller will have to release this lock. + */ + LWLockAcquire(TablespaceCreateLock, LW_EXCLUSIVE); + for (;;) + { + const char *name = list_nth(namelist, index); + + oid = get_tablespace_oid(name, true); + if (oid == InvalidOid) + { + /* Unknown tablespace, try the next one. */ + index = (index + 1) % length; + /* + * But if we've tried them all, it's time to complain. We'll + * arbitrarily complain about the last one we tried in the + * error message. + */ + if (index == first_index) + ereport(ERROR, + (errcode(ERRCODE_UNDEFINED_OBJECT), + errmsg("tablespace \"%s\" does not exist", name), + errhint("Create the tablespace or set undo_tablespaces to a valid or empty list."))); + continue; + } + if (oid == GLOBALTABLESPACE_OID) + ereport(ERROR, + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("undo logs cannot be placed in pg_global tablespace"))); + /* If we got here we succeeded in finding one. */ + break; + } + + Assert(oid != InvalidOid); + *tablespace = oid; + need_to_unlock = true; + } + + /* + * If we came here because the user changed undo_tablesaces, then detach + * from any undo logs we happen to be attached to. + */ + if (force_detach) + { + for (i = 0; i < UndoLogCategories; ++i) + { + UndoLogSlot *slot = CurrentSession->attached_undo_slots[i]; + + if (slot != NULL) + { + LWLockAcquire(&slot->mutex, LW_EXCLUSIVE); + slot->pid = InvalidPid; + slot->meta.unlogged.xid = InvalidTransactionId; + LWLockRelease(&slot->mutex); + + LWLockAcquire(UndoLogLock, LW_EXCLUSIVE); + slot->next_free = UndoLogShared->free_lists[i]; + UndoLogShared->free_lists[i] = slot->logno; + LWLockRelease(UndoLogLock); + + CurrentSession->attached_undo_slots[i] = NULL; + } + } + } + + return need_to_unlock; +} + +bool +DropUndoLogsInTablespace(Oid tablespace) +{ + DIR *dir; + char undo_path[MAXPGPATH]; + UndoLogSlot *slot = NULL; + int i; + + Assert(LWLockHeldByMe(TablespaceCreateLock)); + Assert(tablespace != DEFAULTTABLESPACE_OID); + + /* First, try to kick everyone off any undo logs in this tablespace. */ + while ((slot = UndoLogNextSlot(slot))) + { + bool ok; + bool return_to_freelist = false; + + /* Skip undo logs in other tablespaces. */ + if (slot->meta.tablespace != tablespace) + continue; + + /* Check if this undo log can be forcibly detached. */ + LWLockAcquire(&slot->mutex, LW_EXCLUSIVE); + if (slot->meta.discard == slot->meta.unlogged.insert && + (slot->meta.unlogged.xid == InvalidTransactionId || + !TransactionIdIsInProgress(slot->meta.unlogged.xid))) + { + slot->meta.unlogged.xid = InvalidTransactionId; + if (slot->pid != InvalidPid) + { + slot->pid = InvalidPid; + return_to_freelist = true; + } + ok = true; + } + else + { + /* + * There is data we need in this undo log. We can't force it to + * be detached. + */ + ok = false; + } + LWLockRelease(&slot->mutex); + + /* If we failed, then give up now and report failure. */ + if (!ok) + return false; + + /* + * Put this undo log back on the appropriate free-list. No one can + * attach to it while we hold TablespaceCreateLock, but if we return + * earlier in a future go around this loop, we need the undo log to + * remain usable. We'll remove all appropriate logs from the + * free-lists in a separate step below. + */ + if (return_to_freelist) + { + LWLockAcquire(UndoLogLock, LW_EXCLUSIVE); + slot->next_free = UndoLogShared->free_lists[slot->meta.category]; + UndoLogShared->free_lists[slot->meta.category] = slot->logno; + LWLockRelease(UndoLogLock); + } + } + + /* + * We detached all backends from undo logs in this tablespace, and no one + * can attach to any non-default-tablespace undo logs while we hold + * TablespaceCreateLock. We can now drop the undo logs. + */ + slot = NULL; + while ((slot = UndoLogNextSlot(slot))) + { + /* Skip undo logs in other tablespaces. */ + if (slot->meta.tablespace != tablespace) + continue; + + /* + * Make sure no buffers remain. When that is done by + * UndoLogDiscard(), the final page is left in shared_buffers because + * it may contain data, or at least be needed again very soon. Here + * we need to drop even that page from the buffer pool. + */ + forget_undo_buffers(slot->logno, slot->meta.discard, slot->meta.discard, true); + + /* + * TODO: For now we drop the undo log, meaning that it will never be + * used again. That wastes the rest of its address space. Instead, + * we should put it onto a special list of 'offline' undo logs, ready + * to be reactivated in some other tablespace. Then we can keep the + * unused portion of its address space. + */ + LWLockAcquire(&slot->mutex, LW_EXCLUSIVE); + slot->meta.status = UNDO_LOG_STATUS_DISCARDED; + LWLockRelease(&slot->mutex); + } + + /* Forget about all sync requests relating to this tablespace. */ + undofile_forget_sync_tablespace(tablespace); + + /* Unlink all undo segment files in this tablespace. */ + UndoLogDirectory(tablespace, undo_path); + + dir = AllocateDir(undo_path); + if (dir != NULL) + { + struct dirent *de; + + while ((de = ReadDirExtended(dir, undo_path, LOG)) != NULL) + { + char segment_path[MAXPGPATH]; + + if (strcmp(de->d_name, ".") == 0 || + strcmp(de->d_name, "..") == 0) + continue; + snprintf(segment_path, sizeof(segment_path), "%s/%s", + undo_path, de->d_name); + if (unlink(segment_path) < 0) + elog(LOG, "couldn't unlink file \"%s\": %m", segment_path); + } + FreeDir(dir); + } + + /* Remove all dropped undo logs from the free-lists. */ + LWLockAcquire(UndoLogLock, LW_EXCLUSIVE); + for (i = 0; i < UndoLogCategories; ++i) + { + UndoLogSlot *slot; + UndoLogNumber *place; + + place = &UndoLogShared->free_lists[i]; + while (*place != InvalidUndoLogNumber) + { + slot = find_undo_log_slot(*place, true); + if (!slot) + elog(ERROR, + "corrupted undo log freelist, unknown log %u", *place); + if (slot->meta.status == UNDO_LOG_STATUS_DISCARDED) + *place = slot->next_free; + else + place = &slot->next_free; + } + } + LWLockRelease(UndoLogLock); + + return true; +} + +void +ResetUndoLogs(UndoLogCategory category) +{ + UndoLogSlot *slot = NULL; + + while ((slot = UndoLogNextSlot(slot))) + { + DIR *dir; + struct dirent *de; + char undo_path[MAXPGPATH]; + char segment_prefix[MAXPGPATH]; + size_t segment_prefix_size; + + if (slot->meta.category != category) + continue; + + /* Scan the directory for files belonging to this undo log. */ + snprintf(segment_prefix, sizeof(segment_prefix), "%06X.", slot->logno); + segment_prefix_size = strlen(segment_prefix); + UndoLogDirectory(slot->meta.tablespace, undo_path); + dir = AllocateDir(undo_path); + if (dir == NULL) + continue; + while ((de = ReadDirExtended(dir, undo_path, LOG)) != NULL) + { + char segment_path[MAXPGPATH]; + + if (strncmp(de->d_name, segment_prefix, segment_prefix_size) != 0) + continue; + snprintf(segment_path, sizeof(segment_path), "%s/%s", + undo_path, de->d_name); + elog(DEBUG1, "unlinked undo segment \"%s\"", segment_path); + if (unlink(segment_path) < 0) + elog(LOG, "couldn't unlink file \"%s\": %m", segment_path); + } + FreeDir(dir); + + /* + * We have no segment files. Set the pointers to indicate that there + * is no data. The discard and insert pointers point to the first + * usable byte in the segment we will create when we next try to + * allocate. This is a bit strange, because it means that they are + * past the end pointer. That's the same as when new undo logs are + * created. + * + * TODO: Should we rewind to zero instead, so we can reuse that (now) + * unreferenced address space? + */ + slot->meta.unlogged.insert = slot->meta.discard = slot->meta.end + + UndoLogBlockHeaderSize; + } +} + +Datum +pg_stat_get_undo_logs(PG_FUNCTION_ARGS) +{ +#define PG_STAT_GET_UNDO_LOGS_COLS 9 + ReturnSetInfo *rsinfo = (ReturnSetInfo *) fcinfo->resultinfo; + TupleDesc tupdesc; + Tuplestorestate *tupstore; + MemoryContext per_query_ctx; + MemoryContext oldcontext; + char *tablespace_name = NULL; + Oid last_tablespace = InvalidOid; + int i; + + /* check to see if caller supports us returning a tuplestore */ + if (rsinfo == NULL || !IsA(rsinfo, ReturnSetInfo)) + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("set-valued function called in context that cannot accept a set"))); + if (!(rsinfo->allowedModes & SFRM_Materialize)) + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("materialize mode required, but it is not " \ + "allowed in this context"))); + + /* Build a tuple descriptor for our result type */ + if (get_call_result_type(fcinfo, NULL, &tupdesc) != TYPEFUNC_COMPOSITE) + elog(ERROR, "return type must be a row type"); + + per_query_ctx = rsinfo->econtext->ecxt_per_query_memory; + oldcontext = MemoryContextSwitchTo(per_query_ctx); + + tupstore = tuplestore_begin_heap(true, false, work_mem); + rsinfo->returnMode = SFRM_Materialize; + rsinfo->setResult = tupstore; + rsinfo->setDesc = tupdesc; + + MemoryContextSwitchTo(oldcontext); + + /* Scan all undo logs to build the results. */ + for (i = 0; i < UndoLogShared->nslots; ++i) + { + UndoLogSlot *slot = &UndoLogShared->slots[i]; + char buffer[17]; + Datum values[PG_STAT_GET_UNDO_LOGS_COLS]; + bool nulls[PG_STAT_GET_UNDO_LOGS_COLS] = { false }; + Oid tablespace; + + /* + * This won't be a consistent result overall, but the values for each + * log will be consistent because we'll take the per-log lock while + * copying them. + */ + LWLockAcquire(&slot->mutex, LW_SHARED); + + /* Skip unused slots and entirely discarded undo logs. */ + if (slot->logno == InvalidUndoLogNumber || + slot->meta.status == UNDO_LOG_STATUS_DISCARDED) + { + LWLockRelease(&slot->mutex); + continue; + } + + values[0] = ObjectIdGetDatum((Oid) slot->logno); + values[1] = CStringGetTextDatum( + slot->meta.category == UNDO_PERMANENT ? "permanent" : + slot->meta.category == UNDO_UNLOGGED ? "unlogged" : + slot->meta.category == UNDO_TEMP ? "temporary" : + slot->meta.category == UNDO_SHARED ? "shared" : ""); + tablespace = slot->meta.tablespace; + + snprintf(buffer, sizeof(buffer), UndoRecPtrFormat, + MakeUndoRecPtr(slot->logno, slot->meta.discard)); + values[3] = CStringGetTextDatum(buffer); + snprintf(buffer, sizeof(buffer), UndoRecPtrFormat, + MakeUndoRecPtr(slot->logno, slot->meta.unlogged.insert)); + values[4] = CStringGetTextDatum(buffer); + snprintf(buffer, sizeof(buffer), UndoRecPtrFormat, + MakeUndoRecPtr(slot->logno, slot->meta.end)); + values[5] = CStringGetTextDatum(buffer); + if (slot->meta.unlogged.xid == InvalidTransactionId) + nulls[6] = true; + else + values[6] = TransactionIdGetDatum(slot->meta.unlogged.xid); + if (slot->pid == InvalidPid) + nulls[7] = true; + else + values[7] = Int32GetDatum((int32) slot->pid); + switch (slot->meta.status) + { + case UNDO_LOG_STATUS_ACTIVE: + values[8] = CStringGetTextDatum("ACTIVE"); break; + case UNDO_LOG_STATUS_FULL: + values[8] = CStringGetTextDatum("FULL"); break; + default: + nulls[8] = true; + } + LWLockRelease(&slot->mutex); + + /* + * Deal with potentially slow tablespace name lookup without the lock. + * Avoid making multiple calls to that expensive function for the + * common case of repeating tablespace. + */ + if (tablespace != last_tablespace) + { + if (tablespace_name) + pfree(tablespace_name); + tablespace_name = get_tablespace_name(tablespace); + last_tablespace = tablespace; + } + if (tablespace_name) + { + values[2] = CStringGetTextDatum(tablespace_name); + nulls[2] = false; + } + else + nulls[2] = true; + + tuplestore_putvalues(tupstore, tupdesc, values, nulls); + } + + if (tablespace_name) + pfree(tablespace_name); + tuplestore_donestoring(tupstore); + + return (Datum) 0; +} + +/* + * replay the creation of a new undo log + */ +static void +undolog_xlog_create(XLogReaderState *record) +{ + xl_undolog_create *xlrec = (xl_undolog_create *) XLogRecGetData(record); + UndoLogSlot *slot; + + /* Create meta-data space in shared memory. */ + LWLockAcquire(UndoLogLock, LW_EXCLUSIVE); + + /* TODO: assert that it doesn't exist already? */ + + slot = allocate_undo_log_slot(); + LWLockAcquire(&slot->mutex, LW_EXCLUSIVE); + slot->logno = xlrec->logno; + slot->meta.logno = xlrec->logno; + slot->meta.status = UNDO_LOG_STATUS_ACTIVE; + slot->meta.category = xlrec->category; + slot->meta.tablespace = xlrec->tablespace; + slot->meta.unlogged.insert = UndoLogBlockHeaderSize; + slot->meta.discard = UndoLogBlockHeaderSize; + UndoLogShared->next_logno = Max(xlrec->logno + 1, UndoLogShared->next_logno); + LWLockRelease(&slot->mutex); + LWLockRelease(UndoLogLock); +} + +/* + * replay the addition of a new segment to an undo log + */ +static void +undolog_xlog_extend(XLogReaderState *record) +{ + xl_undolog_extend *xlrec = (xl_undolog_extend *) XLogRecGetData(record); + + /* Extend exactly as we would during DO phase. */ + extend_undo_log(xlrec->logno, xlrec->end); +} + +/* + * Drop all buffers for the given undo log, from the old_discard to up + * new_discard. If drop_tail is true, also drop the buffer that holds + * new_discard; this is used when discarding undo logs completely, for example + * via DROP TABLESPACE. If it is false, then the final buffer is not dropped + * because it may contain data. + * + */ +static void +forget_undo_buffers(int logno, UndoLogOffset old_discard, + UndoLogOffset new_discard, bool drop_tail) +{ + BlockNumber old_blockno; + BlockNumber new_blockno; + RelFileNode rnode; + + UndoRecPtrAssignRelFileNode(rnode, MakeUndoRecPtr(logno, old_discard)); + old_blockno = old_discard / BLCKSZ; + new_blockno = new_discard / BLCKSZ; + if (drop_tail) + ++new_blockno; + while (old_blockno < new_blockno) + { + ForgetBuffer(rnode, UndoLogForkNum, old_blockno); + ForgetLocalBuffer(rnode, UndoLogForkNum, old_blockno++); + } +} +/* + * replay an undo segment discard record + */ +static void +undolog_xlog_discard(XLogReaderState *record) +{ + xl_undolog_discard *xlrec = (xl_undolog_discard *) XLogRecGetData(record); + UndoLogSlot *slot; + UndoLogOffset discard; + UndoLogOffset end; + UndoLogOffset old_segment_begin; + UndoLogOffset new_segment_begin; + RelFileNode rnode = {0}; + + slot = find_undo_log_slot(xlrec->logno, false); + if (slot == NULL) + elog(ERROR, "unknown undo log %d", xlrec->logno); + + /* + * We're about to discard undologs. In Hot Standby mode, ensure that + * there's no queries running which need to get tuple from discarded undo. + * + * XXX we are passing empty rnode to the conflict function so that it can + * check conflict in all the backend regardless of which database the + * backend is connected. + */ + if (InHotStandby && TransactionIdIsValid(xlrec->latestxid)) + ResolveRecoveryConflictWithSnapshot(xlrec->latestxid, rnode); + + /* + * See if we need to unlink or rename any files, but don't consider it an + * error if we find that files are missing. Since UndoLogDiscard() + * performs filesystem operations before WAL logging or updating shmem + * which could be checkpointed, a crash could have left files already + * deleted, but we could replay WAL that expects the files to be there. + */ + + LWLockAcquire(&slot->mutex, LW_EXCLUSIVE); + Assert(slot->logno == xlrec->logno); + discard = slot->meta.discard; + end = slot->meta.end; + LWLockRelease(&slot->mutex); + + /* Drop buffers before we remove/recycle any files. */ + forget_undo_buffers(xlrec->logno, discard, xlrec->discard, + xlrec->entirely_discarded); + + /* Rewind to the start of the segment. */ + old_segment_begin = discard - discard % UndoLogSegmentSize; + new_segment_begin = xlrec->discard - xlrec->discard % UndoLogSegmentSize; + + /* Unlink or rename segments that are no longer in range. */ + while (old_segment_begin < new_segment_begin) + { + char discard_path[MAXPGPATH]; + + /* Tell the checkpointer that the file is going away. */ + undofile_forget_sync(slot->logno, + old_segment_begin / UndoLogSegmentSize, + slot->meta.tablespace); + + UndoLogSegmentPath(xlrec->logno, old_segment_begin / UndoLogSegmentSize, + slot->meta.tablespace, discard_path); + + /* Can we recycle the oldest segment? */ + if (end < xlrec->end) + { + char recycle_path[MAXPGPATH]; + + UndoLogSegmentPath(xlrec->logno, end / UndoLogSegmentSize, + slot->meta.tablespace, recycle_path); + if (rename(discard_path, recycle_path) == 0) + { + elog(DEBUG1, "recycled undo segment \"%s\" -> \"%s\"", + discard_path, recycle_path); + end += UndoLogSegmentSize; + } + else + { + elog(LOG, "could not rename \"%s\" to \"%s\": %m", + discard_path, recycle_path); + } + } + else + { + if (unlink(discard_path) == 0) + elog(DEBUG1, "unlinked undo segment \"%s\"", discard_path); + else + elog(LOG, "could not unlink \"%s\": %m", discard_path); + } + old_segment_begin += UndoLogSegmentSize; + } + + /* Create any further new segments that are needed the slow way. */ + while (end < xlrec->end) + { + allocate_empty_undo_segment(xlrec->logno, slot->meta.tablespace, end); + end += UndoLogSegmentSize; + } + + /* Flush the directory entries before next checkpoint. */ + undofile_request_sync_dir(slot->meta.tablespace); + + /* Update shmem. */ + LWLockAcquire(&slot->mutex, LW_EXCLUSIVE); + slot->meta.discard = xlrec->discard; + slot->meta.end = end; + LWLockRelease(&slot->mutex); + + /* If we discarded everything, the slot can be given up. */ + if (xlrec->entirely_discarded) + free_undo_log_slot(slot); +} + +/* + * replay the switch of a undo log + */ +static void +undolog_xlog_switch(XLogReaderState *record) +{ + xl_undolog_switch *xlrec = (xl_undolog_switch *) XLogRecGetData(record); + UndoLogSlot *slot; + + slot = find_undo_log_slot(xlrec->logno, false); + + /* + * Restore the log switch information in the MyUndoLogState this will be + * reset by following UndoLogAllocateDuringRecovery. + */ + slot->meta.unlogged.prevlog_xact_start = xlrec->prevlog_xact_start; + slot->meta.unlogged.prevlog_last_urp = xlrec->prevlog_last_urp; +} + +void +undolog_redo(XLogReaderState *record) +{ + uint8 info = XLogRecGetInfo(record) & ~XLR_INFO_MASK; + + switch (info) + { + case XLOG_UNDOLOG_CREATE: + undolog_xlog_create(record); + break; + case XLOG_UNDOLOG_EXTEND: + undolog_xlog_extend(record); + break; + case XLOG_UNDOLOG_DISCARD: + undolog_xlog_discard(record); + break; + case XLOG_UNDOLOG_SWITCH: + undolog_xlog_switch(record); + default: + elog(PANIC, "undo_redo: unknown op code %u", info); + } +} + +/* + * For assertions only. + */ +bool +AmAttachedToUndoLogSlot(UndoLogSlot *slot) +{ + /* + * In general, we can't access log's members without locking. But this + * function is intended only for asserting that you are attached, and + * while you're attached the slot can't be recycled, so don't bother + * locking. + */ + return CurrentSession->attached_undo_slots[slot->meta.category] == slot; +} + +/* + * For testing use only. This function is only used by the test_undo module. + */ +void +UndoLogDetachFull(void) +{ + int i; + + for (i = 0; i < UndoLogCategories; ++i) + if (CurrentSession->attached_undo_slots[i]) + detach_current_undo_log(i, true); +} diff --git a/src/backend/bootstrap/bootstrap.c b/src/backend/bootstrap/bootstrap.c index 9238fbe98d..4671838dd2 100644 --- a/src/backend/bootstrap/bootstrap.c +++ b/src/backend/bootstrap/bootstrap.c @@ -20,6 +20,7 @@ #include "access/genam.h" #include "access/heapam.h" #include "access/htup_details.h" +#include "access/session.h" #include "access/tableam.h" #include "access/xact.h" #include "access/xlog_internal.h" @@ -519,6 +520,8 @@ BootstrapModeMain(void) InitPostgres(NULL, InvalidOid, NULL, InvalidOid, NULL, false); + InitializeSession(); + /* Initialize stuff for bootstrap-file processing */ for (i = 0; i < MAXATTR; i++) { diff --git a/src/backend/catalog/system_views.sql b/src/backend/catalog/system_views.sql index ea4c85e395..cbe04e4dcc 100644 --- a/src/backend/catalog/system_views.sql +++ b/src/backend/catalog/system_views.sql @@ -1062,6 +1062,10 @@ GRANT SELECT (subdbid, subname, subowner, subenabled, subslotname, subpublicatio ON pg_subscription TO public; +CREATE VIEW pg_stat_undo_logs AS + SELECT * + FROM pg_stat_get_undo_logs(); + -- -- We have a few function definitions in here, too. -- At some point there might be enough to justify breaking them out into diff --git a/src/backend/commands/tablespace.c b/src/backend/commands/tablespace.c index 84efb414d8..17c28b2c66 100644 --- a/src/backend/commands/tablespace.c +++ b/src/backend/commands/tablespace.c @@ -55,6 +55,7 @@ #include "access/htup_details.h" #include "access/sysattr.h" #include "access/tableam.h" +#include "access/undolog.h" #include "access/xact.h" #include "access/xlog.h" #include "access/xloginsert.h" @@ -496,6 +497,20 @@ DropTableSpace(DropTableSpaceStmt *stmt) */ LWLockAcquire(TablespaceCreateLock, LW_EXCLUSIVE); + /* + * Drop the undo logs in this tablespace. This will fail (without + * dropping anything) if there are undo logs that we can't afford to drop + * because they contain non-discarded data or a transaction is in + * progress. Since we hold TablespaceCreateLock, no other session will be + * able to attach to an undo log in this tablespace (or any tablespace + * except default) concurrently. + */ + if (!DropUndoLogsInTablespace(tablespaceoid)) + ereport(ERROR, + (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + errmsg("tablespace \"%s\" cannot be dropped because it contains non-empty undo logs", + tablespacename))); + /* * Try to remove the physical infrastructure. */ @@ -1517,6 +1532,14 @@ tblspc_redo(XLogReaderState *record) { xl_tblspc_drop_rec *xlrec = (xl_tblspc_drop_rec *) XLogRecGetData(record); + /* This shouldn't be able to fail in recovery. */ + LWLockAcquire(TablespaceCreateLock, LW_EXCLUSIVE); + if (!DropUndoLogsInTablespace(xlrec->ts_id)) + ereport(ERROR, + (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + errmsg("tablespace cannot be dropped because it contains non-empty undo logs"))); + LWLockRelease(TablespaceCreateLock); + /* * If we issued a WAL record for a drop tablespace it implies that * there were no files in it at all when the DROP was done. That means diff --git a/src/backend/postmaster/pgstat.c b/src/backend/postmaster/pgstat.c index b4f2b28b51..c742861dae 100644 --- a/src/backend/postmaster/pgstat.c +++ b/src/backend/postmaster/pgstat.c @@ -4070,6 +4070,27 @@ pgstat_get_wait_io(WaitEventIO w) case WAIT_EVENT_TWOPHASE_FILE_WRITE: event_name = "TwophaseFileWrite"; break; + case WAIT_EVENT_UNDO_CHECKPOINT_READ: + event_name = "UndoCheckpointRead"; + break; + case WAIT_EVENT_UNDO_CHECKPOINT_WRITE: + event_name = "UndoCheckpointWrite"; + break; + case WAIT_EVENT_UNDO_CHECKPOINT_SYNC: + event_name = "UndoCheckpointSync"; + break; + case WAIT_EVENT_UNDO_FILE_READ: + event_name = "UndoFileRead"; + break; + case WAIT_EVENT_UNDO_FILE_WRITE: + event_name = "UndoFileWrite"; + break; + case WAIT_EVENT_UNDO_FILE_FLUSH: + event_name = "UndoFileFlush"; + break; + case WAIT_EVENT_UNDO_FILE_SYNC: + event_name = "UndoFileSync"; + break; case WAIT_EVENT_WALSENDER_TIMELINE_HISTORY_READ: event_name = "WALSenderTimelineHistoryRead"; break; diff --git a/src/backend/replication/basebackup.c b/src/backend/replication/basebackup.c index d5f9b617c8..abcb5e5ad2 100644 --- a/src/backend/replication/basebackup.c +++ b/src/backend/replication/basebackup.c @@ -1368,7 +1368,7 @@ sendFile(const char *readfilename, const char *tarfilename, struct stat *statbuf char *page; size_t pad; PageHeader phdr; - int segmentno = 0; + BlockNumber first_blkno = 0; char *segmentpath; bool verify_checksum = false; @@ -1406,12 +1406,18 @@ sendFile(const char *readfilename, const char *tarfilename, struct stat *statbuf segmentpath = strstr(filename, "."); if (segmentpath != NULL) { - segmentno = atoi(segmentpath + 1); - if (segmentno == 0) + char *end; + if (strstr(readfilename, "undo")) + first_blkno = strtol(segmentpath + 1, &end, 16) / BLCKSZ; + else + first_blkno = strtol(segmentpath + 1, &end, 10) * RELSEG_SIZE; + if (*end != '\0') ereport(ERROR, - (errmsg("invalid segment number %d in file \"%s\"", - segmentno, filename))); + (errmsg("invalid segment number in file \"%s\"", + filename))); } + else + first_blkno = 0; } } @@ -1451,7 +1457,7 @@ sendFile(const char *readfilename, const char *tarfilename, struct stat *statbuf */ if (!PageIsNew(page) && PageGetLSN(page) < startptr) { - checksum = pg_checksum_page((char *) page, blkno + segmentno * RELSEG_SIZE); + checksum = pg_checksum_page((char *) page, blkno + first_blkno); phdr = (PageHeader) page; if (phdr->pd_checksum != checksum) { diff --git a/src/backend/replication/logical/decode.c b/src/backend/replication/logical/decode.c index 151c3ef882..d3a9c4d64c 100644 --- a/src/backend/replication/logical/decode.c +++ b/src/backend/replication/logical/decode.c @@ -154,6 +154,7 @@ LogicalDecodingProcessRecord(LogicalDecodingContext *ctx, XLogReaderState *recor case RM_COMMIT_TS_ID: case RM_REPLORIGIN_ID: case RM_GENERIC_ID: + case RM_UNDOLOG_ID: /* just deal with xid, and done */ ReorderBufferProcessXid(ctx->reorder, XLogRecGetXid(record), buf.origptr); diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c index 6f3a402854..2b1d60680e 100644 --- a/src/backend/storage/buffer/bufmgr.c +++ b/src/backend/storage/buffer/bufmgr.c @@ -177,6 +177,7 @@ static PrivateRefCountEntry *NewPrivateRefCountEntry(Buffer buffer); static PrivateRefCountEntry *GetPrivateRefCountEntry(Buffer buffer, bool do_move); static inline int32 GetPrivateRefCount(Buffer buffer); static void ForgetPrivateRefCountEntry(PrivateRefCountEntry *ref); +static void InvalidateBuffer(BufferDesc *buf); /* * Ensure that the PrivateRefCountArray has sufficient space to store one more @@ -620,10 +621,12 @@ ReadBuffer(Relation reln, BlockNumber blockNum) * valid, the page is zeroed instead of throwing an error. This is intended * for non-critical data, where the caller is prepared to repair errors. * - * In RBM_ZERO_AND_LOCK mode, if the page isn't in buffer cache already, it's + * In RBM_ZERO mode, if the page isn't in buffer cache already, it's * filled with zeros instead of reading it from disk. Useful when the caller * is going to fill the page from scratch, since this saves I/O and avoids * unnecessary failure if the page-on-disk has corrupt page headers. + * + * In RBM_ZERO_AND_LOCK mode, the page is zeroed and also locked. * The page is returned locked to ensure that the caller has a chance to * initialize the page before it's made visible to others. * Caution: do not use this mode to read a page that is beyond the relation's @@ -674,24 +677,20 @@ ReadBufferExtended(Relation reln, ForkNumber forkNum, BlockNumber blockNum, /* * ReadBufferWithoutRelcache -- like ReadBufferExtended, but doesn't require * a relcache entry for the relation. - * - * NB: At present, this function may only be used on permanent relations, which - * is OK, because we only use it during XLOG replay. If in the future we - * want to use it on temporary or unlogged relations, we could pass additional - * parameters. */ Buffer ReadBufferWithoutRelcache(RelFileNode rnode, ForkNumber forkNum, BlockNumber blockNum, ReadBufferMode mode, - BufferAccessStrategy strategy) + BufferAccessStrategy strategy, + char relpersistence) { bool hit; - SMgrRelation smgr = smgropen(rnode, InvalidBackendId); - - Assert(InRecovery); + SMgrRelation smgr = smgropen(rnode, + relpersistence == RELPERSISTENCE_TEMP + ? MyBackendId : InvalidBackendId); - return ReadBuffer_common(smgr, RELPERSISTENCE_PERMANENT, forkNum, blockNum, + return ReadBuffer_common(smgr, relpersistence, forkNum, blockNum, mode, strategy, &hit); } @@ -885,7 +884,9 @@ ReadBuffer_common(SMgrRelation smgr, char relpersistence, ForkNumber forkNum, * Read in the page, unless the caller intends to overwrite it and * just wants us to allocate a buffer. */ - if (mode == RBM_ZERO_AND_LOCK || mode == RBM_ZERO_AND_CLEANUP_LOCK) + if (mode == RBM_ZERO || + mode == RBM_ZERO_AND_LOCK || + mode == RBM_ZERO_AND_CLEANUP_LOCK) MemSet((char *) bufBlock, 0, BLCKSZ); else { @@ -1339,6 +1340,61 @@ BufferAlloc(SMgrRelation smgr, char relpersistence, ForkNumber forkNum, return buf; } +/* + * ForgetBuffer -- drop a buffer from shared buffers + * + * If the buffer isn't present in shared buffers, nothing happens. If it is + * present, it is discarded without making any attempt to write it back out to + * the operating system. The caller must therefore somehow be sure that the + * data won't be needed for anything now or in the future. It assumes that + * there is no concurrent access to the block, except that it might be being + * concurrently written. + */ +void +ForgetBuffer(RelFileNode rnode, ForkNumber forkNum, BlockNumber blockNum) +{ + SMgrRelation smgr = smgropen(rnode, InvalidBackendId); + BufferTag tag; /* identity of target block */ + uint32 hash; /* hash value for tag */ + LWLock *partitionLock; /* buffer partition lock for it */ + int buf_id; + BufferDesc *bufHdr; + uint32 buf_state; + + /* create a tag so we can lookup the buffer */ + INIT_BUFFERTAG(tag, smgr->smgr_rnode.node, forkNum, blockNum); + + /* determine its hash code and partition lock ID */ + hash = BufTableHashCode(&tag); + partitionLock = BufMappingPartitionLock(hash); + + /* see if the block is in the buffer pool */ + LWLockAcquire(partitionLock, LW_SHARED); + buf_id = BufTableLookup(&tag, hash); + LWLockRelease(partitionLock); + + /* didn't find it, so nothing to do */ + if (buf_id < 0) + return; + + /* take the buffer header lock */ + bufHdr = GetBufferDescriptor(buf_id); + buf_state = LockBufHdr(bufHdr); + + /* + * The buffer might been evicted after we released the partition lock and + * before we acquired the buffer header lock. If so, the buffer we've + * locked might contain some other data which we shouldn't touch. If the + * buffer hasn't been recycled, we proceed to invalidate it. + */ + if (RelFileNodeEquals(bufHdr->tag.rnode, rnode) && + bufHdr->tag.blockNum == blockNum && + bufHdr->tag.forkNum == forkNum) + InvalidateBuffer(bufHdr); /* releases spinlock */ + else + UnlockBufHdr(bufHdr, buf_state); +} + /* * InvalidateBuffer -- mark a shared buffer invalid and return it to the * freelist. diff --git a/src/backend/storage/buffer/localbuf.c b/src/backend/storage/buffer/localbuf.c index f5f6a29222..a6fe3dbf82 100644 --- a/src/backend/storage/buffer/localbuf.c +++ b/src/backend/storage/buffer/localbuf.c @@ -272,6 +272,49 @@ LocalBufferAlloc(SMgrRelation smgr, ForkNumber forkNum, BlockNumber blockNum, return bufHdr; } +/* + * ForgetLocalBuffer - drop a buffer from local buffers + * + * This is similar to bufmgr.c's ForgetBuffer, except that we do not need + * to do any locking since this is all local. As with that function, this + * must be used very carefully, since we'll cheerfully throw away dirty + * buffers without any attempt to write them. + */ +void +ForgetLocalBuffer(RelFileNode rnode, ForkNumber forkNum, BlockNumber blockNum) +{ + SMgrRelation smgr = smgropen(rnode, BackendIdForTempRelations()); + BufferTag tag; /* identity of target block */ + LocalBufferLookupEnt *hresult; + BufferDesc *bufHdr; + uint32 buf_state; + + /* + * If somehow this is the first request in the session, there's nothing to + * do. (This probably shouldn't happen, though.) + */ + if (LocalBufHash == NULL) + return; + + /* create a tag so we can lookup the buffer */ + INIT_BUFFERTAG(tag, smgr->smgr_rnode.node, forkNum, blockNum); + + /* see if the block is in the local buffer pool */ + hresult = (LocalBufferLookupEnt *) + hash_search(LocalBufHash, (void *) &tag, HASH_REMOVE, NULL); + + /* didn't find it, so nothing to do */ + if (!hresult) + return; + + /* mark buffer invalid */ + bufHdr = GetLocalBufferDescriptor(hresult->id); + CLEAR_BUFFERTAG(bufHdr->tag); + buf_state = pg_atomic_read_u32(&bufHdr->state); + buf_state &= ~(BM_VALID | BM_TAG_VALID | BM_DIRTY); + pg_atomic_unlocked_write_u32(&bufHdr->state, buf_state); +} + /* * MarkLocalBufferDirty - * mark a local buffer dirty diff --git a/src/backend/storage/file/fd.c b/src/backend/storage/file/fd.c index 315c74c745..3c4f0537d4 100644 --- a/src/backend/storage/file/fd.c +++ b/src/backend/storage/file/fd.c @@ -322,7 +322,6 @@ static void pre_sync_fname(const char *fname, bool isdir, int elevel); static void datadir_fsync_fname(const char *fname, bool isdir, int elevel); static void unlink_if_exists_fname(const char *fname, bool isdir, int elevel); -static int fsync_fname_ext(const char *fname, bool isdir, bool ignore_perm, int elevel); static int fsync_parent_path(const char *fname, int elevel); @@ -3345,7 +3344,7 @@ unlink_if_exists_fname(const char *fname, bool isdir, int elevel) * * Returns 0 if the operation succeeded, -1 otherwise. */ -static int +int fsync_fname_ext(const char *fname, bool isdir, bool ignore_perm, int elevel) { int fd; diff --git a/src/backend/storage/ipc/ipci.c b/src/backend/storage/ipc/ipci.c index d7d733530f..12c324925c 100644 --- a/src/backend/storage/ipc/ipci.c +++ b/src/backend/storage/ipc/ipci.c @@ -21,6 +21,7 @@ #include "access/nbtree.h" #include "access/subtrans.h" #include "access/twophase.h" +#include "access/undolog.h" #include "commands/async.h" #include "miscadmin.h" #include "pgstat.h" @@ -125,6 +126,7 @@ CreateSharedMemoryAndSemaphores(int port) size = add_size(size, ProcGlobalShmemSize()); size = add_size(size, XLOGShmemSize()); size = add_size(size, CLOGShmemSize()); + size = add_size(size, UndoLogShmemSize()); size = add_size(size, CommitTsShmemSize()); size = add_size(size, SUBTRANSShmemSize()); size = add_size(size, TwoPhaseShmemSize()); @@ -213,6 +215,7 @@ CreateSharedMemoryAndSemaphores(int port) */ XLOGShmemInit(); CLOGShmemInit(); + UndoLogShmemInit(); CommitTsShmemInit(); SUBTRANSShmemInit(); MultiXactShmemInit(); diff --git a/src/backend/storage/lmgr/lwlock.c b/src/backend/storage/lmgr/lwlock.c index bc1aa88322..5df658d349 100644 --- a/src/backend/storage/lmgr/lwlock.c +++ b/src/backend/storage/lmgr/lwlock.c @@ -522,6 +522,8 @@ RegisterLWLockTranches(void) LWLockRegisterTranche(LWTRANCHE_PARALLEL_APPEND, "parallel_append"); LWLockRegisterTranche(LWTRANCHE_PARALLEL_HASH_JOIN, "parallel_hash_join"); LWLockRegisterTranche(LWTRANCHE_SXACT, "serializable_xact"); + LWLockRegisterTranche(LWTRANCHE_UNDOLOG, "undo_log"); + LWLockRegisterTranche(LWTRANCHE_UNDODISCARD, "undo_discard"); /* Register named tranches. */ for (i = 0; i < NamedLWLockTrancheRequests; i++) diff --git a/src/backend/storage/lmgr/lwlocknames.txt b/src/backend/storage/lmgr/lwlocknames.txt index db47843229..4b42a1cf0b 100644 --- a/src/backend/storage/lmgr/lwlocknames.txt +++ b/src/backend/storage/lmgr/lwlocknames.txt @@ -49,3 +49,4 @@ MultiXactTruncationLock 41 OldSnapshotTimeMapLock 42 LogicalRepWorkerLock 43 CLogTruncationLock 44 +UndoLogLock 45 diff --git a/src/backend/storage/smgr/Makefile b/src/backend/storage/smgr/Makefile index e486b7c0d1..ff2e5e2db4 100644 --- a/src/backend/storage/smgr/Makefile +++ b/src/backend/storage/smgr/Makefile @@ -12,6 +12,6 @@ subdir = src/backend/storage/smgr top_builddir = ../../../.. include $(top_builddir)/src/Makefile.global -OBJS = md.o smgr.o +OBJS = md.o smgr.o undofile.o include $(top_srcdir)/src/backend/common.mk diff --git a/src/backend/storage/smgr/smgr.c b/src/backend/storage/smgr/smgr.c index d00b275e46..93df85cba9 100644 --- a/src/backend/storage/smgr/smgr.c +++ b/src/backend/storage/smgr/smgr.c @@ -17,11 +17,13 @@ */ #include "postgres.h" +#include "catalog/database_internal.h" #include "lib/ilist.h" #include "storage/bufmgr.h" #include "storage/ipc.h" #include "storage/md.h" #include "storage/smgr.h" +#include "storage/undofile.h" #include "utils/hsearch.h" #include "utils/inval.h" @@ -81,6 +83,24 @@ static const f_smgr smgrsw[] = { .smgr_nblocks = mdnblocks, .smgr_truncate = mdtruncate, .smgr_immedsync = mdimmedsync, + }, + /* undo logs */ + { + .smgr_init = undofile_init, + .smgr_shutdown = undofile_shutdown, + .smgr_open = undofile_open, + .smgr_close = undofile_close, + .smgr_create = undofile_create, + .smgr_exists = undofile_exists, + .smgr_unlink = undofile_unlink, + .smgr_extend = undofile_extend, + .smgr_prefetch = undofile_prefetch, + .smgr_read = undofile_read, + .smgr_write = undofile_write, + .smgr_writeback = undofile_writeback, + .smgr_nblocks = undofile_nblocks, + .smgr_truncate = undofile_truncate, + .smgr_immedsync = undofile_immedsync, } }; @@ -105,6 +125,8 @@ smgrwhich(RelFileNode rnode) { switch (rnode.dbNode) { + case UndoDbOid: + return 1; /* undofile.c */ default: return 0; /* md.c */ } @@ -189,6 +211,7 @@ smgropen(RelFileNode rnode, BackendId backend) reln->smgr_fsm_nblocks = InvalidBlockNumber; reln->smgr_vm_nblocks = InvalidBlockNumber; reln->smgr_which = smgrwhich(rnode); + reln->private_data = NULL; /* implementation-specific initialization */ smgrsw[reln->smgr_which].smgr_open(reln); diff --git a/src/backend/storage/smgr/undofile.c b/src/backend/storage/smgr/undofile.c new file mode 100644 index 0000000000..04d4514e35 --- /dev/null +++ b/src/backend/storage/smgr/undofile.c @@ -0,0 +1,422 @@ +/* + * undofile.h + * + * PostgreSQL undo file manager. This module provides SMGR-compatible + * interface to the files that back undo logs on the filesystem, so that undo + * log data can use the shared buffer pool. Other aspects of undo log + * management are provided by undolog.c, so the SMGR interfaces not directly + * concerned with reading, writing and flushing data are unimplemented. + * + * Portions Copyright (c) 1996-2019, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * src/storage/smgr/undofile.c + */ + +#include "postgres.h" + +#include "access/undolog.h" +#include "access/xlog.h" +#include "catalog/database_internal.h" +#include "miscadmin.h" +#include "pgstat.h" +#include "postmaster/bgwriter.h" +#include "storage/fd.h" +#include "storage/smgr.h" +#include "storage/undofile.h" +#include "utils/memutils.h" + +/* Populate a file tag describing an undo segment file. */ +#define INIT_UNDOFILETAG(a,xx_logno,xx_tbspc,xx_segno) \ +( \ + memset(&(a), 0, sizeof(FileTag)), \ + (a).handler = SYNC_HANDLER_UNDO, \ + (a).rnode.dbNode = UndoDbOid, \ + (a).rnode.spcNode = (xx_tbspc), \ + (a).rnode.relNode = (xx_logno), \ + (a).segno = (xx_segno) \ +) + +/* + * While md.c expects random access and has a small number of huge + * segments, undofile.c manages a potentially very large number of smaller + * segments and has a less random access pattern. Therefore, instead of + * keeping a potentially huge array of vfds we'll just keep the most + * recently accessed N. + * + * For now, N == 1, so we just need to hold onto one 'File' handle. + */ +typedef struct UndoFileState +{ + int mru_segno; + File mru_file; +} UndoFileState; + +static MemoryContext UndoFileCxt; + +static File undofile_open_segment_file(Oid relNode, Oid spcNode, + BlockNumber segno, bool missing_ok); +static File undofile_get_segment_file(SMgrRelation reln, BlockNumber segno); + +void +undofile_init(void) +{ + UndoFileCxt = AllocSetContextCreate(TopMemoryContext, + "UndoFileSmgr", + ALLOCSET_DEFAULT_SIZES); +} + +void +undofile_shutdown(void) +{ +} + +void +undofile_open(SMgrRelation reln) +{ + UndoFileState *state; + + state = MemoryContextAllocZero(UndoFileCxt, sizeof(UndoFileState)); + reln->private_data = state; +} + +void +undofile_close(SMgrRelation reln, ForkNumber forknum) +{ +} + +void +undofile_create(SMgrRelation reln, ForkNumber forknum, bool isRedo) +{ + /* + * File creation is managed by undolog.c, but xlogutils.c likes to call + * this just in case. Ignore. + */ +} + +bool +undofile_exists(SMgrRelation reln, ForkNumber forknum) +{ + elog(ERROR, "undofile_exists is not supported"); + + return false; /* not reached */ +} + +void +undofile_unlink(RelFileNodeBackend rnode, ForkNumber forknum, bool isRedo) +{ + elog(ERROR, "undofile_unlink is not supported"); +} + +void +undofile_extend(SMgrRelation reln, ForkNumber forknum, + BlockNumber blocknum, char *buffer, + bool skipFsync) +{ + elog(ERROR, "undofile_extend is not supported"); +} + +void +undofile_prefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum) +{ + elog(ERROR, "undofile_prefetch is not supported"); +} + +void +undofile_read(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, + char *buffer) +{ + File file; + off_t seekpos; + int nbytes; + + Assert(forknum == MAIN_FORKNUM); + file = undofile_get_segment_file(reln, blocknum / UNDOSEG_SIZE); + seekpos = (off_t) BLCKSZ * (blocknum % ((BlockNumber) UNDOSEG_SIZE)); + Assert(seekpos < (off_t) BLCKSZ * UNDOSEG_SIZE); + nbytes = FileRead(file, buffer, BLCKSZ, seekpos, WAIT_EVENT_UNDO_FILE_READ); + if (nbytes != BLCKSZ) + { + if (nbytes < 0) + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not read block %u in file \"%s\": %m", + blocknum, FilePathName(file)))); + ereport(ERROR, + (errcode(ERRCODE_DATA_CORRUPTED), + errmsg("could not read block %u in file \"%s\": read only %d of %d bytes", + blocknum, FilePathName(file), + nbytes, BLCKSZ))); + } +} + +void +undofile_write(SMgrRelation reln, ForkNumber forknum, + BlockNumber blocknum, char *buffer, + bool skipFsync) +{ + File file; + off_t seekpos; + int nbytes; + + Assert(forknum == MAIN_FORKNUM); + file = undofile_get_segment_file(reln, blocknum / UNDOSEG_SIZE); + seekpos = (off_t) BLCKSZ * (blocknum % ((BlockNumber) UNDOSEG_SIZE)); + Assert(seekpos < (off_t) BLCKSZ * UNDOSEG_SIZE); + nbytes = FileWrite(file, buffer, BLCKSZ, seekpos, WAIT_EVENT_UNDO_FILE_WRITE); + if (nbytes != BLCKSZ) + { + if (nbytes < 0) + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not write block %u in file \"%s\": %m", + blocknum, FilePathName(file)))); + /* + * short write: unexpected, because this should be overwriting an + * entirely pre-allocated segment file + */ + ereport(ERROR, + (errcode(ERRCODE_DISK_FULL), + errmsg("could not write block %u in file \"%s\": wrote only %d of %d bytes", + blocknum, FilePathName(file), + nbytes, BLCKSZ))); + } + + /* Tell checkpointer this file is dirty. */ + if (!skipFsync && !SmgrIsTemp(reln)) + { + undofile_request_sync(reln->smgr_rnode.node.relNode, + blocknum / UNDOSEG_SIZE, + reln->smgr_rnode.node.spcNode); + } +} + +void +undofile_writeback(SMgrRelation reln, ForkNumber forknum, + BlockNumber blocknum, BlockNumber nblocks) +{ + while (nblocks > 0) + { + File file; + int nflush; + + file = undofile_get_segment_file(reln, blocknum / UNDOSEG_SIZE); + + /* compute number of desired writes within the current segment */ + nflush = Min(nblocks, + 1 + UNDOSEG_SIZE - (blocknum % UNDOSEG_SIZE)); + + FileWriteback(file, + (blocknum % UNDOSEG_SIZE) * BLCKSZ, + nflush * BLCKSZ, WAIT_EVENT_UNDO_FILE_FLUSH); + + nblocks -= nflush; + blocknum += nflush; + } +} + +BlockNumber +undofile_nblocks(SMgrRelation reln, ForkNumber forknum) +{ + /* + * xlogutils.c likes to call this to decide whether to read or extend; for + * now we lie and say the relation is big as possible. + */ + return UndoLogMaxSize / BLCKSZ; +} + +void +undofile_truncate(SMgrRelation reln, ForkNumber forknum, BlockNumber nblocks) +{ + elog(ERROR, "undofile_truncate is not supported"); +} + +void +undofile_immedsync(SMgrRelation reln, ForkNumber forknum) +{ + elog(ERROR, "undofile_immedsync is not supported"); +} + +static File undofile_open_segment_file(Oid relNode, Oid spcNode, + BlockNumber segno, bool missing_ok) +{ + File file; + char path[MAXPGPATH]; + + UndoLogSegmentPath(relNode, segno, spcNode, path); + file = PathNameOpenFile(path, O_RDWR | PG_BINARY); + + if (file <= 0 && (!missing_ok || errno != ENOENT)) + elog(ERROR, "cannot open undo segment file '%s': %m", path); + + return file; +} + +/* + * Get a File for a particular segment of a SMgrRelation representing an undo + * log. + */ +static File undofile_get_segment_file(SMgrRelation reln, BlockNumber segno) +{ + UndoFileState *state = (UndoFileState *) reln->private_data; + + /* If we have a file open already, check if we need to close it. */ + if (state->mru_file > 0 && state->mru_segno != segno) + { + /* These are not the blocks we're looking for. */ + FileClose(state->mru_file); + state->mru_file = 0; + } + + /* Check if we need to open a new file. */ + if (state->mru_file <= 0) + { + state->mru_file = + undofile_open_segment_file(reln->smgr_rnode.node.relNode, + reln->smgr_rnode.node.spcNode, + segno, InRecovery); + if (InRecovery && state->mru_file <= 0) + { + /* + * If in recovery, we may be trying to access a file that will + * later be unlinked. Tolerate missing files, creating a new + * zero-filled file as required. + */ + UndoLogNewSegment(reln->smgr_rnode.node.relNode, + reln->smgr_rnode.node.spcNode, + segno); + state->mru_file = + undofile_open_segment_file(reln->smgr_rnode.node.relNode, + reln->smgr_rnode.node.spcNode, + segno, false); + Assert(state->mru_file > 0); + } + state->mru_segno = segno; + } + + return state->mru_file; +} + +/* + * Callback to handle a queued sync request. + */ +int +undofile_syncfiletag(const FileTag *tag, char *path) +{ + SMgrRelation reln = smgropen(tag->rnode, InvalidBackendId); + File file; + + if (tag->rnode.relNode == (Oid) InvalidUndoLogNumber) + { + /* Sync parent directory for this tablespace. */ + UndoLogDirectory(tag->rnode.spcNode, path); + + /* The caller (sync.c) will do appropriate error reporting. */ + return fsync_fname_ext(path, true, false, WARNING); + } + else + { + /* Sync a segment file. */ + UndoLogSegmentPath(tag->rnode.relNode, tag->segno, tag->rnode.spcNode, + path); + + file = undofile_get_segment_file(reln, tag->segno); + if (file <= 0) + { + /* errno set by undofile_get_segment_file() */ + return -1; + } + + return FileSync(file, WAIT_EVENT_UNDO_FILE_SYNC); + } +} + +/* + * Filtering callback used by SYNC_FILTER_REQUEST to forget some requests. + */ +bool +undofile_filetagmatches(const FileTag *tag, const FileTag *candidate) +{ + /* + * We use SYNC_FILTER_REQUEST to forget requests for a given tablespace, + * before removing all undo files in the tablespace. + */ + return tag->rnode.spcNode == candidate->rnode.spcNode; +} + +/* + * Tell the checkpointer to sync a segment file. + */ +void +undofile_request_sync(UndoLogNumber logno, BlockNumber segno, Oid tablespace) +{ + char path[MAXPGPATH]; + FileTag tag; + + INIT_UNDOFILETAG(tag, logno, tablespace, segno); + + /* Try to send to the checkpointer, but if out of space, do it here. */ + if (!RegisterSyncRequest(&tag, SYNC_REQUEST, false)) + { + if (undofile_syncfiletag(&tag, path) < 0) + ereport(data_sync_elevel(ERROR), + (errmsg("could not fsync file \"%s\": %m", path))); + } +} + +/* + * Tell the checkpointer to forget about any sync requests for a given segment + * file, because it's about to go away. + */ +void +undofile_forget_sync(UndoLogNumber logno, BlockNumber segno, Oid tablespace) +{ + FileTag tag; + + INIT_UNDOFILETAG(tag, logno, tablespace, segno); + + /* Send, and keep retrying if out of space. */ + (void) RegisterSyncRequest(&tag, SYNC_FORGET_REQUEST, true); +} + +/* + * Tell the checkpointer to fsync the undo directory in a given tablespace, + * because we have created or renamed files inside it. + */ +void +undofile_request_sync_dir(Oid tablespace) +{ + char path[MAXPGPATH]; + FileTag tag; + + /* We use a special logno and segno to mean "the directory". */ + INIT_UNDOFILETAG(tag, (Oid) InvalidUndoLogNumber, tablespace, + InvalidBlockNumber); + + /* Try to send to the checkpointer, but if out of space, do it here. */ + if (!RegisterSyncRequest(&tag, SYNC_REQUEST, false)) + { + if (undofile_syncfiletag(&tag, path) < 0) + ereport(data_sync_elevel(ERROR), + (errmsg("could not fsync directory \"%s\": %m", path))); + } +} + +/* + * Tell the checkpointer to forget about all sync requests for a given + * tablespace, because it's about to go away. + */ +void +undofile_forget_sync_tablespace(Oid tablespace) +{ + FileTag tag; + + INIT_UNDOFILETAG(tag, (Oid) InvalidUndoLogNumber, tablespace, + InvalidBlockNumber); + + /* + * Tell checkpointer to forget about any request for this tag, and keep + * waiting if there is not enough space. + */ + (void) RegisterSyncRequest(&tag, SYNC_FILTER_REQUEST, true); +} diff --git a/src/backend/storage/sync/sync.c b/src/backend/storage/sync/sync.c index f329c3fd66..db7b417d83 100644 --- a/src/backend/storage/sync/sync.c +++ b/src/backend/storage/sync/sync.c @@ -28,6 +28,7 @@ #include "storage/bufmgr.h" #include "storage/ipc.h" #include "storage/md.h" +#include "storage/undofile.h" #include "utils/hsearch.h" #include "utils/memutils.h" #include "utils/inval.h" @@ -96,6 +97,11 @@ static const SyncOps syncsw[] = { .sync_syncfiletag = mdsyncfiletag, .sync_unlinkfiletag = mdunlinkfiletag, .sync_filetagmatches = mdfiletagmatches + }, + /* undo log segment files */ + { + .sync_syncfiletag = undofile_syncfiletag, + .sync_filetagmatches = undofile_filetagmatches } }; diff --git a/src/backend/utils/init/postinit.c b/src/backend/utils/init/postinit.c index 43b9f17f72..5d9af89036 100644 --- a/src/backend/utils/init/postinit.c +++ b/src/backend/utils/init/postinit.c @@ -25,6 +25,7 @@ #include "access/session.h" #include "access/sysattr.h" #include "access/tableam.h" +#include "access/undolog.h" #include "access/xact.h" #include "access/xlog.h" #include "catalog/catalog.h" @@ -561,6 +562,7 @@ BaseInit(void) InitSync(); smgrinit(); InitBufferPoolAccess(); + UndoLogInit(); } diff --git a/src/backend/utils/misc/guc.c b/src/backend/utils/misc/guc.c index fc463601ff..296fb77166 100644 --- a/src/backend/utils/misc/guc.c +++ b/src/backend/utils/misc/guc.c @@ -121,6 +121,7 @@ extern int CommitDelay; extern int CommitSiblings; extern char *default_tablespace; extern char *temp_tablespaces; +extern char *undo_tablespaces; extern bool ignore_checksum_failure; extern bool synchronize_seqscans; @@ -3667,6 +3668,17 @@ static struct config_string ConfigureNamesString[] = check_temp_tablespaces, assign_temp_tablespaces, NULL }, + { + {"undo_tablespaces", PGC_USERSET, CLIENT_CONN_STATEMENT, + gettext_noop("Sets the tablespace(s) to use for undo logs."), + NULL, + GUC_LIST_INPUT | GUC_LIST_QUOTE + }, + &undo_tablespaces, + "", + check_undo_tablespaces, assign_undo_tablespaces, NULL + }, + { {"dynamic_library_path", PGC_SUSET, CLIENT_CONN_OTHER, gettext_noop("Sets the path for dynamically loadable modules."), diff --git a/src/bin/initdb/initdb.c b/src/bin/initdb/initdb.c index 04d77ad700..bbcbdfdb87 100644 --- a/src/bin/initdb/initdb.c +++ b/src/bin/initdb/initdb.c @@ -210,11 +210,13 @@ static const char *const subdirs[] = { "pg_snapshots", "pg_subtrans", "pg_twophase", + "pg_undo", "pg_multixact", "pg_multixact/members", "pg_multixact/offsets", "base", "base/1", + "base/undo", "pg_replslot", "pg_tblspc", "pg_stat", diff --git a/src/bin/pg_checksums/pg_checksums.c b/src/bin/pg_checksums/pg_checksums.c index 8c00ec9a3b..8e83be9d52 100644 --- a/src/bin/pg_checksums/pg_checksums.c +++ b/src/bin/pg_checksums/pg_checksums.c @@ -167,7 +167,7 @@ skipfile(const char *fn) } static void -scan_file(const char *fn, BlockNumber segmentno) +scan_file(const char *fn, BlockNumber first_blkno) { PGAlignedBlock buf; PageHeader header = (PageHeader) buf.data; @@ -208,7 +208,7 @@ scan_file(const char *fn, BlockNumber segmentno) if (PageIsNew(header)) continue; - csum = pg_checksum_page(buf.data, blockno + segmentno * RELSEG_SIZE); + csum = pg_checksum_page(buf.data, blockno + first_blkno); current_size += r; if (mode == PG_MODE_CHECK) { @@ -310,7 +310,7 @@ scan_directory(const char *basedir, const char *subdir, bool sizeonly) char fnonly[MAXPGPATH]; char *forkpath, *segmentpath; - BlockNumber segmentno = 0; + BlockNumber first_blkno; if (skipfile(de->d_name)) continue; @@ -325,15 +325,22 @@ scan_directory(const char *basedir, const char *subdir, bool sizeonly) segmentpath = strchr(fnonly, '.'); if (segmentpath != NULL) { + char *end; + *segmentpath++ = '\0'; - segmentno = atoi(segmentpath); - if (segmentno == 0) + if (strstr(segmentpath, "undo")) + first_blkno = strtol(segmentpath, &end, 16) / BLCKSZ; + else + first_blkno = strtol(segmentpath, &end, 10) * RELSEG_SIZE; + if (*end != '\0') { - pg_log_error("invalid segment number %d in file name \"%s\"", - segmentno, fn); + pg_log_error("invalid segment number in file name \"%s\"", + fn); exit(1); } } + else + first_blkno = 0; forkpath = strchr(fnonly, '_'); if (forkpath != NULL) @@ -350,7 +357,7 @@ scan_directory(const char *basedir, const char *subdir, bool sizeonly) * the items in the data folder. */ if (!sizeonly) - scan_file(fn, segmentno); + scan_file(fn, first_blkno); } #ifndef WIN32 else if (S_ISDIR(st.st_mode) || S_ISLNK(st.st_mode)) diff --git a/src/bin/pg_resetwal/pg_resetwal.c b/src/bin/pg_resetwal/pg_resetwal.c index ff0f8ea5e7..ad96fe75af 100644 --- a/src/bin/pg_resetwal/pg_resetwal.c +++ b/src/bin/pg_resetwal/pg_resetwal.c @@ -80,6 +80,7 @@ static bool ReadControlFile(void); static void GuessControlValues(void); static void PrintControlValues(bool guessed); static void PrintNewControlValues(void); +static void AdjustRedoLocation(const char *DataDir); static void RewriteControlFile(void); static void FindEndOfXLOG(void); static void KillExistingXLOG(void); @@ -510,6 +511,7 @@ main(int argc, char *argv[]) /* * Else, do the dirty deed. */ + AdjustRedoLocation(DataDir); RewriteControlFile(); KillExistingXLOG(); KillExistingArchiveStatus(); @@ -889,6 +891,80 @@ PrintNewControlValues(void) } +/* + * Compute the new redo, and move the pg_undo file to match if necessary. + * Rather than renaming it, we'll create a new copy, so that a failure that + * occurs before the controlfile is rewritten won't be fatal. + */ +static void +AdjustRedoLocation(const char *DataDir) +{ + uint64 old_redo = ControlFile.checkPointCopy.redo; + char old_pg_undo_path[MAXPGPATH]; + char new_pg_undo_path[MAXPGPATH]; + int old_fd; + int new_fd; + ssize_t nread; + ssize_t nwritten; + char buffer[1024]; + + /* + * Adjust fields as needed to force an empty XLOG starting at + * newXlogSegNo. + */ + XLogSegNoOffsetToRecPtr(newXlogSegNo, SizeOfXLogLongPHD, WalSegSz, + ControlFile.checkPointCopy.redo); + + /* If the redo location didn't move, we don't need to do anything. */ + if (old_redo == ControlFile.checkPointCopy.redo) + return; + + /* + * Otherwise we copy the pg_undo file, because its name must match the redo + * location. + */ + snprintf(old_pg_undo_path, + sizeof(old_pg_undo_path), + "pg_undo/%016" INT64_MODIFIER "X", + old_redo); + snprintf(new_pg_undo_path, + sizeof(new_pg_undo_path), + "pg_undo/%016" INT64_MODIFIER "X", + ControlFile.checkPointCopy.redo); + old_fd = open(old_pg_undo_path, O_RDONLY, 0); + if (old_fd < 0) + { + pg_log_error("could not open \"%s\": %m", old_pg_undo_path); + exit(1); + } + new_fd = open(new_pg_undo_path, O_RDWR | O_CREAT, 0644); + if (new_fd < 0) + { + pg_log_error("could not create \"%s\": %m", new_pg_undo_path); + exit(1); + } + while ((nread = read(old_fd, buffer, sizeof(buffer))) > 0) + { + do + { + nwritten = write(new_fd, buffer, nread); + if (nwritten < 0) + { + pg_log_error("could not write to \"%s\": %m", new_pg_undo_path); + exit(1); + } + nread -= nwritten; + } while (nread > 0); + } + if (nread < 0) + { + pg_log_error("could not read from \"%s\": %m", old_pg_undo_path); + exit(1); + } + close(old_fd); + close(new_fd); +} + /* * Write out the new pg_control file. */ diff --git a/src/bin/pg_upgrade/Makefile b/src/bin/pg_upgrade/Makefile index 2cb829acd5..82c80336c5 100644 --- a/src/bin/pg_upgrade/Makefile +++ b/src/bin/pg_upgrade/Makefile @@ -9,7 +9,7 @@ include $(top_builddir)/src/Makefile.global OBJS = check.o controldata.o dump.o exec.o file.o function.o info.o \ option.o parallel.o pg_upgrade.o relfilenode.o server.o \ - tablespace.o util.o version.o $(WIN32RES) + tablespace.o undo.o util.o version.o $(WIN32RES) override CPPFLAGS := -DDLSUFFIX=\"$(DLSUFFIX)\" -I$(srcdir) -I$(libpq_srcdir) $(CPPFLAGS) LDFLAGS_INTERNAL += -L$(top_builddir)/src/fe_utils -lpgfeutils $(libpq_pgport) diff --git a/src/bin/pg_upgrade/check.c b/src/bin/pg_upgrade/check.c index 617270f101..4a431dce83 100644 --- a/src/bin/pg_upgrade/check.c +++ b/src/bin/pg_upgrade/check.c @@ -21,6 +21,7 @@ static void check_locale_and_encoding(DbInfo *olddb, DbInfo *newdb); static bool equivalent_locale(int category, const char *loca, const char *locb); static void check_is_install_user(ClusterInfo *cluster); static void check_proper_datallowconn(ClusterInfo *cluster); +static void check_for_undo_data(ClusterInfo *cluster); static void check_for_prepared_transactions(ClusterInfo *cluster); static void check_for_isn_and_int8_passing_mismatch(ClusterInfo *cluster); static void check_for_tables_with_oids(ClusterInfo *cluster); @@ -97,6 +98,7 @@ check_and_dump_old_cluster(bool live_check) */ check_is_install_user(&old_cluster); check_proper_datallowconn(&old_cluster); + check_for_undo_data(&old_cluster); check_for_prepared_transactions(&old_cluster); check_for_reg_data_type_usage(&old_cluster); check_for_isn_and_int8_passing_mismatch(&old_cluster); @@ -171,6 +173,7 @@ check_new_cluster(void) check_is_install_user(&new_cluster); + check_for_undo_data(&new_cluster); check_for_prepared_transactions(&new_cluster); } @@ -800,6 +803,46 @@ check_for_prepared_transactions(ClusterInfo *cluster) } +/* + * check_for_live_undo_data() + * + * Make sure there are no live undo records (aborted transactions that have + * not been rolled back, or committed transactions whose undo data has not + * yet been discarded). + */ +static void +check_for_undo_data(ClusterInfo *cluster) +{ + PGresult *res; + PGconn *conn; + + if (GET_MAJOR_VERSION(old_cluster.major_version) < 1300) + return; + + conn = connectToServer(cluster, "template1"); + prep_status("Checking for undo data"); + + res = executeQueryOrDie(conn, + "SELECT * " + "FROM pg_catalog.pg_stat_undo_logs " + "WHERE discard != insert"); + + if (PQntuples(res) != 0) + { + if (cluster == &old_cluster) + pg_fatal("The source cluster contains live undo data\n"); + else + pg_fatal("The target cluster contains live undo data\n"); + } + + PQclear(res); + + PQfinish(conn); + + check_ok(); +} + + /* * check_for_isn_and_int8_passing_mismatch() * diff --git a/src/bin/pg_upgrade/controldata.c b/src/bin/pg_upgrade/controldata.c index 38236415be..523e86eeff 100644 --- a/src/bin/pg_upgrade/controldata.c +++ b/src/bin/pg_upgrade/controldata.c @@ -59,6 +59,7 @@ get_control_data(ClusterInfo *cluster, bool live_check) bool got_date_is_int = false; bool got_data_checksum_version = false; bool got_cluster_state = false; + bool got_redo_location = false; char *lc_collate = NULL; char *lc_ctype = NULL; char *lc_monetary = NULL; @@ -485,6 +486,23 @@ get_control_data(ClusterInfo *cluster, bool live_check) cluster->controldata.data_checksum_version = str2uint(p); got_data_checksum_version = true; } + else if ((p = strstr(bufin, "Latest checkpoint's REDO location:")) != NULL) + { + uint32 hi; + uint32 lo; + + p = strchr(p, ':'); + + if (p == NULL || strlen(p) <= 1) + pg_fatal("%d: controldata retrieval problem\n", __LINE__); + + p++; /* remove ':' char */ + + if (sscanf(p, "%X/%X", &hi, &lo) != 2) + pg_fatal("%d: controldata cannot parse REDO location\n", __LINE__); + cluster->controldata.redo_location = (((uint64) hi) << 32) | lo; + got_redo_location = true; + } } pclose(output); @@ -528,6 +546,13 @@ get_control_data(ClusterInfo *cluster, bool live_check) } } + /* + * If we used pg_resetwal instead of pg_controldata, there is no REDO + * location. + */ + if (!got_redo_location) + cluster->controldata.redo_location = 0; + /* verify that we got all the mandatory pg_control data */ if (!got_xid || !got_oid || !got_multi || diff --git a/src/bin/pg_upgrade/exec.c b/src/bin/pg_upgrade/exec.c index 0363309328..f23451a876 100644 --- a/src/bin/pg_upgrade/exec.c +++ b/src/bin/pg_upgrade/exec.c @@ -351,6 +351,10 @@ check_data_dir(ClusterInfo *cluster) check_single_dir(pg_data, "pg_clog"); else check_single_dir(pg_data, "pg_xact"); + + /* pg_undo is new in v13 */ + if (GET_MAJOR_VERSION(cluster->major_version) >= 1300) + check_single_dir(pg_data, "pg_undo"); } diff --git a/src/bin/pg_upgrade/pg_upgrade.c b/src/bin/pg_upgrade/pg_upgrade.c index d1975aab2b..09b786f472 100644 --- a/src/bin/pg_upgrade/pg_upgrade.c +++ b/src/bin/pg_upgrade/pg_upgrade.c @@ -139,6 +139,8 @@ main(int argc, char **argv) /* New now using xids of the old system */ + merge_undo_logs(); + /* -- NEW -- */ start_postmaster(&new_cluster, true); diff --git a/src/bin/pg_upgrade/pg_upgrade.h b/src/bin/pg_upgrade/pg_upgrade.h index 5d31750d86..d0d93c0682 100644 --- a/src/bin/pg_upgrade/pg_upgrade.h +++ b/src/bin/pg_upgrade/pg_upgrade.h @@ -227,6 +227,7 @@ typedef struct bool date_is_int; bool float8_pass_by_value; bool data_checksum_version; + uint64 redo_location; } ControlData; /* @@ -465,3 +466,7 @@ void parallel_transfer_all_new_dbs(DbInfoArr *old_db_arr, DbInfoArr *new_db_arr char *old_pgdata, char *new_pgdata, char *old_tablespace); bool reap_child(bool wait_for_child); + +/* undo.c */ + +void merge_undo_logs(void); diff --git a/src/bin/pg_upgrade/undo.c b/src/bin/pg_upgrade/undo.c new file mode 100644 index 0000000000..48397b0141 --- /dev/null +++ b/src/bin/pg_upgrade/undo.c @@ -0,0 +1,292 @@ +/* + * undo.c + * + * Support for upgrading undo logs.\ + * Copyright (c) 2019, PostgreSQL Global Development Group + * src/bin/pg_upgrade/undo.c + */ + + +#include "postgres_fe.h" +#include "pg_upgrade.h" +#include "access/undolog.h" + +/* + * The relevant parts of UndoLogMetaDataData, in a version-independent format. + */ +typedef struct +{ + UndoLogNumber logno; + UndoLogOffset discard; + UndoLogStatus status; + UndoLogCategory category; + Oid tablespace; +} UndoLogInfo; + +/* + * Read the header of a pg_undo file and extract basic information. If the + * format of the header changes in later versions, this may need to change + * depending on "cluster". + */ +static void +read_pg_undo_header(int fd, ClusterInfo *cluster, UndoLogNumber *low_logno, + UndoLogNumber *next_logno, UndoLogNumber *num_logs) +{ + pg_crc32c crc; + + /* Read the header, much like StartupUndoLogs(). */ + if (read(fd, low_logno, sizeof(*low_logno)) != sizeof(*low_logno) || + read(fd, next_logno, sizeof(*next_logno)) != sizeof(*next_logno) || + read(fd, num_logs, sizeof(*num_logs)) != sizeof(*num_logs) || + read(fd, &crc, sizeof(crc)) != sizeof(crc)) + pg_fatal("pg_undo file is corrupted or cannot be read\n"); +} + +/* + * Read a single UndoLogMetaData object. If the format changes in later + * versions, this may need to change to be able to read different structs + * depending on "cluster". + */ +static void +read_one_undo_log(int fd, ClusterInfo *cluster, UndoLogInfo *info) +{ + UndoLogMetaData meta_data; + int rc; + + rc = read(fd, &meta_data, sizeof(meta_data)); + if (rc < 0) + pg_fatal("could not read undo log meta-data: %m"); + else if (rc != sizeof(meta_data)) + pg_fatal("could not read undo log meta-data: expect %zu bytes but read only %d bytes", + sizeof(meta_data), rc); + + info->logno = meta_data.logno; + info->category = meta_data.category; + info->tablespace = meta_data.tablespace; + info->discard = meta_data.discard; + info->status = meta_data.status; +} + +static void +merge_undo_log(UndoLogInfo *logs, UndoLogNumber *num_logs, + const UndoLogInfo *info) +{ + UndoLogNumber i; + + /* Do we already have an entry for this logno? */ + for (i = 0; i < *num_logs; ++i) + { + if (logs[i].logno == info->logno) + { + /* + * Take the highest discard offset, so that any pointers that + * originated in either cluster appear to be discarded. + */ + if (logs[i].discard < info->discard) + logs[i].discard = info->discard; + + /* + * Take the highest status so that entirely discarded logs trump + * active logs. + */ + StaticAssertStmt(UNDO_LOG_STATUS_ACTIVE < UNDO_LOG_STATUS_FULL, + "undo log status out of order"); + StaticAssertStmt(UNDO_LOG_STATUS_FULL < UNDO_LOG_STATUS_DISCARDED, + "undo log status out of order"); + if (logs[i].status < info->status) + logs[i].status = info->status; + + /* + * Take the most persistent persistence level. While we could + * just convert them all to permanent and it wouldn't hurt, it's + * probably a better idea to keep about the same number of each + * persistence level, so that a system that has stabilized with + * those numbers will continue to be stable after the upgrade (ie + * not suddenly need to create more undo logs of different + * levels). The most permanent is the best choice, because TEMP + * undo logs might be rewound in future. + */ + StaticAssertStmt(UNDO_PERMANENT < UNDO_UNLOGGED, + "undo log persistent out of order"); + StaticAssertStmt(UNDO_UNLOGGED < UNDO_TEMP, + "undo log persistent out of order"); + if (logs[i].status > info->status) + logs[i].status = info->status; + + /* + * Take the highest tablespace OID. The choice of 'highest' is + * arbitrary (we don't really expect the new cluster to have more + * than one log), but it seems useful to preserve the distribution + * of tablespaces from the old cluster for stability, as above. + */ + if (logs[i].tablespace < info->tablespace) + logs[i].tablespace = info->tablespace; + break; + } + } + + /* Otherwise create a new entry. */ + logs[*num_logs++] = *info; +} + +/* + * We need to merge the old undo logs and the new undo logs. We know that + * there is no live undo data (see check_for_live_undo_data()), but we need to + * make sure that any undo record pointers that exist in the old OR new + * cluster appear as discarded. That is, any log numbers that are entirely + * discarded in either cluster appear as entirely discarded, and we retain + * the higher of the discard pointers in any log that is active. This is + * mostly a theoretical concern for now, but perhaps a future release will be + * able to create higher undo record pointers during initdb than the old + * cluster had, so let's use an algorithm that doesn't make any assumptions + * about that. + */ +void +merge_undo_logs(void) +{ + char old_pg_undo_path[MAXPGPATH]; + char new_pg_undo_path[MAXPGPATH]; + UndoLogInfo *logs; + UndoLogNumber num_logs; + UndoLogNumber num_old_logs; + UndoLogNumber old_low_logno; + UndoLogNumber old_next_logno; + UndoLogNumber num_new_logs; + UndoLogNumber new_low_logno; + UndoLogNumber new_next_logno; + UndoLogNumber i; + int old_fd; + int new_fd; + pg_crc32c crc; + + /* If the old cluster has no undo logs, there is nothing to do */ + if (GET_MAJOR_VERSION(old_cluster.major_version) < 1300) + return; + + /* + * Open the pg_undo files corresponding to the old and new redo locations. + * First, we'll reload pg_controldata output, so that we have up-to-date + * redo locations. + */ + get_control_data(&old_cluster, true); + get_control_data(&new_cluster, true); + snprintf(old_pg_undo_path, + sizeof(old_pg_undo_path), + "%s/pg_undo/%016" INT64_MODIFIER "X", + old_cluster.pgdata, + old_cluster.controldata.redo_location); + snprintf(new_pg_undo_path, + sizeof(new_pg_undo_path), + "%s/pg_undo/%016" INT64_MODIFIER "X", + new_cluster.pgdata, + new_cluster.controldata.redo_location); + old_fd = open(old_pg_undo_path, O_RDONLY, 0); + if (old_fd < 0) + pg_fatal("could not open file \"%s\": %m\n", old_pg_undo_path); + new_fd = open(new_pg_undo_path, O_RDWR, 0); + if (new_fd < 0) + pg_fatal("could not open file \"%s\": %m\n", new_pg_undo_path); + + /* Read the headers */ + read_pg_undo_header(old_fd, &old_cluster, &old_low_logno, &old_next_logno, &num_old_logs); + read_pg_undo_header(new_fd, &new_cluster, &new_low_logno, &new_next_logno, &num_new_logs); + + /* Allocate workspace that is sure to be enough for the merged set */ + logs = malloc(sizeof(*logs) * (num_old_logs + num_new_logs)); + if (logs == NULL) + { + pg_fatal("out of memory\n"); + exit(1); + } + num_logs = 0; + + /* + * Anything below the "low" logno has been entirely discarded, so we'll + * take the higher of the two values. Likewise, the "next" log number to + * allocate should be the higher of the two. + */ + new_low_logno = Max(old_low_logno, new_low_logno); + new_next_logno = Max(old_next_logno, new_next_logno); + + /* Merge in the old logs */ + while (num_old_logs > 0) + { + UndoLogInfo info; + + read_one_undo_log(old_fd, &old_cluster, &info); + merge_undo_log(logs, &num_logs, &info); + --num_old_logs; + } + + /* Merge in the new logs */ + while (num_new_logs > 0) + { + UndoLogInfo info; + + read_one_undo_log(new_fd, &old_cluster, &info); + merge_undo_log(logs, &num_logs, &info); + --num_new_logs; + } + + close(old_fd); + + /* Now write out the new file, much like CheckPointUndoLogs() */ + if (ftruncate(new_fd, 0) < 0) + pg_fatal("could not truncate file \"%s\": %m", new_pg_undo_path); + if (lseek(new_fd, SEEK_SET, 0) < 0) + pg_fatal("could not seek to start of file \"%s\": %m", new_pg_undo_path); + + /* Compute header checksum */ + INIT_CRC32C(crc); + COMP_CRC32C(crc, &new_low_logno, sizeof(new_low_logno)); + COMP_CRC32C(crc, &new_next_logno, sizeof(new_next_logno)); + COMP_CRC32C(crc, &num_logs, sizeof(num_logs)); + FIN_CRC32C(crc); + + /* Write out the header */ + if ((write(new_fd, &new_low_logno, sizeof(new_low_logno)) != sizeof(new_low_logno)) || + (write(new_fd, &new_next_logno, sizeof(new_next_logno)) != sizeof(new_next_logno)) || + (write(new_fd, &num_logs, sizeof(num_logs)) != sizeof(num_logs)) || + (write(new_fd, &crc, sizeof(crc)) != sizeof(crc))) + pg_fatal("could not write to file \"%s\": %m", new_pg_undo_path); + + /* Write out the undo logs */ + INIT_CRC32C(crc); + for (i = 0; i < num_logs; ++i) + { + UndoLogMetaData meta_data; + UndoLogInfo *info = &logs[i]; + UndoLogOffset end; + + memset(&meta_data, 0, sizeof(meta_data)); + meta_data.logno = info->logno; + + /* + * Round the discard offset up so that it points to the first byte in + * a segment, and assign that to all three offsets. That means there + * is no logical data, and there are no physical files. + */ + end = ((info->discard + UndoLogSegmentSize - 1) / UndoLogSegmentSize) + * UndoLogSegmentSize; + meta_data.unlogged.insert = meta_data.discard = meta_data.end = end; + + /* + * We have whatever was the highest status (though it probably + * wouldn't hurt if we set them all to ACTIVE). + */ + meta_data.status = info->status; + + + if (write(new_fd, &meta_data, sizeof(meta_data)) != sizeof(meta_data)) + pg_fatal("could not write to file \"%s\": %m", new_pg_undo_path); + + COMP_CRC32C(crc, &meta_data, sizeof(meta_data)); + } + FIN_CRC32C(crc); + + if (write(new_fd, &crc, sizeof(crc)) != sizeof(crc)) + pg_fatal("could not write to file \"%s\": %m", new_pg_undo_path); + + close(new_fd); + free(logs); +} diff --git a/src/bin/pg_waldump/rmgrdesc.c b/src/bin/pg_waldump/rmgrdesc.c index 852d8ca4b1..938150dd91 100644 --- a/src/bin/pg_waldump/rmgrdesc.c +++ b/src/bin/pg_waldump/rmgrdesc.c @@ -20,6 +20,7 @@ #include "access/nbtxlog.h" #include "access/rmgr.h" #include "access/spgxlog.h" +#include "access/undolog_xlog.h" #include "access/xact.h" #include "access/xlog_internal.h" #include "catalog/storage_xlog.h" diff --git a/src/include/access/rmgrlist.h b/src/include/access/rmgrlist.h index 3c0db2ccf5..6945e3e950 100644 --- a/src/include/access/rmgrlist.h +++ b/src/include/access/rmgrlist.h @@ -47,3 +47,4 @@ PG_RMGR(RM_COMMIT_TS_ID, "CommitTs", commit_ts_redo, commit_ts_desc, commit_ts_i PG_RMGR(RM_REPLORIGIN_ID, "ReplicationOrigin", replorigin_redo, replorigin_desc, replorigin_identify, NULL, NULL, NULL) PG_RMGR(RM_GENERIC_ID, "Generic", generic_redo, generic_desc, generic_identify, NULL, NULL, generic_mask) PG_RMGR(RM_LOGICALMSG_ID, "LogicalMessage", logicalmsg_redo, logicalmsg_desc, logicalmsg_identify, NULL, NULL, NULL) +PG_RMGR(RM_UNDOLOG_ID, "UndoLog", undolog_redo, undolog_desc, undolog_identify, NULL, NULL, NULL) diff --git a/src/include/access/session.h b/src/include/access/session.h index 8fba568029..da0770e614 100644 --- a/src/include/access/session.h +++ b/src/include/access/session.h @@ -17,6 +17,9 @@ /* Avoid including typcache.h */ struct SharedRecordTypmodRegistry; +/* Avoid including undolog.h */ +struct UndoLogSlot; + /* * A struct encapsulating some elements of a user's session. For now this * manages state that applies to parallel query, but it principle it could @@ -27,6 +30,10 @@ typedef struct Session dsm_segment *segment; /* The session-scoped DSM segment. */ dsa_area *area; /* The session-scoped DSA area. */ + /* State managed by undolog.c. */ + struct UndoLogSlot *attached_undo_slots[4]; /* UndoLogCategories */ + bool need_to_choose_undo_tablespace; + /* State managed by typcache.c. */ struct SharedRecordTypmodRegistry *shared_typmod_registry; dshash_table *shared_record_table; diff --git a/src/include/access/undolog.h b/src/include/access/undolog.h new file mode 100644 index 0000000000..c4a6b29ef0 --- /dev/null +++ b/src/include/access/undolog.h @@ -0,0 +1,489 @@ +/*------------------------------------------------------------------------- + * + * undolog.h + * + * PostgreSQL undo log manager. This module is responsible for lifecycle + * management of undo logs and backing files, associating undo logs with + * backends, allocating and managing space within undo logs. + * + * Portions Copyright (c) 1996-2019, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * src/include/access/undolog.h + * + *------------------------------------------------------------------------- + */ +#ifndef UNDOLOG_H +#define UNDOLOG_H + +#include "access/transam.h" +#include "access/xlogreader.h" +#include "catalog/database_internal.h" +#include "catalog/pg_class.h" +#include "common/relpath.h" +#include "storage/bufpage.h" + +#ifndef FRONTEND +#include "storage/lwlock.h" +#endif + +/* The type used to identify an undo log and position within it. */ +typedef uint64 UndoRecPtr; + +/* The type used for undo record lengths. */ +typedef uint16 UndoRecordSize; + +/* Undo log statuses. */ +typedef enum +{ + UNDO_LOG_STATUS_UNUSED = 0, + UNDO_LOG_STATUS_ACTIVE, + UNDO_LOG_STATUS_FULL, + UNDO_LOG_STATUS_DISCARDED +} UndoLogStatus; + +/* + * Undo log categories. These correspond to the different persistence levels + * of relations so that we can discard unlogged and temporary undo data + * wholesale in some circumstance. We also have a separate category for + * 'shared' records that are not associated with a single transactions. Since + * they might live longer than the transaction that created them, and since we + * prefer to avoid interleaving records that don't belong to the same + * transaction, we keep them separate. + */ +typedef enum +{ + UNDO_PERMANENT = 0, + UNDO_UNLOGGED = 1, + UNDO_TEMP = 2, + UNDO_SHARED = 3 +} UndoLogCategory; + +#define UndoLogCategories 4 + +/* + * Convert from relpersistence ('p', 'u', 't') to an UndoLogCategory + * enumerator. + */ +#define UndoLogCategoryForRelPersistence(rp) \ + ((rp) == RELPERSISTENCE_PERMANENT ? UNDO_PERMANENT : \ + (rp) == RELPERSISTENCE_UNLOGGED ? UNDO_UNLOGGED : UNDO_TEMP) + +/* + * Convert from UndoLogCategory to a relpersistence value. There is no + * relpersistence level for UNDO_SHARED, but the only use of this macro is to + * pass a value to ReadBufferWithoutRelcache, which cares only about detecting + * RELPERSISTENCE_TEMP. XXX There must be a better way. + */ +#define RelPersistenceForUndoLogCategory(up) \ + ((up) == UNDO_PERMANENT ? RELPERSISTENCE_PERMANENT : \ + (up) == UNDO_UNLOGGED ? RELPERSISTENCE_UNLOGGED : \ + (up) == UNDO_SHARED ? RELPERSISTENCE_PERMANENT : \ + RELPERSISTENCE_TEMP) + +/* + * Get the appropriate UndoLogCategory value from a Relation. + */ +#define UndoLogCategoryForRelation(rel) \ + (UndoLogCategoryForRelPersistence((rel)->rd_rel->relpersistence)) + +/* Type for offsets within undo logs */ +typedef uint64 UndoLogOffset; + +/* printf-family format string for UndoRecPtr. */ +#define UndoRecPtrFormat "%016" INT64_MODIFIER "X" + +/* printf-family format string for UndoLogOffset. */ +#define UndoLogOffsetFormat UINT64_FORMAT + +/* Number of blocks of BLCKSZ in an undo log segment file. 128 = 1MB. */ +#define UNDOSEG_SIZE 128 + +/* Size of an undo log segment file in bytes. */ +#define UndoLogSegmentSize ((size_t) BLCKSZ * UNDOSEG_SIZE) + +/* The width of an undo log number in bits. 24 allows for 16.7m logs. */ +#define UndoLogNumberBits 24 + +/* The maximum valid undo log number. */ +#define MaxUndoLogNumber ((1 << UndoLogNumberBits) - 1) + +/* The width of an undo log offset in bits. 40 allows for 1TB per log.*/ +#define UndoLogOffsetBits (64 - UndoLogNumberBits) + +/* Special value for undo record pointer which indicates that it is invalid. */ +#define InvalidUndoRecPtr ((UndoRecPtr) 0) + +/* End-of-list value when building linked lists of undo logs. */ +#define InvalidUndoLogNumber -1 + +/* + * The maximum amount of data that can be stored in an undo log. Can be set + * artificially low to test full log behavior. + */ +#define UndoLogMaxSize ((UndoLogOffset) 1 << UndoLogOffsetBits) + +/* Type for numbering undo logs. */ +typedef int UndoLogNumber; + +/* Extract the undo log number from an UndoRecPtr. */ +#define UndoRecPtrGetLogNo(urp) \ + ((urp) >> UndoLogOffsetBits) + +/* Extract the offset from an UndoRecPtr. */ +#define UndoRecPtrGetOffset(urp) \ + ((urp) & ((UINT64CONST(1) << UndoLogOffsetBits) - 1)) + +/* Make an UndoRecPtr from an log number and offset. */ +#define MakeUndoRecPtr(logno, offset) \ + (((uint64) (logno) << UndoLogOffsetBits) | (offset)) + +/* The number of unusable bytes in the header of each block. */ +#define UndoLogBlockHeaderSize SizeOfPageHeaderData + +/* The number of usable bytes we can store per block. */ +#define UndoLogUsableBytesPerPage (BLCKSZ - UndoLogBlockHeaderSize) + +/* Length of undo checkpoint filename */ +#define UNDO_CHECKPOINT_FILENAME_LENGTH 16 + +/* + * UndoRecPtrIsValid + * True iff undoRecPtr is valid. + */ +#define UndoRecPtrIsValid(undoRecPtr) \ + ((bool) ((UndoRecPtr) (undoRecPtr) != InvalidUndoRecPtr)) + +/* Extract the relnode for an undo log. */ +#define UndoRecPtrGetRelNode(urp) \ + UndoRecPtrGetLogNo(urp) + +/* The only valid fork number for undo log buffers. */ +#define UndoLogForkNum MAIN_FORKNUM + +/* Compute the block number that holds a given UndoRecPtr. */ +#define UndoRecPtrGetBlockNum(urp) \ + (UndoRecPtrGetOffset(urp) / BLCKSZ) + +/* Compute the offset of a given UndoRecPtr in the page that holds it. */ +#define UndoRecPtrGetPageOffset(urp) \ + (UndoRecPtrGetOffset(urp) % BLCKSZ) + +/* Compare two undo checkpoint files to find the oldest file. */ +#define UndoCheckPointFilenamePrecedes(file1, file2) \ + (strcmp(file1, file2) < 0) + +/* What is the offset of the i'th non-header byte? */ +#define UndoLogOffsetFromUsableByteNo(i) \ + (((i) / UndoLogUsableBytesPerPage) * BLCKSZ + \ + UndoLogBlockHeaderSize + \ + ((i) % UndoLogUsableBytesPerPage)) + +/* How many non-header bytes are there before a given offset? */ +#define UndoLogOffsetToUsableByteNo(offset) \ + (((offset) % BLCKSZ - UndoLogBlockHeaderSize) + \ + ((offset) / BLCKSZ) * UndoLogUsableBytesPerPage) + +/* Add 'n' usable bytes to offset stepping over headers to find new offset. */ +#define UndoLogOffsetPlusUsableBytes(offset, n) \ + UndoLogOffsetFromUsableByteNo(UndoLogOffsetToUsableByteNo(offset) + (n)) + +/* Populate a RelFileNode from an UndoRecPtr. */ +#define UndoRecPtrAssignRelFileNode(rfn, urp) \ + do \ + { \ + (rfn).spcNode = UndoLogNumberGetTablespace(UndoRecPtrGetLogNo(urp)); \ + (rfn).dbNode = UndoDbOid; \ + (rfn).relNode = UndoRecPtrGetRelNode(urp); \ + } while (false); + +/* + * Properties of an undo log that don't have explicit WAL records logging + * their changes, to reduce WAL volume. Instead, they change incrementally + * whenever data is inserted as a result of other WAL records. Since the + * values recorded in an online checkpoint may be out of the sync (ie not the + * correct values as at the redo LSN), these are backed up in buffer data on + * first change after each checkpoint. + */ +typedef struct UndoLogUnloggedMetaData +{ + UndoLogOffset insert; /* next insertion point (head) */ + UndoLogOffset last_xact_start; /* last transaction's first byte in this log */ + UndoLogOffset this_xact_start; /* this transaction's first byte in this log */ + TransactionId xid; /* currently attached/writing xid */ + + /* + * Below two variable are used during recovery when transaction's undo + * records are split across undo logs. Replay of switch will restore + * these two undo record pointers which will be reset on next allocation + * during recovery. */ + UndoRecPtr prevlog_xact_start; /* Transaction's start undo record pointer + * in the previous log. */ + UndoRecPtr prevlog_last_urp; /* Transaction's last undo record pointer in + * the previous log. */ +} UndoLogUnloggedMetaData; + +/* + * Control metadata for an active undo log. Lives in shared memory inside an + * UndoLogSlot object, but also written to disk during checkpoints. + */ +typedef struct UndoLogMetaData +{ + /* Members that are not managed by explicit WAL logs. */ + UndoLogUnloggedMetaData unlogged; + + /* Members that are fixed for the lifetime of the undo log. */ + UndoLogNumber logno; + Oid tablespace; + UndoLogCategory category; + + /* Members that are changed by explicit WAL records. */ + UndoLogStatus status; + UndoLogOffset end; /* one past end of highest segment */ + UndoLogOffset discard; /* oldest data needed (tail) */ +} UndoLogMetaData; + +/* + * Context used to hold undo log state across all the undo log insertions + * corresponding to a single WAL record. + */ +typedef struct UndoLogAllocContext +{ + UndoLogCategory category; + UndoRecPtr try_location; + XLogReaderState *xlog_record; + UndoLogNumber recovery_logno; + uint8 recovery_block_id; + bool new_shared_record_set; + + /* + * The maximum number of undo logs that a single WAL record could insert + * into, modifying its unlogged meta data. Typically the number is 1, but + * it might touch a couple or more in rare cases where space runs out. + */ +#define MAX_META_DATA_IMAGES 4 + int num_meta_data_images; + struct + { + UndoLogNumber logno; + UndoLogUnloggedMetaData data; + } meta_data_images[MAX_META_DATA_IMAGES]; +} UndoLogAllocContext; + +#ifndef FRONTEND + +/* + * The in-memory control object for an undo log. We have a fixed-sized array + * of these. + */ +typedef struct UndoLogSlot +{ + /* + * Protected by UndoLogLock and 'mutex'. Both must be held to steal this + * slot for another undolog. Either may be held to prevent that from + * happening. + */ + UndoLogNumber logno; /* InvalidUndoLogNumber for unused slots */ + + /* Protected by UndoLogLock. */ + UndoLogNumber next_free; /* link for active unattached undo logs */ + + /* Protected by 'mutex'. */ + LWLock mutex; + UndoLogMetaData meta; /* current meta-data */ + pid_t pid; /* InvalidPid for unattached */ + + /* Protected by 'discard_lock'. State used by undo workers. */ + FullTransactionId wait_fxmin; /* trigger for processing this log again */ + UndoRecPtr oldest_data; + LWLock discard_lock; /* prevents discarding while reading */ + LWLock discard_update_lock; /* block updaters during discard */ +} UndoLogSlot; + +extern UndoLogSlot *UndoLogGetSlot(UndoLogNumber logno, bool missing_ok); +extern UndoLogSlot *UndoLogNextSlot(UndoLogSlot *slot); +extern bool AmAttachedToUndoLogSlot(UndoLogSlot *slot); +extern UndoRecPtr UndoLogGetOldestRecord(UndoLogNumber logno, bool *full); + +/* + * Each backend maintains a small hash table mapping undo log numbers to + * UndoLogSlot objects in shared memory. + * + * We also cache the tablespace, category and a recently observed discard + * pointer here, since we need fast access to those. We could also reach them + * via slot->meta, but they can't be accessed without locking (since the + * UndoLogSlot object might be recycled if the log is entirely discard). + * Since tablespace and category are constant for lifetime of the undo log + * number, and the discard pointer only travels in one direction, there is no + * cache invalidation problem to worry about. + */ +typedef struct UndoLogTableEntry +{ + UndoLogNumber number; + UndoLogSlot *slot; + Oid tablespace; + UndoLogCategory category; + UndoRecPtr recent_discard; + char status; /* used by simplehash */ +} UndoLogTableEntry; + +/* + * Instantiate fast inline hash table access functions. We use an identity + * hash function for speed, since we already have integers and don't expect + * many collisions. + */ +#define SH_PREFIX undologtable +#define SH_ELEMENT_TYPE UndoLogTableEntry +#define SH_KEY_TYPE UndoLogNumber +#define SH_KEY number +#define SH_HASH_KEY(tb, key) (key) +#define SH_EQUAL(tb, a, b) ((a) == (b)) +#define SH_SCOPE static inline +#define SH_DECLARE +#define SH_DEFINE +#include "lib/simplehash.h" + +extern PGDLLIMPORT undologtable_hash *undologtable_cache; + +/* + * Find or create an UndoLogTableGetEntry for this log number. This is used + * only for fast look-ups of tablespace and persistence. + */ +static pg_attribute_always_inline UndoLogTableEntry * +UndoLogGetTableEntry(UndoLogNumber logno) +{ + UndoLogTableEntry *entry; + + /* Fast path. */ + entry = undologtable_lookup(undologtable_cache, logno); + if (likely(entry)) + return entry; + + /* Slow path: force cache entry to be created. */ + UndoLogGetSlot(logno, false); + entry = undologtable_lookup(undologtable_cache, logno); + + return entry; +} + +/* + * Look up the tablespace for an undo log in our cache. + */ +static inline Oid +UndoLogNumberGetTablespace(UndoLogNumber logno) +{ + return UndoLogGetTableEntry(logno)->tablespace; +} + +static inline Oid +UndoRecPtrGetTablespace(UndoRecPtr urp) +{ + return UndoLogNumberGetTablespace(UndoRecPtrGetLogNo(urp)); +} + +/* + * Look up the category for an undo log in our cache. + */ +static inline UndoLogCategory +UndoLogNumberGetCategory(UndoLogNumber logno) +{ + return UndoLogGetTableEntry(logno)->category; +} + +static inline UndoLogCategory +UndoRecPtrGetCategory(UndoRecPtr urp) +{ + return UndoLogNumberGetCategory(UndoRecPtrGetLogNo(urp)); +} + +#endif + +/* Space management. */ +extern void UndoLogBeginInsert(UndoLogAllocContext *context, + UndoLogCategory category, + XLogReaderState *xlog_record); +extern void UndoLogRegister(UndoLogAllocContext *context, + uint8 block_id, + UndoLogNumber logno); +extern UndoRecPtr UndoLogAllocate(UndoLogAllocContext *context, + uint16 size, + bool *need_xact_header, + UndoRecPtr *last_xact_start, + UndoRecPtr *prevlog_xact_start, + UndoRecPtr *prevlog_insert_urp); +extern UndoRecPtr UndoLogAllocateInRecovery(UndoLogAllocContext *context, + TransactionId xid, + uint16 size, + bool *need_xact_header, + UndoRecPtr *last_xact_start, + UndoRecPtr *prevlog_xact_start, + UndoRecPtr *prevlog_last_urp); +extern void UndoLogAdvance(UndoLogAllocContext *context, size_t size); +extern void UndoLogAdvanceFinal(UndoRecPtr insertion_point, size_t size); +extern bool UndoLogDiscard(UndoRecPtr discard_point, TransactionId xid); +extern bool UndoLogRecPtrIsDiscardedSlowPath(UndoRecPtr pointer); + +#ifndef FRONTEND + +/* + * Check if an undo log pointer is discarded. + */ +static inline bool +UndoRecPtrIsDiscarded(UndoRecPtr pointer) +{ + UndoLogNumber logno = UndoRecPtrGetLogNo(pointer); + UndoRecPtr recent_discard; + + /* See if we can answer the question without acquiring any locks. */ + recent_discard = UndoLogGetTableEntry(logno)->recent_discard; + if (likely(recent_discard > pointer)) + return true; + + /* + * It might be discarded or not, but we'll need to do a bit more work to + * find out. + */ + return UndoLogRecPtrIsDiscardedSlowPath(pointer); +} + +#endif + +/* Initialization interfaces. */ +extern void StartupUndoLogs(XLogRecPtr checkPointRedo); +extern void UndoLogShmemInit(void); +extern Size UndoLogShmemSize(void); +extern void UndoLogInit(void); +extern void UndoLogDirectory(Oid tablespace, char *path); +extern void UndoLogSegmentPath(UndoLogNumber logno, int segno, Oid tablespace, + char *path); +extern void ResetUndoLogs(UndoLogCategory category); + +/* Interface use by tablespace.c. */ +extern bool DropUndoLogsInTablespace(Oid tablespace); + +/* GUC interfaces. */ +extern void assign_undo_tablespaces(const char *newval, void *extra); + +/* Checkpointing interfaces. */ +extern void CheckPointUndoLogs(XLogRecPtr checkPointRedo, + XLogRecPtr priorCheckPointRedo); + +/* File sync request management. */ + + +extern UndoRecPtr UndoLogGetLastXactStartPoint(UndoLogNumber logno); +extern UndoRecPtr UndoLogGetNextInsertPtr(UndoLogNumber logno); +extern void UndoLogSwitchSetPrevLogInfo(UndoLogNumber logno, + UndoRecPtr prevlog_last_urp, + UndoRecPtr prevlog_xact_start); +extern void UndoLogSetLSN(XLogRecPtr lsn); +void UndoLogNewSegment(UndoLogNumber logno, Oid tablespace, int segno); +/* Redo interface. */ +extern void undolog_redo(XLogReaderState *record); +/* Discard the undo logs for temp tables */ +extern void TempUndoDiscard(UndoLogNumber); + +/* Test-only interfacing. */ +extern void UndoLogDetachFull(void); + +#endif diff --git a/src/include/access/undolog_xlog.h b/src/include/access/undolog_xlog.h new file mode 100644 index 0000000000..aa9003ba4b --- /dev/null +++ b/src/include/access/undolog_xlog.h @@ -0,0 +1,62 @@ +/*------------------------------------------------------------------------- + * + * undolog_xlog.h + * undo log access XLOG definitions. + * + * Portions Copyright (c) 1996-2019, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * src/include/access/undolog_xlog.h + * + *------------------------------------------------------------------------- + */ +#ifndef UNDOLOG_XLOG_H +#define UNDOLOG_XLOG_H + +#include "access/undolog.h" +#include "access/xlogreader.h" +#include "lib/stringinfo.h" + +/* XLOG records */ +#define XLOG_UNDOLOG_CREATE 0x00 +#define XLOG_UNDOLOG_EXTEND 0x10 +#define XLOG_UNDOLOG_DISCARD 0x20 +#define XLOG_UNDOLOG_SWITCH 0x30 + +/* Create a new undo log. */ +typedef struct xl_undolog_create +{ + UndoLogNumber logno; + Oid tablespace; + UndoLogCategory category; +} xl_undolog_create; + +/* Extend an undo log by adding a new segment. */ +typedef struct xl_undolog_extend +{ + UndoLogNumber logno; + UndoLogOffset end; +} xl_undolog_extend; + +/* Discard space, and possibly destroy or recycle undo log segments. */ +typedef struct xl_undolog_discard +{ + UndoLogNumber logno; + UndoLogOffset discard; + UndoLogOffset end; + TransactionId latestxid; /* latest xid whose undolog are discarded. */ + bool entirely_discarded; +} xl_undolog_discard; + +/* Switch undo log. */ +typedef struct xl_undolog_switch +{ + UndoLogNumber logno; + UndoRecPtr prevlog_xact_start; + UndoRecPtr prevlog_last_urp; +} xl_undolog_switch; + +extern void undolog_desc(StringInfo buf,XLogReaderState *record); +extern const char *undolog_identify(uint8 info); + +#endif diff --git a/src/include/access/xlogutils.h b/src/include/access/xlogutils.h index 4105b59904..4db68f1082 100644 --- a/src/include/access/xlogutils.h +++ b/src/include/access/xlogutils.h @@ -44,6 +44,20 @@ extern XLogRedoAction XLogReadBufferForRedoExtended(XLogReaderState *record, extern Buffer XLogReadBufferExtended(RelFileNode rnode, ForkNumber forknum, BlockNumber blkno, ReadBufferMode mode); +extern bool XLogFindBlockId(XLogReaderState *record, + RelFileNode rnode, + ForkNumber forknum, + BlockNumber blockno, + uint8 *block_id); + +extern XLogRedoAction XLogReadBufferForRedoBlock(XLogReaderState *record, + RelFileNode rnode, + ForkNumber forknum, + BlockNumber blockno, + ReadBufferMode mode, + bool get_cleanup_lock, + Buffer *buf); + extern Relation CreateFakeRelcacheEntry(RelFileNode rnode); extern void FreeFakeRelcacheEntry(Relation fakerel); diff --git a/src/include/catalog/database_internal.h b/src/include/catalog/database_internal.h new file mode 100644 index 0000000000..a2b3a122ef --- /dev/null +++ b/src/include/catalog/database_internal.h @@ -0,0 +1,21 @@ +/*------------------------------------------------------------------------- + * + * database_internal.h + * + * Portions Copyright (c) 1996-2019, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * src/include/catalog/database_internal.h + * + *------------------------------------------------------------------------- + */ +#ifndef DATABASE_INTERNAL_H +#define DATABASE_INTERNAL_H + +/* + * We use this header to define the OIDs of pseudo-database OIDs used in + * buffer tags to hold system data. + */ +#define UndoDbOid 9 + +#endif /* DATABASE_INTERNAL_H */ diff --git a/src/include/catalog/pg_proc.dat b/src/include/catalog/pg_proc.dat index 0902dce5f1..de475cb2d4 100644 --- a/src/include/catalog/pg_proc.dat +++ b/src/include/catalog/pg_proc.dat @@ -10701,4 +10701,11 @@ proname => 'pg_partition_root', prorettype => 'regclass', proargtypes => 'regclass', prosrc => 'pg_partition_root' }, +# undo logs +{ oid => '5032', descr => 'list undo logs', + proname => 'pg_stat_get_undo_logs', procost => '1', prorows => '10', proretset => 't', + prorettype => 'record', proargtypes => '', + proallargtypes => '{oid,text,text,text,text,text,xid,int4,text}', proargmodes => '{o,o,o,o,o,o,o,o,o}', + proargnames => '{logno,category,tablespace,discard,insert,end,xid,pid,status}', prosrc => 'pg_stat_get_undo_logs' }, + ] diff --git a/src/include/pgstat.h b/src/include/pgstat.h index 0a3ad3a188..1936c5db6f 100644 --- a/src/include/pgstat.h +++ b/src/include/pgstat.h @@ -934,6 +934,13 @@ typedef enum WAIT_EVENT_TWOPHASE_FILE_READ, WAIT_EVENT_TWOPHASE_FILE_SYNC, WAIT_EVENT_TWOPHASE_FILE_WRITE, + WAIT_EVENT_UNDO_CHECKPOINT_READ, + WAIT_EVENT_UNDO_CHECKPOINT_SYNC, + WAIT_EVENT_UNDO_CHECKPOINT_WRITE, + WAIT_EVENT_UNDO_FILE_READ, + WAIT_EVENT_UNDO_FILE_WRITE, + WAIT_EVENT_UNDO_FILE_FLUSH, + WAIT_EVENT_UNDO_FILE_SYNC, WAIT_EVENT_WALSENDER_TIMELINE_HISTORY_READ, WAIT_EVENT_WAL_BOOTSTRAP_SYNC, WAIT_EVENT_WAL_BOOTSTRAP_WRITE, diff --git a/src/include/storage/bufmgr.h b/src/include/storage/bufmgr.h index 509f4b7ef1..a04190aa92 100644 --- a/src/include/storage/bufmgr.h +++ b/src/include/storage/bufmgr.h @@ -37,8 +37,9 @@ typedef enum BufferAccessStrategyType typedef enum { RBM_NORMAL, /* Normal read */ - RBM_ZERO_AND_LOCK, /* Don't read from disk, caller will - * initialize. Also locks the page. */ + RBM_ZERO, /* Don't read from disk, caller will + * initialize. */ + RBM_ZERO_AND_LOCK, /* Like RBM_ZERO, but also locks the page. */ RBM_ZERO_AND_CLEANUP_LOCK, /* Like RBM_ZERO_AND_LOCK, but locks the page * in "cleanup" mode */ RBM_ZERO_ON_ERROR, /* Read, but return an all-zeros page on error */ @@ -170,7 +171,10 @@ extern Buffer ReadBufferExtended(Relation reln, ForkNumber forkNum, BufferAccessStrategy strategy); extern Buffer ReadBufferWithoutRelcache(RelFileNode rnode, ForkNumber forkNum, BlockNumber blockNum, - ReadBufferMode mode, BufferAccessStrategy strategy); + ReadBufferMode mode, BufferAccessStrategy strategy, + char relpersistence); +extern void ForgetBuffer(RelFileNode rnode, ForkNumber forkNum, + BlockNumber blockNum); extern void ReleaseBuffer(Buffer buffer); extern void UnlockReleaseBuffer(Buffer buffer); extern void MarkBufferDirty(Buffer buffer); @@ -227,6 +231,10 @@ extern void AtProcExit_LocalBuffers(void); extern void TestForOldSnapshot_impl(Snapshot snapshot, Relation relation); +/* in localbuf.c */ +extern void ForgetLocalBuffer(RelFileNode rnode, ForkNumber forkNum, + BlockNumber blockNum); + /* in freelist.c */ extern BufferAccessStrategy GetAccessStrategy(BufferAccessStrategyType btype); extern void FreeAccessStrategy(BufferAccessStrategy strategy); diff --git a/src/include/storage/fd.h b/src/include/storage/fd.h index d2a8c52044..b215201f30 100644 --- a/src/include/storage/fd.h +++ b/src/include/storage/fd.h @@ -143,6 +143,7 @@ extern int pg_fsync_writethrough(int fd); extern int pg_fdatasync(int fd); extern void pg_flush_data(int fd, off_t offset, off_t amount); extern void fsync_fname(const char *fname, bool isdir); +extern int fsync_fname_ext(const char *fname, bool isdir, bool perm, int elevel); extern int durable_rename(const char *oldfile, const char *newfile, int loglevel); extern int durable_unlink(const char *fname, int loglevel); extern int durable_link_or_rename(const char *oldfile, const char *newfile, int loglevel); diff --git a/src/include/storage/lwlock.h b/src/include/storage/lwlock.h index 08e0dc8144..4abb344aec 100644 --- a/src/include/storage/lwlock.h +++ b/src/include/storage/lwlock.h @@ -220,7 +220,10 @@ typedef enum BuiltinTrancheIds LWTRANCHE_TBM, LWTRANCHE_PARALLEL_APPEND, LWTRANCHE_SXACT, - LWTRANCHE_FIRST_USER_DEFINED + LWTRANCHE_UNDOLOG, + LWTRANCHE_UNDODISCARD, + LWTRANCHE_REWIND, + LWTRANCHE_FIRST_USER_DEFINED, } BuiltinTrancheIds; /* diff --git a/src/include/storage/smgr.h b/src/include/storage/smgr.h index d286c8c7b1..0fd3b7505d 100644 --- a/src/include/storage/smgr.h +++ b/src/include/storage/smgr.h @@ -70,6 +70,9 @@ typedef struct SMgrRelationData int md_num_open_segs[MAX_FORKNUM + 1]; struct _MdfdVec *md_seg_fds[MAX_FORKNUM + 1]; + /* For use by implementations. */ + void *private_data; + /* if unowned, list link in list of all unowned SMgrRelations */ dlist_node node; } SMgrRelationData; diff --git a/src/include/storage/sync.h b/src/include/storage/sync.h index 16428c5f5f..59f1da9178 100644 --- a/src/include/storage/sync.h +++ b/src/include/storage/sync.h @@ -34,7 +34,8 @@ typedef enum SyncRequestType */ typedef enum SyncRequestHandler { - SYNC_HANDLER_MD = 0 /* md smgr */ + SYNC_HANDLER_MD = 0, /* md smgr */ + SYNC_HANDLER_UNDO = 1 /* undo smgr */ } SyncRequestHandler; /* diff --git a/src/include/storage/undofile.h b/src/include/storage/undofile.h new file mode 100644 index 0000000000..a5ec30fa7a --- /dev/null +++ b/src/include/storage/undofile.h @@ -0,0 +1,59 @@ +/*------------------------------------------------------------------------- + * + * undofile.h + * undo storage manager public interface declarations. + * + * + * Portions Copyright (c) 1996-2019, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * src/include/storage/undofile.h + * + *------------------------------------------------------------------------- + */ +#ifndef UNDOFILE_H +#define UNDOFILE_H + +#include "access/undolog.h" +#include "storage/block.h" +#include "storage/relfilenode.h" +#include "storage/smgr.h" +#include "storage/sync.h" + +extern void undofile_init(void); +extern void undofile_shutdown(void); +extern void undofile_open(SMgrRelation reln); +extern void undofile_close(SMgrRelation reln, ForkNumber forknum); +extern void undofile_create(SMgrRelation reln, ForkNumber forknum, + bool isRedo); +extern bool undofile_exists(SMgrRelation reln, ForkNumber forknum); +extern void undofile_unlink(RelFileNodeBackend rnode, ForkNumber forknum, + bool isRedo); +extern void undofile_extend(SMgrRelation reln, ForkNumber forknum, + BlockNumber blocknum, char *buffer, bool skipFsync); +extern void undofile_prefetch(SMgrRelation reln, ForkNumber forknum, + BlockNumber blocknum); +extern void undofile_read(SMgrRelation reln, ForkNumber forknum, + BlockNumber blocknum, char *buffer); +extern void undofile_write(SMgrRelation reln, ForkNumber forknum, + BlockNumber blocknum, char *buffer, bool skipFsync); +extern void undofile_writeback(SMgrRelation reln, ForkNumber forknum, + BlockNumber blocknum, BlockNumber nblocks); +extern BlockNumber undofile_nblocks(SMgrRelation reln, ForkNumber forknum); +extern void undofile_truncate(SMgrRelation reln, ForkNumber forknum, + BlockNumber nblocks); +extern void undofile_immedsync(SMgrRelation reln, ForkNumber forknum); + +/* Callbacks used by sync.c. */ +extern int undofile_syncfiletag(const FileTag *tag, char *path); +extern bool undofile_filetagmatches(const FileTag *tag, const FileTag *candidate); + +/* Management of checkpointer requests. */ +extern void undofile_request_sync(UndoLogNumber logno, BlockNumber segno, + Oid tablespace); +extern void undofile_forget_sync(UndoLogNumber logno, BlockNumber segno, + Oid tablespace); +extern void undofile_forget_sync_tablespace(Oid tablespace); +extern void undofile_request_sync_dir(Oid tablespace); + +#endif /* UNDOFILE_H */ diff --git a/src/include/utils/guc.h b/src/include/utils/guc.h index e709177c37..84639eed22 100644 --- a/src/include/utils/guc.h +++ b/src/include/utils/guc.h @@ -431,6 +431,8 @@ extern void GUC_check_errcode(int sqlerrcode); extern bool check_default_tablespace(char **newval, void **extra, GucSource source); extern bool check_temp_tablespaces(char **newval, void **extra, GucSource source); extern void assign_temp_tablespaces(const char *newval, void *extra); +extern bool check_undo_tablespaces(char **newval, void **extra, GucSource source); +extern void assign_undo_tablespaces(const char *newval, void *extra); /* in catalog/namespace.c */ extern bool check_search_path(char **newval, void **extra, GucSource source); diff --git a/src/test/regress/expected/rules.out b/src/test/regress/expected/rules.out index 210e9cd146..7098461dde 100644 --- a/src/test/regress/expected/rules.out +++ b/src/test/regress/expected/rules.out @@ -2010,6 +2010,16 @@ pg_stat_sys_tables| SELECT pg_stat_all_tables.relid, pg_stat_all_tables.autoanalyze_count FROM pg_stat_all_tables WHERE ((pg_stat_all_tables.schemaname = ANY (ARRAY['pg_catalog'::name, 'information_schema'::name])) OR (pg_stat_all_tables.schemaname ~ '^pg_toast'::text)); +pg_stat_undo_logs| SELECT pg_stat_get_undo_logs.logno, + pg_stat_get_undo_logs.category, + pg_stat_get_undo_logs.tablespace, + pg_stat_get_undo_logs.discard, + pg_stat_get_undo_logs.insert, + pg_stat_get_undo_logs."end", + pg_stat_get_undo_logs.xid, + pg_stat_get_undo_logs.pid, + pg_stat_get_undo_logs.status + FROM pg_stat_get_undo_logs() pg_stat_get_undo_logs(logno, category, tablespace, discard, insert, "end", xid, pid, status); pg_stat_user_functions| SELECT p.oid AS funcid, n.nspname AS schemaname, p.proname AS funcname, -- 2.39.5