summaryrefslogtreecommitdiff
path: root/src/include
diff options
context:
space:
mode:
Diffstat (limited to 'src/include')
-rw-r--r--src/include/access/nbtree.h123
-rw-r--r--src/include/access/nbtxlog.h28
-rw-r--r--src/include/access/xlog_internal.h2
-rw-r--r--src/include/storage/standby.h2
4 files changed, 127 insertions, 28 deletions
diff --git a/src/include/access/nbtree.h b/src/include/access/nbtree.h
index cad4f2bdeb9..9ac90d74398 100644
--- a/src/include/access/nbtree.h
+++ b/src/include/access/nbtree.h
@@ -37,8 +37,9 @@ typedef uint16 BTCycleId;
*
* In addition, we store the page's btree level (counting upwards from
* zero at a leaf page) as well as some flag bits indicating the page type
- * and status. If the page is deleted, we replace the level with the
- * next-transaction-ID value indicating when it is safe to reclaim the page.
+ * and status. If the page is deleted, a BTDeletedPageData struct is stored
+ * in the page's tuple area, while a standard BTPageOpaqueData struct is
+ * stored in the page special area.
*
* We also store a "vacuum cycle ID". When a page is split while VACUUM is
* processing the index, a nonzero value associated with the VACUUM run is
@@ -52,17 +53,17 @@ typedef uint16 BTCycleId;
*
* NOTE: the BTP_LEAF flag bit is redundant since level==0 could be tested
* instead.
+ *
+ * NOTE: the btpo_level field used to be a union type in order to allow
+ * deleted pages to store a 32-bit safexid in the same field. We now store
+ * 64-bit/full safexid values using BTDeletedPageData instead.
*/
typedef struct BTPageOpaqueData
{
BlockNumber btpo_prev; /* left sibling, or P_NONE if leftmost */
BlockNumber btpo_next; /* right sibling, or P_NONE if rightmost */
- union
- {
- uint32 level; /* tree level --- zero for leaf pages */
- TransactionId xact; /* next transaction ID, if deleted */
- } btpo;
+ uint32 btpo_level; /* tree level --- zero for leaf pages */
uint16 btpo_flags; /* flag bits, see below */
BTCycleId btpo_cycleid; /* vacuum cycle ID of latest split */
} BTPageOpaqueData;
@@ -78,6 +79,7 @@ typedef BTPageOpaqueData *BTPageOpaque;
#define BTP_SPLIT_END (1 << 5) /* rightmost page of split group */
#define BTP_HAS_GARBAGE (1 << 6) /* page has LP_DEAD tuples (deprecated) */
#define BTP_INCOMPLETE_SPLIT (1 << 7) /* right sibling's downlink is missing */
+#define BTP_HAS_FULLXID (1 << 8) /* contains BTDeletedPageData */
/*
* The max allowed value of a cycle ID is a bit less than 64K. This is
@@ -105,10 +107,12 @@ typedef struct BTMetaPageData
BlockNumber btm_fastroot; /* current "fast" root location */
uint32 btm_fastlevel; /* tree level of the "fast" root page */
/* remaining fields only valid when btm_version >= BTREE_NOVAC_VERSION */
- TransactionId btm_oldest_btpo_xact; /* oldest btpo_xact among all deleted
- * pages */
- float8 btm_last_cleanup_num_heap_tuples; /* number of heap tuples
- * during last cleanup */
+
+ /* number of deleted, non-recyclable pages during last cleanup */
+ uint32 btm_last_cleanup_num_delpages;
+ /* number of heap tuples during last cleanup */
+ float8 btm_last_cleanup_num_heap_tuples;
+
bool btm_allequalimage; /* are all columns "equalimage"? */
} BTMetaPageData;
@@ -220,6 +224,93 @@ typedef struct BTMetaPageData
#define P_IGNORE(opaque) (((opaque)->btpo_flags & (BTP_DELETED|BTP_HALF_DEAD)) != 0)
#define P_HAS_GARBAGE(opaque) (((opaque)->btpo_flags & BTP_HAS_GARBAGE) != 0)
#define P_INCOMPLETE_SPLIT(opaque) (((opaque)->btpo_flags & BTP_INCOMPLETE_SPLIT) != 0)
+#define P_HAS_FULLXID(opaque) (((opaque)->btpo_flags & BTP_HAS_FULLXID) != 0)
+
+/*
+ * BTDeletedPageData is the page contents of a deleted page
+ */
+typedef struct BTDeletedPageData
+{
+ FullTransactionId safexid; /* See BTPageIsRecyclable() */
+} BTDeletedPageData;
+
+static inline void
+BTPageSetDeleted(Page page, FullTransactionId safexid)
+{
+ BTPageOpaque opaque;
+ PageHeader header;
+ BTDeletedPageData *contents;
+
+ opaque = (BTPageOpaque) PageGetSpecialPointer(page);
+ header = ((PageHeader) page);
+
+ opaque->btpo_flags &= ~BTP_HALF_DEAD;
+ opaque->btpo_flags |= BTP_DELETED | BTP_HAS_FULLXID;
+ header->pd_lower = MAXALIGN(SizeOfPageHeaderData) +
+ sizeof(BTDeletedPageData);
+ header->pd_upper = header->pd_special;
+
+ /* Set safexid in deleted page */
+ contents = ((BTDeletedPageData *) PageGetContents(page));
+ contents->safexid = safexid;
+}
+
+static inline FullTransactionId
+BTPageGetDeleteXid(Page page)
+{
+ BTPageOpaque opaque;
+ BTDeletedPageData *contents;
+
+ /* We only expect to be called with a deleted page */
+ Assert(!PageIsNew(page));
+ opaque = (BTPageOpaque) PageGetSpecialPointer(page);
+ Assert(P_ISDELETED(opaque));
+
+ /* pg_upgrade'd deleted page -- must be safe to delete now */
+ if (!P_HAS_FULLXID(opaque))
+ return FirstNormalFullTransactionId;
+
+ /* Get safexid from deleted page */
+ contents = ((BTDeletedPageData *) PageGetContents(page));
+ return contents->safexid;
+}
+
+/*
+ * Is an existing page recyclable?
+ *
+ * This exists to centralize the policy on which deleted pages are now safe to
+ * re-use.
+ *
+ * Note: PageIsNew() pages are always safe to recycle, but we can't deal with
+ * them here (caller is responsible for that case themselves). Caller might
+ * well need special handling for new pages anyway.
+ */
+static inline bool
+BTPageIsRecyclable(Page page)
+{
+ BTPageOpaque opaque;
+
+ Assert(!PageIsNew(page));
+
+ /* Recycling okay iff page is deleted and safexid is old enough */
+ opaque = (BTPageOpaque) PageGetSpecialPointer(page);
+ if (P_ISDELETED(opaque))
+ {
+ /*
+ * The page was deleted, but when? If it was just deleted, a scan
+ * might have seen the downlink to it, and will read the page later.
+ * As long as that can happen, we must keep the deleted page around as
+ * a tombstone.
+ *
+ * For that check if the deletion XID could still be visible to
+ * anyone. If not, then no scan that's still in progress could have
+ * seen its downlink, and we can recycle it.
+ */
+ return GlobalVisCheckRemovableFullXid(NULL, BTPageGetDeleteXid(page));
+ }
+
+ return false;
+}
/*
* Lehman and Yao's algorithm requires a ``high key'' on every non-rightmost
@@ -962,7 +1053,7 @@ typedef struct BTOptions
{
int32 varlena_header_; /* varlena header (do not touch directly!) */
int fillfactor; /* page fill factor in percent (0..100) */
- /* fraction of newly inserted tuples prior to trigger index cleanup */
+ /* fraction of newly inserted tuples needed to trigger index cleanup */
float8 vacuum_cleanup_index_scale_factor;
bool deduplicate_items; /* Try to deduplicate items? */
} BTOptions;
@@ -1066,8 +1157,8 @@ extern OffsetNumber _bt_findsplitloc(Relation rel, Page origpage,
*/
extern void _bt_initmetapage(Page page, BlockNumber rootbknum, uint32 level,
bool allequalimage);
-extern void _bt_update_meta_cleanup_info(Relation rel,
- TransactionId oldestBtpoXact, float8 numHeapTuples);
+extern void _bt_set_cleanup_info(Relation rel, BlockNumber num_delpages,
+ float8 num_heap_tuples);
extern void _bt_upgrademetapage(Page page);
extern Buffer _bt_getroot(Relation rel, int access);
extern Buffer _bt_gettrueroot(Relation rel);
@@ -1084,15 +1175,13 @@ extern void _bt_unlockbuf(Relation rel, Buffer buf);
extern bool _bt_conditionallockbuf(Relation rel, Buffer buf);
extern void _bt_upgradelockbufcleanup(Relation rel, Buffer buf);
extern void _bt_pageinit(Page page, Size size);
-extern bool _bt_page_recyclable(Page page);
extern void _bt_delitems_vacuum(Relation rel, Buffer buf,
OffsetNumber *deletable, int ndeletable,
BTVacuumPosting *updatable, int nupdatable);
extern void _bt_delitems_delete_check(Relation rel, Buffer buf,
Relation heapRel,
TM_IndexDeleteOp *delstate);
-extern uint32 _bt_pagedel(Relation rel, Buffer leafbuf,
- TransactionId *oldestBtpoXact);
+extern uint32 _bt_pagedel(Relation rel, Buffer leafbuf);
/*
* prototypes for functions in nbtsearch.c
diff --git a/src/include/access/nbtxlog.h b/src/include/access/nbtxlog.h
index 7ae5c98c2b8..3df34fcda2d 100644
--- a/src/include/access/nbtxlog.h
+++ b/src/include/access/nbtxlog.h
@@ -13,6 +13,7 @@
#ifndef NBTXLOG_H
#define NBTXLOG_H
+#include "access/transam.h"
#include "access/xlogreader.h"
#include "lib/stringinfo.h"
#include "storage/off.h"
@@ -52,7 +53,7 @@ typedef struct xl_btree_metadata
uint32 level;
BlockNumber fastroot;
uint32 fastlevel;
- TransactionId oldest_btpo_xact;
+ uint32 last_cleanup_num_delpages;
float8 last_cleanup_num_heap_tuples;
bool allequalimage;
} xl_btree_metadata;
@@ -187,7 +188,7 @@ typedef struct xl_btree_reuse_page
{
RelFileNode node;
BlockNumber block;
- TransactionId latestRemovedXid;
+ FullTransactionId latestRemovedFullXid;
} xl_btree_reuse_page;
#define SizeOfBtreeReusePage (sizeof(xl_btree_reuse_page))
@@ -282,9 +283,12 @@ typedef struct xl_btree_mark_page_halfdead
#define SizeOfBtreeMarkPageHalfDead (offsetof(xl_btree_mark_page_halfdead, topparent) + sizeof(BlockNumber))
/*
- * This is what we need to know about deletion of a btree page. Note we do
- * not store any content for the deleted page --- it is just rewritten as empty
- * during recovery, apart from resetting the btpo.xact.
+ * This is what we need to know about deletion of a btree page. Note that we
+ * only leave behind a small amount of bookkeeping information in deleted
+ * pages (deleted pages must be kept around as tombstones for a while). It is
+ * convenient for the REDO routine to regenerate its target page from scratch.
+ * This is why WAL record describes certain details that are actually directly
+ * available from the target page.
*
* Backup Blk 0: target block being deleted
* Backup Blk 1: target block's left sibling, if any
@@ -296,20 +300,24 @@ typedef struct xl_btree_unlink_page
{
BlockNumber leftsib; /* target block's left sibling, if any */
BlockNumber rightsib; /* target block's right sibling */
+ uint32 level; /* target block's level */
+ FullTransactionId safexid; /* target block's BTPageSetDeleted() XID */
/*
- * Information needed to recreate the leaf page, when target is an
- * internal page.
+ * Information needed to recreate a half-dead leaf page with correct
+ * topparent link. The fields are only used when deletion operation's
+ * target page is an internal page. REDO routine creates half-dead page
+ * from scratch to keep things simple (this is the same convenient
+ * approach used for the target page itself).
*/
BlockNumber leafleftsib;
BlockNumber leafrightsib;
- BlockNumber topparent; /* next child down in the subtree */
+ BlockNumber leaftopparent; /* next child down in the subtree */
- TransactionId btpo_xact; /* value of btpo.xact for use in recovery */
/* xl_btree_metadata FOLLOWS IF XLOG_BTREE_UNLINK_PAGE_META */
} xl_btree_unlink_page;
-#define SizeOfBtreeUnlinkPage (offsetof(xl_btree_unlink_page, btpo_xact) + sizeof(TransactionId))
+#define SizeOfBtreeUnlinkPage (offsetof(xl_btree_unlink_page, leaftopparent) + sizeof(BlockNumber))
/*
* New root log record. There are zero tuples if this is to establish an
diff --git a/src/include/access/xlog_internal.h b/src/include/access/xlog_internal.h
index 224cae0246f..8d09eaec93d 100644
--- a/src/include/access/xlog_internal.h
+++ b/src/include/access/xlog_internal.h
@@ -31,7 +31,7 @@
/*
* Each page of XLOG file has a header like this:
*/
-#define XLOG_PAGE_MAGIC 0xD109 /* can be used as WAL version indicator */
+#define XLOG_PAGE_MAGIC 0xD10A /* can be used as WAL version indicator */
typedef struct XLogPageHeaderData
{
diff --git a/src/include/storage/standby.h b/src/include/storage/standby.h
index 94d33851d09..38fd85a4316 100644
--- a/src/include/storage/standby.h
+++ b/src/include/storage/standby.h
@@ -31,6 +31,8 @@ extern void ShutdownRecoveryTransactionEnvironment(void);
extern void ResolveRecoveryConflictWithSnapshot(TransactionId latestRemovedXid,
RelFileNode node);
+extern void ResolveRecoveryConflictWithSnapshotFullXid(FullTransactionId latestRemovedFullXid,
+ RelFileNode node);
extern void ResolveRecoveryConflictWithTablespace(Oid tsid);
extern void ResolveRecoveryConflictWithDatabase(Oid dbid);