diff options
Diffstat (limited to 'src/include')
| -rw-r--r-- | src/include/access/nbtree.h | 123 | ||||
| -rw-r--r-- | src/include/access/nbtxlog.h | 28 | ||||
| -rw-r--r-- | src/include/access/xlog_internal.h | 2 | ||||
| -rw-r--r-- | src/include/storage/standby.h | 2 |
4 files changed, 127 insertions, 28 deletions
diff --git a/src/include/access/nbtree.h b/src/include/access/nbtree.h index cad4f2bdeb9..9ac90d74398 100644 --- a/src/include/access/nbtree.h +++ b/src/include/access/nbtree.h @@ -37,8 +37,9 @@ typedef uint16 BTCycleId; * * In addition, we store the page's btree level (counting upwards from * zero at a leaf page) as well as some flag bits indicating the page type - * and status. If the page is deleted, we replace the level with the - * next-transaction-ID value indicating when it is safe to reclaim the page. + * and status. If the page is deleted, a BTDeletedPageData struct is stored + * in the page's tuple area, while a standard BTPageOpaqueData struct is + * stored in the page special area. * * We also store a "vacuum cycle ID". When a page is split while VACUUM is * processing the index, a nonzero value associated with the VACUUM run is @@ -52,17 +53,17 @@ typedef uint16 BTCycleId; * * NOTE: the BTP_LEAF flag bit is redundant since level==0 could be tested * instead. + * + * NOTE: the btpo_level field used to be a union type in order to allow + * deleted pages to store a 32-bit safexid in the same field. We now store + * 64-bit/full safexid values using BTDeletedPageData instead. */ typedef struct BTPageOpaqueData { BlockNumber btpo_prev; /* left sibling, or P_NONE if leftmost */ BlockNumber btpo_next; /* right sibling, or P_NONE if rightmost */ - union - { - uint32 level; /* tree level --- zero for leaf pages */ - TransactionId xact; /* next transaction ID, if deleted */ - } btpo; + uint32 btpo_level; /* tree level --- zero for leaf pages */ uint16 btpo_flags; /* flag bits, see below */ BTCycleId btpo_cycleid; /* vacuum cycle ID of latest split */ } BTPageOpaqueData; @@ -78,6 +79,7 @@ typedef BTPageOpaqueData *BTPageOpaque; #define BTP_SPLIT_END (1 << 5) /* rightmost page of split group */ #define BTP_HAS_GARBAGE (1 << 6) /* page has LP_DEAD tuples (deprecated) */ #define BTP_INCOMPLETE_SPLIT (1 << 7) /* right sibling's downlink is missing */ +#define BTP_HAS_FULLXID (1 << 8) /* contains BTDeletedPageData */ /* * The max allowed value of a cycle ID is a bit less than 64K. This is @@ -105,10 +107,12 @@ typedef struct BTMetaPageData BlockNumber btm_fastroot; /* current "fast" root location */ uint32 btm_fastlevel; /* tree level of the "fast" root page */ /* remaining fields only valid when btm_version >= BTREE_NOVAC_VERSION */ - TransactionId btm_oldest_btpo_xact; /* oldest btpo_xact among all deleted - * pages */ - float8 btm_last_cleanup_num_heap_tuples; /* number of heap tuples - * during last cleanup */ + + /* number of deleted, non-recyclable pages during last cleanup */ + uint32 btm_last_cleanup_num_delpages; + /* number of heap tuples during last cleanup */ + float8 btm_last_cleanup_num_heap_tuples; + bool btm_allequalimage; /* are all columns "equalimage"? */ } BTMetaPageData; @@ -220,6 +224,93 @@ typedef struct BTMetaPageData #define P_IGNORE(opaque) (((opaque)->btpo_flags & (BTP_DELETED|BTP_HALF_DEAD)) != 0) #define P_HAS_GARBAGE(opaque) (((opaque)->btpo_flags & BTP_HAS_GARBAGE) != 0) #define P_INCOMPLETE_SPLIT(opaque) (((opaque)->btpo_flags & BTP_INCOMPLETE_SPLIT) != 0) +#define P_HAS_FULLXID(opaque) (((opaque)->btpo_flags & BTP_HAS_FULLXID) != 0) + +/* + * BTDeletedPageData is the page contents of a deleted page + */ +typedef struct BTDeletedPageData +{ + FullTransactionId safexid; /* See BTPageIsRecyclable() */ +} BTDeletedPageData; + +static inline void +BTPageSetDeleted(Page page, FullTransactionId safexid) +{ + BTPageOpaque opaque; + PageHeader header; + BTDeletedPageData *contents; + + opaque = (BTPageOpaque) PageGetSpecialPointer(page); + header = ((PageHeader) page); + + opaque->btpo_flags &= ~BTP_HALF_DEAD; + opaque->btpo_flags |= BTP_DELETED | BTP_HAS_FULLXID; + header->pd_lower = MAXALIGN(SizeOfPageHeaderData) + + sizeof(BTDeletedPageData); + header->pd_upper = header->pd_special; + + /* Set safexid in deleted page */ + contents = ((BTDeletedPageData *) PageGetContents(page)); + contents->safexid = safexid; +} + +static inline FullTransactionId +BTPageGetDeleteXid(Page page) +{ + BTPageOpaque opaque; + BTDeletedPageData *contents; + + /* We only expect to be called with a deleted page */ + Assert(!PageIsNew(page)); + opaque = (BTPageOpaque) PageGetSpecialPointer(page); + Assert(P_ISDELETED(opaque)); + + /* pg_upgrade'd deleted page -- must be safe to delete now */ + if (!P_HAS_FULLXID(opaque)) + return FirstNormalFullTransactionId; + + /* Get safexid from deleted page */ + contents = ((BTDeletedPageData *) PageGetContents(page)); + return contents->safexid; +} + +/* + * Is an existing page recyclable? + * + * This exists to centralize the policy on which deleted pages are now safe to + * re-use. + * + * Note: PageIsNew() pages are always safe to recycle, but we can't deal with + * them here (caller is responsible for that case themselves). Caller might + * well need special handling for new pages anyway. + */ +static inline bool +BTPageIsRecyclable(Page page) +{ + BTPageOpaque opaque; + + Assert(!PageIsNew(page)); + + /* Recycling okay iff page is deleted and safexid is old enough */ + opaque = (BTPageOpaque) PageGetSpecialPointer(page); + if (P_ISDELETED(opaque)) + { + /* + * The page was deleted, but when? If it was just deleted, a scan + * might have seen the downlink to it, and will read the page later. + * As long as that can happen, we must keep the deleted page around as + * a tombstone. + * + * For that check if the deletion XID could still be visible to + * anyone. If not, then no scan that's still in progress could have + * seen its downlink, and we can recycle it. + */ + return GlobalVisCheckRemovableFullXid(NULL, BTPageGetDeleteXid(page)); + } + + return false; +} /* * Lehman and Yao's algorithm requires a ``high key'' on every non-rightmost @@ -962,7 +1053,7 @@ typedef struct BTOptions { int32 varlena_header_; /* varlena header (do not touch directly!) */ int fillfactor; /* page fill factor in percent (0..100) */ - /* fraction of newly inserted tuples prior to trigger index cleanup */ + /* fraction of newly inserted tuples needed to trigger index cleanup */ float8 vacuum_cleanup_index_scale_factor; bool deduplicate_items; /* Try to deduplicate items? */ } BTOptions; @@ -1066,8 +1157,8 @@ extern OffsetNumber _bt_findsplitloc(Relation rel, Page origpage, */ extern void _bt_initmetapage(Page page, BlockNumber rootbknum, uint32 level, bool allequalimage); -extern void _bt_update_meta_cleanup_info(Relation rel, - TransactionId oldestBtpoXact, float8 numHeapTuples); +extern void _bt_set_cleanup_info(Relation rel, BlockNumber num_delpages, + float8 num_heap_tuples); extern void _bt_upgrademetapage(Page page); extern Buffer _bt_getroot(Relation rel, int access); extern Buffer _bt_gettrueroot(Relation rel); @@ -1084,15 +1175,13 @@ extern void _bt_unlockbuf(Relation rel, Buffer buf); extern bool _bt_conditionallockbuf(Relation rel, Buffer buf); extern void _bt_upgradelockbufcleanup(Relation rel, Buffer buf); extern void _bt_pageinit(Page page, Size size); -extern bool _bt_page_recyclable(Page page); extern void _bt_delitems_vacuum(Relation rel, Buffer buf, OffsetNumber *deletable, int ndeletable, BTVacuumPosting *updatable, int nupdatable); extern void _bt_delitems_delete_check(Relation rel, Buffer buf, Relation heapRel, TM_IndexDeleteOp *delstate); -extern uint32 _bt_pagedel(Relation rel, Buffer leafbuf, - TransactionId *oldestBtpoXact); +extern uint32 _bt_pagedel(Relation rel, Buffer leafbuf); /* * prototypes for functions in nbtsearch.c diff --git a/src/include/access/nbtxlog.h b/src/include/access/nbtxlog.h index 7ae5c98c2b8..3df34fcda2d 100644 --- a/src/include/access/nbtxlog.h +++ b/src/include/access/nbtxlog.h @@ -13,6 +13,7 @@ #ifndef NBTXLOG_H #define NBTXLOG_H +#include "access/transam.h" #include "access/xlogreader.h" #include "lib/stringinfo.h" #include "storage/off.h" @@ -52,7 +53,7 @@ typedef struct xl_btree_metadata uint32 level; BlockNumber fastroot; uint32 fastlevel; - TransactionId oldest_btpo_xact; + uint32 last_cleanup_num_delpages; float8 last_cleanup_num_heap_tuples; bool allequalimage; } xl_btree_metadata; @@ -187,7 +188,7 @@ typedef struct xl_btree_reuse_page { RelFileNode node; BlockNumber block; - TransactionId latestRemovedXid; + FullTransactionId latestRemovedFullXid; } xl_btree_reuse_page; #define SizeOfBtreeReusePage (sizeof(xl_btree_reuse_page)) @@ -282,9 +283,12 @@ typedef struct xl_btree_mark_page_halfdead #define SizeOfBtreeMarkPageHalfDead (offsetof(xl_btree_mark_page_halfdead, topparent) + sizeof(BlockNumber)) /* - * This is what we need to know about deletion of a btree page. Note we do - * not store any content for the deleted page --- it is just rewritten as empty - * during recovery, apart from resetting the btpo.xact. + * This is what we need to know about deletion of a btree page. Note that we + * only leave behind a small amount of bookkeeping information in deleted + * pages (deleted pages must be kept around as tombstones for a while). It is + * convenient for the REDO routine to regenerate its target page from scratch. + * This is why WAL record describes certain details that are actually directly + * available from the target page. * * Backup Blk 0: target block being deleted * Backup Blk 1: target block's left sibling, if any @@ -296,20 +300,24 @@ typedef struct xl_btree_unlink_page { BlockNumber leftsib; /* target block's left sibling, if any */ BlockNumber rightsib; /* target block's right sibling */ + uint32 level; /* target block's level */ + FullTransactionId safexid; /* target block's BTPageSetDeleted() XID */ /* - * Information needed to recreate the leaf page, when target is an - * internal page. + * Information needed to recreate a half-dead leaf page with correct + * topparent link. The fields are only used when deletion operation's + * target page is an internal page. REDO routine creates half-dead page + * from scratch to keep things simple (this is the same convenient + * approach used for the target page itself). */ BlockNumber leafleftsib; BlockNumber leafrightsib; - BlockNumber topparent; /* next child down in the subtree */ + BlockNumber leaftopparent; /* next child down in the subtree */ - TransactionId btpo_xact; /* value of btpo.xact for use in recovery */ /* xl_btree_metadata FOLLOWS IF XLOG_BTREE_UNLINK_PAGE_META */ } xl_btree_unlink_page; -#define SizeOfBtreeUnlinkPage (offsetof(xl_btree_unlink_page, btpo_xact) + sizeof(TransactionId)) +#define SizeOfBtreeUnlinkPage (offsetof(xl_btree_unlink_page, leaftopparent) + sizeof(BlockNumber)) /* * New root log record. There are zero tuples if this is to establish an diff --git a/src/include/access/xlog_internal.h b/src/include/access/xlog_internal.h index 224cae0246f..8d09eaec93d 100644 --- a/src/include/access/xlog_internal.h +++ b/src/include/access/xlog_internal.h @@ -31,7 +31,7 @@ /* * Each page of XLOG file has a header like this: */ -#define XLOG_PAGE_MAGIC 0xD109 /* can be used as WAL version indicator */ +#define XLOG_PAGE_MAGIC 0xD10A /* can be used as WAL version indicator */ typedef struct XLogPageHeaderData { diff --git a/src/include/storage/standby.h b/src/include/storage/standby.h index 94d33851d09..38fd85a4316 100644 --- a/src/include/storage/standby.h +++ b/src/include/storage/standby.h @@ -31,6 +31,8 @@ extern void ShutdownRecoveryTransactionEnvironment(void); extern void ResolveRecoveryConflictWithSnapshot(TransactionId latestRemovedXid, RelFileNode node); +extern void ResolveRecoveryConflictWithSnapshotFullXid(FullTransactionId latestRemovedFullXid, + RelFileNode node); extern void ResolveRecoveryConflictWithTablespace(Oid tsid); extern void ResolveRecoveryConflictWithDatabase(Oid dbid); |
