P_FIRSTDATAKEY(opaque));
itup = (IndexTuple) PageGetItem(state->target, itemid);
nextleveldown.leftmost = BTreeTupleGetDownLink(itup);
- nextleveldown.level = opaque->btpo.level - 1;
+ nextleveldown.level = opaque->btpo_level - 1;
}
else
{
if (opaque->btpo_prev != leftcurrent)
bt_recheck_sibling_links(state, opaque->btpo_prev, leftcurrent);
- /* Check level, which must be valid for non-ignorable page */
- if (level.level != opaque->btpo.level)
+ /* Check level */
+ if (level.level != opaque->btpo_level)
ereport(ERROR,
(errcode(ERRCODE_INDEX_CORRUPTED),
errmsg("leftmost down link for level points to block in index \"%s\" whose level is not one level down",
RelationGetRelationName(state->rel)),
errdetail_internal("Block pointed to=%u expected level=%u level in pointed to block=%u.",
- current, level.level, opaque->btpo.level)));
+ current, level.level, opaque->btpo_level)));
/* Verify invariants for page */
bt_target_page_check(state);
bt_child_highkey_check(state,
offset,
NULL,
- topaque->btpo.level);
+ topaque->btpo_level);
}
continue;
}
if (!P_ISLEAF(topaque) && P_RIGHTMOST(topaque) && state->readonly)
{
bt_child_highkey_check(state, InvalidOffsetNumber,
- NULL, topaque->btpo.level);
+ NULL, topaque->btpo_level);
}
}
ereport(DEBUG1,
(errcode(ERRCODE_NO_DATA),
errmsg_internal("level %u leftmost page of index \"%s\" was found deleted or half dead",
- opaque->btpo.level, RelationGetRelationName(state->rel)),
+ opaque->btpo_level, RelationGetRelationName(state->rel)),
errdetail_internal("Deleted page found when building scankey from right sibling.")));
/* Be slightly more pro-active in freeing this memory, just in case */
state->targetblock, blkno,
LSN_FORMAT_ARGS(state->targetlsn))));
- /* Check level for non-ignorable page */
- if (!P_IGNORE(opaque) && opaque->btpo.level != target_level - 1)
+ /* Do level sanity check */
+ if ((!P_ISDELETED(opaque) || P_HAS_FULLXID(opaque)) &&
+ opaque->btpo_level != target_level - 1)
ereport(ERROR,
(errcode(ERRCODE_INDEX_CORRUPTED),
errmsg("block found while following rightlinks from child of index \"%s\" has invalid level",
RelationGetRelationName(state->rel)),
errdetail_internal("Block pointed to=%u expected level=%u level in pointed to block=%u.",
- blkno, target_level - 1, opaque->btpo.level)));
+ blkno, target_level - 1, opaque->btpo_level)));
/* Try to detect circular links */
if ((!first && blkno == state->prevrightlink) || blkno == opaque->btpo_prev)
* check for downlink connectivity.
*/
bt_child_highkey_check(state, downlinkoffnum,
- child, topaque->btpo.level);
+ child, topaque->btpo_level);
/*
* Since there cannot be a concurrent VACUUM operation in readonly mode,
errmsg_internal("harmless interrupted page split detected in index %s",
RelationGetRelationName(state->rel)),
errdetail_internal("Block=%u level=%u left sibling=%u page lsn=%X/%X.",
- blkno, opaque->btpo.level,
+ blkno, opaque->btpo_level,
opaque->btpo_prev,
LSN_FORMAT_ARGS(pagelsn))));
return;
elog(DEBUG1, "checking for interrupted multi-level deletion due to missing downlink in index \"%s\"",
RelationGetRelationName(state->rel));
- level = opaque->btpo.level;
+ level = opaque->btpo_level;
itemid = PageGetItemIdCareful(state, blkno, page, P_FIRSTDATAKEY(opaque));
itup = (IndexTuple) PageGetItem(page, itemid);
childblk = BTreeTupleGetDownLink(itup);
break;
/* Do an extra sanity check in passing on internal pages */
- if (copaque->btpo.level != level - 1)
+ if (copaque->btpo_level != level - 1)
ereport(ERROR,
(errcode(ERRCODE_INDEX_CORRUPTED),
errmsg_internal("downlink points to block in index \"%s\" whose level is not one level down",
RelationGetRelationName(state->rel)),
errdetail_internal("Top parent/under check block=%u block pointed to=%u expected level=%u level in pointed to block=%u.",
blkno, childblk,
- level - 1, copaque->btpo.level)));
+ level - 1, copaque->btpo_level)));
- level = copaque->btpo.level;
+ level = copaque->btpo_level;
itemid = PageGetItemIdCareful(state, childblk, child,
P_FIRSTDATAKEY(copaque));
itup = (IndexTuple) PageGetItem(child, itemid);
errmsg("internal index block lacks downlink in index \"%s\"",
RelationGetRelationName(state->rel)),
errdetail_internal("Block=%u level=%u page lsn=%X/%X.",
- blkno, opaque->btpo.level,
+ blkno, opaque->btpo_level,
LSN_FORMAT_ARGS(pagelsn))));
}
}
/*
- * Deleted pages have no sane "level" field, so can only check non-deleted
- * page level
+ * Deleted pages that still use the old 32-bit XID representation have no
+ * sane "level" field because they type pun the field, but all other pages
+ * (including pages deleted on Postgres 14+) have a valid value.
*/
- if (P_ISLEAF(opaque) && !P_ISDELETED(opaque) && opaque->btpo.level != 0)
- ereport(ERROR,
- (errcode(ERRCODE_INDEX_CORRUPTED),
- errmsg("invalid leaf page level %u for block %u in index \"%s\"",
- opaque->btpo.level, blocknum, RelationGetRelationName(state->rel))));
+ if (!P_ISDELETED(opaque) || P_HAS_FULLXID(opaque))
+ {
+ /* Okay, no reason not to trust btpo_level field from page */
- if (!P_ISLEAF(opaque) && !P_ISDELETED(opaque) &&
- opaque->btpo.level == 0)
- ereport(ERROR,
- (errcode(ERRCODE_INDEX_CORRUPTED),
- errmsg("invalid internal page level 0 for block %u in index \"%s\"",
- blocknum, RelationGetRelationName(state->rel))));
+ if (P_ISLEAF(opaque) && opaque->btpo_level != 0)
+ ereport(ERROR,
+ (errcode(ERRCODE_INDEX_CORRUPTED),
+ errmsg_internal("invalid leaf page level %u for block %u in index \"%s\"",
+ opaque->btpo_level, blocknum,
+ RelationGetRelationName(state->rel))));
+
+ if (!P_ISLEAF(opaque) && opaque->btpo_level == 0)
+ ereport(ERROR,
+ (errcode(ERRCODE_INDEX_CORRUPTED),
+ errmsg_internal("invalid internal page level 0 for block %u in index \"%s\"",
+ blocknum,
+ RelationGetRelationName(state->rel))));
+ }
/*
* Sanity checks for number of items on page.
* state. This state is nonetheless treated as corruption by VACUUM on
* from version 9.4 on, so do the same here. See _bt_pagedel() for full
* details.
- *
- * Internal pages should never have garbage items, either.
*/
if (!P_ISLEAF(opaque) && P_ISHALFDEAD(opaque))
ereport(ERROR,
blocknum, RelationGetRelationName(state->rel)),
errhint("This can be caused by an interrupted VACUUM in version 9.3 or older, before upgrade. Please REINDEX it.")));
+ /*
+ * Check that internal pages have no garbage items, and that no page has
+ * an invalid combination of deletion-related page level flags
+ */
if (!P_ISLEAF(opaque) && P_HAS_GARBAGE(opaque))
ereport(ERROR,
(errcode(ERRCODE_INDEX_CORRUPTED),
- errmsg("internal page block %u in index \"%s\" has garbage items",
- blocknum, RelationGetRelationName(state->rel))));
+ errmsg_internal("internal page block %u in index \"%s\" has garbage items",
+ blocknum, RelationGetRelationName(state->rel))));
+
+ if (P_HAS_FULLXID(opaque) && !P_ISDELETED(opaque))
+ ereport(ERROR,
+ (errcode(ERRCODE_INDEX_CORRUPTED),
+ errmsg_internal("full transaction id page flag appears in non-deleted block %u in index \"%s\"",
+ blocknum, RelationGetRelationName(state->rel))));
+
+ if (P_ISDELETED(opaque) && P_ISHALFDEAD(opaque))
+ ereport(ERROR,
+ (errcode(ERRCODE_INDEX_CORRUPTED),
+ errmsg_internal("deleted page block %u in index \"%s\" is half-dead",
+ blocknum, RelationGetRelationName(state->rel))));
return page;
}
/* opaque data */
BlockNumber btpo_prev;
BlockNumber btpo_next;
- union
- {
- uint32 level;
- TransactionId xact;
- } btpo;
+ uint32 btpo_level;
uint16 btpo_flags;
BTCycleId btpo_cycleid;
} BTPageStat;
/* page type (flags) */
if (P_ISDELETED(opaque))
{
- stat->type = 'd';
- stat->btpo.xact = opaque->btpo.xact;
- return;
+ /* We divide deleted pages into leaf ('d') or internal ('D') */
+ if (P_ISLEAF(opaque) || !P_HAS_FULLXID(opaque))
+ stat->type = 'd';
+ else
+ stat->type = 'D';
+
+ /*
+ * Report safexid in a deleted page.
+ *
+ * Handle pg_upgrade'd deleted pages that used the previous safexid
+ * representation in btpo_level field (this used to be a union type
+ * called "bpto").
+ */
+ if (P_HAS_FULLXID(opaque))
+ {
+ FullTransactionId safexid = BTPageGetDeleteXid(page);
+
+ elog(NOTICE, "deleted page from block %u has safexid %u:%u",
+ blkno, EpochFromFullTransactionId(safexid),
+ XidFromFullTransactionId(safexid));
+ }
+ else
+ elog(NOTICE, "deleted page from block %u has safexid %u",
+ blkno, opaque->btpo_level);
+
+ /* Don't interpret BTDeletedPageData as index tuples */
+ maxoff = InvalidOffsetNumber;
}
else if (P_IGNORE(opaque))
stat->type = 'e';
/* btpage opaque data */
stat->btpo_prev = opaque->btpo_prev;
stat->btpo_next = opaque->btpo_next;
- stat->btpo.level = opaque->btpo.level;
+ stat->btpo_level = opaque->btpo_level;
stat->btpo_flags = opaque->btpo_flags;
stat->btpo_cycleid = opaque->btpo_cycleid;
values[j++] = psprintf("%u", stat.free_size);
values[j++] = psprintf("%u", stat.btpo_prev);
values[j++] = psprintf("%u", stat.btpo_next);
- values[j++] = psprintf("%u", (stat.type == 'd') ? stat.btpo.xact : stat.btpo.level);
+ values[j++] = psprintf("%u", stat.btpo_level);
values[j++] = psprintf("%d", stat.btpo_flags);
tuple = BuildTupleFromCStrings(TupleDescGetAttInMetadata(tupleDesc),
opaque = (BTPageOpaque) PageGetSpecialPointer(uargs->page);
- if (P_ISDELETED(opaque))
- elog(NOTICE, "page is deleted");
-
- fctx->max_calls = PageGetMaxOffsetNumber(uargs->page);
+ if (!P_ISDELETED(opaque))
+ fctx->max_calls = PageGetMaxOffsetNumber(uargs->page);
+ else
+ {
+ /* Don't interpret BTDeletedPageData as index tuples */
+ elog(NOTICE, "page from block " INT64_FORMAT " is deleted", blkno);
+ fctx->max_calls = 0;
+ }
uargs->leafpage = P_ISLEAF(opaque);
uargs->rightmost = P_RIGHTMOST(opaque);
if (P_ISDELETED(opaque))
elog(NOTICE, "page is deleted");
- fctx->max_calls = PageGetMaxOffsetNumber(uargs->page);
+ if (!P_ISDELETED(opaque))
+ fctx->max_calls = PageGetMaxOffsetNumber(uargs->page);
+ else
+ {
+ /* Don't interpret BTDeletedPageData as index tuples */
+ elog(NOTICE, "page from block is deleted");
+ fctx->max_calls = 0;
+ }
uargs->leafpage = P_ISLEAF(opaque);
uargs->rightmost = P_RIGHTMOST(opaque);
/*
* We need a kluge here to detect API versions prior to 1.8. Earlier
- * versions incorrectly used int4 for certain columns. This caused
- * various problems. For example, an int4 version of the "oldest_xact"
- * column would not work with TransactionId values that happened to exceed
- * PG_INT32_MAX.
+ * versions incorrectly used int4 for certain columns.
*
* There is no way to reliably avoid the problems created by the old
* function definition at this point, so insist that the user update the
*/
if (metad->btm_version >= BTREE_NOVAC_VERSION)
{
- values[j++] = psprintf("%u", metad->btm_oldest_btpo_xact);
+ values[j++] = psprintf(INT64_FORMAT,
+ (int64) metad->btm_last_cleanup_num_delpages);
values[j++] = psprintf("%f", metad->btm_last_cleanup_num_heap_tuples);
values[j++] = metad->btm_allequalimage ? "t" : "f";
}
CREATE INDEX test1_a_idx ON test1 USING btree (a);
\x
SELECT * FROM bt_metap('test1_a_idx');
--[ RECORD 1 ]-----------+-------
-magic | 340322
-version | 4
-root | 1
-level | 0
-fastroot | 1
-fastlevel | 0
-oldest_xact | 0
-last_cleanup_num_tuples | -1
-allequalimage | t
+-[ RECORD 1 ]-------------+-------
+magic | 340322
+version | 4
+root | 1
+level | 0
+fastroot | 1
+fastlevel | 0
+last_cleanup_num_delpages | 0
+last_cleanup_num_tuples | -1
+allequalimage | t
SELECT * FROM bt_page_stats('test1_a_idx', -1);
ERROR: invalid block number
free_size | 8128
btpo_prev | 0
btpo_next | 0
-btpo | 0
+btpo_level | 0
btpo_flags | 3
SELECT * FROM bt_page_stats('test1_a_idx', 2);
AS 'MODULE_PATHNAME', 'page_checksum_1_9'
LANGUAGE C STRICT PARALLEL SAFE;
+--
+-- bt_metap()
+--
+DROP FUNCTION bt_metap(text);
+CREATE FUNCTION bt_metap(IN relname text,
+ OUT magic int4,
+ OUT version int4,
+ OUT root int8,
+ OUT level int8,
+ OUT fastroot int8,
+ OUT fastlevel int8,
+ OUT last_cleanup_num_delpages int8,
+ OUT last_cleanup_num_tuples float8,
+ OUT allequalimage boolean)
+AS 'MODULE_PATHNAME', 'bt_metap'
+LANGUAGE C STRICT PARALLEL SAFE;
+
--
-- bt_page_stats()
--
OUT free_size int4,
OUT btpo_prev int8,
OUT btpo_next int8,
- OUT btpo int4,
+ OUT btpo_level int8,
OUT btpo_flags int4)
AS 'MODULE_PATHNAME', 'bt_page_stats_1_9'
LANGUAGE C STRICT PARALLEL SAFE;
page = BufferGetPage(buffer);
opaque = (BTPageOpaque) PageGetSpecialPointer(page);
- /* Determine page type, and update totals */
-
+ /*
+ * Determine page type, and update totals.
+ *
+ * Note that we arbitrarily bucket deleted pages together without
+ * considering if they're leaf pages or internal pages.
+ */
if (P_ISDELETED(opaque))
indexStat.deleted_pages++;
else if (P_IGNORE(opaque))
<para>
If no tuples were deleted from the heap, B-tree indexes are still
- scanned at the <command>VACUUM</command> cleanup stage when at least one
- of the following conditions is met: the index statistics are stale, or
- the index contains deleted pages that can be recycled during cleanup.
- Index statistics are considered to be stale if the number of newly
- inserted tuples exceeds the <varname>vacuum_cleanup_index_scale_factor</varname>
+ scanned at the <command>VACUUM</command> cleanup stage when the
+ index's statistics are stale. Index statistics are considered
+ stale if the number of newly inserted tuples exceeds the
+ <varname>vacuum_cleanup_index_scale_factor</varname>
fraction of the total number of heap tuples detected by the previous
statistics collection. The total number of heap tuples is stored in
the index meta-page. Note that the meta-page does not include this data
index's metapage. For example:
<screen>
test=# SELECT * FROM bt_metap('pg_cast_oid_index');
--[ RECORD 1 ]-----------+-------
-magic | 340322
-version | 4
-root | 1
-level | 0
-fastroot | 1
-fastlevel | 0
-oldest_xact | 582
-last_cleanup_num_tuples | 1000
-allequalimage | f
+-[ RECORD 1 ]-------------+-------
+magic | 340322
+version | 4
+root | 1
+level | 0
+fastroot | 1
+fastlevel | 0
+last_cleanup_num_delpages | 0
+last_cleanup_num_tuples | 230
+allequalimage | f
</screen>
</para>
</listitem>
free_size | 3668
btpo_prev | 0
btpo_next | 0
-btpo | 0
+btpo_level | 0
btpo_flags | 3
</screen>
</para>
* same exclusion effect on primary and standby.
*/
if (InHotStandby)
- {
- FullTransactionId latestRemovedFullXid = xlrec->latestRemovedFullXid;
- FullTransactionId nextXid = ReadNextFullTransactionId();
- uint64 diff;
-
- /*
- * ResolveRecoveryConflictWithSnapshot operates on 32-bit
- * TransactionIds, so truncate the logged FullTransactionId. If the
- * logged value is very old, so that XID wrap-around already happened
- * on it, there can't be any snapshots that still see it.
- */
- diff = U64FromFullTransactionId(nextXid) -
- U64FromFullTransactionId(latestRemovedFullXid);
- if (diff < MaxTransactionId / 2)
- {
- TransactionId latestRemovedXid;
-
- latestRemovedXid = XidFromFullTransactionId(latestRemovedFullXid);
- ResolveRecoveryConflictWithSnapshot(latestRemovedXid,
- xlrec->node);
- }
- }
+ ResolveRecoveryConflictWithSnapshotFullXid(xlrec->latestRemovedFullXid,
+ xlrec->node);
}
void
metapg = BufferGetPage(metabuf);
metad = BTPageGetMeta(metapg);
- if (metad->btm_fastlevel >= opaque->btpo.level)
+ if (metad->btm_fastlevel >= opaque->btpo_level)
{
/* no update wanted */
_bt_relbuf(rel, metabuf);
if (metad->btm_version < BTREE_NOVAC_VERSION)
_bt_upgrademetapage(metapg);
metad->btm_fastroot = BufferGetBlockNumber(buf);
- metad->btm_fastlevel = opaque->btpo.level;
+ metad->btm_fastlevel = opaque->btpo_level;
MarkBufferDirty(metabuf);
}
xlmeta.level = metad->btm_level;
xlmeta.fastroot = metad->btm_fastroot;
xlmeta.fastlevel = metad->btm_fastlevel;
- xlmeta.oldest_btpo_xact = metad->btm_oldest_btpo_xact;
+ xlmeta.last_cleanup_num_delpages = metad->btm_last_cleanup_num_delpages;
xlmeta.last_cleanup_num_heap_tuples =
metad->btm_last_cleanup_num_heap_tuples;
xlmeta.allequalimage = metad->btm_allequalimage;
lopaque->btpo_flags |= BTP_INCOMPLETE_SPLIT;
lopaque->btpo_prev = oopaque->btpo_prev;
/* handle btpo_next after rightpage buffer acquired */
- lopaque->btpo.level = oopaque->btpo.level;
+ lopaque->btpo_level = oopaque->btpo_level;
/* handle btpo_cycleid after rightpage buffer acquired */
/*
ropaque->btpo_flags &= ~(BTP_ROOT | BTP_SPLIT_END | BTP_HAS_GARBAGE);
ropaque->btpo_prev = origpagenumber;
ropaque->btpo_next = oopaque->btpo_next;
- ropaque->btpo.level = oopaque->btpo.level;
+ ropaque->btpo_level = oopaque->btpo_level;
ropaque->btpo_cycleid = lopaque->btpo_cycleid;
/*
uint8 xlinfo;
XLogRecPtr recptr;
- xlrec.level = ropaque->btpo.level;
+ xlrec.level = ropaque->btpo_level;
/* See comments below on newitem, orignewitem, and posting lists */
xlrec.firstrightoff = firstrightoff;
xlrec.newitemoff = newitemoff;
BlockNumberIsValid(RelationGetTargetBlock(rel))));
/* Find the leftmost page at the next level up */
- pbuf = _bt_get_endpoint(rel, opaque->btpo.level + 1, false, NULL);
+ pbuf = _bt_get_endpoint(rel, opaque->btpo_level + 1, false, NULL);
/* Set up a phony stack entry pointing there */
stack = &fakestack;
stack->bts_blkno = BufferGetBlockNumber(pbuf);
rootopaque = (BTPageOpaque) PageGetSpecialPointer(rootpage);
rootopaque->btpo_prev = rootopaque->btpo_next = P_NONE;
rootopaque->btpo_flags = BTP_ROOT;
- rootopaque->btpo.level =
- ((BTPageOpaque) PageGetSpecialPointer(lpage))->btpo.level + 1;
+ rootopaque->btpo_level =
+ ((BTPageOpaque) PageGetSpecialPointer(lpage))->btpo_level + 1;
rootopaque->btpo_cycleid = 0;
/* update metapage data */
metad->btm_root = rootblknum;
- metad->btm_level = rootopaque->btpo.level;
+ metad->btm_level = rootopaque->btpo_level;
metad->btm_fastroot = rootblknum;
- metad->btm_fastlevel = rootopaque->btpo.level;
+ metad->btm_fastlevel = rootopaque->btpo_level;
/*
* Insert the left page pointer into the new root page. The root page is
md.level = metad->btm_level;
md.fastroot = rootblknum;
md.fastlevel = metad->btm_level;
- md.oldest_btpo_xact = metad->btm_oldest_btpo_xact;
+ md.last_cleanup_num_delpages = metad->btm_last_cleanup_num_delpages;
md.last_cleanup_num_heap_tuples = metad->btm_last_cleanup_num_heap_tuples;
md.allequalimage = metad->btm_allequalimage;
static BTMetaPageData *_bt_getmeta(Relation rel, Buffer metabuf);
static void _bt_log_reuse_page(Relation rel, BlockNumber blkno,
- TransactionId latestRemovedXid);
+ FullTransactionId safexid);
static void _bt_delitems_delete(Relation rel, Buffer buf,
TransactionId latestRemovedXid,
OffsetNumber *deletable, int ndeletable,
static bool _bt_unlink_halfdead_page(Relation rel, Buffer leafbuf,
BlockNumber scanblkno,
bool *rightsib_empty,
- TransactionId *oldestBtpoXact,
uint32 *ndeleted);
static bool _bt_lock_subtree_parent(Relation rel, BlockNumber child,
BTStack stack,
metad->btm_level = level;
metad->btm_fastroot = rootbknum;
metad->btm_fastlevel = level;
- metad->btm_oldest_btpo_xact = InvalidTransactionId;
+ metad->btm_last_cleanup_num_delpages = 0;
metad->btm_last_cleanup_num_heap_tuples = -1.0;
metad->btm_allequalimage = allequalimage;
/* Set version number and fill extra fields added into version 3 */
metad->btm_version = BTREE_NOVAC_VERSION;
- metad->btm_oldest_btpo_xact = InvalidTransactionId;
+ metad->btm_last_cleanup_num_delpages = 0;
metad->btm_last_cleanup_num_heap_tuples = -1.0;
/* Only a REINDEX can set this field */
Assert(!metad->btm_allequalimage);
}
/*
- * _bt_update_meta_cleanup_info() -- Update cleanup-related information in
- * the metapage.
- *
- * This routine checks if provided cleanup-related information is matching
- * to those written in the metapage. On mismatch, metapage is overwritten.
+ * _bt_set_cleanup_info() -- Update metapage for btvacuumcleanup().
+ *
+ * This routine is called at the end of each VACUUM's btvacuumcleanup()
+ * call. Its purpose is to maintain the metapage fields that are used by
+ * _bt_vacuum_needs_cleanup() to decide whether or not a btvacuumscan()
+ * call should go ahead for an entire VACUUM operation.
+ *
+ * See btvacuumcleanup() and _bt_vacuum_needs_cleanup() for details of
+ * the two fields that we maintain here.
+ *
+ * The information that we maintain for btvacuumcleanup() describes the
+ * state of the index (as well as the table it indexes) just _after_ the
+ * ongoing VACUUM operation. The next _bt_vacuum_needs_cleanup() call
+ * will consider the information we saved for it during the next VACUUM
+ * operation (assuming that there will be no btbulkdelete() call during
+ * the next VACUUM operation -- if there is then the question of skipping
+ * btvacuumscan() doesn't even arise).
*/
void
-_bt_update_meta_cleanup_info(Relation rel, TransactionId oldestBtpoXact,
- float8 numHeapTuples)
+_bt_set_cleanup_info(Relation rel, BlockNumber num_delpages,
+ float8 num_heap_tuples)
{
Buffer metabuf;
Page metapg;
BTMetaPageData *metad;
- bool needsRewrite = false;
+ bool rewrite = false;
XLogRecPtr recptr;
- /* read the metapage and check if it needs rewrite */
+ /*
+ * On-disk compatibility note: The btm_last_cleanup_num_delpages metapage
+ * field started out as a TransactionId field called btm_oldest_btpo_xact.
+ * Both "versions" are just uint32 fields. It was convenient to repurpose
+ * the field when we began to use 64-bit XIDs in deleted pages.
+ *
+ * It's possible that a pg_upgrade'd database will contain an XID value in
+ * what is now recognized as the metapage's btm_last_cleanup_num_delpages
+ * field. _bt_vacuum_needs_cleanup() may even believe that this value
+ * indicates that there are lots of pages that it needs to recycle, when
+ * in reality there are only one or two. The worst that can happen is
+ * that there will be a call to btvacuumscan a little earlier, which will
+ * set btm_last_cleanup_num_delpages to a sane value when we're called.
+ */
metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_READ);
metapg = BufferGetPage(metabuf);
metad = BTPageGetMeta(metapg);
- /* outdated version of metapage always needs rewrite */
+ /* Always dynamically upgrade index/metapage when BTREE_MIN_VERSION */
if (metad->btm_version < BTREE_NOVAC_VERSION)
- needsRewrite = true;
- else if (metad->btm_oldest_btpo_xact != oldestBtpoXact ||
- metad->btm_last_cleanup_num_heap_tuples != numHeapTuples)
- needsRewrite = true;
+ rewrite = true;
+ else if (metad->btm_last_cleanup_num_delpages != num_delpages)
+ rewrite = true;
+ else if (metad->btm_last_cleanup_num_heap_tuples != num_heap_tuples)
+ rewrite = true;
- if (!needsRewrite)
+ if (!rewrite)
{
_bt_relbuf(rel, metabuf);
return;
_bt_upgrademetapage(metapg);
/* update cleanup-related information */
- metad->btm_oldest_btpo_xact = oldestBtpoXact;
- metad->btm_last_cleanup_num_heap_tuples = numHeapTuples;
+ metad->btm_last_cleanup_num_delpages = num_delpages;
+ metad->btm_last_cleanup_num_heap_tuples = num_heap_tuples;
MarkBufferDirty(metabuf);
/* write wal record if needed */
md.level = metad->btm_level;
md.fastroot = metad->btm_fastroot;
md.fastlevel = metad->btm_fastlevel;
- md.oldest_btpo_xact = oldestBtpoXact;
- md.last_cleanup_num_heap_tuples = numHeapTuples;
+ md.last_cleanup_num_delpages = num_delpages;
+ md.last_cleanup_num_heap_tuples = num_heap_tuples;
md.allequalimage = metad->btm_allequalimage;
XLogRegisterBufData(0, (char *) &md, sizeof(xl_btree_metadata));
}
END_CRIT_SECTION();
+
_bt_relbuf(rel, metabuf);
}
* because that's not set in a "fast root".
*/
if (!P_IGNORE(rootopaque) &&
- rootopaque->btpo.level == rootlevel &&
+ rootopaque->btpo_level == rootlevel &&
P_LEFTMOST(rootopaque) &&
P_RIGHTMOST(rootopaque))
{
rootopaque = (BTPageOpaque) PageGetSpecialPointer(rootpage);
rootopaque->btpo_prev = rootopaque->btpo_next = P_NONE;
rootopaque->btpo_flags = (BTP_LEAF | BTP_ROOT);
- rootopaque->btpo.level = 0;
+ rootopaque->btpo_level = 0;
rootopaque->btpo_cycleid = 0;
/* Get raw page pointer for metapage */
metapg = BufferGetPage(metabuf);
metad->btm_level = 0;
metad->btm_fastroot = rootblkno;
metad->btm_fastlevel = 0;
- metad->btm_oldest_btpo_xact = InvalidTransactionId;
+ metad->btm_last_cleanup_num_delpages = 0;
metad->btm_last_cleanup_num_heap_tuples = -1.0;
MarkBufferDirty(rootbuf);
md.level = 0;
md.fastroot = rootblkno;
md.fastlevel = 0;
- md.oldest_btpo_xact = InvalidTransactionId;
+ md.last_cleanup_num_delpages = 0;
md.last_cleanup_num_heap_tuples = -1.0;
md.allequalimage = metad->btm_allequalimage;
rootblkno = rootopaque->btpo_next;
}
- /* Note: can't check btpo.level on deleted pages */
- if (rootopaque->btpo.level != rootlevel)
+ if (rootopaque->btpo_level != rootlevel)
elog(ERROR, "root page %u of index \"%s\" has level %u, expected %u",
rootblkno, RelationGetRelationName(rel),
- rootopaque->btpo.level, rootlevel);
+ rootopaque->btpo_level, rootlevel);
}
/*
rootblkno = rootopaque->btpo_next;
}
- /* Note: can't check btpo.level on deleted pages */
- if (rootopaque->btpo.level != rootlevel)
+ if (rootopaque->btpo_level != rootlevel)
elog(ERROR, "root page %u of index \"%s\" has level %u, expected %u",
rootblkno, RelationGetRelationName(rel),
- rootopaque->btpo.level, rootlevel);
+ rootopaque->btpo_level, rootlevel);
return rootbuf;
}
* Log the reuse of a page from the FSM.
*/
static void
-_bt_log_reuse_page(Relation rel, BlockNumber blkno, TransactionId latestRemovedXid)
+_bt_log_reuse_page(Relation rel, BlockNumber blkno, FullTransactionId safexid)
{
xl_btree_reuse_page xlrec_reuse;
/* XLOG stuff */
xlrec_reuse.node = rel->rd_node;
xlrec_reuse.block = blkno;
- xlrec_reuse.latestRemovedXid = latestRemovedXid;
+ xlrec_reuse.latestRemovedFullXid = safexid;
XLogBeginInsert();
XLogRegisterData((char *) &xlrec_reuse, SizeOfBtreeReusePage);
if (_bt_conditionallockbuf(rel, buf))
{
page = BufferGetPage(buf);
- if (_bt_page_recyclable(page))
+
+ /*
+ * It's possible to find an all-zeroes page in an index. For
+ * example, a backend might successfully extend the relation
+ * one page and then crash before it is able to make a WAL
+ * entry for adding the page. If we find a zeroed page then
+ * reclaim it immediately.
+ */
+ if (PageIsNew(page))
+ {
+ /* Okay to use page. Initialize and return it. */
+ _bt_pageinit(page, BufferGetPageSize(buf));
+ return buf;
+ }
+
+ if (BTPageIsRecyclable(page))
{
/*
* If we are generating WAL for Hot Standby then create a
* WAL record that will allow us to conflict with queries
* running on standby, in case they have snapshots older
- * than btpo.xact. This can only apply if the page does
- * have a valid btpo.xact value, ie not if it's new. (We
- * must check that because an all-zero page has no special
- * space.)
+ * than safexid value
*/
- if (XLogStandbyInfoActive() && RelationNeedsWAL(rel) &&
- !PageIsNew(page))
- {
- BTPageOpaque opaque = (BTPageOpaque) PageGetSpecialPointer(page);
-
- _bt_log_reuse_page(rel, blkno, opaque->btpo.xact);
- }
+ if (XLogStandbyInfoActive() && RelationNeedsWAL(rel))
+ _bt_log_reuse_page(rel, blkno,
+ BTPageGetDeleteXid(page));
- /* Okay to use page. Re-initialize and return it */
+ /* Okay to use page. Re-initialize and return it. */
_bt_pageinit(page, BufferGetPageSize(buf));
return buf;
}
PageInit(page, size, sizeof(BTPageOpaqueData));
}
-/*
- * _bt_page_recyclable() -- Is an existing page recyclable?
- *
- * This exists to make sure _bt_getbuf and btvacuumscan have the same
- * policy about whether a page is safe to re-use. But note that _bt_getbuf
- * knows enough to distinguish the PageIsNew condition from the other one.
- * At some point it might be appropriate to redesign this to have a three-way
- * result value.
- */
-bool
-_bt_page_recyclable(Page page)
-{
- BTPageOpaque opaque;
-
- /*
- * It's possible to find an all-zeroes page in an index --- for example, a
- * backend might successfully extend the relation one page and then crash
- * before it is able to make a WAL entry for adding the page. If we find a
- * zeroed page then reclaim it.
- */
- if (PageIsNew(page))
- return true;
-
- /*
- * Otherwise, recycle if deleted and too old to have any processes
- * interested in it.
- */
- opaque = (BTPageOpaque) PageGetSpecialPointer(page);
- if (P_ISDELETED(opaque) &&
- GlobalVisCheckRemovableXid(NULL, opaque->btpo.xact))
- return true;
- return false;
-}
-
/*
* Delete item(s) from a btree leaf page during VACUUM.
*
* that the btvacuumscan scan has yet to reach; they'll get counted later
* instead.
*
- * Maintains *oldestBtpoXact for any pages that get deleted. Caller is
- * responsible for maintaining *oldestBtpoXact in the case of pages that were
- * deleted by a previous VACUUM.
- *
* NOTE: this leaks memory. Rather than trying to clean up everything
* carefully, it's better to run it in a temp context that can be reset
* frequently.
*/
uint32
-_bt_pagedel(Relation rel, Buffer leafbuf, TransactionId *oldestBtpoXact)
+_bt_pagedel(Relation rel, Buffer leafbuf)
{
uint32 ndeleted = 0;
BlockNumber rightsib;
{
/* Check for interrupts in _bt_unlink_halfdead_page */
if (!_bt_unlink_halfdead_page(rel, leafbuf, scanblkno,
- &rightsib_empty, oldestBtpoXact,
- &ndeleted))
+ &rightsib_empty, &ndeleted))
{
/*
* _bt_unlink_halfdead_page should never fail, since we
}
Assert(P_ISLEAF(opaque) && P_ISDELETED(opaque));
- Assert(TransactionIdFollowsOrEquals(opaque->btpo.xact,
- *oldestBtpoXact));
rightsib = opaque->btpo_next;
* containing leafbuf. (We always set *rightsib_empty for caller, just to be
* consistent.)
*
- * We maintain *oldestBtpoXact for pages that are deleted by the current
- * VACUUM operation here. This must be handled here because we conservatively
- * assume that there needs to be a new call to ReadNextTransactionId() each
- * time a page gets deleted. See comments about the underlying assumption
- * below.
- *
* Must hold pin and lock on leafbuf at entry (read or write doesn't matter).
* On success exit, we'll be holding pin and write lock. On failure exit,
* we'll release both pin and lock before returning (we define it that way
*/
static bool
_bt_unlink_halfdead_page(Relation rel, Buffer leafbuf, BlockNumber scanblkno,
- bool *rightsib_empty, TransactionId *oldestBtpoXact,
- uint32 *ndeleted)
+ bool *rightsib_empty, uint32 *ndeleted)
{
BlockNumber leafblkno = BufferGetBlockNumber(leafbuf);
BlockNumber leafleftsib;
BTMetaPageData *metad = NULL;
ItemId itemid;
Page page;
- PageHeader header;
BTPageOpaque opaque;
+ FullTransactionId safexid;
bool rightsib_is_rightmost;
- int targetlevel;
+ uint32 targetlevel;
IndexTuple leafhikey;
- BlockNumber nextchild;
+ BlockNumber leaftopparent;
page = BufferGetPage(leafbuf);
opaque = (BTPageOpaque) PageGetSpecialPointer(page);
page = BufferGetPage(buf);
opaque = (BTPageOpaque) PageGetSpecialPointer(page);
leftsib = opaque->btpo_prev;
- targetlevel = opaque->btpo.level;
+ targetlevel = opaque->btpo_level;
Assert(targetlevel > 0);
/*
!P_ISLEAF(opaque) || !P_ISHALFDEAD(opaque))
elog(ERROR, "half-dead page changed status unexpectedly in block %u of index \"%s\"",
target, RelationGetRelationName(rel));
- nextchild = InvalidBlockNumber;
+
+ /* Leaf page is also target page: don't set leaftopparent */
+ leaftopparent = InvalidBlockNumber;
}
else
{
+ IndexTuple finaldataitem;
+
if (P_FIRSTDATAKEY(opaque) != PageGetMaxOffsetNumber(page) ||
P_ISLEAF(opaque))
elog(ERROR, "half-dead page changed status unexpectedly in block %u of index \"%s\"",
target, RelationGetRelationName(rel));
- /* Remember the next non-leaf child down in the subtree */
+ /* Target is internal: set leaftopparent for next call here... */
itemid = PageGetItemId(page, P_FIRSTDATAKEY(opaque));
- nextchild = BTreeTupleGetDownLink((IndexTuple) PageGetItem(page, itemid));
- if (nextchild == leafblkno)
- nextchild = InvalidBlockNumber;
+ finaldataitem = (IndexTuple) PageGetItem(page, itemid);
+ leaftopparent = BTreeTupleGetDownLink(finaldataitem);
+ /* ...except when it would be a redundant pointer-to-self */
+ if (leaftopparent == leafblkno)
+ leaftopparent = InvalidBlockNumber;
}
/*
* no lock was held.
*/
if (target != leafblkno)
- BTreeTupleSetTopParent(leafhikey, nextchild);
+ BTreeTupleSetTopParent(leafhikey, leaftopparent);
/*
* Mark the page itself deleted. It can be recycled when all current
* transactions are gone. Storing GetTopTransactionId() would work, but
* we're in VACUUM and would not otherwise have an XID. Having already
- * updated links to the target, ReadNextTransactionId() suffices as an
+ * updated links to the target, ReadNextFullTransactionId() suffices as an
* upper bound. Any scan having retained a now-stale link is advertising
* in its PGPROC an xmin less than or equal to the value we read here. It
* will continue to do so, holding back the xmin horizon, for the duration
page = BufferGetPage(buf);
opaque = (BTPageOpaque) PageGetSpecialPointer(page);
Assert(P_ISHALFDEAD(opaque) || !P_ISLEAF(opaque));
- opaque->btpo_flags &= ~BTP_HALF_DEAD;
- opaque->btpo_flags |= BTP_DELETED;
- opaque->btpo.xact = ReadNextTransactionId();
/*
- * Remove the remaining tuples on the page. This keeps things simple for
- * WAL consistency checking.
+ * Store upper bound XID that's used to determine when deleted page is no
+ * longer needed as a tombstone
*/
- header = (PageHeader) page;
- header->pd_lower = SizeOfPageHeaderData;
- header->pd_upper = header->pd_special;
+ safexid = ReadNextFullTransactionId();
+ BTPageSetDeleted(page, safexid);
+ opaque->btpo_cycleid = 0;
/* And update the metapage, if needed */
if (BufferIsValid(metabuf))
if (target != leafblkno)
XLogRegisterBuffer(3, leafbuf, REGBUF_WILL_INIT);
- /* information on the unlinked block */
+ /* information stored on the target/to-be-unlinked block */
xlrec.leftsib = leftsib;
xlrec.rightsib = rightsib;
- xlrec.btpo_xact = opaque->btpo.xact;
+ xlrec.level = targetlevel;
+ xlrec.safexid = safexid;
/* information needed to recreate the leaf block (if not the target) */
xlrec.leafleftsib = leafleftsib;
xlrec.leafrightsib = leafrightsib;
- xlrec.topparent = nextchild;
+ xlrec.leaftopparent = leaftopparent;
XLogRegisterData((char *) &xlrec, SizeOfBtreeUnlinkPage);
xlmeta.level = metad->btm_level;
xlmeta.fastroot = metad->btm_fastroot;
xlmeta.fastlevel = metad->btm_fastlevel;
- xlmeta.oldest_btpo_xact = metad->btm_oldest_btpo_xact;
+ xlmeta.last_cleanup_num_delpages = metad->btm_last_cleanup_num_delpages;
xlmeta.last_cleanup_num_heap_tuples = metad->btm_last_cleanup_num_heap_tuples;
xlmeta.allequalimage = metad->btm_allequalimage;
_bt_relbuf(rel, lbuf);
_bt_relbuf(rel, rbuf);
- if (!TransactionIdIsValid(*oldestBtpoXact) ||
- TransactionIdPrecedes(opaque->btpo.xact, *oldestBtpoXact))
- *oldestBtpoXact = opaque->btpo.xact;
+ /* If the target is not leafbuf, we're done with it now -- release it */
+ if (target != leafblkno)
+ _bt_relbuf(rel, buf);
/*
* If btvacuumscan won't revisit this page in a future btvacuumpage call
if (target <= scanblkno)
(*ndeleted)++;
- /* If the target is not leafbuf, we're done with it now -- release it */
- if (target != leafblkno)
- _bt_relbuf(rel, buf);
-
return true;
}
IndexBulkDeleteCallback callback;
void *callback_state;
BTCycleId cycleid;
- BlockNumber totFreePages; /* true total # of free pages */
- TransactionId oldestBtpoXact;
MemoryContext pagedelcontext;
} BTVacState;
* _bt_vacuum_needs_cleanup() -- Checks if index needs cleanup
*
* Called by btvacuumcleanup when btbulkdelete was never called because no
- * tuples need to be deleted.
+ * tuples needed to be deleted by VACUUM.
*
* When we return false, VACUUM can even skip the cleanup-only call to
* btvacuumscan (i.e. there will be no btvacuumscan call for this index at
Buffer metabuf;
Page metapg;
BTMetaPageData *metad;
- bool result = false;
+ BTOptions *relopts;
+ float8 cleanup_scale_factor;
+ uint32 btm_version;
+ BlockNumber prev_num_delpages;
+ float8 prev_num_heap_tuples;
+ /*
+ * Copy details from metapage to local variables quickly.
+ *
+ * Note that we deliberately avoid using cached version of metapage here.
+ */
metabuf = _bt_getbuf(info->index, BTREE_METAPAGE, BT_READ);
metapg = BufferGetPage(metabuf);
metad = BTPageGetMeta(metapg);
+ btm_version = metad->btm_version;
- /*
- * XXX: If IndexVacuumInfo contained the heap relation, we could be more
- * aggressive about vacuuming non catalog relations by passing the table
- * to GlobalVisCheckRemovableXid().
- */
-
- if (metad->btm_version < BTREE_NOVAC_VERSION)
+ if (btm_version < BTREE_NOVAC_VERSION)
{
/*
- * Do cleanup if metapage needs upgrade, because we don't have
- * cleanup-related meta-information yet.
+ * Metapage needs to be dynamically upgraded to store fields that are
+ * only present when btm_version >= BTREE_NOVAC_VERSION
*/
- result = true;
- }
- else if (TransactionIdIsValid(metad->btm_oldest_btpo_xact) &&
- GlobalVisCheckRemovableXid(NULL, metad->btm_oldest_btpo_xact))
- {
- /*
- * If any oldest btpo.xact from a previously deleted page in the index
- * is visible to everyone, then at least one deleted page can be
- * recycled -- don't skip cleanup.
- */
- result = true;
- }
- else
- {
- BTOptions *relopts;
- float8 cleanup_scale_factor;
- float8 prev_num_heap_tuples;
-
- /*
- * If table receives enough insertions and no cleanup was performed,
- * then index would appear have stale statistics. If scale factor is
- * set, we avoid that by performing cleanup if the number of inserted
- * tuples exceeds vacuum_cleanup_index_scale_factor fraction of
- * original tuples count.
- */
- relopts = (BTOptions *) info->index->rd_options;
- cleanup_scale_factor = (relopts &&
- relopts->vacuum_cleanup_index_scale_factor >= 0)
- ? relopts->vacuum_cleanup_index_scale_factor
- : vacuum_cleanup_index_scale_factor;
- prev_num_heap_tuples = metad->btm_last_cleanup_num_heap_tuples;
-
- if (cleanup_scale_factor <= 0 ||
- info->num_heap_tuples < 0 ||
- prev_num_heap_tuples <= 0 ||
- (info->num_heap_tuples - prev_num_heap_tuples) /
- prev_num_heap_tuples >= cleanup_scale_factor)
- result = true;
+ _bt_relbuf(info->index, metabuf);
+ return true;
}
+ prev_num_delpages = metad->btm_last_cleanup_num_delpages;
+ prev_num_heap_tuples = metad->btm_last_cleanup_num_heap_tuples;
_bt_relbuf(info->index, metabuf);
- return result;
+
+ /*
+ * If the underlying table has received a sufficiently high number of
+ * insertions since the last VACUUM operation that called btvacuumscan(),
+ * then have the current VACUUM operation call btvacuumscan() now. This
+ * happens when the statistics are deemed stale.
+ *
+ * XXX: We should have a more principled way of determining what
+ * "staleness" means. The vacuum_cleanup_index_scale_factor GUC (and the
+ * index-level storage param) seem hard to tune in a principled way.
+ */
+ relopts = (BTOptions *) info->index->rd_options;
+ cleanup_scale_factor = (relopts &&
+ relopts->vacuum_cleanup_index_scale_factor >= 0)
+ ? relopts->vacuum_cleanup_index_scale_factor
+ : vacuum_cleanup_index_scale_factor;
+
+ if (cleanup_scale_factor <= 0 ||
+ info->num_heap_tuples < 0 ||
+ prev_num_heap_tuples <= 0 ||
+ (info->num_heap_tuples - prev_num_heap_tuples) /
+ prev_num_heap_tuples >= cleanup_scale_factor)
+ return true;
+
+ /*
+ * Trigger cleanup in rare cases where prev_num_delpages exceeds 5% of the
+ * total size of the index. We can reasonably expect (though are not
+ * guaranteed) to be able to recycle this many pages if we decide to do a
+ * btvacuumscan call during the ongoing btvacuumcleanup.
+ *
+ * Our approach won't reliably avoid "wasted" cleanup-only btvacuumscan
+ * calls. That is, we can end up scanning the entire index without ever
+ * placing even 1 of the prev_num_delpages pages in the free space map, at
+ * least in certain narrow cases (see nbtree/README section on recycling
+ * deleted pages for details). This rarely matters in practice.
+ */
+ if (prev_num_delpages > RelationGetNumberOfBlocks(info->index) / 20)
+ return true;
+
+ return false;
}
/*
IndexBulkDeleteResult *
btvacuumcleanup(IndexVacuumInfo *info, IndexBulkDeleteResult *stats)
{
+ BlockNumber num_delpages;
+
/* No-op in ANALYZE ONLY mode */
if (info->analyze_only)
return stats;
/*
- * If btbulkdelete was called, we need not do anything, just return the
- * stats from the latest btbulkdelete call. If it wasn't called, we might
- * still need to do a pass over the index, to recycle any newly-recyclable
- * pages or to obtain index statistics. _bt_vacuum_needs_cleanup
- * determines if either are needed.
+ * If btbulkdelete was called, we need not do anything (we just maintain
+ * the information used within _bt_vacuum_needs_cleanup() by calling
+ * _bt_set_cleanup_info() below).
*
- * Since we aren't going to actually delete any leaf items, there's no
- * need to go through all the vacuum-cycle-ID pushups.
+ * If btbulkdelete was _not_ called, then we have a choice to make: we
+ * must decide whether or not a btvacuumscan() call is needed now (i.e.
+ * whether the ongoing VACUUM operation can entirely avoid a physical scan
+ * of the index). A call to _bt_vacuum_needs_cleanup() decides it for us
+ * now.
*/
if (stats == NULL)
{
- /* Check if we need a cleanup */
+ /* Check if VACUUM operation can entirely avoid btvacuumscan() call */
if (!_bt_vacuum_needs_cleanup(info))
return NULL;
+ /*
+ * Since we aren't going to actually delete any leaf items, there's no
+ * need to go through all the vacuum-cycle-ID pushups here
+ */
stats = (IndexBulkDeleteResult *) palloc0(sizeof(IndexBulkDeleteResult));
btvacuumscan(info, stats, NULL, NULL, 0);
}
+ /*
+ * By here, we know for sure that this VACUUM operation won't be skipping
+ * its btvacuumscan() call. Maintain the count of the current number of
+ * heap tuples in the metapage. Also maintain the num_delpages value.
+ * This information will be used by _bt_vacuum_needs_cleanup() during
+ * future VACUUM operations that don't need to call btbulkdelete().
+ *
+ * num_delpages is the number of deleted pages now in the index that were
+ * not safe to place in the FSM to be recycled just yet. We expect that
+ * it will almost certainly be possible to place all of these pages in the
+ * FSM during the next VACUUM operation. That factor alone might cause
+ * _bt_vacuum_needs_cleanup() to force the next VACUUM to proceed with a
+ * btvacuumscan() call.
+ *
+ * Note: We must delay the _bt_set_cleanup_info() call until this late
+ * stage of VACUUM (the btvacuumcleanup() phase), to keep num_heap_tuples
+ * accurate. The btbulkdelete()-time num_heap_tuples value is generally
+ * just pg_class.reltuples for the heap relation _before_ VACUUM began.
+ * In general cleanup info should describe the state of the index/table
+ * _after_ VACUUM finishes.
+ */
+ Assert(stats->pages_deleted >= stats->pages_free);
+ num_delpages = stats->pages_deleted - stats->pages_free;
+ _bt_set_cleanup_info(info->index, num_delpages, info->num_heap_tuples);
+
/*
* It's quite possible for us to be fooled by concurrent page splits into
* double-counting some index tuples, so disbelieve any total that exceeds
* deleted, and looking for old deleted pages that can be recycled. Both
* btbulkdelete and btvacuumcleanup invoke this (the latter only if no
* btbulkdelete call occurred and _bt_vacuum_needs_cleanup returned true).
- * Note that this is also where the metadata used by _bt_vacuum_needs_cleanup
- * is maintained.
*
* The caller is responsible for initially allocating/zeroing a stats struct
* and for obtaining a vacuum cycle ID if necessary.
bool needLock;
/*
- * Reset counts that will be incremented during the scan; needed in case
- * of multiple scans during a single VACUUM command
+ * Reset fields that track information about the entire index now. This
+ * avoids double-counting in the case where a single VACUUM command
+ * requires multiple scans of the index.
+ *
+ * Avoid resetting the tuples_removed field here, since it tracks
+ * information about the VACUUM command, and so must last across each call
+ * to btvacuumscan().
+ *
+ * (Note that pages_free is treated as state about the whole index, not
+ * the current VACUUM. This is appropriate because RecordFreeIndexPage()
+ * calls are idempotent, and get repeated for the same deleted pages in
+ * some scenarios. The point for us is to track the number of recyclable
+ * pages in the index at the end of the VACUUM command.)
*/
+ stats->num_pages = 0;
stats->estimated_count = false;
stats->num_index_tuples = 0;
stats->pages_deleted = 0;
+ stats->pages_free = 0;
/* Set up info to pass down to btvacuumpage */
vstate.info = info;
vstate.callback = callback;
vstate.callback_state = callback_state;
vstate.cycleid = cycleid;
- vstate.totFreePages = 0;
- vstate.oldestBtpoXact = InvalidTransactionId;
/* Create a temporary memory context to run _bt_pagedel in */
vstate.pagedelcontext = AllocSetContextCreate(CurrentMemoryContext,
}
}
+ /* Set statistics num_pages field to final size of index */
+ stats->num_pages = num_pages;
+
MemoryContextDelete(vstate.pagedelcontext);
/*
* Note that if no recyclable pages exist, we don't bother vacuuming the
* FSM at all.
*/
- if (vstate.totFreePages > 0)
+ if (stats->pages_free > 0)
IndexFreeSpaceMapVacuum(rel);
-
- /*
- * Maintain the oldest btpo.xact and a count of the current number of heap
- * tuples in the metapage (for the benefit of _bt_vacuum_needs_cleanup).
- *
- * The page with the oldest btpo.xact is typically a page deleted by this
- * VACUUM operation, since pages deleted by a previous VACUUM operation
- * tend to be placed in the FSM (by the current VACUUM operation) -- such
- * pages are not candidates to be the oldest btpo.xact. (Note that pages
- * placed in the FSM are reported as deleted pages in the bulk delete
- * statistics, despite not counting as deleted pages for the purposes of
- * determining the oldest btpo.xact.)
- */
- _bt_update_meta_cleanup_info(rel, vstate.oldestBtpoXact,
- info->num_heap_tuples);
-
- /* update statistics */
- stats->num_pages = num_pages;
- stats->pages_free = vstate.totFreePages;
}
/*
}
}
- /* Page is valid, see what to do with it */
- if (_bt_page_recyclable(page))
+ if (!opaque || BTPageIsRecyclable(page))
{
/* Okay to recycle this page (which could be leaf or internal) */
RecordFreeIndexPage(rel, blkno);
- vstate->totFreePages++;
stats->pages_deleted++;
+ stats->pages_free++;
}
else if (P_ISDELETED(opaque))
{
* recycle yet.
*/
stats->pages_deleted++;
-
- /* Maintain the oldest btpo.xact */
- if (!TransactionIdIsValid(vstate->oldestBtpoXact) ||
- TransactionIdPrecedes(opaque->btpo.xact, vstate->oldestBtpoXact))
- vstate->oldestBtpoXact = opaque->btpo.xact;
}
else if (P_ISHALFDEAD(opaque))
{
/*
* Half-dead leaf page. Try to delete now. Might update
- * oldestBtpoXact and pages_deleted below.
+ * pages_deleted below.
*/
attempt_pagedel = true;
}
* count. There will be no double-counting.
*/
Assert(blkno == scanblkno);
- stats->pages_deleted += _bt_pagedel(rel, buf, &vstate->oldestBtpoXact);
+ stats->pages_deleted += _bt_pagedel(rel, buf);
MemoryContextSwitchTo(oldcontext);
/* pagedel released buffer, so we shouldn't */
* we're on the level 1 and asked to lock leaf page in write mode,
* then lock next page in write mode, because it must be a leaf.
*/
- if (opaque->btpo.level == 1 && access == BT_WRITE)
+ if (opaque->btpo_level == 1 && access == BT_WRITE)
page_access = BT_WRITE;
/* drop the read lock on the page, then acquire one on its child */
}
/* Done? */
- if (opaque->btpo.level == level)
+ if (opaque->btpo_level == level)
break;
- if (opaque->btpo.level < level)
+ if (opaque->btpo_level < level)
ereport(ERROR,
(errcode(ERRCODE_INDEX_CORRUPTED),
errmsg_internal("btree level %u not found in index \"%s\"",
/* Initialize BT opaque state */
opaque = (BTPageOpaque) PageGetSpecialPointer(page);
opaque->btpo_prev = opaque->btpo_next = P_NONE;
- opaque->btpo.level = level;
+ opaque->btpo_level = level;
opaque->btpo_flags = (level > 0) ? 0 : BTP_LEAF;
opaque->btpo_cycleid = 0;
md->btm_fastlevel = xlrec->fastlevel;
/* Cannot log BTREE_MIN_VERSION index metapage without upgrade */
Assert(md->btm_version >= BTREE_NOVAC_VERSION);
- md->btm_oldest_btpo_xact = xlrec->oldest_btpo_xact;
+ md->btm_last_cleanup_num_delpages = xlrec->last_cleanup_num_delpages;
md->btm_last_cleanup_num_heap_tuples = xlrec->last_cleanup_num_heap_tuples;
md->btm_allequalimage = xlrec->allequalimage;
ropaque->btpo_prev = origpagenumber;
ropaque->btpo_next = spagenumber;
- ropaque->btpo.level = xlrec->level;
+ ropaque->btpo_level = xlrec->level;
ropaque->btpo_flags = isleaf ? BTP_LEAF : 0;
ropaque->btpo_cycleid = 0;
pageop->btpo_prev = xlrec->leftblk;
pageop->btpo_next = xlrec->rightblk;
- pageop->btpo.level = 0;
+ pageop->btpo_level = 0;
pageop->btpo_flags = BTP_HALF_DEAD | BTP_LEAF;
pageop->btpo_cycleid = 0;
xl_btree_unlink_page *xlrec = (xl_btree_unlink_page *) XLogRecGetData(record);
BlockNumber leftsib;
BlockNumber rightsib;
+ uint32 level;
+ bool isleaf;
+ FullTransactionId safexid;
Buffer leftbuf;
Buffer target;
Buffer rightbuf;
leftsib = xlrec->leftsib;
rightsib = xlrec->rightsib;
+ level = xlrec->level;
+ isleaf = (level == 0);
+ safexid = xlrec->safexid;
+
+ /* No leaftopparent for level 0 (leaf page) or level 1 target */
+ Assert(xlrec->leaftopparent == InvalidBlockNumber || level > 1);
/*
* In normal operation, we would lock all the pages this WAL record
pageop->btpo_prev = leftsib;
pageop->btpo_next = rightsib;
- pageop->btpo.xact = xlrec->btpo_xact;
- pageop->btpo_flags = BTP_DELETED;
- if (!BlockNumberIsValid(xlrec->topparent))
+ pageop->btpo_level = level;
+ BTPageSetDeleted(page, safexid);
+ if (isleaf)
pageop->btpo_flags |= BTP_LEAF;
pageop->btpo_cycleid = 0;
Buffer leafbuf;
IndexTupleData trunctuple;
+ Assert(!isleaf);
+
leafbuf = XLogInitBufferForRedo(record, 3);
page = (Page) BufferGetPage(leafbuf);
pageop->btpo_flags = BTP_HALF_DEAD | BTP_LEAF;
pageop->btpo_prev = xlrec->leafleftsib;
pageop->btpo_next = xlrec->leafrightsib;
- pageop->btpo.level = 0;
+ pageop->btpo_level = 0;
pageop->btpo_cycleid = 0;
/* Add a dummy hikey item */
MemSet(&trunctuple, 0, sizeof(IndexTupleData));
trunctuple.t_info = sizeof(IndexTupleData);
- BTreeTupleSetTopParent(&trunctuple, xlrec->topparent);
+ BTreeTupleSetTopParent(&trunctuple, xlrec->leaftopparent);
if (PageAddItem(page, (Item) &trunctuple, sizeof(IndexTupleData), P_HIKEY,
false, false) == InvalidOffsetNumber)
pageop->btpo_flags = BTP_ROOT;
pageop->btpo_prev = pageop->btpo_next = P_NONE;
- pageop->btpo.level = xlrec->level;
+ pageop->btpo_level = xlrec->level;
if (xlrec->level == 0)
pageop->btpo_flags |= BTP_LEAF;
pageop->btpo_cycleid = 0;
_bt_restore_meta(record, 2);
}
+/*
+ * In general VACUUM must defer recycling as a way of avoiding certain race
+ * conditions. Deleted pages contain a safexid value that is used by VACUUM
+ * to determine whether or not it's safe to place a page that was deleted by
+ * VACUUM earlier into the FSM now. See nbtree/README.
+ *
+ * As far as any backend operating during original execution is concerned, the
+ * FSM is a cache of recycle-safe pages; the mere presence of the page in the
+ * FSM indicates that the page must already be safe to recycle (actually,
+ * _bt_getbuf() verifies it's safe using BTPageIsRecyclable(), but that's just
+ * because it would be unwise to completely trust the FSM, given its current
+ * limitations).
+ *
+ * This isn't sufficient to prevent similar concurrent recycling race
+ * conditions during Hot Standby, though. For that we need to log a
+ * xl_btree_reuse_page record at the point that a page is actually recycled
+ * and reused for an entirely unrelated page inside _bt_split(). These
+ * records include the same safexid value from the original deleted page,
+ * stored in the record's latestRemovedFullXid field.
+ *
+ * The GlobalVisCheckRemovableFullXid() test in BTPageIsRecyclable() is used
+ * to determine if it's safe to recycle a page. This mirrors our own test:
+ * the PGPROC->xmin > limitXmin test inside GetConflictingVirtualXIDs().
+ * Consequently, one XID value achieves the same exclusion effect on primary
+ * and standby.
+ */
static void
btree_xlog_reuse_page(XLogReaderState *record)
{
xl_btree_reuse_page *xlrec = (xl_btree_reuse_page *) XLogRecGetData(record);
- /*
- * Btree reuse_page records exist to provide a conflict point when we
- * reuse pages in the index via the FSM. That's all they do though.
- *
- * latestRemovedXid was the page's btpo.xact. The
- * GlobalVisCheckRemovableXid test in _bt_page_recyclable() conceptually
- * mirrors the pgxact->xmin > limitXmin test in
- * GetConflictingVirtualXIDs(). Consequently, one XID value achieves the
- * same exclusion effect on primary and standby.
- */
if (InHotStandby)
- {
- ResolveRecoveryConflictWithSnapshot(xlrec->latestRemovedXid,
- xlrec->node);
- }
+ ResolveRecoveryConflictWithSnapshotFullXid(xlrec->latestRemovedFullXid,
+ xlrec->node);
}
void
{
xl_btree_unlink_page *xlrec = (xl_btree_unlink_page *) rec;
- appendStringInfo(buf, "left %u; right %u; btpo_xact %u; ",
- xlrec->leftsib, xlrec->rightsib,
- xlrec->btpo_xact);
- appendStringInfo(buf, "leafleft %u; leafright %u; topparent %u",
+ appendStringInfo(buf, "left %u; right %u; level %u; safexid %u:%u; ",
+ xlrec->leftsib, xlrec->rightsib, xlrec->level,
+ EpochFromFullTransactionId(xlrec->safexid),
+ XidFromFullTransactionId(xlrec->safexid));
+ appendStringInfo(buf, "leafleft %u; leafright %u; leaftopparent %u",
xlrec->leafleftsib, xlrec->leafrightsib,
- xlrec->topparent);
+ xlrec->leaftopparent);
break;
}
case XLOG_BTREE_NEWROOT:
{
xl_btree_reuse_page *xlrec = (xl_btree_reuse_page *) rec;
- appendStringInfo(buf, "rel %u/%u/%u; latestRemovedXid %u",
+ appendStringInfo(buf, "rel %u/%u/%u; latestRemovedXid %u:%u",
xlrec->node.spcNode, xlrec->node.dbNode,
- xlrec->node.relNode, xlrec->latestRemovedXid);
+ xlrec->node.relNode,
+ EpochFromFullTransactionId(xlrec->latestRemovedFullXid),
+ XidFromFullTransactionId(xlrec->latestRemovedFullXid));
break;
}
case XLOG_BTREE_META_CLEANUP:
xlrec = (xl_btree_metadata *) XLogRecGetBlockData(record, 0,
NULL);
- appendStringInfo(buf, "oldest_btpo_xact %u; last_cleanup_num_heap_tuples: %f",
- xlrec->oldest_btpo_xact,
+ appendStringInfo(buf, "last_cleanup_num_delpages %u; last_cleanup_num_heap_tuples: %f",
+ xlrec->last_cleanup_num_delpages,
xlrec->last_cleanup_num_heap_tuples);
break;
}
true);
}
+/*
+ * Variant of ResolveRecoveryConflictWithSnapshot that works with
+ * FullTransactionId values
+ */
+void
+ResolveRecoveryConflictWithSnapshotFullXid(FullTransactionId latestRemovedFullXid,
+ RelFileNode node)
+{
+ /*
+ * ResolveRecoveryConflictWithSnapshot operates on 32-bit TransactionIds,
+ * so truncate the logged FullTransactionId. If the logged value is very
+ * old, so that XID wrap-around already happened on it, there can't be any
+ * snapshots that still see it.
+ */
+ FullTransactionId nextXid = ReadNextFullTransactionId();
+ uint64 diff;
+
+ diff = U64FromFullTransactionId(nextXid) -
+ U64FromFullTransactionId(latestRemovedFullXid);
+ if (diff < MaxTransactionId / 2)
+ {
+ TransactionId latestRemovedXid;
+
+ latestRemovedXid = XidFromFullTransactionId(latestRemovedFullXid);
+ ResolveRecoveryConflictWithSnapshot(latestRemovedXid, node);
+ }
+}
+
void
ResolveRecoveryConflictWithTablespace(Oid tsid)
{
*
* In addition, we store the page's btree level (counting upwards from
* zero at a leaf page) as well as some flag bits indicating the page type
- * and status. If the page is deleted, we replace the level with the
- * next-transaction-ID value indicating when it is safe to reclaim the page.
+ * and status. If the page is deleted, a BTDeletedPageData struct is stored
+ * in the page's tuple area, while a standard BTPageOpaqueData struct is
+ * stored in the page special area.
*
* We also store a "vacuum cycle ID". When a page is split while VACUUM is
* processing the index, a nonzero value associated with the VACUUM run is
*
* NOTE: the BTP_LEAF flag bit is redundant since level==0 could be tested
* instead.
+ *
+ * NOTE: the btpo_level field used to be a union type in order to allow
+ * deleted pages to store a 32-bit safexid in the same field. We now store
+ * 64-bit/full safexid values using BTDeletedPageData instead.
*/
typedef struct BTPageOpaqueData
{
BlockNumber btpo_prev; /* left sibling, or P_NONE if leftmost */
BlockNumber btpo_next; /* right sibling, or P_NONE if rightmost */
- union
- {
- uint32 level; /* tree level --- zero for leaf pages */
- TransactionId xact; /* next transaction ID, if deleted */
- } btpo;
+ uint32 btpo_level; /* tree level --- zero for leaf pages */
uint16 btpo_flags; /* flag bits, see below */
BTCycleId btpo_cycleid; /* vacuum cycle ID of latest split */
} BTPageOpaqueData;
#define BTP_SPLIT_END (1 << 5) /* rightmost page of split group */
#define BTP_HAS_GARBAGE (1 << 6) /* page has LP_DEAD tuples (deprecated) */
#define BTP_INCOMPLETE_SPLIT (1 << 7) /* right sibling's downlink is missing */
+#define BTP_HAS_FULLXID (1 << 8) /* contains BTDeletedPageData */
/*
* The max allowed value of a cycle ID is a bit less than 64K. This is
BlockNumber btm_fastroot; /* current "fast" root location */
uint32 btm_fastlevel; /* tree level of the "fast" root page */
/* remaining fields only valid when btm_version >= BTREE_NOVAC_VERSION */
- TransactionId btm_oldest_btpo_xact; /* oldest btpo_xact among all deleted
- * pages */
- float8 btm_last_cleanup_num_heap_tuples; /* number of heap tuples
- * during last cleanup */
+
+ /* number of deleted, non-recyclable pages during last cleanup */
+ uint32 btm_last_cleanup_num_delpages;
+ /* number of heap tuples during last cleanup */
+ float8 btm_last_cleanup_num_heap_tuples;
+
bool btm_allequalimage; /* are all columns "equalimage"? */
} BTMetaPageData;
#define P_IGNORE(opaque) (((opaque)->btpo_flags & (BTP_DELETED|BTP_HALF_DEAD)) != 0)
#define P_HAS_GARBAGE(opaque) (((opaque)->btpo_flags & BTP_HAS_GARBAGE) != 0)
#define P_INCOMPLETE_SPLIT(opaque) (((opaque)->btpo_flags & BTP_INCOMPLETE_SPLIT) != 0)
+#define P_HAS_FULLXID(opaque) (((opaque)->btpo_flags & BTP_HAS_FULLXID) != 0)
+
+/*
+ * BTDeletedPageData is the page contents of a deleted page
+ */
+typedef struct BTDeletedPageData
+{
+ FullTransactionId safexid; /* See BTPageIsRecyclable() */
+} BTDeletedPageData;
+
+static inline void
+BTPageSetDeleted(Page page, FullTransactionId safexid)
+{
+ BTPageOpaque opaque;
+ PageHeader header;
+ BTDeletedPageData *contents;
+
+ opaque = (BTPageOpaque) PageGetSpecialPointer(page);
+ header = ((PageHeader) page);
+
+ opaque->btpo_flags &= ~BTP_HALF_DEAD;
+ opaque->btpo_flags |= BTP_DELETED | BTP_HAS_FULLXID;
+ header->pd_lower = MAXALIGN(SizeOfPageHeaderData) +
+ sizeof(BTDeletedPageData);
+ header->pd_upper = header->pd_special;
+
+ /* Set safexid in deleted page */
+ contents = ((BTDeletedPageData *) PageGetContents(page));
+ contents->safexid = safexid;
+}
+
+static inline FullTransactionId
+BTPageGetDeleteXid(Page page)
+{
+ BTPageOpaque opaque;
+ BTDeletedPageData *contents;
+
+ /* We only expect to be called with a deleted page */
+ Assert(!PageIsNew(page));
+ opaque = (BTPageOpaque) PageGetSpecialPointer(page);
+ Assert(P_ISDELETED(opaque));
+
+ /* pg_upgrade'd deleted page -- must be safe to delete now */
+ if (!P_HAS_FULLXID(opaque))
+ return FirstNormalFullTransactionId;
+
+ /* Get safexid from deleted page */
+ contents = ((BTDeletedPageData *) PageGetContents(page));
+ return contents->safexid;
+}
+
+/*
+ * Is an existing page recyclable?
+ *
+ * This exists to centralize the policy on which deleted pages are now safe to
+ * re-use.
+ *
+ * Note: PageIsNew() pages are always safe to recycle, but we can't deal with
+ * them here (caller is responsible for that case themselves). Caller might
+ * well need special handling for new pages anyway.
+ */
+static inline bool
+BTPageIsRecyclable(Page page)
+{
+ BTPageOpaque opaque;
+
+ Assert(!PageIsNew(page));
+
+ /* Recycling okay iff page is deleted and safexid is old enough */
+ opaque = (BTPageOpaque) PageGetSpecialPointer(page);
+ if (P_ISDELETED(opaque))
+ {
+ /*
+ * The page was deleted, but when? If it was just deleted, a scan
+ * might have seen the downlink to it, and will read the page later.
+ * As long as that can happen, we must keep the deleted page around as
+ * a tombstone.
+ *
+ * For that check if the deletion XID could still be visible to
+ * anyone. If not, then no scan that's still in progress could have
+ * seen its downlink, and we can recycle it.
+ */
+ return GlobalVisCheckRemovableFullXid(NULL, BTPageGetDeleteXid(page));
+ }
+
+ return false;
+}
/*
* Lehman and Yao's algorithm requires a ``high key'' on every non-rightmost
{
int32 varlena_header_; /* varlena header (do not touch directly!) */
int fillfactor; /* page fill factor in percent (0..100) */
- /* fraction of newly inserted tuples prior to trigger index cleanup */
+ /* fraction of newly inserted tuples needed to trigger index cleanup */
float8 vacuum_cleanup_index_scale_factor;
bool deduplicate_items; /* Try to deduplicate items? */
} BTOptions;
*/
extern void _bt_initmetapage(Page page, BlockNumber rootbknum, uint32 level,
bool allequalimage);
-extern void _bt_update_meta_cleanup_info(Relation rel,
- TransactionId oldestBtpoXact, float8 numHeapTuples);
+extern void _bt_set_cleanup_info(Relation rel, BlockNumber num_delpages,
+ float8 num_heap_tuples);
extern void _bt_upgrademetapage(Page page);
extern Buffer _bt_getroot(Relation rel, int access);
extern Buffer _bt_gettrueroot(Relation rel);
extern bool _bt_conditionallockbuf(Relation rel, Buffer buf);
extern void _bt_upgradelockbufcleanup(Relation rel, Buffer buf);
extern void _bt_pageinit(Page page, Size size);
-extern bool _bt_page_recyclable(Page page);
extern void _bt_delitems_vacuum(Relation rel, Buffer buf,
OffsetNumber *deletable, int ndeletable,
BTVacuumPosting *updatable, int nupdatable);
extern void _bt_delitems_delete_check(Relation rel, Buffer buf,
Relation heapRel,
TM_IndexDeleteOp *delstate);
-extern uint32 _bt_pagedel(Relation rel, Buffer leafbuf,
- TransactionId *oldestBtpoXact);
+extern uint32 _bt_pagedel(Relation rel, Buffer leafbuf);
/*
* prototypes for functions in nbtsearch.c
#ifndef NBTXLOG_H
#define NBTXLOG_H
+#include "access/transam.h"
#include "access/xlogreader.h"
#include "lib/stringinfo.h"
#include "storage/off.h"
uint32 level;
BlockNumber fastroot;
uint32 fastlevel;
- TransactionId oldest_btpo_xact;
+ uint32 last_cleanup_num_delpages;
float8 last_cleanup_num_heap_tuples;
bool allequalimage;
} xl_btree_metadata;
{
RelFileNode node;
BlockNumber block;
- TransactionId latestRemovedXid;
+ FullTransactionId latestRemovedFullXid;
} xl_btree_reuse_page;
#define SizeOfBtreeReusePage (sizeof(xl_btree_reuse_page))
#define SizeOfBtreeMarkPageHalfDead (offsetof(xl_btree_mark_page_halfdead, topparent) + sizeof(BlockNumber))
/*
- * This is what we need to know about deletion of a btree page. Note we do
- * not store any content for the deleted page --- it is just rewritten as empty
- * during recovery, apart from resetting the btpo.xact.
+ * This is what we need to know about deletion of a btree page. Note that we
+ * only leave behind a small amount of bookkeeping information in deleted
+ * pages (deleted pages must be kept around as tombstones for a while). It is
+ * convenient for the REDO routine to regenerate its target page from scratch.
+ * This is why WAL record describes certain details that are actually directly
+ * available from the target page.
*
* Backup Blk 0: target block being deleted
* Backup Blk 1: target block's left sibling, if any
{
BlockNumber leftsib; /* target block's left sibling, if any */
BlockNumber rightsib; /* target block's right sibling */
+ uint32 level; /* target block's level */
+ FullTransactionId safexid; /* target block's BTPageSetDeleted() XID */
/*
- * Information needed to recreate the leaf page, when target is an
- * internal page.
+ * Information needed to recreate a half-dead leaf page with correct
+ * topparent link. The fields are only used when deletion operation's
+ * target page is an internal page. REDO routine creates half-dead page
+ * from scratch to keep things simple (this is the same convenient
+ * approach used for the target page itself).
*/
BlockNumber leafleftsib;
BlockNumber leafrightsib;
- BlockNumber topparent; /* next child down in the subtree */
+ BlockNumber leaftopparent; /* next child down in the subtree */
- TransactionId btpo_xact; /* value of btpo.xact for use in recovery */
/* xl_btree_metadata FOLLOWS IF XLOG_BTREE_UNLINK_PAGE_META */
} xl_btree_unlink_page;
-#define SizeOfBtreeUnlinkPage (offsetof(xl_btree_unlink_page, btpo_xact) + sizeof(TransactionId))
+#define SizeOfBtreeUnlinkPage (offsetof(xl_btree_unlink_page, leaftopparent) + sizeof(BlockNumber))
/*
* New root log record. There are zero tuples if this is to establish an
/*
* Each page of XLOG file has a header like this:
*/
-#define XLOG_PAGE_MAGIC 0xD109 /* can be used as WAL version indicator */
+#define XLOG_PAGE_MAGIC 0xD10A /* can be used as WAL version indicator */
typedef struct XLogPageHeaderData
{
extern void ResolveRecoveryConflictWithSnapshot(TransactionId latestRemovedXid,
RelFileNode node);
+extern void ResolveRecoveryConflictWithSnapshotFullXid(FullTransactionId latestRemovedFullXid,
+ RelFileNode node);
extern void ResolveRecoveryConflictWithTablespace(Oid tsid);
extern void ResolveRecoveryConflictWithDatabase(Oid dbid);