#include "utils/snapmgr.h"
static BTMetaPageData *_bt_getmeta(Relation rel, Buffer metabuf);
-static bool _bt_mark_page_halfdead(Relation rel, Buffer buf, BTStack stack);
+static bool _bt_mark_page_halfdead(Relation rel, Buffer leafbuf,
+ BTStack stack);
static bool _bt_unlink_halfdead_page(Relation rel, Buffer leafbuf,
- bool *rightsib_empty);
+ bool *rightsib_empty,
+ TransactionId *oldestBtpoXact);
static TransactionId _bt_xid_horizon(Relation rel, Relation heapRel, Page page,
OffsetNumber *deletable, int ndeletable);
static bool _bt_lock_branch_parent(Relation rel, BlockNumber child,
}
/*
- * _bt_pagedel() -- Delete a page from the b-tree, if legal to do so.
+ * _bt_pagedel() -- Delete a leaf page from the b-tree, if legal to do so.
*
- * This action unlinks the page from the b-tree structure, removing all
+ * This action unlinks the leaf page from the b-tree structure, removing all
* pointers leading to it --- but not touching its own left and right links.
* The page cannot be physically reclaimed right away, since other processes
* may currently be trying to follow links leading to the page; they have to
* be allowed to use its right-link to recover. See nbtree/README.
*
* On entry, the target buffer must be pinned and locked (either read or write
- * lock is OK). This lock and pin will be dropped before exiting.
+ * lock is OK). The page must be an empty leaf page, which may be half-dead
+ * already (a half-dead page should only be passed to us when an earlier
+ * VACUUM operation was interrupted, though). Note in particular that caller
+ * should never pass a buffer containing an existing deleted page here. The
+ * lock and pin on caller's buffer will be dropped before we return.
*
* Returns the number of pages successfully deleted (zero if page cannot
- * be deleted now; could be more than one if parent or sibling pages were
- * deleted too).
+ * be deleted now; could be more than one if parent or right sibling pages
+ * were deleted too).
+ *
+ * Maintains *oldestBtpoXact for any pages that get deleted. Caller is
+ * responsible for maintaining *oldestBtpoXact in the case of pages that were
+ * deleted by a previous VACUUM.
*
* NOTE: this leaks memory. Rather than trying to clean up everything
* carefully, it's better to run it in a temp context that can be reset
* frequently.
*/
int
-_bt_pagedel(Relation rel, Buffer buf)
+_bt_pagedel(Relation rel, Buffer leafbuf, TransactionId *oldestBtpoXact)
{
int ndeleted = 0;
BlockNumber rightsib;
for (;;)
{
- page = BufferGetPage(buf);
+ page = BufferGetPage(leafbuf);
opaque = (BTPageOpaque) PageGetSpecialPointer(page);
/*
* Internal pages are never deleted directly, only as part of deleting
* the whole branch all the way down to leaf level.
+ *
+ * Also check for deleted pages here. Caller never passes us a fully
+ * deleted page. Only VACUUM can delete pages, so there can't have
+ * been a concurrent deletion. Assume that we reached any deleted
+ * page encountered here by following a sibling link, and that the
+ * index is corrupt.
*/
- if (!P_ISLEAF(opaque))
+ Assert(!P_ISDELETED(opaque));
+ if (!P_ISLEAF(opaque) || P_ISDELETED(opaque))
{
/*
* Pre-9.4 page deletion only marked internal pages as half-dead,
errmsg("index \"%s\" contains a half-dead internal page",
RelationGetRelationName(rel)),
errhint("This can be caused by an interrupted VACUUM in version 9.3 or older, before upgrade. Please REINDEX it.")));
- _bt_relbuf(rel, buf);
+
+ if (P_ISDELETED(opaque))
+ ereport(LOG,
+ (errcode(ERRCODE_INDEX_CORRUPTED),
+ errmsg_internal("found deleted block %u while following right link in index \"%s\"",
+ BufferGetBlockNumber(leafbuf),
+ RelationGetRelationName(rel))));
+
+ _bt_relbuf(rel, leafbuf);
return ndeleted;
}
/*
* We can never delete rightmost pages nor root pages. While at it,
- * check that page is not already deleted and is empty.
+ * check that page is empty, since it's possible that the leafbuf page
+ * was empty a moment ago, but has since had some inserts.
*
* To keep the algorithm simple, we also never delete an incompletely
* split page (they should be rare enough that this doesn't make any
* to. On subsequent iterations, we know we stepped right from a page
* that passed these tests, so it's OK.
*/
- if (P_RIGHTMOST(opaque) || P_ISROOT(opaque) || P_ISDELETED(opaque) ||
+ if (P_RIGHTMOST(opaque) || P_ISROOT(opaque) ||
P_FIRSTDATAKEY(opaque) <= PageGetMaxOffsetNumber(page) ||
P_INCOMPLETE_SPLIT(opaque))
{
/* Should never fail to delete a half-dead page */
Assert(!P_ISHALFDEAD(opaque));
- _bt_relbuf(rel, buf);
+ _bt_relbuf(rel, leafbuf);
return ndeleted;
}
* To avoid deadlocks, we'd better drop the leaf page lock
* before going further.
*/
- LockBuffer(buf, BUFFER_LOCK_UNLOCK);
+ LockBuffer(leafbuf, BUFFER_LOCK_UNLOCK);
/*
* Fetch the left sibling, to check that it's not marked with
* incompletely-split page to be split again. So we don't
* need to walk right here.
*/
- if (lopaque->btpo_next == BufferGetBlockNumber(buf) &&
+ if (lopaque->btpo_next == BufferGetBlockNumber(leafbuf) &&
P_INCOMPLETE_SPLIT(lopaque))
{
- ReleaseBuffer(buf);
+ ReleaseBuffer(leafbuf);
_bt_relbuf(rel, lbuf);
return ndeleted;
}
_bt_relbuf(rel, lbuf);
/*
- * Re-lock the leaf page, and start over, to re-check that the
- * page can still be deleted.
+ * Re-lock the leaf page, and start over to use our stack
+ * within _bt_mark_page_halfdead. We must do it that way
+ * because it's possible that leafbuf can no longer be
+ * deleted. We need to recheck.
*/
- LockBuffer(buf, BT_WRITE);
+ LockBuffer(leafbuf, BT_WRITE);
continue;
}
- if (!_bt_mark_page_halfdead(rel, buf, stack))
+ /*
+ * See if it's safe to delete the leaf page, and determine how
+ * many parent/internal pages above the leaf level will be
+ * deleted. If it's safe then _bt_mark_page_halfdead will also
+ * perform the first phase of deletion, which includes marking the
+ * leafbuf page half-dead.
+ */
+ Assert(P_ISLEAF(opaque) && !P_IGNORE(opaque));
+ if (!_bt_mark_page_halfdead(rel, leafbuf, stack))
{
- _bt_relbuf(rel, buf);
+ _bt_relbuf(rel, leafbuf);
return ndeleted;
}
}
/*
* Then unlink it from its siblings. Each call to
* _bt_unlink_halfdead_page unlinks the topmost page from the branch,
- * making it shallower. Iterate until the leaf page is gone.
+ * making it shallower. Iterate until the leafbuf page is deleted.
+ *
+ * _bt_unlink_halfdead_page should never fail, since we established
+ * that deletion is generally safe in _bt_mark_page_halfdead.
*/
rightsib_empty = false;
+ Assert(P_ISLEAF(opaque) && P_ISHALFDEAD(opaque));
while (P_ISHALFDEAD(opaque))
{
- /* will check for interrupts, once lock is released */
- if (!_bt_unlink_halfdead_page(rel, buf, &rightsib_empty))
+ /* Check for interrupts in _bt_unlink_halfdead_page */
+ if (!_bt_unlink_halfdead_page(rel, leafbuf, &rightsib_empty,
+ oldestBtpoXact))
{
- /* _bt_unlink_halfdead_page already released buffer */
+ /* _bt_unlink_halfdead_page failed, released buffer */
return ndeleted;
}
ndeleted++;
}
+ Assert(P_ISLEAF(opaque) && P_ISDELETED(opaque));
+ Assert(TransactionIdFollowsOrEquals(opaque->btpo.xact,
+ *oldestBtpoXact));
+
rightsib = opaque->btpo_next;
- _bt_relbuf(rel, buf);
+ _bt_relbuf(rel, leafbuf);
/*
* Check here, as calling loops will have locks held, preventing
if (!rightsib_empty)
break;
- buf = _bt_getbuf(rel, rightsib, BT_WRITE);
+ leafbuf = _bt_getbuf(rel, rightsib, BT_WRITE);
}
return ndeleted;
* of the whole branch, including the leaf page itself, iterate until the
* leaf page is deleted.
*
- * Returns 'false' if the page could not be unlinked (shouldn't happen).
- * If the (current) right sibling of the page is empty, *rightsib_empty is
- * set to true.
+ * Returns 'false' if the page could not be unlinked (shouldn't happen). If
+ * the right sibling of the current target page is empty, *rightsib_empty is
+ * set to true, allowing caller to delete the target's right sibling page in
+ * passing. Note that *rightsib_empty is only actually used by caller when
+ * target page is leafbuf, following last call here for leafbuf/the subtree
+ * containing leafbuf. (We always set *rightsib_empty for caller, just to be
+ * consistent.)
+ *
+ * We maintain *oldestBtpoXact for pages that are deleted by the current
+ * VACUUM operation here. This must be handled here because we conservatively
+ * assume that there needs to be a new call to ReadNewTransactionId() each
+ * time a page gets deleted. See comments about the underlying assumption
+ * below.
*
* Must hold pin and lock on leafbuf at entry (read or write doesn't matter).
* On success exit, we'll be holding pin and write lock. On failure exit,
* to avoid having to reacquire a lock we already released).
*/
static bool
-_bt_unlink_halfdead_page(Relation rel, Buffer leafbuf, bool *rightsib_empty)
+_bt_unlink_halfdead_page(Relation rel, Buffer leafbuf, bool *rightsib_empty,
+ TransactionId *oldestBtpoXact)
{
BlockNumber leafblkno = BufferGetBlockNumber(leafbuf);
BlockNumber leafleftsib;
lbuf = InvalidBuffer;
/*
- * Next write-lock the target page itself. It should be okay to take just
- * a write lock not a superexclusive lock, since no scans would stop on an
- * empty page.
+ * Next write-lock the target page itself. It's okay to take a write lock
+ * rather than a superexclusive lock, since no scan will stop on an empty
+ * page.
*/
LockBuffer(buf, BT_WRITE);
page = BufferGetPage(buf);
*/
page = BufferGetPage(buf);
opaque = (BTPageOpaque) PageGetSpecialPointer(page);
+ Assert(P_ISHALFDEAD(opaque) || !P_ISLEAF(opaque));
opaque->btpo_flags &= ~BTP_HALF_DEAD;
opaque->btpo_flags |= BTP_DELETED;
opaque->btpo.xact = ReadNewTransactionId();
_bt_relbuf(rel, lbuf);
_bt_relbuf(rel, rbuf);
+ if (!TransactionIdIsValid(*oldestBtpoXact) ||
+ TransactionIdPrecedes(opaque->btpo.xact, *oldestBtpoXact))
+ *oldestBtpoXact = opaque->btpo.xact;
+
/*
* Release the target, if it was not the leaf block. The leaf is always
* kept locked.
static void btvacuumscan(IndexVacuumInfo *info, IndexBulkDeleteResult *stats,
IndexBulkDeleteCallback callback, void *callback_state,
- BTCycleId cycleid, TransactionId *oldestBtpoXact);
+ BTCycleId cycleid);
static void btvacuumpage(BTVacState *vstate, BlockNumber blkno,
BlockNumber orig_blkno);
static BTVacuumPosting btreevacuumposting(BTVacState *vstate,
}
/*
- * _bt_vacuum_needs_cleanup() -- Checks if index needs cleanup assuming that
- * btbulkdelete() wasn't called.
+ * _bt_vacuum_needs_cleanup() -- Checks if index needs cleanup
+ *
+ * Called by btvacuumcleanup when btbulkdelete was never called because no
+ * tuples need to be deleted.
+ *
+ * When we return false, VACUUM can even skip the cleanup-only call to
+ * btvacuumscan (i.e. there will be no btvacuumscan call for this index at
+ * all). Otherwise, a cleanup-only btvacuumscan call is required.
*/
static bool
_bt_vacuum_needs_cleanup(IndexVacuumInfo *info)
RecentGlobalXmin))
{
/*
- * If oldest btpo.xact in the deleted pages is older than
- * RecentGlobalXmin, then at least one deleted page can be recycled.
+ * If any oldest btpo.xact from a previously deleted page in the index
+ * is older than RecentGlobalXmin, then at least one deleted page can
+ * be recycled -- don't skip cleanup.
+ *
+ * Note that btvacuumpage currently doesn't make any effort to
+ * recognize when a recycled page is already in the FSM (i.e. put
+ * there by a previous VACUUM operation). We have to be conservative
+ * because the FSM isn't crash safe. Hopefully recycled pages get
+ * reused before too long.
*/
result = true;
}
/* The ENSURE stuff ensures we clean up shared memory on failure */
PG_ENSURE_ERROR_CLEANUP(_bt_end_vacuum_callback, PointerGetDatum(rel));
{
- TransactionId oldestBtpoXact;
-
cycleid = _bt_start_vacuum(rel);
- btvacuumscan(info, stats, callback, callback_state, cycleid,
- &oldestBtpoXact);
-
- /*
- * Update cleanup-related information in metapage. This information is
- * used only for cleanup but keeping them up to date can avoid
- * unnecessary cleanup even after bulkdelete.
- */
- _bt_update_meta_cleanup_info(info->index, oldestBtpoXact,
- info->num_heap_tuples);
+ btvacuumscan(info, stats, callback, callback_state, cycleid);
}
PG_END_ENSURE_ERROR_CLEANUP(_bt_end_vacuum_callback, PointerGetDatum(rel));
_bt_end_vacuum(rel);
*/
if (stats == NULL)
{
- TransactionId oldestBtpoXact;
-
/* Check if we need a cleanup */
if (!_bt_vacuum_needs_cleanup(info))
return NULL;
stats = (IndexBulkDeleteResult *) palloc0(sizeof(IndexBulkDeleteResult));
- btvacuumscan(info, stats, NULL, NULL, 0, &oldestBtpoXact);
-
- /* Update cleanup-related information in the metapage */
- _bt_update_meta_cleanup_info(info->index, oldestBtpoXact,
- info->num_heap_tuples);
+ btvacuumscan(info, stats, NULL, NULL, 0);
}
/*
* according to the vacuum callback, looking for empty pages that can be
* deleted, and looking for old deleted pages that can be recycled. Both
* btbulkdelete and btvacuumcleanup invoke this (the latter only if no
- * btbulkdelete call occurred).
+ * btbulkdelete call occurred and _bt_vacuum_needs_cleanup returned true).
+ * Note that this is also where the metadata used by _bt_vacuum_needs_cleanup
+ * is maintained.
*
* The caller is responsible for initially allocating/zeroing a stats struct
* and for obtaining a vacuum cycle ID if necessary.
static void
btvacuumscan(IndexVacuumInfo *info, IndexBulkDeleteResult *stats,
IndexBulkDeleteCallback callback, void *callback_state,
- BTCycleId cycleid, TransactionId *oldestBtpoXact)
+ BTCycleId cycleid)
{
Relation rel = info->index;
BTVacState vstate;
MemoryContextDelete(vstate.pagedelcontext);
+ /*
+ * Maintain a count of the oldest btpo.xact and current number of heap
+ * tuples in the metapage (for the benefit of _bt_vacuum_needs_cleanup).
+ * The oldest page is typically a page deleted by a previous VACUUM
+ * operation.
+ */
+ _bt_update_meta_cleanup_info(rel, vstate.oldestBtpoXact,
+ info->num_heap_tuples);
+
/*
* If we found any recyclable pages (and recorded them in the FSM), then
* forcibly update the upper-level FSM pages to ensure that searchers can
/* update statistics */
stats->num_pages = num_pages;
stats->pages_free = vstate.totFreePages;
-
- if (oldestBtpoXact)
- *oldestBtpoXact = vstate.oldestBtpoXact;
}
/*
/* Page is valid, see what to do with it */
if (_bt_page_recyclable(page))
{
- /* Okay to recycle this page */
+ /* Okay to recycle this page (which could be leaf or internal) */
RecordFreeIndexPage(rel, blkno);
vstate->totFreePages++;
stats->pages_deleted++;
}
else if (P_ISDELETED(opaque))
{
- /* Already deleted, but can't recycle yet */
+ /*
+ * Already deleted page (which could be leaf or internal). Can't
+ * recycle yet.
+ */
stats->pages_deleted++;
- /* Update the oldest btpo.xact */
+ /* Maintain the oldest btpo.xact */
if (!TransactionIdIsValid(vstate->oldestBtpoXact) ||
TransactionIdPrecedes(opaque->btpo.xact, vstate->oldestBtpoXact))
vstate->oldestBtpoXact = opaque->btpo.xact;
}
else if (P_ISHALFDEAD(opaque))
{
- /* Half-dead, try to delete */
+ /*
+ * Half-dead leaf page. Try to delete now. Might update
+ * oldestBtpoXact and pages_deleted below.
+ */
delete_now = true;
}
else if (P_ISLEAF(opaque))
else
{
/*
- * If the page has been split during this vacuum cycle, it seems
- * worth expending a write to clear btpo_cycleid even if we don't
- * have any deletions to do. (If we do, _bt_delitems_vacuum takes
- * care of this.) This ensures we won't process the page again.
+ * If the leaf page has been split during this vacuum cycle, it
+ * seems worth expending a write to clear btpo_cycleid even if we
+ * don't have any deletions to do. (If we do, _bt_delitems_vacuum
+ * takes care of this.) This ensures we won't process the page
+ * again.
*
* We treat this like a hint-bit update because there's no need to
* WAL-log it.
}
/*
- * If it's now empty, try to delete; else count the live tuples (live
- * table TIDs in posting lists are counted as separate live tuples).
- * We don't delete when recursing, though, to avoid putting entries
- * into freePages out-of-order (doesn't seem worth any extra code to
- * handle the case).
+ * If the leaf page is now empty, try to delete it; else count the
+ * live tuples (live table TIDs in posting lists are counted as
+ * separate live tuples). We don't delete when recursing, though, to
+ * avoid putting entries into freePages out-of-order (doesn't seem
+ * worth any extra code to handle the case).
*/
if (minoff > maxoff)
delete_now = (blkno == orig_blkno);
MemoryContextReset(vstate->pagedelcontext);
oldcontext = MemoryContextSwitchTo(vstate->pagedelcontext);
- ndel = _bt_pagedel(rel, buf);
+ ndel = _bt_pagedel(rel, buf, &vstate->oldestBtpoXact);
/* count only this page, else may double-count parent */
if (ndel)
- {
stats->pages_deleted++;
- if (!TransactionIdIsValid(vstate->oldestBtpoXact) ||
- TransactionIdPrecedes(opaque->btpo.xact, vstate->oldestBtpoXact))
- vstate->oldestBtpoXact = opaque->btpo.xact;
- }
MemoryContextSwitchTo(oldcontext);
/* pagedel released buffer, so we shouldn't */