Fix SPGiST vacuum algorithm to handle concurrent tuple motion properly.
authorTom Lane <tgl@sss.pgh.pa.us>
Mon, 12 Mar 2012 20:10:05 +0000 (16:10 -0400)
committerTom Lane <tgl@sss.pgh.pa.us>
Mon, 12 Mar 2012 20:10:28 +0000 (16:10 -0400)
A leaf tuple that we need to delete could get moved as a consequence of an
insertion happening concurrently with the VACUUM scan.  If it moves from a
page past the current scan point to a page before, we'll miss it, which is
not acceptable.  Hence, when we see a leaf-page REDIRECT that could have
been made since our scan started, chase down the redirection pointer much
as if we were doing a normal index search, and be sure to vacuum every page
it leads to.  This fixes the issue because, if the tuple was on page N at
the instant we start our scan, we will surely find it as a consequence of
chasing the redirect from page N, no matter how much it moves around in
between.  Problem noted by Takashi Yamamoto.

src/backend/access/spgist/README
src/backend/access/spgist/spgvacuum.c

index d20ad17a4b66980d0cd59afff8c1523e6ddb1366..1b86e275914d4216262b099b06dfdf1bfc21ab92 100644 (file)
@@ -314,6 +314,27 @@ the reverse map of the nextOffset links (ie, when we see tuple x links to
 tuple y, we set predecessor[y] = x).  Then head tuples are the ones with
 no predecessor.
 
+Because insertions can occur while VACUUM runs, a pure sequential scan
+could miss deleting some target leaf tuples, because they could get moved
+from a not-yet-visited leaf page to an already-visited leaf page as a
+consequence of a PickSplit or MoveLeafs operation.  Failing to delete any
+target TID is not acceptable, so we have to extend the algorithm to cope
+with such cases.  We recognize that such a move might have occurred when
+we see a leaf-page REDIRECT tuple whose XID indicates it might have been
+created after the VACUUM scan started.  We add the redirection target TID
+to a "pending list" of places we need to recheck.  Between pages of the
+main sequential scan, we empty the pending list by visiting each listed
+TID.  If it points to an inner tuple (from a PickSplit), add each downlink
+TID to the pending list.  If it points to a leaf page, vacuum that page.
+(We could just vacuum the single pointed-to chain, but vacuuming the
+whole page simplifies the code and reduces the odds of VACUUM having to
+modify the same page multiple times.)  To ensure that pending-list
+processing can never get into an endless loop, even in the face of
+concurrent index changes, we don't remove list entries immediately but
+only after we've completed all pending-list processing; instead we just
+mark items as done after processing them.  Adding a TID that's already in
+the list is a no-op, whether or not that item is marked done yet.
+
 spgbulkdelete also updates the index's free space map.
 
 Currently, spgvacuumcleanup has nothing to do if spgbulkdelete was
index a09da84a2aac18b9846919f6c81c6ba0d47f3dd2..856790ee2aa41cac1728021a079338a10b21e429 100644 (file)
 #include "storage/indexfsm.h"
 #include "storage/lmgr.h"
 #include "storage/procarray.h"
+#include "utils/snapmgr.h"
 
 
-/* local state for vacuum operations */
+/* Entry in pending-list of TIDs we need to revisit */
+typedef struct spgVacPendingItem
+{
+       ItemPointerData tid;                            /* redirection target to visit */
+       bool            done;                                   /* have we dealt with this? */
+       struct spgVacPendingItem *next;         /* list link */
+} spgVacPendingItem;
+
+/* Local state for vacuum operations */
 typedef struct spgBulkDeleteState
 {
        /* Parameters passed in to spgvacuumscan */
@@ -35,22 +44,87 @@ typedef struct spgBulkDeleteState
        IndexBulkDeleteResult *stats;
        IndexBulkDeleteCallback callback;
        void       *callback_state;
+
        /* Additional working state */
-       SpGistState spgstate;
-       TransactionId OldestXmin;
-       BlockNumber lastFilledBlock;
+       SpGistState spgstate;                   /* for SPGiST operations that need one */
+       spgVacPendingItem *pendingList; /* TIDs we need to (re)visit */
+       TransactionId myXmin;                   /* for detecting newly-added redirects */
+       TransactionId OldestXmin;               /* for deciding a redirect is obsolete */
+       BlockNumber lastFilledBlock;    /* last non-deletable block */
 } spgBulkDeleteState;
 
 
+/*
+ * Add TID to pendingList, but only if not already present.
+ *
+ * Note that new items are always appended at the end of the list; this
+ * ensures that scans of the list don't miss items added during the scan.
+ */
+static void
+spgAddPendingTID(spgBulkDeleteState *bds, ItemPointer tid)
+{
+       spgVacPendingItem *pitem;
+       spgVacPendingItem **listLink;
+
+       /* search the list for pre-existing entry */
+       listLink = &bds->pendingList;
+       while (*listLink != NULL)
+       {
+               pitem = *listLink;
+               if (ItemPointerEquals(tid, &pitem->tid))
+                       return;                         /* already in list, do nothing */
+               listLink = &pitem->next;
+       }
+       /* not there, so append new entry */
+       pitem = (spgVacPendingItem *) palloc(sizeof(spgVacPendingItem));
+       pitem->tid = *tid;
+       pitem->done = false;
+       pitem->next = NULL;
+       *listLink = pitem;
+}
+
+/*
+ * Clear pendingList
+ */
+static void
+spgClearPendingList(spgBulkDeleteState *bds)
+{
+       spgVacPendingItem *pitem;
+       spgVacPendingItem *nitem;
+
+       for (pitem = bds->pendingList; pitem != NULL; pitem = nitem)
+       {
+               nitem = pitem->next;
+               /* All items in list should have been dealt with */
+               Assert(pitem->done);
+               pfree(pitem);
+       }
+       bds->pendingList = NULL;
+}
+
 /*
  * Vacuum a regular (non-root) leaf page
  *
  * We must delete tuples that are targeted for deletion by the VACUUM,
  * but not move any tuples that are referenced by outside links; we assume
  * those are the ones that are heads of chains.
+ *
+ * If we find a REDIRECT that was made by a concurrently-running transaction,
+ * we must add its target TID to pendingList.  (We don't try to visit the
+ * target immediately, first because we don't want VACUUM locking more than
+ * one buffer at a time, and second because the duplicate-filtering logic
+ * in spgAddPendingTID is useful to ensure we can't get caught in an infinite
+ * loop in the face of continuous concurrent insertions.)
+ *
+ * If forPending is true, we are examining the page as a consequence of
+ * chasing a redirect link, not as part of the normal sequential scan.
+ * We still vacuum the page normally, but we don't increment the stats
+ * about live tuples; else we'd double-count those tuples, since the page
+ * has been or will be visited in the sequential scan as well.
  */
 static void
-vacuumLeafPage(spgBulkDeleteState *bds, Relation index, Buffer buffer)
+vacuumLeafPage(spgBulkDeleteState *bds, Relation index, Buffer buffer,
+                          bool forPending)
 {
        Page            page = BufferGetPage(buffer);
        spgxlogVacuumLeaf xlrec;
@@ -90,7 +164,8 @@ vacuumLeafPage(spgBulkDeleteState *bds, Relation index, Buffer buffer)
                        }
                        else
                        {
-                               bds->stats->num_index_tuples += 1;
+                               if (!forPending)
+                                       bds->stats->num_index_tuples += 1;
                        }
 
                        /* Form predecessor map, too */
@@ -106,6 +181,25 @@ vacuumLeafPage(spgBulkDeleteState *bds, Relation index, Buffer buffer)
                                predecessor[lt->nextOffset] = i;
                        }
                }
+               else if (lt->tupstate == SPGIST_REDIRECT)
+               {
+                       SpGistDeadTuple dt = (SpGistDeadTuple) lt;
+
+                       Assert(dt->nextOffset == InvalidOffsetNumber);
+                       Assert(ItemPointerIsValid(&dt->pointer));
+
+                       /*
+                        * Add target TID to pending list if the redirection could have
+                        * happened since VACUUM started.
+                        *
+                        * Note: we could make a tighter test by seeing if the xid is
+                        * "running" according to the active snapshot; but tqual.c doesn't
+                        * currently export a suitable API, and it's not entirely clear
+                        * that a tighter test is worth the cycles anyway.
+                        */
+                       if (TransactionIdFollowsOrEquals(dt->xid, bds->myXmin))
+                               spgAddPendingTID(bds, &dt->pointer);
+               }
                else
                {
                        Assert(lt->nextOffset == InvalidOffsetNumber);
@@ -545,7 +639,7 @@ spgvacuumpage(spgBulkDeleteState *bds, BlockNumber blkno)
                }
                else
                {
-                       vacuumLeafPage(bds, index, buffer);
+                       vacuumLeafPage(bds, index, buffer, false);
                        vacuumRedirectAndPlaceholder(index, buffer, bds->OldestXmin);
                }
        }
@@ -556,8 +650,8 @@ spgvacuumpage(spgBulkDeleteState *bds, BlockNumber blkno)
        }
 
        /*
-        * The root page must never be deleted, nor marked as available in FSM,
-        * because we don't want it ever returned by a search for a place to
+        * The root pages must never be deleted, nor marked as available in FSM,
+        * because we don't want them ever returned by a search for a place to
         * put a new tuple.  Otherwise, check for empty/deletable page, and
         * make sure FSM knows about it.
         */
@@ -585,6 +679,118 @@ spgvacuumpage(spgBulkDeleteState *bds, BlockNumber blkno)
        UnlockReleaseBuffer(buffer);
 }
 
+/*
+ * Process the pending-TID list between pages of the main scan
+ */
+static void
+spgprocesspending(spgBulkDeleteState *bds)
+{
+       Relation        index = bds->info->index;
+       spgVacPendingItem *pitem;
+       spgVacPendingItem *nitem;
+       BlockNumber     blkno;
+       Buffer          buffer;
+       Page            page;
+
+       for (pitem = bds->pendingList; pitem != NULL; pitem = pitem->next)
+       {
+               if (pitem->done)
+                       continue;                       /* ignore already-done items */
+
+               /* call vacuum_delay_point while not holding any buffer lock */
+               vacuum_delay_point();
+
+               /* examine the referenced page */
+               blkno = ItemPointerGetBlockNumber(&pitem->tid);
+               buffer = ReadBufferExtended(index, MAIN_FORKNUM, blkno,
+                                                                       RBM_NORMAL, bds->info->strategy);
+               LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
+               page = (Page) BufferGetPage(buffer);
+
+               if (PageIsNew(page) || SpGistPageIsDeleted(page))
+               {
+                       /* Probably shouldn't happen, but ignore it */
+               }
+               else if (SpGistPageIsLeaf(page))
+               {
+                       if (SpGistBlockIsRoot(blkno))
+                       {
+                               /* this should definitely not happen */
+                               elog(ERROR, "redirection leads to root page of index \"%s\"",
+                                        RelationGetRelationName(index));
+                       }
+
+                       /* deal with any deletable tuples */
+                       vacuumLeafPage(bds, index, buffer, true);
+                       /* might as well do this while we are here */
+                       vacuumRedirectAndPlaceholder(index, buffer, bds->OldestXmin);
+
+                       SpGistSetLastUsedPage(index, buffer);
+
+                       /*
+                        * We can mark as done not only this item, but any later ones
+                        * pointing at the same page, since we vacuumed the whole page.
+                        */
+                       pitem->done = true;
+                       for (nitem = pitem->next; nitem != NULL; nitem = nitem->next)
+                       {
+                               if (ItemPointerGetBlockNumber(&nitem->tid) == blkno)
+                                       nitem->done = true;
+                       }
+               }
+               else
+               {
+                       /*
+                        * On an inner page, visit the referenced inner tuple and add
+                        * all its downlinks to the pending list.  We might have pending
+                        * items for more than one inner tuple on the same page (in fact
+                        * this is pretty likely given the way space allocation works),
+                        * so get them all while we are here.
+                        */
+                       for (nitem = pitem; nitem != NULL; nitem = nitem->next)
+                       {
+                               if (nitem->done)
+                                       continue;
+                               if (ItemPointerGetBlockNumber(&nitem->tid) == blkno)
+                               {
+                                       OffsetNumber offset;
+                                       SpGistInnerTuple innerTuple;
+
+                                       offset = ItemPointerGetOffsetNumber(&nitem->tid);
+                                       innerTuple = (SpGistInnerTuple) PageGetItem(page,
+                                                                                               PageGetItemId(page, offset));
+                                       if (innerTuple->tupstate == SPGIST_LIVE)
+                                       {
+                                               SpGistNodeTuple node;
+                                               int                     i;
+
+                                               SGITITERATE(innerTuple, i, node)
+                                               {
+                                                       if (ItemPointerIsValid(&node->t_tid))
+                                                               spgAddPendingTID(bds, &node->t_tid);
+                                               }
+                                       }
+                                       else if (innerTuple->tupstate == SPGIST_REDIRECT)
+                                       {
+                                               /* transfer attention to redirect point */
+                                               spgAddPendingTID(bds,
+                                                                                &((SpGistDeadTuple) innerTuple)->pointer);
+                                       }
+                                       else
+                                               elog(ERROR, "unexpected SPGiST tuple state: %d",
+                                                        innerTuple->tupstate);
+
+                                       nitem->done = true;
+                               }
+                       }
+               }
+
+               UnlockReleaseBuffer(buffer);
+       }
+
+       spgClearPendingList(bds);
+}
+
 /*
  * Perform a bulkdelete scan
  */
@@ -598,6 +804,8 @@ spgvacuumscan(spgBulkDeleteState *bds)
 
        /* Finish setting up spgBulkDeleteState */
        initSpGistState(&bds->spgstate, index);
+       bds->pendingList = NULL;
+       bds->myXmin = GetActiveSnapshot()->xmin;
        bds->OldestXmin = GetOldestXmin(true, false);
        bds->lastFilledBlock = SPGIST_LAST_FIXED_BLKNO;
 
@@ -637,6 +845,9 @@ spgvacuumscan(spgBulkDeleteState *bds)
                for (; blkno < num_pages; blkno++)
                {
                        spgvacuumpage(bds, blkno);
+                       /* empty the pending-list after each page */
+                       if (bds->pendingList != NULL)
+                               spgprocesspending(bds);
                }
        }
 
@@ -747,7 +958,7 @@ spgvacuumcleanup(PG_FUNCTION_ARGS)
        IndexFreeSpaceMapVacuum(index);
 
        /*
-        * It's quite possible for us to be fooled by concurrent page splits into
+        * It's quite possible for us to be fooled by concurrent tuple moves into
         * double-counting some index tuples, so disbelieve any total that exceeds
         * the underlying heap's count ... if we know that accurately.  Otherwise
         * this might just make matters worse.