Further optimize GIN multi-key searches.
authorHeikki Linnakangas <heikki.linnakangas@iki.fi>
Wed, 29 Jan 2014 19:22:08 +0000 (21:22 +0200)
committerHeikki Linnakangas <heikki.linnakangas@iki.fi>
Wed, 29 Jan 2014 19:24:38 +0000 (21:24 +0200)
When skipping over some items in a posting tree, re-find the new location
by descending the tree from root, rather than walking the right links.
This can save a lot of I/O.

Heavily modified from Alexander Korotkov's fast scan patch.

src/backend/access/gin/gindatapage.c
src/backend/access/gin/ginget.c
src/include/access/gin_private.h

index 9a0b8ab1f214258ece93695f96ec04a441cc3018..c6230f3bc5a433a408a7290ba72900a9f5aa354b 100644 (file)
@@ -1639,16 +1639,15 @@ ginInsertItemPointers(Relation index, BlockNumber rootBlkno,
  * Starts a new scan on a posting tree.
  */
 GinBtreeStack *
-ginScanBeginPostingTree(Relation index, BlockNumber rootBlkno)
+ginScanBeginPostingTree(GinBtree btree, Relation index, BlockNumber rootBlkno)
 {
-   GinBtreeData btree;
    GinBtreeStack *stack;
 
-   ginPrepareDataScan(&btree, index, rootBlkno);
+   ginPrepareDataScan(btree, index, rootBlkno);
 
-   btree.fullScan = TRUE;
+   btree->fullScan = TRUE;
 
-   stack = ginFindLeafPage(&btree, TRUE);
+   stack = ginFindLeafPage(btree, TRUE);
 
    return stack;
 }
index 49e47c6859c57e44932aa87e1489f0e8e715f9cf..a45d72212e9436b27e418dec81a9a0380d1bf637 100644 (file)
@@ -99,12 +99,13 @@ static void
 scanPostingTree(Relation index, GinScanEntry scanEntry,
                BlockNumber rootPostingTree)
 {
+   GinBtreeData btree;
    GinBtreeStack *stack;
    Buffer      buffer;
    Page        page;
 
    /* Descend to the leftmost leaf page */
-   stack = ginScanBeginPostingTree(index, rootPostingTree);
+   stack = ginScanBeginPostingTree(&btree, index, rootPostingTree);
    buffer = stack->buffer;
    IncrBufferRefCount(buffer); /* prevent unpin in freeGinBtreeStack */
 
@@ -412,7 +413,8 @@ restartScanEntry:
            LockBuffer(stackEntry->buffer, GIN_UNLOCK);
            needUnlock = FALSE;
 
-           stack = ginScanBeginPostingTree(ginstate->index, rootPostingTree);
+           stack = ginScanBeginPostingTree(&entry->btree, ginstate->index,
+                                           rootPostingTree);
            entry->buffer = stack->buffer;
 
            /*
@@ -506,8 +508,60 @@ entryLoadMoreItems(GinState *ginstate, GinScanEntry entry, ItemPointerData advan
 {
    Page        page;
    int         i;
+   bool        stepright;
+
+   if (!BufferIsValid(entry->buffer))
+   {
+       entry->isFinished = true;
+       return;
+   }
+
+   /*
+    * We have two strategies for finding the correct page: step right from
+    * the current page, or descend the tree again from the root. If
+    * advancePast equals the current item, the next matching item should be
+    * on the next page, so we step right. Otherwise, descend from root.
+    */
+   if (ginCompareItemPointers(&entry->curItem, &advancePast) == 0)
+   {
+       stepright = true;
+       LockBuffer(entry->buffer, GIN_SHARE);
+   }
+   else
+   {
+       GinBtreeStack *stack;
+
+       ReleaseBuffer(entry->buffer);
+
+       /*
+        * Set the search key, and find the correct leaf page.
+        */
+       if (ItemPointerIsLossyPage(&advancePast))
+       {
+           ItemPointerSet(&entry->btree.itemptr,
+                          GinItemPointerGetBlockNumber(&advancePast) + 1,
+                          FirstOffsetNumber);
+       }
+       else
+       {
+           entry->btree.itemptr = advancePast;
+           entry->btree.itemptr.ip_posid++;
+       }
+       entry->btree.fullScan = false;
+       stack = ginFindLeafPage(&entry->btree, true);
+
+       /* we don't need the stack, just the buffer. */
+       entry->buffer = stack->buffer;
+       IncrBufferRefCount(entry->buffer);
+       freeGinBtreeStack(stack);
+       stepright = false;
+   }
+
+   elog(DEBUG2, "entryLoadMoreItems, %u/%u, skip: %d",
+        GinItemPointerGetBlockNumber(&advancePast),
+        GinItemPointerGetOffsetNumber(&advancePast),
+        !stepright);
 
-   LockBuffer(entry->buffer, GIN_SHARE);
    page = BufferGetPage(entry->buffer);
    for (;;)
    {
@@ -519,30 +573,34 @@ entryLoadMoreItems(GinState *ginstate, GinScanEntry entry, ItemPointerData advan
            entry->nlist = 0;
        }
 
-       /*
-        * We've processed all the entries on this page. If it was the last
-        * page in the tree, we're done.
-        */
-       if (GinPageRightMost(page))
+       if (stepright)
        {
-           UnlockReleaseBuffer(entry->buffer);
-           entry->buffer = InvalidBuffer;
-           entry->isFinished = TRUE;
-           return;
+           /*
+            * We've processed all the entries on this page. If it was the last
+            * page in the tree, we're done.
+            */
+           if (GinPageRightMost(page))
+           {
+               UnlockReleaseBuffer(entry->buffer);
+               entry->buffer = InvalidBuffer;
+               entry->isFinished = TRUE;
+               return;
+           }
+
+           /*
+            * Step to next page, following the right link. then find the first
+            * ItemPointer greater than advancePast.
+            */
+           entry->buffer = ginStepRight(entry->buffer,
+                                        ginstate->index,
+                                        GIN_SHARE);
+           page = BufferGetPage(entry->buffer);
        }
+       stepright = true;
 
        if (GinPageGetOpaque(page)->flags & GIN_DELETED)
            continue;       /* page was deleted by concurrent vacuum */
 
-       /*
-        * Step to next page, following the right link. then find the first
-        * ItemPointer greater than advancePast.
-        */
-       entry->buffer = ginStepRight(entry->buffer,
-                                    ginstate->index,
-                                    GIN_SHARE);
-       page = BufferGetPage(entry->buffer);
-
        /*
         * The first item > advancePast might not be on this page, but
         * somewhere to the right, if the page was split, or a non-match from
@@ -566,8 +624,16 @@ entryLoadMoreItems(GinState *ginstate, GinScanEntry entry, ItemPointerData advan
        {
            if (ginCompareItemPointers(&advancePast, &entry->list[i]) < 0)
            {
-               LockBuffer(entry->buffer, GIN_UNLOCK);
                entry->offset = i;
+
+               if (GinPageRightMost(page))
+               {
+                   /* after processing the copied items, we're done. */
+                   UnlockReleaseBuffer(entry->buffer);
+                   entry->buffer = InvalidBuffer;
+               }
+               else
+                   LockBuffer(entry->buffer, GIN_UNLOCK);
                return;
            }
        }
@@ -677,7 +743,10 @@ entryGetItem(GinState *ginstate, GinScanEntry entry,
    }
    else if (!BufferIsValid(entry->buffer))
    {
-       /* A posting list from an entry tuple  */
+       /*
+        * A posting list from an entry tuple, or the last page of a posting
+        * tree.
+        */
        do
        {
            if (entry->offset >= entry->nlist)
index ea9ae31acc0ea57b42740585f78de2fd893eda42..bb0ab317cbcf6859eccdb36807f4a55a818b5d64 100644 (file)
@@ -702,7 +702,7 @@ extern void GinPageDeletePostingItem(Page page, OffsetNumber offset);
 extern void ginInsertItemPointers(Relation index, BlockNumber rootBlkno,
                      ItemPointerData *items, uint32 nitem,
                      GinStatsData *buildStats);
-extern GinBtreeStack *ginScanBeginPostingTree(Relation index, BlockNumber rootBlkno);
+extern GinBtreeStack *ginScanBeginPostingTree(GinBtree btree, Relation index, BlockNumber rootBlkno);
 extern void ginDataFillRoot(GinBtree btree, Page root, BlockNumber lblkno, Page lpage, BlockNumber rblkno, Page rpage);
 extern void ginPrepareDataScan(GinBtree btree, Relation index, BlockNumber rootBlkno);
 
@@ -802,6 +802,7 @@ typedef struct GinScanEntryData
    bool        isFinished;
    bool        reduceResult;
    uint32      predictNumberResult;
+   GinBtreeData btree;
 }  GinScanEntryData;
 
 typedef struct GinScanOpaqueData