Reduce non-leaf keys overlap in GiST indexes produced by a sorted build
authorAlexander Korotkov <akorotkov@postgresql.org>
Mon, 7 Feb 2022 20:20:42 +0000 (23:20 +0300)
committerAlexander Korotkov <akorotkov@postgresql.org>
Mon, 7 Feb 2022 20:20:42 +0000 (23:20 +0300)
The GiST sorted build currently chooses split points according to the only page
space utilization.  That may lead to higher non-leaf keys overlap and, in turn,
slower search query answers.

This commit makes the sorted build use the opclass's picksplit method.  Once
four pages at the level are accumulated, the picksplit method is applied until
each split partition fits the page.  Some of our split algorithms could show
significant performance degradation while processing 4-times more data at once.
But those opclasses haven't received the sorted build support and shouldn't
receive it before their split algorithms are improved.

Discussion: https://postgr.es/m/CAHqSB9jqtS94e9%3D0vxqQX5dxQA89N95UKyz-%3DA7Y%2B_YJt%2BVW5A%40mail.gmail.com
Author: Aliaksandr Kalenik, Sergei Shoulbakov, Andrey Borodin
Reviewed-by: Björn Harrtell, Darafei Praliaskouski, Andres Freund
Reviewed-by: Alexander Korotkov
contrib/pageinspect/expected/gist.out
src/backend/access/gist/README
src/backend/access/gist/gistbuild.c
src/tools/pgindent/typedefs.list

index 86c9e9caa9dcecba9ce6328c63195b4aaa416d85..b26b56ba15df187257dbc5649c324351620d5067 100644 (file)
@@ -33,14 +33,13 @@ COMMIT;
 SELECT * FROM gist_page_items(get_raw_page('test_gist_idx', 0), 'test_gist_idx');
  itemoffset |   ctid    | itemlen | dead |       keys        
 ------------+-----------+---------+------+-------------------
-          1 | (1,65535) |      40 | f    | (p)=((166,166))
-          2 | (2,65535) |      40 | f    | (p)=((332,332))
-          3 | (3,65535) |      40 | f    | (p)=((498,498))
-          4 | (4,65535) |      40 | f    | (p)=((664,664))
-          5 | (5,65535) |      40 | f    | (p)=((830,830))
-          6 | (6,65535) |      40 | f    | (p)=((996,996))
-          7 | (7,65535) |      40 | f    | (p)=((1000,1000))
-(7 rows)
+          1 | (1,65535) |      40 | f    | (p)=((185,185))
+          2 | (2,65535) |      40 | f    | (p)=((370,370))
+          3 | (3,65535) |      40 | f    | (p)=((555,555))
+          4 | (4,65535) |      40 | f    | (p)=((740,740))
+          5 | (5,65535) |      40 | f    | (p)=((870,870))
+          6 | (6,65535) |      40 | f    | (p)=((1000,1000))
+(6 rows)
 
 SELECT * FROM gist_page_items(get_raw_page('test_gist_idx', 1), 'test_gist_idx') LIMIT 5;
  itemoffset | ctid  | itemlen | dead |    keys     
@@ -63,7 +62,6 @@ SELECT itemoffset, ctid, itemlen FROM gist_page_items_bytea(get_raw_page('test_g
           4 | (4,65535) |      40
           5 | (5,65535) |      40
           6 | (6,65535) |      40
-          7 | (7,65535) |      40
-(7 rows)
+(6 rows)
 
 DROP TABLE test_gist;
index 25cab0047b657630a9f7983b6e31e04100f98502..efb2730e181488b041cc786181a8e43bbe5f039f 100644 (file)
@@ -26,6 +26,7 @@ The current implementation of GiST supports:
   * Concurrency
   * Recovery support via WAL logging
   * Buffering build algorithm
+  * Sorted build method
 
 The support for concurrency implemented in PostgreSQL was developed based on
 the paper "Access Methods for Next-Generation Database Systems" by
@@ -414,6 +415,21 @@ emptied yet; tuples never move upwards in the tree. The final emptying loops
 through buffers at a given level until all buffers at that level have been
 emptied, and then moves down to the next level.
 
+Sorted build method
+-------------------
+
+Sort all input tuples, pack them into GiST leaf pages in the sorted order,
+and create downlinks and internal pages as we go. This method builds the index
+from the bottom up, similar to how the B-tree index is built.
+
+The sorted method is used if the operator classes for all columns have a
+"sortsupport" defined. Otherwise, we fall back on inserting tuples one by one
+with optional buffering.
+
+Sorting GiST build requires good linearization of the sort opclass. That is not
+always the case in multidimensional data. To tackle the anomalies, we buffer
+index tuples and apply a picksplit function that can be multidimensional-aware.
+
 Bulk delete algorithm (VACUUM)
 ------------------------------
 
index 9854116fca5c58786612bd4eceb0bd3d1865e38f..4db896a533daa8e0004d67a0b29be3ccf9da91cd 100644 (file)
@@ -113,16 +113,23 @@ typedef struct
        Page            ready_pages[XLR_MAX_BLOCK_ID];
 } GISTBuildState;
 
+#define GIST_SORTED_BUILD_PAGE_NUM 4
+
 /*
  * In sorted build, we use a stack of these structs, one for each level,
- * to hold an in-memory buffer of the rightmost page at the level. When the
- * page fills up, it is written out and a new page is allocated.
+ * to hold an in-memory buffer of last pages at the level.
+ *
+ * Sorting GiST build requires good linearization of the sort opclass. This is
+ * not always the case in multidimensional data. To tackle the anomalies, we
+ * buffer index tuples and apply picksplit that can be multidimension-aware.
  */
-typedef struct GistSortedBuildPageState
+typedef struct GistSortedBuildLevelState
 {
-       Page            page;
-       struct GistSortedBuildPageState *parent;        /* Upper level, if any */
-} GistSortedBuildPageState;
+       int                     current_page;
+       BlockNumber last_blkno;
+       struct GistSortedBuildLevelState *parent;       /* Upper level, if any */
+       Page            pages[GIST_SORTED_BUILD_PAGE_NUM];
+} GistSortedBuildLevelState;
 
 /* prototypes for private functions */
 
@@ -130,11 +137,11 @@ static void gistSortedBuildCallback(Relation index, ItemPointer tid,
                                                                        Datum *values, bool *isnull,
                                                                        bool tupleIsAlive, void *state);
 static void gist_indexsortbuild(GISTBuildState *state);
-static void gist_indexsortbuild_pagestate_add(GISTBuildState *state,
-                                                                                         GistSortedBuildPageState *pagestate,
-                                                                                         IndexTuple itup);
-static void gist_indexsortbuild_pagestate_flush(GISTBuildState *state,
-                                                                                               GistSortedBuildPageState *pagestate);
+static void gist_indexsortbuild_levelstate_add(GISTBuildState *state,
+                                                                                          GistSortedBuildLevelState *levelstate,
+                                                                                          IndexTuple itup);
+static void gist_indexsortbuild_levelstate_flush(GISTBuildState *state,
+                                                                                                GistSortedBuildLevelState *levelstate);
 static void gist_indexsortbuild_flush_ready_pages(GISTBuildState *state);
 
 static void gistInitBuffering(GISTBuildState *buildstate);
@@ -396,8 +403,7 @@ static void
 gist_indexsortbuild(GISTBuildState *state)
 {
        IndexTuple      itup;
-       GistSortedBuildPageState *leafstate;
-       GistSortedBuildPageState *pagestate;
+       GistSortedBuildLevelState *levelstate;
        Page            page;
 
        state->pages_allocated = 0;
@@ -414,10 +420,10 @@ gist_indexsortbuild(GISTBuildState *state)
        state->pages_allocated++;
        state->pages_written++;
 
-       /* Allocate a temporary buffer for the first leaf page. */
-       leafstate = palloc(sizeof(GistSortedBuildPageState));
-       leafstate->page = page;
-       leafstate->parent = NULL;
+       /* Allocate a temporary buffer for the first leaf page batch. */
+       levelstate = palloc0(sizeof(GistSortedBuildLevelState));
+       levelstate->pages[0] = page;
+       levelstate->parent = NULL;
        gistinitpage(page, F_LEAF);
 
        /*
@@ -425,133 +431,200 @@ gist_indexsortbuild(GISTBuildState *state)
         */
        while ((itup = tuplesort_getindextuple(state->sortstate, true)) != NULL)
        {
-               gist_indexsortbuild_pagestate_add(state, leafstate, itup);
+               gist_indexsortbuild_levelstate_add(state, levelstate, itup);
                MemoryContextReset(state->giststate->tempCxt);
        }
 
        /*
         * Write out the partially full non-root pages.
         *
-        * Keep in mind that flush can build a new root.
+        * Keep in mind that flush can build a new root. If number of pages is > 1
+        * then new root is required.
         */
-       pagestate = leafstate;
-       while (pagestate->parent != NULL)
+       while (levelstate->parent != NULL || levelstate->current_page != 0)
        {
-               GistSortedBuildPageState *parent;
-
-               gist_indexsortbuild_pagestate_flush(state, pagestate);
-               parent = pagestate->parent;
-               pfree(pagestate->page);
-               pfree(pagestate);
-               pagestate = parent;
+               GistSortedBuildLevelState *parent;
+
+               gist_indexsortbuild_levelstate_flush(state, levelstate);
+               parent = levelstate->parent;
+               for (int i = 0; i < GIST_SORTED_BUILD_PAGE_NUM; i++)
+                       if (levelstate->pages[i])
+                               pfree(levelstate->pages[i]);
+               pfree(levelstate);
+               levelstate = parent;
        }
 
        gist_indexsortbuild_flush_ready_pages(state);
 
        /* Write out the root */
-       PageSetLSN(pagestate->page, GistBuildLSN);
-       PageSetChecksumInplace(pagestate->page, GIST_ROOT_BLKNO);
+       PageSetLSN(levelstate->pages[0], GistBuildLSN);
+       PageSetChecksumInplace(levelstate->pages[0], GIST_ROOT_BLKNO);
        smgrwrite(RelationGetSmgr(state->indexrel), MAIN_FORKNUM, GIST_ROOT_BLKNO,
-                         pagestate->page, true);
+                         levelstate->pages[0], true);
        if (RelationNeedsWAL(state->indexrel))
                log_newpage(&state->indexrel->rd_node, MAIN_FORKNUM, GIST_ROOT_BLKNO,
-                                       pagestate->page, true);
+                                       levelstate->pages[0], true);
 
-       pfree(pagestate->page);
-       pfree(pagestate);
+       pfree(levelstate->pages[0]);
+       pfree(levelstate);
 }
 
 /*
- * Add tuple to a page. If the pages is full, write it out and re-initialize
+ * Add tuple to a page. If the pages are full, write them out and re-initialize
  * a new page first.
  */
 static void
-gist_indexsortbuild_pagestate_add(GISTBuildState *state,
-                                                                 GistSortedBuildPageState *pagestate,
-                                                                 IndexTuple itup)
+gist_indexsortbuild_levelstate_add(GISTBuildState *state,
+                                                                  GistSortedBuildLevelState *levelstate,
+                                                                  IndexTuple itup)
 {
        Size            sizeNeeded;
 
-       /* Does the tuple fit? If not, flush */
-       sizeNeeded = IndexTupleSize(itup) + sizeof(ItemIdData) + state->freespace;
-       if (PageGetFreeSpace(pagestate->page) < sizeNeeded)
-               gist_indexsortbuild_pagestate_flush(state, pagestate);
+       /* Check if tuple can be added to the current page */
+       sizeNeeded = IndexTupleSize(itup) + sizeof(ItemIdData); /* fillfactor ignored */
+       if (PageGetFreeSpace(levelstate->pages[levelstate->current_page]) < sizeNeeded)
+       {
+               Page            newPage;
+               Page            old_page = levelstate->pages[levelstate->current_page];
+               uint16          old_page_flags = GistPageGetOpaque(old_page)->flags;
 
-       gistfillbuffer(pagestate->page, &itup, 1, InvalidOffsetNumber);
+               if (levelstate->current_page + 1 == GIST_SORTED_BUILD_PAGE_NUM)
+               {
+                       gist_indexsortbuild_levelstate_flush(state, levelstate);
+               }
+               else
+                       levelstate->current_page++;
+
+               if (levelstate->pages[levelstate->current_page] == NULL)
+                       levelstate->pages[levelstate->current_page] = palloc(BLCKSZ);
+
+               newPage = levelstate->pages[levelstate->current_page];
+               gistinitpage(newPage, old_page_flags);
+       }
+
+       gistfillbuffer(levelstate->pages[levelstate->current_page], &itup, 1, InvalidOffsetNumber);
 }
 
 static void
-gist_indexsortbuild_pagestate_flush(GISTBuildState *state,
-                                                                       GistSortedBuildPageState *pagestate)
+gist_indexsortbuild_levelstate_flush(GISTBuildState *state,
+                                                                        GistSortedBuildLevelState *levelstate)
 {
-       GistSortedBuildPageState *parent;
-       IndexTuple *itvec;
-       IndexTuple      union_tuple;
-       int                     vect_len;
-       bool            isleaf;
+       GistSortedBuildLevelState *parent;
        BlockNumber blkno;
        MemoryContext oldCtx;
+       IndexTuple      union_tuple;
+       SplitedPageLayout *dist;
+       IndexTuple *itvec;
+       int                     vect_len;
+       bool            isleaf = GistPageIsLeaf(levelstate->pages[0]);
 
-       /* check once per page */
        CHECK_FOR_INTERRUPTS();
 
-       if (state->ready_num_pages == XLR_MAX_BLOCK_ID)
-               gist_indexsortbuild_flush_ready_pages(state);
+       oldCtx = MemoryContextSwitchTo(state->giststate->tempCxt);
 
-       /*
-        * The page is now complete. Assign a block number to it, and add it to
-        * the list of finished pages. (We don't write it out immediately, because
-        * we want to WAL-log the pages in batches.)
-        */
-       blkno = state->pages_allocated++;
-       state->ready_blknos[state->ready_num_pages] = blkno;
-       state->ready_pages[state->ready_num_pages] = pagestate->page;
-       state->ready_num_pages++;
+       /* Get index tuples from first page */
+       itvec = gistextractpage(levelstate->pages[0], &vect_len);
+       if (levelstate->current_page > 0)
+       {
+               /* Append tuples from each page */
+               for (int i = 1; i < levelstate->current_page + 1; i++)
+               {
+                       int                     len_local;
+                       IndexTuple *itvec_local = gistextractpage(levelstate->pages[i], &len_local);
 
-       isleaf = GistPageIsLeaf(pagestate->page);
+                       itvec = gistjoinvector(itvec, &vect_len, itvec_local, len_local);
+                       pfree(itvec_local);
+               }
+
+               /* Apply picksplit to list of all collected tuples */
+               dist = gistSplit(state->indexrel, levelstate->pages[0], itvec, vect_len, state->giststate);
+       }
+       else
+       {
+               /* Create splitted layout from single page */
+               dist = (SplitedPageLayout *) palloc0(sizeof(SplitedPageLayout));
+               union_tuple = gistunion(state->indexrel, itvec, vect_len,
+                                                               state->giststate);
+               dist->itup = union_tuple;
+               dist->list = gistfillitupvec(itvec, vect_len, &(dist->lenlist));
+               dist->block.num = vect_len;
+       }
 
-       /*
-        * Form a downlink tuple to represent all the tuples on the page.
-        */
-       oldCtx = MemoryContextSwitchTo(state->giststate->tempCxt);
-       itvec = gistextractpage(pagestate->page, &vect_len);
-       union_tuple = gistunion(state->indexrel, itvec, vect_len,
-                                                       state->giststate);
-       ItemPointerSetBlockNumber(&(union_tuple->t_tid), blkno);
        MemoryContextSwitchTo(oldCtx);
 
-       /*
-        * Insert the downlink to the parent page. If this was the root, create a
-        * new page as the parent, which becomes the new root.
-        */
-       parent = pagestate->parent;
-       if (parent == NULL)
+       /* Reset page counter */
+       levelstate->current_page = 0;
+
+       /* Create pages for all partitions in split result */
+       for (; dist != NULL; dist = dist->next)
        {
-               parent = palloc(sizeof(GistSortedBuildPageState));
-               parent->page = (Page) palloc(BLCKSZ);
-               parent->parent = NULL;
-               gistinitpage(parent->page, 0);
+               char       *data;
+               Page            target;
 
-               pagestate->parent = parent;
-       }
-       gist_indexsortbuild_pagestate_add(state, parent, union_tuple);
+               /* check once per page */
+               CHECK_FOR_INTERRUPTS();
 
-       /* Re-initialize the page buffer for next page on this level. */
-       pagestate->page = palloc(BLCKSZ);
-       gistinitpage(pagestate->page, isleaf ? F_LEAF : 0);
+               /* Create page and copy data */
+               data = (char *) (dist->list);
+               target = palloc0(BLCKSZ);
+               gistinitpage(target, isleaf ? F_LEAF : 0);
+               for (int i = 0; i < dist->block.num; i++)
+               {
+                       IndexTuple      thistup = (IndexTuple) data;
 
-       /*
-        * Set the right link to point to the previous page. This is just for
-        * debugging purposes: GiST only follows the right link if a page is split
-        * concurrently to a scan, and that cannot happen during index build.
-        *
-        * It's a bit counterintuitive that we set the right link on the new page
-        * to point to the previous page, and not the other way round. But GiST
-        * pages are not ordered like B-tree pages are, so as long as the
-        * right-links form a chain through all the pages in the same level, the
-        * order doesn't matter.
-        */
-       GistPageGetOpaque(pagestate->page)->rightlink = blkno;
+                       if (PageAddItem(target, (Item) data, IndexTupleSize(thistup), i + FirstOffsetNumber, false, false) == InvalidOffsetNumber)
+                               elog(ERROR, "failed to add item to index page in \"%s\"", RelationGetRelationName(state->indexrel));
+
+                       data += IndexTupleSize(thistup);
+               }
+               union_tuple = dist->itup;
+
+               if (state->ready_num_pages == XLR_MAX_BLOCK_ID)
+                       gist_indexsortbuild_flush_ready_pages(state);
+
+               /*
+                * The page is now complete. Assign a block number to it, and add it
+                * to the list of finished pages. (We don't write it out immediately,
+                * because we want to WAL-log the pages in batches.)
+                */
+               blkno = state->pages_allocated++;
+               state->ready_blknos[state->ready_num_pages] = blkno;
+               state->ready_pages[state->ready_num_pages] = target;
+               state->ready_num_pages++;
+               ItemPointerSetBlockNumber(&(union_tuple->t_tid), blkno);
+
+               /*
+                * Set the right link to point to the previous page. This is just for
+                * debugging purposes: GiST only follows the right link if a page is
+                * split concurrently to a scan, and that cannot happen during index
+                * build.
+                *
+                * It's a bit counterintuitive that we set the right link on the new
+                * page to point to the previous page, not the other way around. But
+                * GiST pages are not ordered like B-tree pages are, so as long as the
+                * right-links form a chain through all the pages at the same level,
+                * the order doesn't matter.
+                */
+               if (levelstate->last_blkno)
+                       GistPageGetOpaque(target)->rightlink = levelstate->last_blkno;
+               levelstate->last_blkno = blkno;
+
+               /*
+                * Insert the downlink to the parent page. If this was the root,
+                * create a new page as the parent, which becomes the new root.
+                */
+               parent = levelstate->parent;
+               if (parent == NULL)
+               {
+                       parent = palloc0(sizeof(GistSortedBuildLevelState));
+                       parent->pages[0] = (Page) palloc(BLCKSZ);
+                       parent->parent = NULL;
+                       gistinitpage(parent->pages[0], 0);
+
+                       levelstate->parent = parent;
+               }
+               gist_indexsortbuild_levelstate_add(state, parent, union_tuple);
+       }
 }
 
 static void
index 89249ecc97ce7f363a443c5774f637a23b0c3a63..bfb7802f2d3cac942d5e79375cf391a228cde49b 100644 (file)
@@ -969,7 +969,7 @@ GistHstoreOptions
 GistInetKey
 GistNSN
 GistOptBufferingMode
-GistSortedBuildPageState
+GistSortedBuildLevelState
 GistSplitUnion
 GistSplitVector
 GistTsVectorOptions