Buffering GiST index build algorithm.
authorHeikki Linnakangas <heikki.linnakangas@iki.fi>
Thu, 8 Sep 2011 14:51:23 +0000 (17:51 +0300)
committerHeikki Linnakangas <heikki.linnakangas@iki.fi>
Thu, 8 Sep 2011 14:51:23 +0000 (17:51 +0300)
When building a GiST index that doesn't fit in cache, buffers are attached
to some internal nodes in the index. This speeds up the build by avoiding
random I/O that would otherwise be needed to traverse all the way down the
tree to the find right leaf page for tuple.

Alexander Korotkov

doc/src/sgml/gist.sgml
doc/src/sgml/ref/create_index.sgml
src/backend/access/common/reloptions.c
src/backend/access/gist/Makefile
src/backend/access/gist/README
src/backend/access/gist/gist.c
src/backend/access/gist/gistbuild.c [new file with mode: 0644]
src/backend/access/gist/gistbuildbuffers.c [new file with mode: 0644]
src/backend/access/gist/gistutil.c
src/backend/access/gist/gistxlog.c
src/include/access/gist_private.h

index 78171cfa3fd2140e860834d4aee95ec8c46b4ddc..1b6fa1a8817bb032c44e4bee524c40fc80254123 100644 (file)
@@ -642,6 +642,40 @@ my_distance(PG_FUNCTION_ARGS)
 
   </variablelist>
 
+ <sect2 id="gist-buffering-build">
+  <title>GiST buffering build</title>
+  <para>
+   Building large GiST indexes by simply inserting all the tuples tends to be
+   slow, because if the index tuples are scattered across the index and the
+   index is large enough to not fit in cache, the insertions need to perform
+   a lot of random I/O. PostgreSQL from version 9.2 supports a more efficient
+   method to build GiST indexes based on buffering, which can dramatically
+   reduce number of random I/O needed for non-ordered data sets. For
+   well-ordered datasets the benefit is smaller or non-existent, because
+   only a small number of pages receive new tuples at a time, and those pages
+   fit in cache even if the index as whole does not.
+  </para>
+
+  <para>
+   However, buffering index build needs to call the <function>penalty</>
+   function more often, which consumes some extra CPU resources. Also, the
+   buffers used in the buffering build need temporary disk space, up to
+   the size of the resulting index. Buffering can also infuence the quality
+   of the produced index, in both positive and negative directions. That
+   influence depends on various factors, like the distribution of the input
+   data and operator class implementation.
+  </para>
+
+  <para>
+   By default, the index build switches to the buffering method when the
+   index size reaches <xref linkend="guc-effective-cache-size">. It can
+   be manually turned on or off by the <literal>BUFFERING</literal> parameter
+   to the CREATE INDEX clause. The default behavior is good for most cases,
+   but turning buffering off might speed up the build somewhat if the input
+   data is ordered.
+  </para>
+
+ </sect2>
 </sect1>
 
 <sect1 id="gist-examples">
index 1a1e8d60d75493f2eb9ed73fa3e8eec81efe2106..2cfc9f30f16639e6cc3d3c4fbe1afd73e9454eff 100644 (file)
@@ -340,6 +340,26 @@ CREATE [ UNIQUE ] INDEX [ CONCURRENTLY ] [ <replaceable class="parameter">name</
     </listitem>
    </varlistentry>
 
+   </variablelist>
+   <para>
+    GiST indexes additionaly accepts parameters:
+   </para>
+
+   <variablelist>
+
+   <varlistentry>
+    <term><literal>BUFFERING</></term>
+    <listitem>
+    <para>
+     Determines whether the buffering build technique described in
+     <xref linkend="gist-buffering-build"> is used to build the index. With
+     <literal>OFF</> it is disabled, with <literal>ON</> it is enabled, and
+     with <literal>AUTO</> it is initially disabled, but turned on
+     on-the-fly once the index size reaches <xref linkend="guc-effective-cache-size">. The default is <literal>AUTO</>.
+    </para>
+    </listitem>
+   </varlistentry>
+
    </variablelist>
   </refsect2>
 
index 900b222865e15ae482d0ac46b8ed8a159275de23..240e178b3b438cec5a9a10bc57a13d9b3e57ec90 100644 (file)
@@ -219,6 +219,17 @@ static relopt_real realRelOpts[] =
 
 static relopt_string stringRelOpts[] =
 {
+       {
+               {
+                       "buffering",
+                       "Enables buffering build for this GiST index",
+                       RELOPT_KIND_GIST
+               },
+               4,
+               false,
+               gistValidateBufferingOption,
+               "auto"
+       },
        /* list terminator */
        {{NULL}}
 };
index f8051a2b45c8c25d923ea9412ca449dedf345536..cc9468ffb196041c96976e1d55c186f00897af01 100644 (file)
@@ -13,6 +13,6 @@ top_builddir = ../../../..
 include $(top_builddir)/src/Makefile.global
 
 OBJS = gist.o gistutil.o gistxlog.o gistvacuum.o gistget.o gistscan.o \
-       gistproc.o gistsplit.o
+       gistproc.o gistsplit.o gistbuild.o gistbuildbuffers.o
 
 include $(top_srcdir)/src/backend/common.mk
index 2d78dcb0dfaf21e348bcca3da8ba3829bcac5288..4bcac1f2c795d07874bb2f6da0413e4d684e5496 100644 (file)
@@ -24,6 +24,7 @@ The current implementation of GiST supports:
   * provides NULL-safe interface to GiST core
   * Concurrency
   * Recovery support via WAL logging
+  * Buffering build algorithm
 
 The support for concurrency implemented in PostgreSQL was developed based on
 the paper "Access Methods for Next-Generation Database Systems" by
@@ -31,6 +32,12 @@ Marcel Kornaker:
 
     http://www.sai.msu.su/~megera/postgres/gist/papers/concurrency/access-methods-for-next-generation.pdf.gz
 
+Buffering build algorithm for GiST was developed based on the paper "Efficient
+Bulk Operations on Dynamic R-trees" by Lars Arge, Klaus Hinrichs, Jan Vahrenhold
+and Jeffrey Scott Vitter.
+
+    http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.135.9894&rep=rep1&type=pdf
+
 The original algorithms were modified in several ways:
 
 * They had to be adapted to PostgreSQL conventions. For example, the SEARCH
@@ -278,6 +285,134 @@ would complicate the insertion algorithm. So when an insertion sees a page
 with F_FOLLOW_RIGHT set, it immediately tries to bring the split that
 crashed in the middle to completion by adding the downlink in the parent.
 
+Buffering build algorithm
+-------------------------
+
+In the buffering index build algorithm, some or all internal nodes have a
+buffer attached to them. When a tuple is inserted at the top, the descend down
+the tree is stopped as soon as a buffer is reached, and the tuple is pushed to
+the buffer. When a buffer gets too full, all the tuples in it are flushed to
+the lower level, where they again hit lower level buffers or leaf pages. This
+makes the insertions happen in more of a breadth-first than depth-first order,
+which greatly reduces the amount of random I/O required.
+
+In the algorithm, levels are numbered so that leaf pages have level zero,
+and internal node levels count up from 1. This numbering ensures that a page's
+level number never changes, even when the root page is split.
+
+Level                    Tree
+
+3                         *
+                      /       \
+2                *                 *
+              /  |  \           /  |  \
+1          *     *     *     *     *     *
+          / \   / \   / \   / \   / \   / \
+0        o   o o   o o   o o   o o   o o   o
+
+* - internal page
+o - leaf page
+
+Internal pages that belong to certain levels have buffers associated with
+them. Leaf pages never have buffers. Which levels have buffers is controlled
+by "level step" parameter: level numbers that are multiples of level_step
+have buffers, while others do not. For example, if level_step = 2, then
+pages on levels 2, 4, 6, ... have buffers. If level_step = 1 then every
+internal page has a buffer.
+
+Level        Tree (level_step = 1)                Tree (level_step = 2)
+
+3                      *                                     *
+                   /       \                             /       \
+2             *(b)              *(b)                *(b)              *(b)
+           /  |  \           /  |  \             /  |  \           /  |  \
+1       *(b)  *(b)  *(b)  *(b)  *(b)  *(b)    *     *     *     *     *     *
+       / \   / \   / \   / \   / \   / \     / \   / \   / \   / \   / \   / \
+0     o   o o   o o   o o   o o   o o   o   o   o o   o o   o o   o o   o o   o
+
+(b) - buffer
+
+Logically, a buffer is just bunch of tuples. Physically, it is divided in
+pages, backed by a temporary file. Each buffer can be in one of two states:
+a) Last page of the buffer is kept in main memory. A node buffer is
+automatically switched to this state when a new index tuple is added to it,
+or a tuple is removed from it.
+b) All pages of the buffer are swapped out to disk. When a buffer becomes too
+full, and we start to flush it, all other buffers are switched to this state.
+
+When an index tuple is inserted, its initial processing can end in one of the
+following points:
+1) Leaf page, if the depth of the index <= level_step, meaning that
+   none of the internal pages have buffers associated with them.
+2) Buffer of topmost level page that has buffers.
+
+New index tuples are processed until one of the buffers in the topmost
+buffered level becomes half-full. When a buffer becomes half-full, it's added
+to the emptying queue, and will be emptied before a new tuple is processed.
+
+Buffer emptying process means that index tuples from the buffer are moved
+into buffers at a lower level, or leaf pages. First, all the other buffers are
+swapped to disk to free up the memory. Then tuples are popped from the buffer
+one by one, and cascaded down the tree to the next buffer or leaf page below
+the buffered node.
+
+Emptying a buffer has the interesting dynamic property that any intermediate
+pages between the buffer being emptied, and the next buffered or leaf level
+below it, become cached. If there are no more buffers below the node, the leaf
+pages where the tuples finally land on get cached too. If there are, the last
+buffer page of each buffer below is kept in memory. This is illustrated in
+the figures below:
+
+   Buffer being emptied to
+     lower-level buffers               Buffer being emptied to leaf pages
+
+               +(fb)                                 +(fb)
+            /     \                                /     \
+        +             +                        +             +
+      /   \         /   \                    /   \         /   \
+    *(ab)   *(ab) *(ab)   *(ab)            x       x     x       x
+
++    - cached internal page
+x    - cached leaf page
+*    - non-cached internal page
+(fb) - buffer being emptied
+(ab) - buffers being appended to, with last page in memory
+
+In the beginning of the index build, the level-step is chosen so that all those
+pages involved in emptying one buffer fit in cache, so after each of those
+pages have been accessed once and cached, emptying a buffer doesn't involve
+any more I/O. This locality is where the speedup of the buffering algorithm
+comes from.
+
+Emptying one buffer can fill up one or more of the lower-level buffers,
+triggering emptying of them as well. Whenever a buffer becomes too full, it's
+added to the emptying queue, and will be emptied after the current buffer has
+been processed.
+
+To keep the size of each buffer limited even in the worst case, buffer emptying
+is scheduled as soon as a buffer becomes half-full, and emptying it continues
+until 1/2 of the nominal buffer size worth of tuples has been emptied. This
+guarantees that when buffer emptying begins, all the lower-level buffers
+are at most half-full. In the worst case that all the tuples are cascaded down
+to the same lower-level buffer, that buffer therefore has enough space to
+accommodate all the tuples emptied from the upper-level buffer. There is no
+hard size limit in any of the data structures used, though, so this only needs
+to be approximate; small overfilling of some buffers doesn't matter.
+
+If an internal page that has a buffer associated with it is split, the buffer
+needs to be split too. All tuples in the buffer are scanned through and
+relocated to the correct sibling buffers, using the penalty function to decide
+which buffer each tuple should go to.
+
+After all tuples from the heap have been processed, there are still some index
+tuples in the buffers. At this point, final buffer emptying starts. All buffers
+are emptied in top-down order. This is slightly complicated by the fact that
+new buffers can be allocated during the emptying, due to page splits. However,
+the new buffers will always be siblings of buffers that haven't been fully
+emptied yet; tuples never move upwards in the tree. The final emptying loops
+through buffers at a given level until all buffers at that level have been
+emptied, and then moves down to the next level.
+
 
 Authors:
        Teodor Sigaev   <teodor@sigaev.ru>
index 4fc7a213b6d1ae8ecc27e716bc1ff80702a7061c..24f30099a1cb291f9d15884eb0b2715574bd5bba 100644 (file)
 #include "utils/memutils.h"
 #include "utils/rel.h"
 
-/* Working state for gistbuild and its callback */
-typedef struct
-{
-       GISTSTATE       giststate;
-       int                     numindexattrs;
-       double          indtuples;
-       MemoryContext tmpCtx;
-} GISTBuildState;
-
-/* A List of these is used represent a split-in-progress. */
-typedef struct
-{
-       Buffer          buf;                    /* the split page "half" */
-       IndexTuple      downlink;               /* downlink for this half. */
-} GISTPageSplitInfo;
-
 /* non-export function prototypes */
-static void gistbuildCallback(Relation index,
-                                 HeapTuple htup,
-                                 Datum *values,
-                                 bool *isnull,
-                                 bool tupleIsAlive,
-                                 void *state);
-static void gistdoinsert(Relation r,
-                        IndexTuple itup,
-                        Size freespace,
-                        GISTSTATE *GISTstate);
 static void gistfixsplit(GISTInsertState *state, GISTSTATE *giststate);
 static bool gistinserttuples(GISTInsertState *state, GISTInsertStack *stack,
                                 GISTSTATE *giststate,
@@ -88,138 +62,6 @@ createTempGistContext(void)
                                                                 ALLOCSET_DEFAULT_MAXSIZE);
 }
 
-/*
- * Routine to build an index.  Basically calls insert over and over.
- *
- * XXX: it would be nice to implement some sort of bulk-loading
- * algorithm, but it is not clear how to do that.
- */
-Datum
-gistbuild(PG_FUNCTION_ARGS)
-{
-       Relation        heap = (Relation) PG_GETARG_POINTER(0);
-       Relation        index = (Relation) PG_GETARG_POINTER(1);
-       IndexInfo  *indexInfo = (IndexInfo *) PG_GETARG_POINTER(2);
-       IndexBuildResult *result;
-       double          reltuples;
-       GISTBuildState buildstate;
-       Buffer          buffer;
-       Page            page;
-
-       /*
-        * We expect to be called exactly once for any index relation. If that's
-        * not the case, big trouble's what we have.
-        */
-       if (RelationGetNumberOfBlocks(index) != 0)
-               elog(ERROR, "index \"%s\" already contains data",
-                        RelationGetRelationName(index));
-
-       /* no locking is needed */
-       initGISTstate(&buildstate.giststate, index);
-
-       /* initialize the root page */
-       buffer = gistNewBuffer(index);
-       Assert(BufferGetBlockNumber(buffer) == GIST_ROOT_BLKNO);
-       page = BufferGetPage(buffer);
-
-       START_CRIT_SECTION();
-
-       GISTInitBuffer(buffer, F_LEAF);
-
-       MarkBufferDirty(buffer);
-
-       if (RelationNeedsWAL(index))
-       {
-               XLogRecPtr      recptr;
-               XLogRecData rdata;
-
-               rdata.data = (char *) &(index->rd_node);
-               rdata.len = sizeof(RelFileNode);
-               rdata.buffer = InvalidBuffer;
-               rdata.next = NULL;
-
-               recptr = XLogInsert(RM_GIST_ID, XLOG_GIST_CREATE_INDEX, &rdata);
-               PageSetLSN(page, recptr);
-               PageSetTLI(page, ThisTimeLineID);
-       }
-       else
-               PageSetLSN(page, GetXLogRecPtrForTemp());
-
-       UnlockReleaseBuffer(buffer);
-
-       END_CRIT_SECTION();
-
-       /* build the index */
-       buildstate.numindexattrs = indexInfo->ii_NumIndexAttrs;
-       buildstate.indtuples = 0;
-
-       /*
-        * create a temporary memory context that is reset once for each tuple
-        * inserted into the index
-        */
-       buildstate.tmpCtx = createTempGistContext();
-
-       /* do the heap scan */
-       reltuples = IndexBuildHeapScan(heap, index, indexInfo, true,
-                                                                  gistbuildCallback, (void *) &buildstate);
-
-       /* okay, all heap tuples are indexed */
-       MemoryContextDelete(buildstate.tmpCtx);
-
-       freeGISTstate(&buildstate.giststate);
-
-       /*
-        * Return statistics
-        */
-       result = (IndexBuildResult *) palloc(sizeof(IndexBuildResult));
-
-       result->heap_tuples = reltuples;
-       result->index_tuples = buildstate.indtuples;
-
-       PG_RETURN_POINTER(result);
-}
-
-/*
- * Per-tuple callback from IndexBuildHeapScan
- */
-static void
-gistbuildCallback(Relation index,
-                                 HeapTuple htup,
-                                 Datum *values,
-                                 bool *isnull,
-                                 bool tupleIsAlive,
-                                 void *state)
-{
-       GISTBuildState *buildstate = (GISTBuildState *) state;
-       IndexTuple      itup;
-       MemoryContext oldCtx;
-
-       oldCtx = MemoryContextSwitchTo(buildstate->tmpCtx);
-
-       /* form an index tuple and point it at the heap tuple */
-       itup = gistFormTuple(&buildstate->giststate, index,
-                                                values, isnull, true /* size is currently bogus */ );
-       itup->t_tid = htup->t_self;
-
-       /*
-        * Since we already have the index relation locked, we call gistdoinsert
-        * directly.  Normal access method calls dispatch through gistinsert,
-        * which locks the relation for write.  This is the right thing to do if
-        * you're inserting single tups, but not when you're initializing the
-        * whole index at once.
-        *
-        * In this path we respect the fillfactor setting, whereas insertions
-        * after initial build do not.
-        */
-       gistdoinsert(index, itup,
-                         RelationGetTargetPageFreeSpace(index, GIST_DEFAULT_FILLFACTOR),
-                                &buildstate->giststate);
-
-       buildstate->indtuples += 1;
-       MemoryContextSwitchTo(oldCtx);
-       MemoryContextReset(buildstate->tmpCtx);
-}
-
 /*
  *     gistbuildempty() -- build an empty gist index in the initialization fork
  */
@@ -285,6 +127,11 @@ gistinsert(PG_FUNCTION_ARGS)
  * to the right of 'leftchildbuf', or updating the downlink for 'leftchildbuf'.
  * F_FOLLOW_RIGHT flag on 'leftchildbuf' is cleared and NSN is set.
  *
+ * If 'markfollowright' is true and the page is split, the left child is
+ * marked with F_FOLLOW_RIGHT flag. That is the normal case. During buffered
+ * index build, however, there is no concurrent access and the page splitting
+ * is done in a slightly simpler fashion, and false is passed.
+ *
  * If there is not enough room on the page, it is split. All the split
  * pages are kept pinned and locked and returned in *splitinfo, the caller
  * is responsible for inserting the downlinks for them. However, if
@@ -293,13 +140,16 @@ gistinsert(PG_FUNCTION_ARGS)
  * In that case, we continue to hold the root page locked, and the child
  * pages are released; note that new tuple(s) are *not* on the root page
  * but in one of the new child pages.
+ *
+ * Returns 'true' if the page was split, 'false' otherwise.
  */
-static bool
-gistplacetopage(GISTInsertState *state, GISTSTATE *giststate,
+bool
+gistplacetopage(Relation rel, Size freespace, GISTSTATE *giststate,
                                Buffer buffer,
                                IndexTuple *itup, int ntup, OffsetNumber oldoffnum,
                                Buffer leftchildbuf,
-                               List **splitinfo)
+                               List **splitinfo,
+                               bool markfollowright)
 {
        Page            page = BufferGetPage(buffer);
        bool            is_leaf = (GistPageIsLeaf(page)) ? true : false;
@@ -331,7 +181,7 @@ gistplacetopage(GISTInsertState *state, GISTSTATE *giststate,
         * one-element todelete array; in the split case, it's handled implicitly
         * because the tuple vector passed to gistSplit won't include this tuple.
         */
-       is_split = gistnospace(page, itup, ntup, oldoffnum, state->freespace);
+       is_split = gistnospace(page, itup, ntup, oldoffnum, freespace);
        if (is_split)
        {
                /* no space for insertion */
@@ -362,7 +212,7 @@ gistplacetopage(GISTInsertState *state, GISTSTATE *giststate,
                                memmove(itvec + pos, itvec + pos + 1, sizeof(IndexTuple) * (tlen - pos));
                }
                itvec = gistjoinvector(itvec, &tlen, itup, ntup);
-               dist = gistSplit(state->r, page, itvec, tlen, giststate);
+               dist = gistSplit(rel, page, itvec, tlen, giststate);
 
                /*
                 * Set up pages to work with. Allocate new buffers for all but the
@@ -392,7 +242,7 @@ gistplacetopage(GISTInsertState *state, GISTSTATE *giststate,
                for (; ptr; ptr = ptr->next)
                {
                        /* Allocate new page */
-                       ptr->buffer = gistNewBuffer(state->r);
+                       ptr->buffer = gistNewBuffer(rel);
                        GISTInitBuffer(ptr->buffer, (is_leaf) ? F_LEAF : 0);
                        ptr->page = BufferGetPage(ptr->buffer);
                        ptr->block.blkno = BufferGetBlockNumber(ptr->buffer);
@@ -463,7 +313,7 @@ gistplacetopage(GISTInsertState *state, GISTSTATE *giststate,
                        for (i = 0; i < ptr->block.num; i++)
                        {
                                if (PageAddItem(ptr->page, (Item) data, IndexTupleSize((IndexTuple) data), i + FirstOffsetNumber, false, false) == InvalidOffsetNumber)
-                                       elog(ERROR, "failed to add item to index page in \"%s\"", RelationGetRelationName(state->r));
+                                       elog(ERROR, "failed to add item to index page in \"%s\"", RelationGetRelationName(rel));
                                data += IndexTupleSize((IndexTuple) data);
                        }
 
@@ -474,7 +324,15 @@ gistplacetopage(GISTInsertState *state, GISTSTATE *giststate,
                        else
                                GistPageGetOpaque(ptr->page)->rightlink = oldrlink;
 
-                       if (ptr->next && !is_rootsplit)
+                       /*
+                        * Mark the all but the right-most page with the follow-right
+                        * flag. It will be cleared as soon as the downlink is inserted
+                        * into the parent, but this ensures that if we error out before
+                        * that, the index is still consistent. (in buffering build mode,
+                        * any error will abort the index build anyway, so this is not
+                        * needed.)
+                        */
+                       if (ptr->next && !is_rootsplit && markfollowright)
                                GistMarkFollowRight(ptr->page);
                        else
                                GistClearFollowRight(ptr->page);
@@ -506,9 +364,10 @@ gistplacetopage(GISTInsertState *state, GISTSTATE *giststate,
                dist->page = BufferGetPage(dist->buffer);
 
                /* Write the WAL record */
-               if (RelationNeedsWAL(state->r))
-                       recptr = gistXLogSplit(state->r->rd_node, blkno, is_leaf,
-                                                                  dist, oldrlink, oldnsn, leftchildbuf);
+               if (RelationNeedsWAL(rel))
+                       recptr = gistXLogSplit(rel->rd_node, blkno, is_leaf,
+                                                                  dist, oldrlink, oldnsn, leftchildbuf,
+                                                                  markfollowright);
                else
                        recptr = GetXLogRecPtrForTemp();
 
@@ -547,7 +406,7 @@ gistplacetopage(GISTInsertState *state, GISTSTATE *giststate,
                if (BufferIsValid(leftchildbuf))
                        MarkBufferDirty(leftchildbuf);
 
-               if (RelationNeedsWAL(state->r))
+               if (RelationNeedsWAL(rel))
                {
                        OffsetNumber ndeloffs = 0,
                                                deloffs[1];
@@ -558,7 +417,7 @@ gistplacetopage(GISTInsertState *state, GISTSTATE *giststate,
                                ndeloffs = 1;
                        }
 
-                       recptr = gistXLogUpdate(state->r->rd_node, buffer,
+                       recptr = gistXLogUpdate(rel->rd_node, buffer,
                                                                        deloffs, ndeloffs, itup, ntup,
                                                                        leftchildbuf);
 
@@ -570,8 +429,6 @@ gistplacetopage(GISTInsertState *state, GISTSTATE *giststate,
                        recptr = GetXLogRecPtrForTemp();
                        PageSetLSN(page, recptr);
                }
-
-               *splitinfo = NIL;
        }
 
        /*
@@ -608,7 +465,7 @@ gistplacetopage(GISTInsertState *state, GISTSTATE *giststate,
  * this routine assumes it is invoked in a short-lived memory context,
  * so it does not bother releasing palloc'd allocations.
  */
-static void
+void
 gistdoinsert(Relation r, IndexTuple itup, Size freespace, GISTSTATE *giststate)
 {
        ItemId          iid;
@@ -1192,10 +1049,12 @@ gistinserttuples(GISTInsertState *state, GISTInsertStack *stack,
        List       *splitinfo;
        bool            is_split;
 
-       is_split = gistplacetopage(state, giststate, stack->buffer,
+       is_split = gistplacetopage(state->r, state->freespace, giststate,
+                                                          stack->buffer,
                                                           tuples, ntup, oldoffnum,
                                                           leftchild,
-                                                          &splitinfo);
+                                                          &splitinfo,
+                                                          true);
        if (splitinfo)
                gistfinishsplit(state, stack, giststate, splitinfo);
 
diff --git a/src/backend/access/gist/gistbuild.c b/src/backend/access/gist/gistbuild.c
new file mode 100644 (file)
index 0000000..8319238
--- /dev/null
@@ -0,0 +1,1068 @@
+/*-------------------------------------------------------------------------
+ *
+ * gistbuild.c
+ *       build algorithm for GiST indexes implementation.
+ *
+ *
+ * Portions Copyright (c) 1996-2011, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * IDENTIFICATION
+ *       src/backend/access/gist/gistbuild.c
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include "access/genam.h"
+#include "access/gist_private.h"
+#include "catalog/index.h"
+#include "miscadmin.h"
+#include "optimizer/cost.h"
+#include "storage/bufmgr.h"
+#include "storage/smgr.h"
+#include "utils/memutils.h"
+#include "utils/rel.h"
+
+/* Step of index tuples for check whether to switch to buffering build mode */
+#define BUFFERING_MODE_SWITCH_CHECK_STEP 256
+
+/*
+ * Number of tuples to process in the slow way before switching to buffering
+ * mode, when buffering is explicitly turned on. Also, the number of tuples
+ * to process between readjusting the buffer size parameter, while in
+ * buffering mode.
+ */
+#define BUFFERING_MODE_TUPLE_SIZE_STATS_TARGET 4096
+
+typedef enum
+{
+       GIST_BUFFERING_DISABLED,        /* in regular build mode and aren't going to
+                                                                * switch */
+       GIST_BUFFERING_AUTO,            /* in regular build mode, but will switch to
+                                                                * buffering build mode if the index grows too
+                                                                * big */
+       GIST_BUFFERING_STATS,           /* gathering statistics of index tuple size
+                                                                * before switching to the buffering build
+                                                                * mode */
+       GIST_BUFFERING_ACTIVE           /* in buffering build mode */
+}      GistBufferingMode;
+
+/* Working state for gistbuild and its callback */
+typedef struct
+{
+       Relation        indexrel;
+       GISTSTATE       giststate;
+       GISTBuildBuffers *gfbb;
+
+       int64           indtuples;              /* number of tuples indexed */
+       int64           indtuplesSize;  /* total size of all indexed tuples */
+
+       Size            freespace;              /* amount of free space to leave on pages */
+
+       GistBufferingMode bufferingMode;
+       MemoryContext tmpCtx;
+} GISTBuildState;
+
+static void gistInitBuffering(GISTBuildState *buildstate);
+static int     calculatePagesPerBuffer(GISTBuildState *buildstate, int levelStep);
+static void gistBuildCallback(Relation index,
+                                 HeapTuple htup,
+                                 Datum *values,
+                                 bool *isnull,
+                                 bool tupleIsAlive,
+                                 void *state);
+static void gistBufferingBuildInsert(GISTBuildState *buildstate,
+                                                IndexTuple itup);
+static bool gistProcessItup(GISTBuildState *buildstate, IndexTuple itup,
+                               GISTBufferingInsertStack *startparent);
+static void gistbufferinginserttuples(GISTBuildState *buildstate,
+                                                 Buffer buffer,
+                                                 IndexTuple *itup, int ntup, OffsetNumber oldoffnum,
+                                                 GISTBufferingInsertStack *path);
+static void gistBufferingFindCorrectParent(GISTBuildState *buildstate,
+                                                          GISTBufferingInsertStack *child);
+static void gistProcessEmptyingQueue(GISTBuildState *buildstate);
+static void gistEmptyAllBuffers(GISTBuildState *buildstate);
+static void gistFreeUnreferencedPath(GISTBufferingInsertStack *path);
+static int     gistGetMaxLevel(Relation index);
+
+
+/*
+ * Main entry point to GiST index build. Initially calls insert over and over,
+ * but switches to more efficient buffering build algorithm after a certain
+ * number of tuples (unless buffering mode is disabled).
+ */
+Datum
+gistbuild(PG_FUNCTION_ARGS)
+{
+       Relation        heap = (Relation) PG_GETARG_POINTER(0);
+       Relation        index = (Relation) PG_GETARG_POINTER(1);
+       IndexInfo  *indexInfo = (IndexInfo *) PG_GETARG_POINTER(2);
+       IndexBuildResult *result;
+       double          reltuples;
+       GISTBuildState buildstate;
+       Buffer          buffer;
+       Page            page;
+       MemoryContext oldcxt = CurrentMemoryContext;
+       int                     fillfactor;
+
+       buildstate.indexrel = index;
+       if (index->rd_options)
+       {
+               /* Get buffering mode from the options string */
+               GiSTOptions *options = (GiSTOptions *) index->rd_options;
+               char       *bufferingMode = (char *) options + options->bufferingModeOffset;
+
+               if (strcmp(bufferingMode, "on") == 0)
+                       buildstate.bufferingMode = GIST_BUFFERING_STATS;
+               else if (strcmp(bufferingMode, "off") == 0)
+                       buildstate.bufferingMode = GIST_BUFFERING_DISABLED;
+               else
+                       buildstate.bufferingMode = GIST_BUFFERING_AUTO;
+
+               fillfactor = options->fillfactor;
+       }
+       else
+       {
+               /*
+                * By default, switch to buffering mode when the index grows too large
+                * to fit in cache.
+                */
+               buildstate.bufferingMode = GIST_BUFFERING_AUTO;
+               fillfactor = GIST_DEFAULT_FILLFACTOR;
+       }
+       /* Calculate target amount of free space to leave on pages */
+       buildstate.freespace = BLCKSZ * (100 - fillfactor) / 100;
+
+       /*
+        * We expect to be called exactly once for any index relation. If that's
+        * not the case, big trouble's what we have.
+        */
+       if (RelationGetNumberOfBlocks(index) != 0)
+               elog(ERROR, "index \"%s\" already contains data",
+                        RelationGetRelationName(index));
+
+       /* no locking is needed */
+       initGISTstate(&buildstate.giststate, index);
+
+       /* initialize the root page */
+       buffer = gistNewBuffer(index);
+       Assert(BufferGetBlockNumber(buffer) == GIST_ROOT_BLKNO);
+       page = BufferGetPage(buffer);
+
+       START_CRIT_SECTION();
+
+       GISTInitBuffer(buffer, F_LEAF);
+
+       MarkBufferDirty(buffer);
+
+       if (RelationNeedsWAL(index))
+       {
+               XLogRecPtr      recptr;
+               XLogRecData rdata;
+
+               rdata.data = (char *) &(index->rd_node);
+               rdata.len = sizeof(RelFileNode);
+               rdata.buffer = InvalidBuffer;
+               rdata.next = NULL;
+
+               recptr = XLogInsert(RM_GIST_ID, XLOG_GIST_CREATE_INDEX, &rdata);
+               PageSetLSN(page, recptr);
+               PageSetTLI(page, ThisTimeLineID);
+       }
+       else
+               PageSetLSN(page, GetXLogRecPtrForTemp());
+
+       UnlockReleaseBuffer(buffer);
+
+       END_CRIT_SECTION();
+
+       /* build the index */
+       buildstate.indtuples = 0;
+       buildstate.indtuplesSize = 0;
+
+       /*
+        * create a temporary memory context that is reset once for each tuple
+        * processed.
+        */
+       buildstate.tmpCtx = createTempGistContext();
+
+       /*
+        * Do the heap scan.
+        */
+       reltuples = IndexBuildHeapScan(heap, index, indexInfo, true,
+                                                                  gistBuildCallback, (void *) &buildstate);
+
+       /*
+        * If buffering was used, flush out all the tuples that are still in the
+        * buffers.
+        */
+       if (buildstate.bufferingMode == GIST_BUFFERING_ACTIVE)
+       {
+               elog(DEBUG1, "all tuples processed, emptying buffers");
+               gistEmptyAllBuffers(&buildstate);
+       }
+
+       /* okay, all heap tuples are indexed */
+       MemoryContextSwitchTo(oldcxt);
+       MemoryContextDelete(buildstate.tmpCtx);
+
+       freeGISTstate(&buildstate.giststate);
+
+       /*
+        * Return statistics
+        */
+       result = (IndexBuildResult *) palloc(sizeof(IndexBuildResult));
+
+       result->heap_tuples = reltuples;
+       result->index_tuples = (double) buildstate.indtuples;
+
+       PG_RETURN_POINTER(result);
+}
+
+/*
+ * Validator for "buffering" reloption on GiST indexes. Allows "on", "off"
+ * and "auto" values.
+ */
+void
+gistValidateBufferingOption(char *value)
+{
+       if (value == NULL ||
+               (strcmp(value, "on") != 0 &&
+                strcmp(value, "off") != 0 &&
+                strcmp(value, "auto") != 0))
+       {
+               ereport(ERROR,
+                               (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+                                errmsg("invalid value for \"buffering\" option"),
+                          errdetail("Valid values are \"on\", \"off\" and \"auto\".")));
+       }
+}
+
+/*
+ * Attempt to switch to buffering mode.
+ *
+ * If there is not enough memory for buffering build, sets bufferingMode
+ * to GIST_BUFFERING_DISABLED, so that we don't bother to try the switch
+ * anymore. Otherwise initializes the build buffers, and sets bufferingMode to
+ * GIST_BUFFERING_ACTIVE.
+ */
+static void
+gistInitBuffering(GISTBuildState *buildstate)
+{
+       Relation        index = buildstate->indexrel;
+       int                     pagesPerBuffer;
+       Size            pageFreeSpace;
+       Size            itupAvgSize,
+                               itupMinSize;
+       double          avgIndexTuplesPerPage,
+                               maxIndexTuplesPerPage;
+       int                     i;
+       int                     levelStep;
+
+       /* Calc space of index page which is available for index tuples */
+       pageFreeSpace = BLCKSZ - SizeOfPageHeaderData - sizeof(GISTPageOpaqueData)
+               - sizeof(ItemIdData)
+               - buildstate->freespace;
+
+       /*
+        * Calculate average size of already inserted index tuples using gathered
+        * statistics.
+        */
+       itupAvgSize = (double) buildstate->indtuplesSize /
+               (double) buildstate->indtuples;
+
+       /*
+        * Calculate minimal possible size of index tuple by index metadata.
+        * Minimal possible size of varlena is VARHDRSZ.
+        *
+        * XXX: that's not actually true, as a short varlen can be just 2 bytes.
+        * And we should take padding into account here.
+        */
+       itupMinSize = (Size) MAXALIGN(sizeof(IndexTupleData));
+       for (i = 0; i < index->rd_att->natts; i++)
+       {
+               if (index->rd_att->attrs[i]->attlen < 0)
+                       itupMinSize += VARHDRSZ;
+               else
+                       itupMinSize += index->rd_att->attrs[i]->attlen;
+       }
+
+       /* Calculate average and maximal number of index tuples which fit to page */
+       avgIndexTuplesPerPage = pageFreeSpace / itupAvgSize;
+       maxIndexTuplesPerPage = pageFreeSpace / itupMinSize;
+
+       /*
+        * We need to calculate two parameters for the buffering algorithm:
+        * levelStep and pagesPerBuffer.
+        *
+        * levelStep determines the size of subtree that we operate on, while
+        * emptying a buffer. A higher value is better, as you need fewer buffer
+        * emptying steps to build the index. However, if you set it too high, the
+        * subtree doesn't fit in cache anymore, and you quickly lose the benefit
+        * of the buffers.
+        *
+        * In Arge et al's paper, levelStep is chosen as logB(M/4B), where B is
+        * the number of tuples on page (ie. fanout), and M is the amount of
+        * internal memory available. Curiously, they doesn't explain *why* that
+        * setting is optimal. We calculate it by taking the highest levelStep so
+        * that a subtree still fits in cache. For a small B, our way of
+        * calculating levelStep is very close to Arge et al's formula. For a
+        * large B, our formula gives a value that is 2x higher.
+        *
+        * The average size of a subtree of depth n can be calculated as a
+        * geometric series:
+        *
+        * B^0 + B^1 + B^2 + ... + B^n = (1 - B^(n + 1)) / (1 - B)
+        *
+        * where B is the average number of index tuples on page. The subtree is
+        * cached in the shared buffer cache and the OS cache, so we choose
+        * levelStep so that the subtree size is comfortably smaller than
+        * effective_cache_size, with a safety factor of 4.
+        *
+        * The estimate on the average number of index tuples on page is based on
+        * average tuple sizes observed before switching to buffered build, so the
+        * real subtree size can be somewhat larger. Also, it would selfish to
+        * gobble the whole cache for our index build. The safety factor of 4
+        * should account for those effects.
+        *
+        * The other limiting factor for setting levelStep is that while
+        * processing a subtree, we need to hold one page for each buffer at the
+        * next lower buffered level. The max. number of buffers needed for that
+        * is maxIndexTuplesPerPage^levelStep. This is very conservative, but
+        * hopefully maintenance_work_mem is set high enough that you're
+        * constrained by effective_cache_size rather than maintenance_work_mem.
+        *
+        * XXX: the buffer hash table consumes a fair amount of memory too per
+        * buffer, but that is not currently taken into account. That scales on
+        * the total number of buffers used, ie. the index size and on levelStep.
+        * Note that a higher levelStep *reduces* the amount of memory needed for
+        * the hash table.
+        */
+       levelStep = 1;
+       while (
+       /* subtree must fit in cache (with safety factor of 4) */
+                  (1 - pow(avgIndexTuplesPerPage, (double) (levelStep + 1))) / (1 - avgIndexTuplesPerPage) < effective_cache_size / 4
+                  &&
+       /* each node in the lowest level of a subtree has one page in memory */
+                  (pow(maxIndexTuplesPerPage, (double) levelStep) < (maintenance_work_mem * 1024) / BLCKSZ)
+               )
+       {
+               levelStep++;
+       }
+
+       /*
+        * We just reached an unacceptable value of levelStep in previous loop.
+        * So, decrease levelStep to get last acceptable value.
+        */
+       levelStep--;
+
+       /*
+        * If there's not enough cache or maintenance_work_mem, fall back to plain
+        * inserts.
+        */
+       if (levelStep <= 0)
+       {
+               elog(DEBUG1, "failed to switch to buffered GiST build");
+               buildstate->bufferingMode = GIST_BUFFERING_DISABLED;
+               return;
+       }
+
+       /*
+        * The second parameter to set is pagesPerBuffer, which determines the
+        * size of each buffer. We adjust pagesPerBuffer also during the build,
+        * which is why this calculation is in a separate function.
+        */
+       pagesPerBuffer = calculatePagesPerBuffer(buildstate, levelStep);
+
+       /* Initialize GISTBuildBuffers with these parameters */
+       buildstate->gfbb = gistInitBuildBuffers(pagesPerBuffer, levelStep,
+                                                                                       gistGetMaxLevel(index));
+
+       buildstate->bufferingMode = GIST_BUFFERING_ACTIVE;
+
+       elog(DEBUG1, "switched to buffered GiST build; level step = %d, pagesPerBuffer = %d",
+                levelStep, pagesPerBuffer);
+}
+
+/*
+ * Calculate pagesPerBuffer parameter for the buffering algorithm.
+ *
+ * Buffer size is chosen so that assuming that tuples are distributed
+ * randomly, emptying half a buffer fills on average one page in every buffer
+ * at the next lower level.
+ */
+static int
+calculatePagesPerBuffer(GISTBuildState *buildstate, int levelStep)
+{
+       double          pagesPerBuffer;
+       double          avgIndexTuplesPerPage;
+       double          itupAvgSize;
+       Size            pageFreeSpace;
+
+       /* Calc space of index page which is available for index tuples */
+       pageFreeSpace = BLCKSZ - SizeOfPageHeaderData - sizeof(GISTPageOpaqueData)
+               - sizeof(ItemIdData)
+               - buildstate->freespace;
+
+       /*
+        * Calculate average size of already inserted index tuples using gathered
+        * statistics.
+        */
+       itupAvgSize = (double) buildstate->indtuplesSize /
+               (double) buildstate->indtuples;
+
+       avgIndexTuplesPerPage = pageFreeSpace / itupAvgSize;
+
+       /*
+        * Recalculate required size of buffers.
+        */
+       pagesPerBuffer = 2 * pow(avgIndexTuplesPerPage, levelStep);
+
+       return round(pagesPerBuffer);
+}
+
+/*
+ * Per-tuple callback from IndexBuildHeapScan.
+ */
+static void
+gistBuildCallback(Relation index,
+                                 HeapTuple htup,
+                                 Datum *values,
+                                 bool *isnull,
+                                 bool tupleIsAlive,
+                                 void *state)
+{
+       GISTBuildState *buildstate = (GISTBuildState *) state;
+       IndexTuple      itup;
+       MemoryContext oldCtx;
+
+       oldCtx = MemoryContextSwitchTo(buildstate->tmpCtx);
+
+       /* form an index tuple and point it at the heap tuple */
+       itup = gistFormTuple(&buildstate->giststate, index, values, isnull, true);
+       itup->t_tid = htup->t_self;
+
+       if (buildstate->bufferingMode == GIST_BUFFERING_ACTIVE)
+       {
+               /* We have buffers, so use them. */
+               gistBufferingBuildInsert(buildstate, itup);
+       }
+       else
+       {
+               /*
+                * There's no buffers (yet). Since we already have the index relation
+                * locked, we call gistdoinsert directly.
+                */
+               gistdoinsert(index, itup, buildstate->freespace,
+                                        &buildstate->giststate);
+       }
+
+       /* Update tuple count and total size. */
+       buildstate->indtuples += 1;
+       buildstate->indtuplesSize += IndexTupleSize(itup);
+
+       MemoryContextSwitchTo(oldCtx);
+       MemoryContextReset(buildstate->tmpCtx);
+
+       if (buildstate->bufferingMode == GIST_BUFFERING_ACTIVE &&
+               buildstate->indtuples % BUFFERING_MODE_TUPLE_SIZE_STATS_TARGET == 0)
+       {
+               /* Adjust the target buffer size now */
+               buildstate->gfbb->pagesPerBuffer =
+                       calculatePagesPerBuffer(buildstate, buildstate->gfbb->levelStep);
+       }
+
+       /*
+        * In 'auto' mode, check if the index has grown too large to fit in cache,
+        * and switch to buffering mode if it has.
+        *
+        * To avoid excessive calls to smgrnblocks(), only check this every
+        * BUFFERING_MODE_SWITCH_CHECK_STEP index tuples
+        */
+       if ((buildstate->bufferingMode == GIST_BUFFERING_AUTO &&
+                buildstate->indtuples % BUFFERING_MODE_SWITCH_CHECK_STEP == 0 &&
+                effective_cache_size < smgrnblocks(index->rd_smgr, MAIN_FORKNUM)) ||
+               (buildstate->bufferingMode == GIST_BUFFERING_STATS &&
+                buildstate->indtuples >= BUFFERING_MODE_TUPLE_SIZE_STATS_TARGET))
+       {
+               /*
+                * Index doesn't fit in effective cache anymore. Try to switch to
+                * buffering build mode.
+                */
+               gistInitBuffering(buildstate);
+       }
+}
+
+/*
+ * Insert function for buffering index build.
+ */
+static void
+gistBufferingBuildInsert(GISTBuildState *buildstate, IndexTuple itup)
+{
+       /* Insert the tuple to buffers. */
+       gistProcessItup(buildstate, itup, NULL);
+
+       /* If we filled up (half of a) buffer, process buffer emptying. */
+       gistProcessEmptyingQueue(buildstate);
+}
+
+/*
+ * Process an index tuple. Runs the tuple down the tree until we reach a leaf
+ * page or node buffer, and inserts the tuple there. Returns true if we have
+ * to stop buffer emptying process (because one of child buffers can't take
+ * index tuples anymore).
+ */
+static bool
+gistProcessItup(GISTBuildState *buildstate, IndexTuple itup,
+                               GISTBufferingInsertStack *startparent)
+{
+       GISTSTATE  *giststate = &buildstate->giststate;
+       GISTBuildBuffers *gfbb = buildstate->gfbb;
+       Relation        indexrel = buildstate->indexrel;
+       GISTBufferingInsertStack *path;
+       BlockNumber childblkno;
+       Buffer          buffer;
+       bool            result = false;
+
+       /*
+        * NULL passed in startparent means that we start index tuple processing
+        * from the root.
+        */
+       if (!startparent)
+               path = gfbb->rootitem;
+       else
+               path = startparent;
+
+       /*
+        * Loop until we reach a leaf page (level == 0) or a level with buffers
+        * (not including the level we start at, because we would otherwise make
+        * no progress).
+        */
+       for (;;)
+       {
+               ItemId          iid;
+               IndexTuple      idxtuple,
+                                       newtup;
+               Page            page;
+               OffsetNumber childoffnum;
+               GISTBufferingInsertStack *parent;
+
+               /* Have we reached a level with buffers? */
+               if (LEVEL_HAS_BUFFERS(path->level, gfbb) && path != startparent)
+                       break;
+
+               /* Have we reached a leaf page? */
+               if (path->level == 0)
+                       break;
+
+               /*
+                * Nope. Descend down to the next level then. Choose a child to
+                * descend down to.
+                */
+               buffer = ReadBuffer(indexrel, path->blkno);
+               LockBuffer(buffer, GIST_EXCLUSIVE);
+
+               page = (Page) BufferGetPage(buffer);
+               childoffnum = gistchoose(indexrel, page, itup, giststate);
+               iid = PageGetItemId(page, childoffnum);
+               idxtuple = (IndexTuple) PageGetItem(page, iid);
+               childblkno = ItemPointerGetBlockNumber(&(idxtuple->t_tid));
+
+               /*
+                * Check that the key representing the target child node is consistent
+                * with the key we're inserting. Update it if it's not.
+                */
+               newtup = gistgetadjusted(indexrel, idxtuple, itup, giststate);
+               if (newtup)
+                       gistbufferinginserttuples(buildstate, buffer, &newtup, 1,
+                                                                         childoffnum, path);
+               UnlockReleaseBuffer(buffer);
+
+               /* Create new path item representing current page */
+               parent = path;
+               path = (GISTBufferingInsertStack *) MemoryContextAlloc(gfbb->context,
+                                                                                  sizeof(GISTBufferingInsertStack));
+               path->parent = parent;
+               path->level = parent->level - 1;
+               path->blkno = childblkno;
+               path->downlinkoffnum = childoffnum;
+               path->refCount = 0;             /* it's unreferenced for now */
+
+               /* Adjust reference count of parent */
+               if (parent)
+                       parent->refCount++;
+       }
+
+       if (LEVEL_HAS_BUFFERS(path->level, gfbb))
+       {
+               /*
+                * We've reached level with buffers. Place the index tuple to the
+                * buffer, and add the buffer to the emptying queue if it overflows.
+                */
+               GISTNodeBuffer *childNodeBuffer;
+
+               /* Find the buffer or create a new one */
+               childNodeBuffer = gistGetNodeBuffer(gfbb, giststate, path->blkno,
+                                                                                path->downlinkoffnum, path->parent);
+
+               /* Add index tuple to it */
+               gistPushItupToNodeBuffer(gfbb, childNodeBuffer, itup);
+
+               if (BUFFER_OVERFLOWED(childNodeBuffer, gfbb))
+                       result = true;
+       }
+       else
+       {
+               /*
+                * We've reached a leaf page. Place the tuple here.
+                */
+               buffer = ReadBuffer(indexrel, path->blkno);
+               LockBuffer(buffer, GIST_EXCLUSIVE);
+               gistbufferinginserttuples(buildstate, buffer, &itup, 1,
+                                                                 InvalidOffsetNumber, path);
+               UnlockReleaseBuffer(buffer);
+       }
+
+       /*
+        * Free unreferenced path items, if any. Path item may be referenced by
+        * node buffer.
+        */
+       gistFreeUnreferencedPath(path);
+
+       return result;
+}
+
+/*
+ * Insert tuples to a given page.
+ *
+ * This is analogous with gistinserttuples() in the regular insertion code.
+ */
+static void
+gistbufferinginserttuples(GISTBuildState *buildstate, Buffer buffer,
+                                                 IndexTuple *itup, int ntup, OffsetNumber oldoffnum,
+                                                 GISTBufferingInsertStack *path)
+{
+       GISTBuildBuffers *gfbb = buildstate->gfbb;
+       List       *splitinfo;
+       bool            is_split;
+
+       is_split = gistplacetopage(buildstate->indexrel,
+                                                          buildstate->freespace,
+                                                          &buildstate->giststate,
+                                                          buffer,
+                                                          itup, ntup, oldoffnum,
+                                                          InvalidBuffer,
+                                                          &splitinfo,
+                                                          false);
+
+       /*
+        * If this is a root split, update the root path item kept in memory. This
+        * ensures that all path stacks are always complete, including all parent
+        * nodes up to the root. That simplifies the algorithm to re-find correct
+        * parent.
+        */
+       if (is_split && BufferGetBlockNumber(buffer) == GIST_ROOT_BLKNO)
+       {
+               GISTBufferingInsertStack *oldroot = gfbb->rootitem;
+               Page            page = BufferGetPage(buffer);
+               ItemId          iid;
+               IndexTuple      idxtuple;
+               BlockNumber leftmostchild;
+
+               gfbb->rootitem = (GISTBufferingInsertStack *) MemoryContextAlloc(
+                                                       gfbb->context, sizeof(GISTBufferingInsertStack));
+               gfbb->rootitem->parent = NULL;
+               gfbb->rootitem->blkno = GIST_ROOT_BLKNO;
+               gfbb->rootitem->downlinkoffnum = InvalidOffsetNumber;
+               gfbb->rootitem->level = oldroot->level + 1;
+               gfbb->rootitem->refCount = 1;
+
+               /*
+                * All the downlinks on the old root page are now on one of the child
+                * pages. Change the block number of the old root entry in the stack
+                * to point to the leftmost child. The other child pages will be
+                * accessible from there by walking right.
+                */
+               iid = PageGetItemId(page, FirstOffsetNumber);
+               idxtuple = (IndexTuple) PageGetItem(page, iid);
+               leftmostchild = ItemPointerGetBlockNumber(&(idxtuple->t_tid));
+
+               oldroot->parent = gfbb->rootitem;
+               oldroot->blkno = leftmostchild;
+               oldroot->downlinkoffnum = InvalidOffsetNumber;
+       }
+
+       if (splitinfo)
+       {
+               /*
+                * Insert the downlinks to the parent. This is analogous with
+                * gistfinishsplit() in the regular insertion code, but the locking is
+                * simpler, and we have to maintain the buffers.
+                */
+               IndexTuple *downlinks;
+               int                     ndownlinks,
+                                       i;
+               Buffer          parentBuffer;
+               ListCell   *lc;
+
+               /* Parent may have changed since we memorized this path. */
+               gistBufferingFindCorrectParent(buildstate, path);
+
+               /*
+                * If there's a buffer associated with this page, that needs to be
+                * split too. gistRelocateBuildBuffersOnSplit() will also adjust the
+                * downlinks in 'splitinfo', to make sure they're consistent not only
+                * with the tuples already on the pages, but also the tuples in the
+                * buffers that will eventually be inserted to them.
+                */
+               gistRelocateBuildBuffersOnSplit(gfbb,
+                                                                               &buildstate->giststate,
+                                                                               buildstate->indexrel,
+                                                                               path, buffer, splitinfo);
+
+               /* Create an array of all the downlink tuples */
+               ndownlinks = list_length(splitinfo);
+               downlinks = (IndexTuple *) palloc(sizeof(IndexTuple) * ndownlinks);
+               i = 0;
+               foreach(lc, splitinfo)
+               {
+                       GISTPageSplitInfo *splitinfo = lfirst(lc);
+
+                       /*
+                        * Since there's no concurrent access, we can release the lower
+                        * level buffers immediately. Don't release the buffer for the
+                        * original page, though, because the caller will release that.
+                        */
+                       if (splitinfo->buf != buffer)
+                               UnlockReleaseBuffer(splitinfo->buf);
+                       downlinks[i++] = splitinfo->downlink;
+               }
+
+               /* Insert them into parent. */
+               parentBuffer = ReadBuffer(buildstate->indexrel, path->parent->blkno);
+               LockBuffer(parentBuffer, GIST_EXCLUSIVE);
+               gistbufferinginserttuples(buildstate, parentBuffer,
+                                                                 downlinks, ndownlinks,
+                                                                 path->downlinkoffnum, path->parent);
+               UnlockReleaseBuffer(parentBuffer);
+
+               list_free_deep(splitinfo);              /* we don't need this anymore */
+       }
+}
+
+/*
+ * Find correct parent by following rightlinks in buffering index build. This
+ * method of parent searching is possible because no concurrent activity is
+ * possible while index builds.
+ */
+static void
+gistBufferingFindCorrectParent(GISTBuildState *buildstate,
+                                                          GISTBufferingInsertStack *child)
+{
+       GISTBuildBuffers *gfbb = buildstate->gfbb;
+       Relation        indexrel = buildstate->indexrel;
+       GISTBufferingInsertStack *parent = child->parent;
+       OffsetNumber i,
+                               maxoff;
+       ItemId          iid;
+       IndexTuple      idxtuple;
+       Buffer          buffer;
+       Page            page;
+       bool            copied = false;
+
+       buffer = ReadBuffer(indexrel, parent->blkno);
+       page = BufferGetPage(buffer);
+       LockBuffer(buffer, GIST_EXCLUSIVE);
+       gistcheckpage(indexrel, buffer);
+
+       /* Check if it was not moved */
+       if (child->downlinkoffnum != InvalidOffsetNumber &&
+               child->downlinkoffnum <= PageGetMaxOffsetNumber(page))
+       {
+               iid = PageGetItemId(page, child->downlinkoffnum);
+               idxtuple = (IndexTuple) PageGetItem(page, iid);
+               if (ItemPointerGetBlockNumber(&(idxtuple->t_tid)) == child->blkno)
+               {
+                       /* Still there */
+                       UnlockReleaseBuffer(buffer);
+                       return;
+               }
+       }
+
+       /* parent has changed, look child in right links until found */
+       while (true)
+       {
+               /* Search for relevant downlink in the current page */
+               maxoff = PageGetMaxOffsetNumber(page);
+               for (i = FirstOffsetNumber; i <= maxoff; i = OffsetNumberNext(i))
+               {
+                       iid = PageGetItemId(page, i);
+                       idxtuple = (IndexTuple) PageGetItem(page, iid);
+                       if (ItemPointerGetBlockNumber(&(idxtuple->t_tid)) == child->blkno)
+                       {
+                               /* yes!!, found */
+                               child->downlinkoffnum = i;
+                               UnlockReleaseBuffer(buffer);
+                               return;
+                       }
+               }
+
+               /*
+                * We should copy parent path item because some other path items can
+                * refer to it.
+                */
+               if (!copied)
+               {
+                       parent = (GISTBufferingInsertStack *) MemoryContextAlloc(gfbb->context,
+                                                                                  sizeof(GISTBufferingInsertStack));
+                       memcpy(parent, child->parent, sizeof(GISTBufferingInsertStack));
+                       if (parent->parent)
+                               parent->parent->refCount++;
+                       gistDecreasePathRefcount(child->parent);
+                       child->parent = parent;
+                       parent->refCount = 1;
+                       copied = true;
+               }
+
+               /*
+                * Not found in current page. Move towards rightlink.
+                */
+               parent->blkno = GistPageGetOpaque(page)->rightlink;
+               UnlockReleaseBuffer(buffer);
+
+               if (parent->blkno == InvalidBlockNumber)
+               {
+                       /*
+                        * End of chain and still didn't find parent. Should not happen
+                        * during index build.
+                        */
+                       break;
+               }
+
+               /* Get the next page */
+               buffer = ReadBuffer(indexrel, parent->blkno);
+               page = BufferGetPage(buffer);
+               LockBuffer(buffer, GIST_EXCLUSIVE);
+               gistcheckpage(indexrel, buffer);
+       }
+
+       elog(ERROR, "failed to re-find parent for block %u", child->blkno);
+}
+
+/*
+ * Process buffers emptying stack. Emptying of one buffer can cause emptying
+ * of other buffers. This function iterates until this cascading emptying
+ * process finished, e.g. until buffers emptying stack is empty.
+ */
+static void
+gistProcessEmptyingQueue(GISTBuildState *buildstate)
+{
+       GISTBuildBuffers *gfbb = buildstate->gfbb;
+
+       /* Iterate while we have elements in buffers emptying stack. */
+       while (gfbb->bufferEmptyingQueue != NIL)
+       {
+               GISTNodeBuffer *emptyingNodeBuffer;
+
+               /* Get node buffer from emptying stack. */
+               emptyingNodeBuffer = (GISTNodeBuffer *) linitial(gfbb->bufferEmptyingQueue);
+               gfbb->bufferEmptyingQueue = list_delete_first(gfbb->bufferEmptyingQueue);
+               emptyingNodeBuffer->queuedForEmptying = false;
+
+               /*
+                * We are going to load last pages of buffers where emptying will be
+                * to. So let's unload any previously loaded buffers.
+                */
+               gistUnloadNodeBuffers(gfbb);
+
+               /*
+                * Pop tuples from the buffer and run them down to the buffers at
+                * lower level, or leaf pages. We continue until one of the lower
+                * level buffers fills up, or this buffer runs empty.
+                *
+                * In Arge et al's paper, the buffer emptying is stopped after
+                * processing 1/2 node buffer worth of tuples, to avoid overfilling
+                * any of the lower level buffers. However, it's more efficient to
+                * keep going until one of the lower level buffers actually fills up,
+                * so that's what we do. This doesn't need to be exact, if a buffer
+                * overfills by a few tuples, there's no harm done.
+                */
+               while (true)
+               {
+                       IndexTuple      itup;
+
+                       /* Get next index tuple from the buffer */
+                       if (!gistPopItupFromNodeBuffer(gfbb, emptyingNodeBuffer, &itup))
+                               break;
+
+                       /*
+                        * Run it down to the underlying node buffer or leaf page.
+                        *
+                        * Note: it's possible that the buffer we're emptying splits as a
+                        * result of this call. If that happens, our emptyingNodeBuffer
+                        * points to the left half of the split. After split, it's very
+                        * likely that the new left buffer is no longer over the half-full
+                        * threshold, but we might as well keep flushing tuples from it
+                        * until we fill a lower-level buffer.
+                        */
+                       if (gistProcessItup(buildstate, itup, emptyingNodeBuffer->path))
+                       {
+                               /*
+                                * A lower level buffer filled up. Stop emptying this buffer,
+                                * to avoid overflowing the lower level buffer.
+                                */
+                               break;
+                       }
+
+                       /* Free all the memory allocated during index tuple processing */
+                       MemoryContextReset(CurrentMemoryContext);
+               }
+       }
+}
+
+/*
+ * Empty all node buffers, from top to bottom. This is done at the end of
+ * index build to flush all remaining tuples to the index.
+ *
+ * Note: This destroys the buffersOnLevels lists, so the buffers should not
+ * be inserted to after this call.
+ */
+static void
+gistEmptyAllBuffers(GISTBuildState *buildstate)
+{
+       GISTBuildBuffers *gfbb = buildstate->gfbb;
+       MemoryContext oldCtx;
+       int                     i;
+
+       oldCtx = MemoryContextSwitchTo(buildstate->tmpCtx);
+
+       /*
+        * Iterate through the levels from top to bottom.
+        */
+       for (i = gfbb->buffersOnLevelsLen - 1; i >= 0; i--)
+       {
+               /*
+                * Empty all buffers on this level. Note that new buffers can pop up
+                * in the list during the processing, as a result of page splits, so a
+                * simple walk through the list won't work. We remove buffers from the
+                * list when we see them empty; a buffer can't become non-empty once
+                * it's been fully emptied.
+                */
+               while (gfbb->buffersOnLevels[i] != NIL)
+               {
+                       GISTNodeBuffer *nodeBuffer;
+
+                       nodeBuffer = (GISTNodeBuffer *) linitial(gfbb->buffersOnLevels[i]);
+
+                       if (nodeBuffer->blocksCount != 0)
+                       {
+                               /*
+                                * Add this buffer to the emptying queue, and proceed to empty
+                                * the queue.
+                                */
+                               MemoryContextSwitchTo(gfbb->context);
+                               gfbb->bufferEmptyingQueue =
+                                       lcons(nodeBuffer, gfbb->bufferEmptyingQueue);
+                               MemoryContextSwitchTo(buildstate->tmpCtx);
+                               gistProcessEmptyingQueue(buildstate);
+                       }
+                       else
+                               gfbb->buffersOnLevels[i] =
+                                       list_delete_first(gfbb->buffersOnLevels[i]);
+               }
+       }
+       MemoryContextSwitchTo(oldCtx);
+}
+
+/*
+ * Free unreferenced parts of a path stack.
+ */
+static void
+gistFreeUnreferencedPath(GISTBufferingInsertStack *path)
+{
+       while (path->refCount == 0)
+       {
+               /*
+                * Path part is unreferenced. We can free it and decrease reference
+                * count of parent. If parent becomes unreferenced too procedure
+                * should be repeated for it.
+                */
+               GISTBufferingInsertStack *tmp = path->parent;
+
+               pfree(path);
+               path = tmp;
+               if (path)
+                       path->refCount--;
+               else
+                       break;
+       }
+}
+
+/*
+ * Decrease reference count of a path part, and free any unreferenced parts of
+ * the path stack.
+ */
+void
+gistDecreasePathRefcount(GISTBufferingInsertStack *path)
+{
+       path->refCount--;
+       gistFreeUnreferencedPath(path);
+}
+
+/*
+ * Get the depth of the GiST index.
+ */
+static int
+gistGetMaxLevel(Relation index)
+{
+       int                     maxLevel;
+       BlockNumber blkno;
+
+       /*
+        * Traverse down the tree, starting from the root, until we hit the leaf
+        * level.
+        */
+       maxLevel = 0;
+       blkno = GIST_ROOT_BLKNO;
+       while (true)
+       {
+               Buffer          buffer;
+               Page            page;
+               IndexTuple      itup;
+
+               buffer = ReadBuffer(index, blkno);
+
+               /*
+                * There's no concurrent access during index build, so locking is just
+                * pro forma.
+                */
+               LockBuffer(buffer, GIST_SHARE);
+               page = (Page) BufferGetPage(buffer);
+
+               if (GistPageIsLeaf(page))
+               {
+                       /* We hit the bottom, so we're done. */
+                       UnlockReleaseBuffer(buffer);
+                       break;
+               }
+
+               /*
+                * Pick the first downlink on the page, and follow it. It doesn't
+                * matter which downlink we choose, the tree has the same depth
+                * everywhere, so we just pick the first one.
+                */
+               itup = (IndexTuple) PageGetItem(page,
+                                                                        PageGetItemId(page, FirstOffsetNumber));
+               blkno = ItemPointerGetBlockNumber(&(itup->t_tid));
+               UnlockReleaseBuffer(buffer);
+
+               /*
+                * We're going down on the tree. It means that there is yet one more
+                * level is the tree.
+                */
+               maxLevel++;
+       }
+       return maxLevel;
+}
diff --git a/src/backend/access/gist/gistbuildbuffers.c b/src/backend/access/gist/gistbuildbuffers.c
new file mode 100644 (file)
index 0000000..1c11fb3
--- /dev/null
@@ -0,0 +1,787 @@
+/*-------------------------------------------------------------------------
+ *
+ * gistbuildbuffers.c
+ *       node buffer management functions for GiST buffering build algorithm.
+ *
+ *
+ * Portions Copyright (c) 1996-2011, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * IDENTIFICATION
+ *       src/backend/access/gist/gistbuildbuffers.c
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include "access/genam.h"
+#include "access/gist_private.h"
+#include "catalog/index.h"
+#include "miscadmin.h"
+#include "storage/buffile.h"
+#include "storage/bufmgr.h"
+#include "utils/memutils.h"
+#include "utils/rel.h"
+
+static GISTNodeBufferPage *gistAllocateNewPageBuffer(GISTBuildBuffers *gfbb);
+static void gistAddLoadedBuffer(GISTBuildBuffers *gfbb,
+                                       GISTNodeBuffer *nodeBuffer);
+static void gistLoadNodeBuffer(GISTBuildBuffers *gfbb,
+                                  GISTNodeBuffer *nodeBuffer);
+static void gistUnloadNodeBuffer(GISTBuildBuffers *gfbb,
+                                        GISTNodeBuffer *nodeBuffer);
+static void gistPlaceItupToPage(GISTNodeBufferPage *pageBuffer,
+                                       IndexTuple item);
+static void gistGetItupFromPage(GISTNodeBufferPage *pageBuffer,
+                                       IndexTuple *item);
+static long gistBuffersGetFreeBlock(GISTBuildBuffers *gfbb);
+static void gistBuffersReleaseBlock(GISTBuildBuffers *gfbb, long blocknum);
+
+static void ReadTempFileBlock(BufFile *file, long blknum, void *ptr);
+static void WriteTempFileBlock(BufFile *file, long blknum, void *ptr);
+
+
+/*
+ * Initialize GiST build buffers.
+ */
+GISTBuildBuffers *
+gistInitBuildBuffers(int pagesPerBuffer, int levelStep, int maxLevel)
+{
+       GISTBuildBuffers *gfbb;
+       HASHCTL         hashCtl;
+
+       gfbb = palloc(sizeof(GISTBuildBuffers));
+       gfbb->pagesPerBuffer = pagesPerBuffer;
+       gfbb->levelStep = levelStep;
+
+       /*
+        * Create a temporary file to hold buffer pages that are swapped out of
+        * memory.
+        */
+       gfbb->pfile = BufFileCreateTemp(true);
+       gfbb->nFileBlocks = 0;
+
+       /* Initialize free page management. */
+       gfbb->nFreeBlocks = 0;
+       gfbb->freeBlocksLen = 32;
+       gfbb->freeBlocks = (long *) palloc(gfbb->freeBlocksLen * sizeof(long));
+
+       /*
+        * Current memory context will be used for all in-memory data structures
+        * of buffers which are persistent during buffering build.
+        */
+       gfbb->context = CurrentMemoryContext;
+
+       /*
+        * nodeBuffersTab hash is association between index blocks and it's
+        * buffers.
+        */
+       hashCtl.keysize = sizeof(BlockNumber);
+       hashCtl.entrysize = sizeof(GISTNodeBuffer);
+       hashCtl.hcxt = CurrentMemoryContext;
+       hashCtl.hash = tag_hash;
+       hashCtl.match = memcmp;
+       gfbb->nodeBuffersTab = hash_create("gistbuildbuffers",
+                                                                          1024,
+                                                                          &hashCtl,
+                                                                          HASH_ELEM | HASH_CONTEXT
+                                                                          | HASH_FUNCTION | HASH_COMPARE);
+
+       gfbb->bufferEmptyingQueue = NIL;
+
+       /*
+        * Per-level node buffers lists for final buffers emptying process. Node
+        * buffers are inserted here when they are created.
+        */
+       gfbb->buffersOnLevelsLen = 1;
+       gfbb->buffersOnLevels = (List **) palloc(sizeof(List *) *
+                                                                                        gfbb->buffersOnLevelsLen);
+       gfbb->buffersOnLevels[0] = NIL;
+
+       /*
+        * Block numbers of node buffers which last pages are currently loaded
+        * into main memory.
+        */
+       gfbb->loadedBuffersLen = 32;
+       gfbb->loadedBuffers = (GISTNodeBuffer **) palloc(gfbb->loadedBuffersLen *
+                                                                                                  sizeof(GISTNodeBuffer *));
+       gfbb->loadedBuffersCount = 0;
+
+       /*
+        * Root path item of the tree. Updated on each root node split.
+        */
+       gfbb->rootitem = (GISTBufferingInsertStack *) MemoryContextAlloc(
+                                                       gfbb->context, sizeof(GISTBufferingInsertStack));
+       gfbb->rootitem->parent = NULL;
+       gfbb->rootitem->blkno = GIST_ROOT_BLKNO;
+       gfbb->rootitem->downlinkoffnum = InvalidOffsetNumber;
+       gfbb->rootitem->level = maxLevel;
+       gfbb->rootitem->refCount = 1;
+
+       return gfbb;
+}
+
+/*
+ * Returns a node buffer for given block. The buffer is created if it
+ * doesn't exist yet.
+ */
+GISTNodeBuffer *
+gistGetNodeBuffer(GISTBuildBuffers *gfbb, GISTSTATE *giststate,
+                                 BlockNumber nodeBlocknum,
+                                 OffsetNumber downlinkoffnum,
+                                 GISTBufferingInsertStack *parent)
+{
+       GISTNodeBuffer *nodeBuffer;
+       bool            found;
+
+       /* Find node buffer in hash table */
+       nodeBuffer = (GISTNodeBuffer *) hash_search(gfbb->nodeBuffersTab,
+                                                                                               (const void *) &nodeBlocknum,
+                                                                                               HASH_ENTER,
+                                                                                               &found);
+       if (!found)
+       {
+               /*
+                * Node buffer wasn't found. Initialize the new buffer as empty.
+                */
+               GISTBufferingInsertStack *path;
+               int                     level;
+               MemoryContext oldcxt = MemoryContextSwitchTo(gfbb->context);
+
+               nodeBuffer->pageBuffer = NULL;
+               nodeBuffer->blocksCount = 0;
+               nodeBuffer->queuedForEmptying = false;
+
+               /*
+                * Create a path stack for the page.
+                */
+               if (nodeBlocknum != GIST_ROOT_BLKNO)
+               {
+                       path = (GISTBufferingInsertStack *) palloc(
+                                                                                  sizeof(GISTBufferingInsertStack));
+                       path->parent = parent;
+                       path->blkno = nodeBlocknum;
+                       path->downlinkoffnum = downlinkoffnum;
+                       path->level = parent->level - 1;
+                       path->refCount = 0; /* initially unreferenced */
+                       parent->refCount++; /* this path references its parent */
+                       Assert(path->level > 0);
+               }
+               else
+                       path = gfbb->rootitem;
+
+               nodeBuffer->path = path;
+               path->refCount++;
+
+               /*
+                * Add this buffer to the list of buffers on this level. Enlarge
+                * buffersOnLevels array if needed.
+                */
+               level = path->level;
+               if (level >= gfbb->buffersOnLevelsLen)
+               {
+                       int                     i;
+
+                       gfbb->buffersOnLevels =
+                               (List **) repalloc(gfbb->buffersOnLevels,
+                                                                  (level + 1) * sizeof(List *));
+
+                       /* initialize the enlarged portion */
+                       for (i = gfbb->buffersOnLevelsLen; i <= level; i++)
+                               gfbb->buffersOnLevels[i] = NIL;
+                       gfbb->buffersOnLevelsLen = level + 1;
+               }
+
+               /*
+                * Prepend the new buffer to the list of buffers on this level. It's
+                * not arbitrary that the new buffer is put to the beginning of the
+                * list: in the final emptying phase we loop through all buffers at
+                * each level, and flush them. If a page is split during the emptying,
+                * it's more efficient to flush the new splitted pages first, before
+                * moving on to pre-existing pages on the level. The buffers just
+                * created during the page split are likely still in cache, so
+                * flushing them immediately is more efficient than putting them to
+                * the end of the queue.
+                */
+               gfbb->buffersOnLevels[level] = lcons(nodeBuffer,
+                                                                                        gfbb->buffersOnLevels[level]);
+
+               MemoryContextSwitchTo(oldcxt);
+       }
+       else
+       {
+               if (parent != nodeBuffer->path->parent)
+               {
+                       /*
+                        * A different parent path item was provided than we've
+                        * remembered. We trust caller to provide more correct parent than
+                        * we have. Previous parent may be outdated by page split.
+                        */
+                       gistDecreasePathRefcount(nodeBuffer->path->parent);
+                       nodeBuffer->path->parent = parent;
+                       parent->refCount++;
+               }
+       }
+
+       return nodeBuffer;
+}
+
+/*
+ * Allocate memory for a buffer page.
+ */
+static GISTNodeBufferPage *
+gistAllocateNewPageBuffer(GISTBuildBuffers *gfbb)
+{
+       GISTNodeBufferPage *pageBuffer;
+
+       pageBuffer = (GISTNodeBufferPage *) MemoryContextAlloc(gfbb->context,
+                                                                                                                  BLCKSZ);
+       pageBuffer->prev = InvalidBlockNumber;
+
+       /* Set page free space */
+       PAGE_FREE_SPACE(pageBuffer) = BLCKSZ - BUFFER_PAGE_DATA_OFFSET;
+       return pageBuffer;
+}
+
+/*
+ * Add specified block number into loadedBuffers array.
+ */
+static void
+gistAddLoadedBuffer(GISTBuildBuffers *gfbb, GISTNodeBuffer *nodeBuffer)
+{
+       /* Enlarge the array if needed */
+       if (gfbb->loadedBuffersCount >= gfbb->loadedBuffersLen)
+       {
+               gfbb->loadedBuffersLen *= 2;
+               gfbb->loadedBuffers = (GISTNodeBuffer **)
+                       repalloc(gfbb->loadedBuffers,
+                                        gfbb->loadedBuffersLen * sizeof(GISTNodeBuffer *));
+       }
+
+       gfbb->loadedBuffers[gfbb->loadedBuffersCount] = nodeBuffer;
+       gfbb->loadedBuffersCount++;
+}
+
+/*
+ * Load last page of node buffer into main memory.
+ */
+static void
+gistLoadNodeBuffer(GISTBuildBuffers *gfbb, GISTNodeBuffer *nodeBuffer)
+{
+       /* Check if we really should load something */
+       if (!nodeBuffer->pageBuffer && nodeBuffer->blocksCount > 0)
+       {
+               /* Allocate memory for page */
+               nodeBuffer->pageBuffer = gistAllocateNewPageBuffer(gfbb);
+
+               /* Read block from temporary file */
+               ReadTempFileBlock(gfbb->pfile, nodeBuffer->pageBlocknum,
+                                                 nodeBuffer->pageBuffer);
+
+               /* Mark file block as free */
+               gistBuffersReleaseBlock(gfbb, nodeBuffer->pageBlocknum);
+
+               /* Mark node buffer as loaded */
+               gistAddLoadedBuffer(gfbb, nodeBuffer);
+               nodeBuffer->pageBlocknum = InvalidBlockNumber;
+       }
+}
+
+/*
+ * Write last page of node buffer to the disk.
+ */
+static void
+gistUnloadNodeBuffer(GISTBuildBuffers *gfbb, GISTNodeBuffer *nodeBuffer)
+{
+       /* Check if we have something to write */
+       if (nodeBuffer->pageBuffer)
+       {
+               BlockNumber blkno;
+
+               /* Get free file block */
+               blkno = gistBuffersGetFreeBlock(gfbb);
+
+               /* Write block to the temporary file */
+               WriteTempFileBlock(gfbb->pfile, blkno, nodeBuffer->pageBuffer);
+
+               /* Free memory of that page */
+               pfree(nodeBuffer->pageBuffer);
+               nodeBuffer->pageBuffer = NULL;
+
+               /* Save block number */
+               nodeBuffer->pageBlocknum = blkno;
+       }
+}
+
+/*
+ * Write last pages of all node buffers to the disk.
+ */
+void
+gistUnloadNodeBuffers(GISTBuildBuffers *gfbb)
+{
+       int                     i;
+
+       /* Unload all the buffers that have a page loaded in memory. */
+       for (i = 0; i < gfbb->loadedBuffersCount; i++)
+               gistUnloadNodeBuffer(gfbb, gfbb->loadedBuffers[i]);
+
+       /* Now there are no node buffers with loaded last page */
+       gfbb->loadedBuffersCount = 0;
+}
+
+/*
+ * Add index tuple to buffer page.
+ */
+static void
+gistPlaceItupToPage(GISTNodeBufferPage *pageBuffer, IndexTuple itup)
+{
+       Size            itupsz = IndexTupleSize(itup);
+       char       *ptr;
+
+       /* There should be enough of space. */
+       Assert(PAGE_FREE_SPACE(pageBuffer) >= MAXALIGN(itupsz));
+
+       /* Reduce free space value of page to reserve a spot for the tuple. */
+       PAGE_FREE_SPACE(pageBuffer) -= MAXALIGN(itupsz);
+
+       /* Get pointer to the spot we reserved (ie. end of free space). */
+       ptr = (char *) pageBuffer + BUFFER_PAGE_DATA_OFFSET
+               + PAGE_FREE_SPACE(pageBuffer);
+
+       /* Copy the index tuple there. */
+       memcpy(ptr, itup, itupsz);
+}
+
+/*
+ * Get last item from buffer page and remove it from page.
+ */
+static void
+gistGetItupFromPage(GISTNodeBufferPage *pageBuffer, IndexTuple *itup)
+{
+       IndexTuple      ptr;
+       Size            itupsz;
+
+       Assert(!PAGE_IS_EMPTY(pageBuffer)); /* Page shouldn't be empty */
+
+       /* Get pointer to last index tuple */
+       ptr = (IndexTuple) ((char *) pageBuffer
+                                               + BUFFER_PAGE_DATA_OFFSET
+                                               + PAGE_FREE_SPACE(pageBuffer));
+       itupsz = IndexTupleSize(ptr);
+
+       /* Make a copy of the tuple */
+       *itup = (IndexTuple) palloc(itupsz);
+       memcpy(*itup, ptr, itupsz);
+
+       /* Mark the space used by the tuple as free */
+       PAGE_FREE_SPACE(pageBuffer) += MAXALIGN(itupsz);
+}
+
+/*
+ * Push an index tuple to node buffer.
+ */
+void
+gistPushItupToNodeBuffer(GISTBuildBuffers *gfbb, GISTNodeBuffer *nodeBuffer,
+                                                IndexTuple itup)
+{
+       /*
+        * Most part of memory operations will be in buffering build persistent
+        * context. So, let's switch to it.
+        */
+       MemoryContext oldcxt = MemoryContextSwitchTo(gfbb->context);
+
+       /*
+        * If the buffer is currently empty, create the first page.
+        */
+       if (nodeBuffer->blocksCount == 0)
+       {
+               nodeBuffer->pageBuffer = gistAllocateNewPageBuffer(gfbb);
+               nodeBuffer->blocksCount = 1;
+               gistAddLoadedBuffer(gfbb, nodeBuffer);
+       }
+
+       /* Load last page of node buffer if it wasn't in memory already */
+       if (!nodeBuffer->pageBuffer)
+               gistLoadNodeBuffer(gfbb, nodeBuffer);
+
+       /*
+        * Check if there is enough space on the last page for the tuple.
+        */
+       if (PAGE_NO_SPACE(nodeBuffer->pageBuffer, itup))
+       {
+               /*
+                * Nope. Swap previous block to disk and allocate a new one.
+                */
+               BlockNumber blkno;
+
+               /* Write filled page to the disk */
+               blkno = gistBuffersGetFreeBlock(gfbb);
+               WriteTempFileBlock(gfbb->pfile, blkno, nodeBuffer->pageBuffer);
+
+               /*
+                * Reset the in-memory page as empty, and link the previous block to
+                * the new page by storing its block number in the prev-link.
+                */
+               PAGE_FREE_SPACE(nodeBuffer->pageBuffer) =
+                       BLCKSZ - MAXALIGN(offsetof(GISTNodeBufferPage, tupledata));
+               nodeBuffer->pageBuffer->prev = blkno;
+
+               /* We've just added one more page */
+               nodeBuffer->blocksCount++;
+       }
+
+       gistPlaceItupToPage(nodeBuffer->pageBuffer, itup);
+
+       /*
+        * If the buffer just overflowed, add it to the emptying queue.
+        */
+       if (BUFFER_HALF_FILLED(nodeBuffer, gfbb) && !nodeBuffer->queuedForEmptying)
+       {
+               gfbb->bufferEmptyingQueue = lcons(nodeBuffer,
+                                                                                 gfbb->bufferEmptyingQueue);
+               nodeBuffer->queuedForEmptying = true;
+       }
+
+       /* Restore memory context */
+       MemoryContextSwitchTo(oldcxt);
+}
+
+/*
+ * Removes one index tuple from node buffer. Returns true if success and false
+ * if node buffer is empty.
+ */
+bool
+gistPopItupFromNodeBuffer(GISTBuildBuffers *gfbb, GISTNodeBuffer *nodeBuffer,
+                                                 IndexTuple *itup)
+{
+       /*
+        * If node buffer is empty then return false.
+        */
+       if (nodeBuffer->blocksCount <= 0)
+               return false;
+
+       /* Load last page of node buffer if needed */
+       if (!nodeBuffer->pageBuffer)
+               gistLoadNodeBuffer(gfbb, nodeBuffer);
+
+       /*
+        * Get index tuple from last non-empty page.
+        */
+       gistGetItupFromPage(nodeBuffer->pageBuffer, itup);
+
+       /*
+        * If we just removed the last tuple from the page, fetch previous page on
+        * this node buffer (if any).
+        */
+       if (PAGE_IS_EMPTY(nodeBuffer->pageBuffer))
+       {
+               BlockNumber prevblkno;
+
+               /*
+                * blocksCount includes the page in pageBuffer, so decrease it now.
+                */
+               nodeBuffer->blocksCount--;
+
+               /*
+                * If there's more pages, fetch previous one.
+                */
+               prevblkno = nodeBuffer->pageBuffer->prev;
+               if (prevblkno != InvalidBlockNumber)
+               {
+                       /* There is a previous page. Fetch it. */
+                       Assert(nodeBuffer->blocksCount > 0);
+                       ReadTempFileBlock(gfbb->pfile, prevblkno, nodeBuffer->pageBuffer);
+
+                       /*
+                        * Now that we've read the block in memory, we can release its
+                        * on-disk block for reuse.
+                        */
+                       gistBuffersReleaseBlock(gfbb, prevblkno);
+               }
+               else
+               {
+                       /* No more pages. Free memory. */
+                       Assert(nodeBuffer->blocksCount == 0);
+                       pfree(nodeBuffer->pageBuffer);
+                       nodeBuffer->pageBuffer = NULL;
+               }
+       }
+       return true;
+}
+
+/*
+ * Select a currently unused block for writing to.
+ */
+static long
+gistBuffersGetFreeBlock(GISTBuildBuffers *gfbb)
+{
+       /*
+        * If there are multiple free blocks, we select the one appearing last in
+        * freeBlocks[].  If there are none, assign the next block at the end of
+        * the file (causing the file to be extended).
+        */
+       if (gfbb->nFreeBlocks > 0)
+               return gfbb->freeBlocks[--gfbb->nFreeBlocks];
+       else
+               return gfbb->nFileBlocks++;
+}
+
+/*
+ * Return a block# to the freelist.
+ */
+static void
+gistBuffersReleaseBlock(GISTBuildBuffers *gfbb, long blocknum)
+{
+       int                     ndx;
+
+       /* Enlarge freeBlocks array if full. */
+       if (gfbb->nFreeBlocks >= gfbb->freeBlocksLen)
+       {
+               gfbb->freeBlocksLen *= 2;
+               gfbb->freeBlocks = (long *) repalloc(gfbb->freeBlocks,
+                                                                                        gfbb->freeBlocksLen *
+                                                                                        sizeof(long));
+       }
+
+       /* Add blocknum to array */
+       ndx = gfbb->nFreeBlocks++;
+       gfbb->freeBlocks[ndx] = blocknum;
+}
+
+/*
+ * Free buffering build data structure.
+ */
+void
+gistFreeBuildBuffers(GISTBuildBuffers *gfbb)
+{
+       /* Close buffers file. */
+       BufFileClose(gfbb->pfile);
+
+       /* All other things will be freed on memory context release */
+}
+
+/*
+ * Data structure representing information about node buffer for index tuples
+ * relocation from splitted node buffer.
+ */
+typedef struct
+{
+       GISTENTRY       entry[INDEX_MAX_KEYS];
+       bool            isnull[INDEX_MAX_KEYS];
+       GISTPageSplitInfo *splitinfo;
+       GISTNodeBuffer *nodeBuffer;
+}      RelocationBufferInfo;
+
+/*
+ * At page split, distribute tuples from the buffer of the split page to
+ * new buffers for the created page halves. This also adjusts the downlinks
+ * in 'splitinfo' to include the tuples in the buffers.
+ */
+void
+gistRelocateBuildBuffersOnSplit(GISTBuildBuffers *gfbb, GISTSTATE *giststate,
+                                                               Relation r, GISTBufferingInsertStack *path,
+                                                               Buffer buffer, List *splitinfo)
+{
+       RelocationBufferInfo *relocationBuffersInfos;
+       bool            found;
+       GISTNodeBuffer *nodeBuffer;
+       BlockNumber blocknum;
+       IndexTuple      itup;
+       int                     splitPagesCount = 0,
+                               i;
+       GISTENTRY       entry[INDEX_MAX_KEYS];
+       bool            isnull[INDEX_MAX_KEYS];
+       GISTNodeBuffer nodebuf;
+       ListCell   *lc;
+
+       /* If the splitted page doesn't have buffers, we have nothing to do. */
+       if (!LEVEL_HAS_BUFFERS(path->level, gfbb))
+               return;
+
+       /*
+        * Get the node buffer of the splitted page.
+        */
+       blocknum = BufferGetBlockNumber(buffer);
+       nodeBuffer = hash_search(gfbb->nodeBuffersTab, &blocknum,
+                                                        HASH_FIND, &found);
+       if (!found)
+       {
+               /*
+                * Node buffer should exist at this point. If it didn't exist before,
+                * the insertion that caused the page to split should've created it.
+                */
+               elog(ERROR, "node buffer of page being split (%u) does not exist",
+                        blocknum);
+       }
+
+       /*
+        * Make a copy of the old buffer, as we're going reuse it as the buffer
+        * for the new left page, which is on the same block as the old page.
+        * That's not true for the root page, but that's fine because we never
+        * have a buffer on the root page anyway. The original algorithm as
+        * described by Arge et al did, but it's of no use, as you might as well
+        * read the tuples straight from the heap instead of the root buffer.
+        */
+       Assert(blocknum != GIST_ROOT_BLKNO);
+       memcpy(&nodebuf, nodeBuffer, sizeof(GISTNodeBuffer));
+
+       /* Reset the old buffer, used for the new left page from now on */
+       nodeBuffer->blocksCount = 0;
+       nodeBuffer->pageBuffer = NULL;
+       nodeBuffer->pageBlocknum = InvalidBlockNumber;
+
+       /* Reassign pointer to the saved copy. */
+       nodeBuffer = &nodebuf;
+
+       /*
+        * Allocate memory for information about relocation buffers.
+        */
+       splitPagesCount = list_length(splitinfo);
+       relocationBuffersInfos =
+               (RelocationBufferInfo *) palloc(sizeof(RelocationBufferInfo) *
+                                                                               splitPagesCount);
+
+       /*
+        * Fill relocation buffers information for node buffers of pages produced
+        * by split.
+        */
+       i = 0;
+       foreach(lc, splitinfo)
+       {
+               GISTPageSplitInfo *si = (GISTPageSplitInfo *) lfirst(lc);
+               GISTNodeBuffer *newNodeBuffer;
+
+               /* Decompress parent index tuple of node buffer page. */
+               gistDeCompressAtt(giststate, r,
+                                                 si->downlink, NULL, (OffsetNumber) 0,
+                                                 relocationBuffersInfos[i].entry,
+                                                 relocationBuffersInfos[i].isnull);
+
+               /*
+                * Create a node buffer for the page. The leftmost half is on the same
+                * block as the old page before split, so for the leftmost half this
+                * will return the original buffer, which was emptied earlier in this
+                * function.
+                */
+               newNodeBuffer = gistGetNodeBuffer(gfbb,
+                                                                                 giststate,
+                                                                                 BufferGetBlockNumber(si->buf),
+                                                                                 path->downlinkoffnum,
+                                                                                 path->parent);
+
+               relocationBuffersInfos[i].nodeBuffer = newNodeBuffer;
+               relocationBuffersInfos[i].splitinfo = si;
+
+               i++;
+       }
+
+       /*
+        * Loop through all index tuples on the buffer on the splitted page,
+        * moving them to buffers on the new pages.
+        */
+       while (gistPopItupFromNodeBuffer(gfbb, nodeBuffer, &itup))
+       {
+               float           sum_grow,
+                                       which_grow[INDEX_MAX_KEYS];
+               int                     i,
+                                       which;
+               IndexTuple      newtup;
+               RelocationBufferInfo *targetBufferInfo;
+
+               /*
+                * Choose which page this tuple should go to.
+                */
+               gistDeCompressAtt(giststate, r,
+                                                 itup, NULL, (OffsetNumber) 0, entry, isnull);
+
+               which = -1;
+               *which_grow = -1.0f;
+               sum_grow = 1.0f;
+
+               for (i = 0; i < splitPagesCount && sum_grow; i++)
+               {
+                       int                     j;
+                       RelocationBufferInfo *splitPageInfo = &relocationBuffersInfos[i];
+
+                       sum_grow = 0.0f;
+                       for (j = 0; j < r->rd_att->natts; j++)
+                       {
+                               float           usize;
+
+                               usize = gistpenalty(giststate, j,
+                                                                       &splitPageInfo->entry[j],
+                                                                       splitPageInfo->isnull[j],
+                                                                       &entry[j], isnull[j]);
+
+                               if (which_grow[j] < 0 || usize < which_grow[j])
+                               {
+                                       which = i;
+                                       which_grow[j] = usize;
+                                       if (j < r->rd_att->natts - 1 && i == 0)
+                                               which_grow[j + 1] = -1;
+                                       sum_grow += which_grow[j];
+                               }
+                               else if (which_grow[j] == usize)
+                                       sum_grow += usize;
+                               else
+                               {
+                                       sum_grow = 1;
+                                       break;
+                               }
+                       }
+               }
+               targetBufferInfo = &relocationBuffersInfos[which];
+
+               /* Push item to selected node buffer */
+               gistPushItupToNodeBuffer(gfbb, targetBufferInfo->nodeBuffer, itup);
+
+               /* Adjust the downlink for this page, if needed. */
+               newtup = gistgetadjusted(r, targetBufferInfo->splitinfo->downlink,
+                                                                itup, giststate);
+               if (newtup)
+               {
+                       gistDeCompressAtt(giststate, r,
+                                                         newtup, NULL, (OffsetNumber) 0,
+                                                         targetBufferInfo->entry,
+                                                         targetBufferInfo->isnull);
+
+                       targetBufferInfo->splitinfo->downlink = newtup;
+               }
+       }
+
+       pfree(relocationBuffersInfos);
+}
+
+
+/*
+ * Wrappers around BufFile operations. The main difference is that these
+ * wrappers report errors with ereport(), so that the callers don't need
+ * to check the return code.
+ */
+
+static void
+ReadTempFileBlock(BufFile *file, long blknum, void *ptr)
+{
+       if (BufFileSeekBlock(file, blknum) != 0)
+               elog(ERROR, "could not seek temporary file: %m");
+       if (BufFileRead(file, ptr, BLCKSZ) != BLCKSZ)
+               elog(ERROR, "could not read temporary file: %m");
+}
+
+static void
+WriteTempFileBlock(BufFile *file, long blknum, void *ptr)
+{
+       if (BufFileSeekBlock(file, blknum) != 0)
+               elog(ERROR, "could not seek temporary file: %m");
+       if (BufFileWrite(file, ptr, BLCKSZ) != BLCKSZ)
+       {
+               /*
+                * the other errors in Read/WriteTempFileBlock shouldn't happen, but
+                * an error at write can easily happen if you run out of disk space.
+                */
+               ereport(ERROR,
+                               (errcode_for_file_access(),
+                                errmsg("could not write block %ld of temporary file: %m",
+                                               blknum)));
+       }
+}
index 448d8bce05c2299d1503c8e7e1df858cbd35e426..d91025dbe7ff18ae0edd4c573eb3bc5676b77796 100644 (file)
@@ -667,13 +667,30 @@ gistoptions(PG_FUNCTION_ARGS)
 {
        Datum           reloptions = PG_GETARG_DATUM(0);
        bool            validate = PG_GETARG_BOOL(1);
-       bytea      *result;
+       relopt_value *options;
+       GiSTOptions *rdopts;
+       int                     numoptions;
+       static const relopt_parse_elt tab[] = {
+               {"fillfactor", RELOPT_TYPE_INT, offsetof(GiSTOptions, fillfactor)},
+               {"buffering", RELOPT_TYPE_STRING, offsetof(GiSTOptions, bufferingModeOffset)}
+       };
 
-       result = default_reloptions(reloptions, validate, RELOPT_KIND_GIST);
+       options = parseRelOptions(reloptions, validate, RELOPT_KIND_GIST,
+                                                         &numoptions);
+
+       /* if none set, we're done */
+       if (numoptions == 0)
+               PG_RETURN_NULL();
+
+       rdopts = allocateReloptStruct(sizeof(GiSTOptions), options, numoptions);
+
+       fillRelOptions((void *) rdopts, sizeof(GiSTOptions), options, numoptions,
+                                  validate, tab, lengthof(tab));
+
+       pfree(options);
+
+       PG_RETURN_BYTEA_P(rdopts);
 
-       if (result)
-               PG_RETURN_BYTEA_P(result);
-       PG_RETURN_NULL();
 }
 
 /*
index 09b1d489928793671200e93c3c3f6d3b00ced80c..8c326462b1de7c96db54016c23b49ae7ce4bd323 100644 (file)
@@ -263,7 +263,8 @@ gistRedoPageSplitRecord(XLogRecPtr lsn, XLogRecord *record)
                        else
                                GistPageGetOpaque(page)->rightlink = xldata->origrlink;
                        GistPageGetOpaque(page)->nsn = xldata->orignsn;
-                       if (i < xlrec.data->npage - 1 && !isrootsplit)
+                       if (i < xlrec.data->npage - 1 && !isrootsplit &&
+                               xldata->markfollowright)
                                GistMarkFollowRight(page);
                        else
                                GistClearFollowRight(page);
@@ -411,7 +412,7 @@ XLogRecPtr
 gistXLogSplit(RelFileNode node, BlockNumber blkno, bool page_is_leaf,
                          SplitedPageLayout *dist,
                          BlockNumber origrlink, GistNSN orignsn,
-                         Buffer leftchildbuf)
+                         Buffer leftchildbuf, bool markfollowright)
 {
        XLogRecData *rdata;
        gistxlogPageSplit xlrec;
@@ -433,6 +434,7 @@ gistXLogSplit(RelFileNode node, BlockNumber blkno, bool page_is_leaf,
        xlrec.npage = (uint16) npage;
        xlrec.leftchild =
                BufferIsValid(leftchildbuf) ? BufferGetBlockNumber(leftchildbuf) : InvalidBlockNumber;
+       xlrec.markfollowright = markfollowright;
 
        rdata[0].data = (char *) &xlrec;
        rdata[0].len = sizeof(gistxlogPageSplit);
index 9fb20a6b6cdb9e3065fab9083772c6df00a2fda0..6ce2c7568de867e90d2a365228c104446a3544ed 100644 (file)
 #include "access/gist.h"
 #include "access/itup.h"
 #include "storage/bufmgr.h"
+#include "storage/buffile.h"
 #include "utils/rbtree.h"
+#include "utils/hsearch.h"
 
 /* Buffer lock modes */
 #define GIST_SHARE     BUFFER_LOCK_SHARE
 #define GIST_EXCLUSIVE BUFFER_LOCK_EXCLUSIVE
 #define GIST_UNLOCK BUFFER_LOCK_UNLOCK
 
+typedef struct
+{
+       BlockNumber prev;
+       uint32          freespace;
+       char            tupledata[1];
+} GISTNodeBufferPage;
+
+#define BUFFER_PAGE_DATA_OFFSET MAXALIGN(offsetof(GISTNodeBufferPage, tupledata))
+/* Returns free space in node buffer page */
+#define PAGE_FREE_SPACE(nbp) (nbp->freespace)
+/* Checks if node buffer page is empty */
+#define PAGE_IS_EMPTY(nbp) (nbp->freespace == BLCKSZ - BUFFER_PAGE_DATA_OFFSET)
+/* Checks if node buffers page don't contain sufficient space for index tuple */
+#define PAGE_NO_SPACE(nbp, itup) (PAGE_FREE_SPACE(nbp) < \
+                                                                               MAXALIGN(IndexTupleSize(itup)))
+
 /*
  * GISTSTATE: information needed for any GiST index operation
  *
@@ -170,6 +188,7 @@ typedef struct gistxlogPageSplit
 
        BlockNumber leftchild;          /* like in gistxlogPageUpdate */
        uint16          npage;                  /* # of pages in the split */
+       bool            markfollowright;        /* set F_FOLLOW_RIGHT flags */
 
        /*
         * follow: 1. gistxlogPage and array of IndexTupleData per page
@@ -279,13 +298,149 @@ typedef struct
 #define  GistTupleIsInvalid(itup)      ( ItemPointerGetOffsetNumber( &((itup)->t_tid) ) == TUPLE_IS_INVALID )
 #define  GistTupleSetValid(itup)       ItemPointerSetOffsetNumber( &((itup)->t_tid), TUPLE_IS_VALID )
 
+
+
+
+/*
+ * A buffer attached to an internal node, used when building an index in
+ * buffering mode.
+ */
+typedef struct
+{
+       BlockNumber nodeBlocknum;       /* index block # this buffer is for */
+       int32           blocksCount;    /* current # of blocks occupied by buffer */
+
+       BlockNumber pageBlocknum;       /* temporary file block # */
+       GISTNodeBufferPage *pageBuffer;         /* in-memory buffer page */
+
+       /* is this buffer queued for emptying? */
+       bool            queuedForEmptying;
+
+       struct GISTBufferingInsertStack *path;
+} GISTNodeBuffer;
+
+/*
+ * Does specified level have buffers? (Beware of multiple evaluation of
+ * arguments.)
+ */
+#define LEVEL_HAS_BUFFERS(nlevel, gfbb) \
+       ((nlevel) != 0 && (nlevel) % (gfbb)->levelStep == 0 && \
+        (nlevel) != (gfbb)->rootitem->level)
+
+/* Is specified buffer at least half-filled (should be queued for emptying)? */
+#define BUFFER_HALF_FILLED(nodeBuffer, gfbb) \
+       ((nodeBuffer)->blocksCount > (gfbb)->pagesPerBuffer / 2)
+
+/*
+ * Is specified buffer full? Our buffers can actually grow indefinitely,
+ * beyond the "maximum" size, so this just means whether the buffer has grown
+ * beyond the nominal maximum size.
+ */
+#define BUFFER_OVERFLOWED(nodeBuffer, gfbb) \
+       ((nodeBuffer)->blocksCount > (gfbb)->pagesPerBuffer)
+
+/*
+ * Extended GISTInsertStack for buffering GiST index build.
+ */
+typedef struct GISTBufferingInsertStack
+{
+       /* current page */
+       BlockNumber blkno;
+
+       /* offset of the downlink in the parent page, that points to this page */
+       OffsetNumber downlinkoffnum;
+
+       /* pointer to parent */
+       struct GISTBufferingInsertStack *parent;
+
+       int                     refCount;
+
+       /* level number */
+       int                     level;
+} GISTBufferingInsertStack;
+
+/*
+ * Data structure with general information about build buffers.
+ */
+typedef struct GISTBuildBuffers
+{
+       /* Persistent memory context for the buffers and metadata. */
+       MemoryContext context;
+
+       BufFile    *pfile;                      /* Temporary file to store buffers in */
+       long            nFileBlocks;    /* Current size of the temporary file */
+
+       /*
+        * resizable array of free blocks.
+        */
+       long       *freeBlocks;
+       int                     nFreeBlocks;    /* # of currently free blocks in the array */
+       int                     freeBlocksLen;  /* current allocated length of the array */
+
+       /* Hash for buffers by block number */
+       HTAB       *nodeBuffersTab;
+
+       /* List of buffers scheduled for emptying */
+       List       *bufferEmptyingQueue;
+
+       /*
+        * Parameters to the buffering build algorithm. levelStep determines which
+        * levels in the tree have buffers, and pagesPerBuffer determines how
+        * large each buffer is.
+        */
+       int                     levelStep;
+       int                     pagesPerBuffer;
+
+       /* Array of lists of buffers on each level, for final emptying */
+       List      **buffersOnLevels;
+       int                     buffersOnLevelsLen;
+
+       /*
+        * Dynamically-sized array of buffers that currently have their last page
+        * loaded in main memory.
+        */
+       GISTNodeBuffer **loadedBuffers;
+       int                     loadedBuffersCount;             /* # of entries in loadedBuffers */
+       int                     loadedBuffersLen;               /* allocated size of loadedBuffers */
+
+       /* A path item that points to the current root node */
+       GISTBufferingInsertStack *rootitem;
+} GISTBuildBuffers;
+
+/*
+ * Storage type for GiST's reloptions
+ */
+typedef struct GiSTOptions
+{
+       int32           vl_len_;                /* varlena header (do not touch directly!) */
+       int                     fillfactor;             /* page fill factor in percent (0..100) */
+       int                     bufferingModeOffset;    /* use buffering build? */
+}      GiSTOptions;
+
 /* gist.c */
-extern Datum gistbuild(PG_FUNCTION_ARGS);
 extern Datum gistbuildempty(PG_FUNCTION_ARGS);
 extern Datum gistinsert(PG_FUNCTION_ARGS);
 extern MemoryContext createTempGistContext(void);
 extern void initGISTstate(GISTSTATE *giststate, Relation index);
 extern void freeGISTstate(GISTSTATE *giststate);
+extern void gistdoinsert(Relation r,
+                        IndexTuple itup,
+                        Size freespace,
+                        GISTSTATE *GISTstate);
+
+/* A List of these is returned from gistplacetopage() in *splitinfo */
+typedef struct
+{
+       Buffer          buf;                    /* the split page "half" */
+       IndexTuple      downlink;               /* downlink for this half. */
+} GISTPageSplitInfo;
+
+extern bool gistplacetopage(Relation rel, Size freespace, GISTSTATE *giststate,
+                               Buffer buffer,
+                               IndexTuple *itup, int ntup, OffsetNumber oldoffnum,
+                               Buffer leftchildbuf,
+                               List **splitinfo,
+                               bool markleftchild);
 
 extern SplitedPageLayout *gistSplit(Relation r, Page page, IndexTuple *itup,
                  int len, GISTSTATE *giststate);
@@ -305,7 +460,7 @@ extern XLogRecPtr gistXLogSplit(RelFileNode node,
                          BlockNumber blkno, bool page_is_leaf,
                          SplitedPageLayout *dist,
                          BlockNumber origrlink, GistNSN oldnsn,
-                         Buffer leftchild);
+                         Buffer leftchild, bool markfollowright);
 
 /* gistget.c */
 extern Datum gistgettuple(PG_FUNCTION_ARGS);
@@ -380,4 +535,27 @@ extern void gistSplitByKey(Relation r, Page page, IndexTuple *itup,
                           GistSplitVector *v, GistEntryVector *entryvec,
                           int attno);
 
+/* gistbuild.c */
+extern Datum gistbuild(PG_FUNCTION_ARGS);
+extern void gistValidateBufferingOption(char *value);
+extern void gistDecreasePathRefcount(GISTBufferingInsertStack *path);
+
+/* gistbuildbuffers.c */
+extern GISTBuildBuffers *gistInitBuildBuffers(int pagesPerBuffer, int levelStep,
+                                        int maxLevel);
+extern GISTNodeBuffer *gistGetNodeBuffer(GISTBuildBuffers *gfbb,
+                                 GISTSTATE *giststate,
+                                 BlockNumber blkno, OffsetNumber downlinkoffnum,
+                                 GISTBufferingInsertStack *parent);
+extern void gistPushItupToNodeBuffer(GISTBuildBuffers *gfbb,
+                                                GISTNodeBuffer *nodeBuffer, IndexTuple item);
+extern bool gistPopItupFromNodeBuffer(GISTBuildBuffers *gfbb,
+                                                 GISTNodeBuffer *nodeBuffer, IndexTuple *item);
+extern void gistFreeBuildBuffers(GISTBuildBuffers *gfbb);
+extern void gistRelocateBuildBuffersOnSplit(GISTBuildBuffers *gfbb,
+                                                               GISTSTATE *giststate, Relation r,
+                                                               GISTBufferingInsertStack *path, Buffer buffer,
+                                                               List *splitinfo);
+extern void gistUnloadNodeBuffers(GISTBuildBuffers *gfbb);
+
 #endif   /* GIST_PRIVATE_H */