Revise the TIDBitmap API to support multiple concurrent iterations over a
authorTom Lane <tgl@sss.pgh.pa.us>
Sat, 10 Jan 2009 21:08:36 +0000 (21:08 +0000)
committerTom Lane <tgl@sss.pgh.pa.us>
Sat, 10 Jan 2009 21:08:36 +0000 (21:08 +0000)
bitmap.  This is extracted from Greg Stark's posix_fadvise patch; it seems
worth committing separately, since it's potentially useful independently of
posix_fadvise.

src/backend/access/gin/ginget.c
src/backend/access/gin/ginscan.c
src/backend/executor/nodeBitmapHeapscan.c
src/backend/nodes/tidbitmap.c
src/include/access/gin.h
src/include/nodes/execnodes.h
src/include/nodes/tidbitmap.h

index d4559815340774e607479635c0298c5a02a02694..182981498c10ddb1c437934d4da5a4b0d4f5849d 100644 (file)
@@ -8,7 +8,7 @@
  * Portions Copyright (c) 1994, Regents of the University of California
  *
  * IDENTIFICATION
- *         $PostgreSQL: pgsql/src/backend/access/gin/ginget.c,v 1.21 2009/01/01 17:23:34 momjian Exp $
+ *         $PostgreSQL: pgsql/src/backend/access/gin/ginget.c,v 1.22 2009/01/10 21:08:36 tgl Exp $
  *-------------------------------------------------------------------------
  */
 
@@ -290,6 +290,7 @@ startScanEntry(Relation index, GinState *ginstate, GinScanEntry entry)
    entry->list = NULL;
    entry->nlist = 0;
    entry->partialMatch = NULL;
+   entry->partialMatchIterator = NULL;
    entry->partialMatchResult = NULL;
    entry->reduceResult = FALSE;
    entry->predictNumberResult = 0;
@@ -311,6 +312,9 @@ startScanEntry(Relation index, GinState *ginstate, GinScanEntry entry)
             */
            if ( entry->partialMatch )
            {
+               if (entry->partialMatchIterator)
+                   tbm_end_iterate(entry->partialMatchIterator);
+               entry->partialMatchIterator = NULL;
                tbm_free( entry->partialMatch );
                entry->partialMatch = NULL;
            }
@@ -323,7 +327,7 @@ startScanEntry(Relation index, GinState *ginstate, GinScanEntry entry)
 
        if ( entry->partialMatch && !tbm_is_empty(entry->partialMatch) )
        {
-           tbm_begin_iterate(entry->partialMatch);
+           entry->partialMatchIterator = tbm_begin_iterate(entry->partialMatch);
            entry->isFinished = FALSE;
        }
    }
@@ -534,11 +538,13 @@ entryGetItem(Relation index, GinScanEntry entry)
        {
            if ( entry->partialMatchResult == NULL || entry->offset >= entry->partialMatchResult->ntuples )
            {
-               entry->partialMatchResult = tbm_iterate( entry->partialMatch );
+               entry->partialMatchResult = tbm_iterate( entry->partialMatchIterator );
 
                if ( entry->partialMatchResult == NULL )
                {
                    ItemPointerSet(&entry->curItem, InvalidBlockNumber, InvalidOffsetNumber);
+                   tbm_end_iterate(entry->partialMatchIterator);
+                   entry->partialMatchIterator = NULL;
                    entry->isFinished = TRUE;
                    break;
                }
index 9c122cb526f06e49d77b2d746afa6e32e753cbf7..ba3774192229fd3ae3f1f638f30c9bc90dd115af 100644 (file)
@@ -8,7 +8,7 @@
  * Portions Copyright (c) 1994, Regents of the University of California
  *
  * IDENTIFICATION
- *         $PostgreSQL: pgsql/src/backend/access/gin/ginscan.c,v 1.20 2009/01/01 17:23:34 momjian Exp $
+ *         $PostgreSQL: pgsql/src/backend/access/gin/ginscan.c,v 1.21 2009/01/10 21:08:36 tgl Exp $
  *-------------------------------------------------------------------------
  */
 
@@ -61,6 +61,8 @@ fillScanKey(GinState *ginstate, GinScanKey key, OffsetNumber attnum, Datum query
        key->scanEntry[i].offset = InvalidOffsetNumber;
        key->scanEntry[i].buffer = InvalidBuffer;
        key->scanEntry[i].partialMatch = NULL;
+       key->scanEntry[i].partialMatchIterator = NULL;
+       key->scanEntry[i].partialMatchResult = NULL;
        key->scanEntry[i].strategy = strategy;
        key->scanEntry[i].list = NULL;
        key->scanEntry[i].nlist = 0;
@@ -107,6 +109,7 @@ resetScanKeys(GinScanKey keys, uint32 nkeys)
            key->scanEntry[j].list = NULL;
            key->scanEntry[j].nlist = 0;
            key->scanEntry[j].partialMatch = NULL;
+           key->scanEntry[j].partialMatchIterator = NULL;
            key->scanEntry[j].partialMatchResult = NULL;
        }
    }
@@ -132,6 +135,8 @@ freeScanKeys(GinScanKey keys, uint32 nkeys)
                ReleaseBuffer(key->scanEntry[j].buffer);
            if (key->scanEntry[j].list)
                pfree(key->scanEntry[j].list);
+           if (key->scanEntry[j].partialMatchIterator)
+               tbm_end_iterate(key->scanEntry[j].partialMatchIterator);
            if (key->scanEntry[j].partialMatch)
                tbm_free(key->scanEntry[j].partialMatch);
        }
index a74efe686cf7f9becb69876f078a075a23d72801..880b9c9590eae67d2b1e5fa55cf2acf6c7e7a783 100644 (file)
@@ -21,7 +21,7 @@
  *
  *
  * IDENTIFICATION
- *   $PostgreSQL: pgsql/src/backend/executor/nodeBitmapHeapscan.c,v 1.31 2009/01/01 17:23:41 momjian Exp $
+ *   $PostgreSQL: pgsql/src/backend/executor/nodeBitmapHeapscan.c,v 1.32 2009/01/10 21:08:36 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -65,6 +65,7 @@ BitmapHeapNext(BitmapHeapScanState *node)
    HeapScanDesc scan;
    Index       scanrelid;
    TIDBitmap  *tbm;
+   TBMIterator *tbmiterator;
    TBMIterateResult *tbmres;
    OffsetNumber targoffset;
    TupleTableSlot *slot;
@@ -78,6 +79,7 @@ BitmapHeapNext(BitmapHeapScanState *node)
    scan = node->ss.ss_currentScanDesc;
    scanrelid = ((BitmapHeapScan *) node->ss.ps.plan)->scan.scanrelid;
    tbm = node->tbm;
+   tbmiterator = node->tbmiterator;
    tbmres = node->tbmres;
 
    /*
@@ -111,7 +113,7 @@ BitmapHeapNext(BitmapHeapScanState *node)
 
    /*
     * If we haven't yet performed the underlying index scan, do it, and
-    * prepare the bitmap to be iterated over.
+    * begin the iteration over the bitmap.
     */
    if (tbm == NULL)
    {
@@ -121,9 +123,8 @@ BitmapHeapNext(BitmapHeapScanState *node)
            elog(ERROR, "unrecognized result from subplan");
 
        node->tbm = tbm;
+       node->tbmiterator = tbmiterator = tbm_begin_iterate(tbm);
        node->tbmres = tbmres = NULL;
-
-       tbm_begin_iterate(tbm);
    }
 
    for (;;)
@@ -136,7 +137,7 @@ BitmapHeapNext(BitmapHeapScanState *node)
         */
        if (tbmres == NULL)
        {
-           node->tbmres = tbmres = tbm_iterate(tbm);
+           node->tbmres = tbmres = tbm_iterate(tbmiterator);
            if (tbmres == NULL)
            {
                /* no more entries in the bitmap */
@@ -376,9 +377,12 @@ ExecBitmapHeapReScan(BitmapHeapScanState *node, ExprContext *exprCtxt)
    /* rescan to release any page pin */
    heap_rescan(node->ss.ss_currentScanDesc, NULL);
 
+   if (node->tbmiterator)
+       tbm_end_iterate(node->tbmiterator);
    if (node->tbm)
        tbm_free(node->tbm);
    node->tbm = NULL;
+   node->tbmiterator = NULL;
    node->tbmres = NULL;
 
    /*
@@ -423,6 +427,8 @@ ExecEndBitmapHeapScan(BitmapHeapScanState *node)
    /*
     * release bitmap if any
     */
+   if (node->tbmiterator)
+       tbm_end_iterate(node->tbmiterator);
    if (node->tbm)
        tbm_free(node->tbm);
 
@@ -466,6 +472,7 @@ ExecInitBitmapHeapScan(BitmapHeapScan *node, EState *estate, int eflags)
    scanstate->ss.ps.state = estate;
 
    scanstate->tbm = NULL;
+   scanstate->tbmiterator = NULL;
    scanstate->tbmres = NULL;
 
    /*
index 54acf18fbf2e7ebc36b87ca4c98a571204584438..e214bbb763403b7d65ab5286871c99686b08191c 100644 (file)
@@ -32,7 +32,7 @@
  * Copyright (c) 2003-2009, PostgreSQL Global Development Group
  *
  * IDENTIFICATION
- *   $PostgreSQL: pgsql/src/backend/nodes/tidbitmap.c,v 1.16 2009/01/01 17:23:43 momjian Exp $
+ *   $PostgreSQL: pgsql/src/backend/nodes/tidbitmap.c,v 1.17 2009/01/10 21:08:36 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -136,9 +136,20 @@ struct TIDBitmap
    int         nchunks;        /* number of lossy entries in pagetable */
    bool        iterating;      /* tbm_begin_iterate called? */
    PagetableEntry entry1;      /* used when status == TBM_ONE_PAGE */
-   /* the remaining fields are used while producing sorted output: */
+   /* these are valid when iterating is true: */
    PagetableEntry **spages;    /* sorted exact-page list, or NULL */
    PagetableEntry **schunks;   /* sorted lossy-chunk list, or NULL */
+};
+
+/*
+ * When iterating over a bitmap in sorted order, a TBMIterator is used to
+ * track our progress.  There can be several iterators scanning the same
+ * bitmap concurrently.  Note that the bitmap becomes read-only as soon as
+ * any iterator is created.
+ */
+struct TBMIterator
+{
+   TIDBitmap  *tbm;            /* TIDBitmap we're iterating over */
    int         spageptr;       /* next spages index */
    int         schunkptr;      /* next schunks index */
    int         schunkbit;      /* next bit to check in current schunk */
@@ -172,16 +183,9 @@ tbm_create(long maxbytes)
    TIDBitmap  *tbm;
    long        nbuckets;
 
-   /*
-    * Create the TIDBitmap struct, with enough trailing space to serve the
-    * needs of the TBMIterateResult sub-struct.
-    */
-   tbm = (TIDBitmap *) palloc(sizeof(TIDBitmap) +
-                              MAX_TUPLES_PER_PAGE * sizeof(OffsetNumber));
-   /* Zero all the fixed fields */
-   MemSetAligned(tbm, 0, sizeof(TIDBitmap));
+   /* Create the TIDBitmap struct and zero all its fields */
+   tbm = makeNode(TIDBitmap);
 
-   tbm->type = T_TIDBitmap;    /* Set NodeTag */
    tbm->mcxt = CurrentMemoryContext;
    tbm->status = TBM_EMPTY;
 
@@ -533,60 +537,80 @@ tbm_is_empty(const TIDBitmap *tbm)
 /*
  * tbm_begin_iterate - prepare to iterate through a TIDBitmap
  *
+ * The TBMIterator struct is created in the caller's memory context.
+ * For a clean shutdown of the iteration, call tbm_end_iterate; but it's
+ * okay to just allow the memory context to be released, too.  It is caller's
+ * responsibility not to touch the TBMIterator anymore once the TIDBitmap
+ * is freed.
+ *
  * NB: after this is called, it is no longer allowed to modify the contents
  * of the bitmap.  However, you can call this multiple times to scan the
- * contents repeatedly.
+ * contents repeatedly, including parallel scans.
  */
-void
+TBMIterator *
 tbm_begin_iterate(TIDBitmap *tbm)
 {
-   HASH_SEQ_STATUS status;
-   PagetableEntry *page;
-   int         npages;
-   int         nchunks;
-
-   tbm->iterating = true;
+   TBMIterator *iterator;
 
    /*
-    * Reset iteration pointers.
+    * Create the TBMIterator struct, with enough trailing space to serve the
+    * needs of the TBMIterateResult sub-struct.
     */
-   tbm->spageptr = 0;
-   tbm->schunkptr = 0;
-   tbm->schunkbit = 0;
+   iterator = (TBMIterator *) palloc(sizeof(TBMIterator) +
+                                     MAX_TUPLES_PER_PAGE * sizeof(OffsetNumber));
+   iterator->tbm = tbm;
 
    /*
-    * Nothing else to do if no entries, nor if we don't have a hashtable.
+    * Initialize iteration pointers.
     */
-   if (tbm->nentries == 0 || tbm->status != TBM_HASH)
-       return;
+   iterator->spageptr = 0;
+   iterator->schunkptr = 0;
+   iterator->schunkbit = 0;
 
    /*
-    * Create and fill the sorted page lists if we didn't already.
+    * If we have a hashtable, create and fill the sorted page lists,
+    * unless we already did that for a previous iterator.  Note that the
+    * lists are attached to the bitmap not the iterator, so they can be
+    * used by more than one iterator.
     */
-   if (!tbm->spages && tbm->npages > 0)
-       tbm->spages = (PagetableEntry **)
-           MemoryContextAlloc(tbm->mcxt,
-                              tbm->npages * sizeof(PagetableEntry *));
-   if (!tbm->schunks && tbm->nchunks > 0)
-       tbm->schunks = (PagetableEntry **)
-           MemoryContextAlloc(tbm->mcxt,
-                              tbm->nchunks * sizeof(PagetableEntry *));
-
-   hash_seq_init(&status, tbm->pagetable);
-   npages = nchunks = 0;
-   while ((page = (PagetableEntry *) hash_seq_search(&status)) != NULL)
+   if (tbm->status == TBM_HASH && !tbm->iterating)
    {
-       if (page->ischunk)
-           tbm->schunks[nchunks++] = page;
-       else
-           tbm->spages[npages++] = page;
+       HASH_SEQ_STATUS status;
+       PagetableEntry *page;
+       int         npages;
+       int         nchunks;
+
+       if (!tbm->spages && tbm->npages > 0)
+           tbm->spages = (PagetableEntry **)
+               MemoryContextAlloc(tbm->mcxt,
+                                  tbm->npages * sizeof(PagetableEntry *));
+       if (!tbm->schunks && tbm->nchunks > 0)
+           tbm->schunks = (PagetableEntry **)
+               MemoryContextAlloc(tbm->mcxt,
+                                  tbm->nchunks * sizeof(PagetableEntry *));
+
+       hash_seq_init(&status, tbm->pagetable);
+       npages = nchunks = 0;
+       while ((page = (PagetableEntry *) hash_seq_search(&status)) != NULL)
+       {
+           if (page->ischunk)
+               tbm->schunks[nchunks++] = page;
+           else
+               tbm->spages[npages++] = page;
+       }
+       Assert(npages == tbm->npages);
+       Assert(nchunks == tbm->nchunks);
+       if (npages > 1)
+           qsort(tbm->spages, npages, sizeof(PagetableEntry *),
+                 tbm_comparator);
+       if (nchunks > 1)
+           qsort(tbm->schunks, nchunks, sizeof(PagetableEntry *),
+                 tbm_comparator);
    }
-   Assert(npages == tbm->npages);
-   Assert(nchunks == tbm->nchunks);
-   if (npages > 1)
-       qsort(tbm->spages, npages, sizeof(PagetableEntry *), tbm_comparator);
-   if (nchunks > 1)
-       qsort(tbm->schunks, nchunks, sizeof(PagetableEntry *), tbm_comparator);
+
+   tbm->iterating = true;
+
+   return iterator;
 }
 
 /*
@@ -602,9 +626,10 @@ tbm_begin_iterate(TIDBitmap *tbm)
  * testing, recheck is always set true when ntuples < 0.)
  */
 TBMIterateResult *
-tbm_iterate(TIDBitmap *tbm)
+tbm_iterate(TBMIterator *iterator)
 {
-   TBMIterateResult *output = &(tbm->output);
+   TIDBitmap *tbm = iterator->tbm;
+   TBMIterateResult *output = &(iterator->output);
 
    Assert(tbm->iterating);
 
@@ -612,10 +637,10 @@ tbm_iterate(TIDBitmap *tbm)
     * If lossy chunk pages remain, make sure we've advanced schunkptr/
     * schunkbit to the next set bit.
     */
-   while (tbm->schunkptr < tbm->nchunks)
+   while (iterator->schunkptr < tbm->nchunks)
    {
-       PagetableEntry *chunk = tbm->schunks[tbm->schunkptr];
-       int         schunkbit = tbm->schunkbit;
+       PagetableEntry *chunk = tbm->schunks[iterator->schunkptr];
+       int         schunkbit = iterator->schunkbit;
 
        while (schunkbit < PAGES_PER_CHUNK)
        {
@@ -628,37 +653,37 @@ tbm_iterate(TIDBitmap *tbm)
        }
        if (schunkbit < PAGES_PER_CHUNK)
        {
-           tbm->schunkbit = schunkbit;
+           iterator->schunkbit = schunkbit;
            break;
        }
        /* advance to next chunk */
-       tbm->schunkptr++;
-       tbm->schunkbit = 0;
+       iterator->schunkptr++;
+       iterator->schunkbit = 0;
    }
 
    /*
     * If both chunk and per-page data remain, must output the numerically
     * earlier page.
     */
-   if (tbm->schunkptr < tbm->nchunks)
+   if (iterator->schunkptr < tbm->nchunks)
    {
-       PagetableEntry *chunk = tbm->schunks[tbm->schunkptr];
+       PagetableEntry *chunk = tbm->schunks[iterator->schunkptr];
        BlockNumber chunk_blockno;
 
-       chunk_blockno = chunk->blockno + tbm->schunkbit;
-       if (tbm->spageptr >= tbm->npages ||
-           chunk_blockno < tbm->spages[tbm->spageptr]->blockno)
+       chunk_blockno = chunk->blockno + iterator->schunkbit;
+       if (iterator->spageptr >= tbm->npages ||
+           chunk_blockno < tbm->spages[iterator->spageptr]->blockno)
        {
            /* Return a lossy page indicator from the chunk */
            output->blockno = chunk_blockno;
            output->ntuples = -1;
            output->recheck = true;
-           tbm->schunkbit++;
+           iterator->schunkbit++;
            return output;
        }
    }
 
-   if (tbm->spageptr < tbm->npages)
+   if (iterator->spageptr < tbm->npages)
    {
        PagetableEntry *page;
        int         ntuples;
@@ -668,7 +693,7 @@ tbm_iterate(TIDBitmap *tbm)
        if (tbm->status == TBM_ONE_PAGE)
            page = &tbm->entry1;
        else
-           page = tbm->spages[tbm->spageptr];
+           page = tbm->spages[iterator->spageptr];
 
        /* scan bitmap to extract individual offset numbers */
        ntuples = 0;
@@ -692,7 +717,7 @@ tbm_iterate(TIDBitmap *tbm)
        output->blockno = page->blockno;
        output->ntuples = ntuples;
        output->recheck = page->recheck;
-       tbm->spageptr++;
+       iterator->spageptr++;
        return output;
    }
 
@@ -700,6 +725,19 @@ tbm_iterate(TIDBitmap *tbm)
    return NULL;
 }
 
+/*
+ * tbm_end_iterate - finish an iteration over a TIDBitmap
+ *
+ * Currently this is just a pfree, but it might do more someday.  (For
+ * instance, it could be useful to count open iterators and allow the
+ * bitmap to return to read/write status when there are no more iterators.)
+ */
+void
+tbm_end_iterate(TBMIterator *iterator)
+{
+   pfree(iterator);
+}
+
 /*
  * tbm_find_pageentry - find a PagetableEntry for the pageno
  *
index 78269a415a0153dd4105329c06555656775f5b0a..1425333221d487ab4a0b58f60c173dcc4dde53ee 100644 (file)
@@ -4,7 +4,7 @@
  *
  * Copyright (c) 2006-2009, PostgreSQL Global Development Group
  *
- * $PostgreSQL: pgsql/src/include/access/gin.h,v 1.27 2009/01/01 17:23:55 momjian Exp $
+ * $PostgreSQL: pgsql/src/include/access/gin.h,v 1.28 2009/01/10 21:08:36 tgl Exp $
  *--------------------------------------------------------------------------
  */
 
@@ -380,6 +380,7 @@ typedef struct GinScanEntryData
    /* partial match support */
    bool        isPartialMatch;
    TIDBitmap  *partialMatch;
+   TBMIterator *partialMatchIterator;
    TBMIterateResult *partialMatchResult;
    StrategyNumber strategy;
 
index 4b2b64c300ee0540ba561903b1428c4d94d8ffac..506605df0014676ded4f42770ea11469d4087148 100644 (file)
@@ -7,7 +7,7 @@
  * Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
  *
- * $PostgreSQL: pgsql/src/include/nodes/execnodes.h,v 1.199 2009/01/01 17:23:59 momjian Exp $
+ * $PostgreSQL: pgsql/src/include/nodes/execnodes.h,v 1.200 2009/01/10 21:08:36 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -1152,6 +1152,7 @@ typedef struct BitmapIndexScanState
  *
  *     bitmapqualorig     execution state for bitmapqualorig expressions
  *     tbm                bitmap obtained from child index scan(s)
+ *     tbmiterator        iterator for scanning current pages
  *     tbmres             current-page data
  * ----------------
  */
@@ -1160,6 +1161,7 @@ typedef struct BitmapHeapScanState
    ScanState   ss;             /* its first field is NodeTag */
    List       *bitmapqualorig;
    TIDBitmap  *tbm;
+   TBMIterator *tbmiterator;
    TBMIterateResult *tbmres;
 } BitmapHeapScanState;
 
index e6ce0db892d433dd881d856c58c325df1fe5142f..93658543e4287b315bcd3e166b17326dc7476b4c 100644 (file)
@@ -15,7 +15,7 @@
  *
  * Copyright (c) 2003-2009, PostgreSQL Global Development Group
  *
- * $PostgreSQL: pgsql/src/include/nodes/tidbitmap.h,v 1.8 2009/01/01 17:24:00 momjian Exp $
+ * $PostgreSQL: pgsql/src/include/nodes/tidbitmap.h,v 1.9 2009/01/10 21:08:36 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -31,6 +31,9 @@
  */
 typedef struct TIDBitmap TIDBitmap;
 
+/* Likewise, TBMIterator is private */
+typedef struct TBMIterator TBMIterator;
+
 /* Result structure for tbm_iterate */
 typedef struct
 {
@@ -55,7 +58,8 @@ extern void tbm_intersect(TIDBitmap *a, const TIDBitmap *b);
 
 extern bool tbm_is_empty(const TIDBitmap *tbm);
 
-extern void tbm_begin_iterate(TIDBitmap *tbm);
-extern TBMIterateResult *tbm_iterate(TIDBitmap *tbm);
+extern TBMIterator *tbm_begin_iterate(TIDBitmap *tbm);
+extern TBMIterateResult *tbm_iterate(TBMIterator *iterator);
+extern void tbm_end_iterate(TBMIterator *iterator);
 
 #endif   /* TIDBITMAP_H */