Enhance nbtree ScalarArrayOp execution.

author Peter Geoghegan <pg@bowt.ie>

Sat, 6 Apr 2024 15:47:10 +0000 (11:47 -0400)

committer Peter Geoghegan <pg@bowt.ie>

Sat, 6 Apr 2024 15:47:10 +0000 (11:47 -0400)
author Peter Geoghegan <pg@bowt.ie>
Sat, 6 Apr 2024 15:47:10 +0000 (11:47 -0400)
committer Peter Geoghegan <pg@bowt.ie>
Sat, 6 Apr 2024 15:47:10 +0000 (11:47 -0400)
diff --git a/doc/src/sgml/indexam.sgml b/doc/src/sgml/indexam.sgml

index b68daa55aeb726239809f2a39b93d8da7f0ad3f0..76ac0fcddd78d6da0ae656e68e807947e2196865 100644 (file)
--- a/doc/src/sgml/indexam.sgml
+++ b/doc/src/sgml/indexam.sgml
@@ -809,7 +809,8 @@ amrestrpos (IndexScanDesc scan);
    <para>
  <programlisting>
  Size
-amestimateparallelscan (void);
+amestimateparallelscan (int nkeys,
+                        int norderbys);
  </programlisting>
     Estimate and return the number of bytes of dynamic shared memory which
     the access method will be needed to perform a parallel scan.  (This number
@@ -817,6 +818,13 @@ amestimateparallelscan (void);
     AM-independent data in <structname>ParallelIndexScanDescData</structname>.)
    </para>
  
+  <para>
+   The <literal>nkeys</literal> and <literal>norderbys</literal>
+   parameters indicate the number of quals and ordering operators that will be
+   used in the scan; the same values will be passed to <function>amrescan</function>.
+   Note that the actual values of the scan keys aren't provided yet.
+  </para>
+
    <para>
     It is not necessary to implement this function for access methods which
     do not support parallel scans or for which the number of additional bytes
diff --git a/doc/src/sgml/monitoring.sgml b/doc/src/sgml/monitoring.sgml

index e1e96ba7c45fef482e8e4522c83e95201703cc3f..053da8d6e40a0e65b104e762669efaf5477b629c 100644 (file)
--- a/doc/src/sgml/monitoring.sgml
+++ b/doc/src/sgml/monitoring.sgml
@@ -4064,6 +4064,19 @@ description | Waiting for a newly initialized WAL file to reach durable storage
     </para>
    </note>
  
+  <note>
+   <para>
+    Queries that use certain <acronym>SQL</acronym> constructs to search for
+    rows matching any value out of a list or array of multiple scalar values
+    (see <xref linkend="functions-comparisons"/>) perform multiple
+    <quote>primitive</quote> index scans (up to one primitive scan per scalar
+    value) during query execution.  Each internal primitive index scan
+    increments <structname>pg_stat_all_indexes</structname>.<structfield>idx_scan</structfield>,
+    so it's possible for the count of index scans to significantly exceed the
+    total number of index scan executor node executions.
+   </para>
+  </note>
+
   </sect2>
  
   <sect2 id="monitoring-pg-statio-all-tables-view">
diff --git a/src/backend/access/index/indexam.c b/src/backend/access/index/indexam.c

index 78ac3b1abb3eca279a09db4a1d16ffe2cde949f2..7510159fc8d46921e96baa5a55301db0844c450f 100644 (file)
--- a/src/backend/access/index/indexam.c
+++ b/src/backend/access/index/indexam.c
@@ -449,13 +449,10 @@ index_restrpos(IndexScanDesc scan)
  
  /*
   * index_parallelscan_estimate - estimate shared memory for parallel scan
- *
- * Currently, we don't pass any information to the AM-specific estimator,
- * so it can probably only return a constant.  In the future, we might need
- * to pass more information.
   */
  Size
-index_parallelscan_estimate(Relation indexRelation, Snapshot snapshot)
+index_parallelscan_estimate(Relation indexRelation, int nkeys, int norderbys,
+                                                       Snapshot snapshot)
  {
         Size            nbytes;
  
@@ -474,7 +471,8 @@ index_parallelscan_estimate(Relation indexRelation, Snapshot snapshot)
          */
         if (indexRelation->rd_indam->amestimateparallelscan != NULL)
                 nbytes = add_size(nbytes,
-                                                 indexRelation->rd_indam->amestimateparallelscan());
+                                                 indexRelation->rd_indam->amestimateparallelscan(nkeys,
+                                                                                                                                                 norderbys));
  
         return nbytes;
  }
diff --git a/src/backend/access/nbtree/nbtree.c b/src/backend/access/nbtree/nbtree.c

index 41df1027d2d979ff11e7a987914970b57fbcfd3f..686a3206f726bdf07852ee96b23350c1261ea337 100644 (file)
--- a/src/backend/access/nbtree/nbtree.c
+++ b/src/backend/access/nbtree/nbtree.c
@@ -40,6 +40,9 @@
  /*
   * BTPARALLEL_NOT_INITIALIZED indicates that the scan has not started.
   *
+ * BTPARALLEL_NEED_PRIMSCAN indicates that some process must now seize the
+ * scan to advance it via another call to _bt_first.
+ *
   * BTPARALLEL_ADVANCING indicates that some process is advancing the scan to
   * a new page; others must wait.
   *
@@ -47,11 +50,11 @@
   * to a new page; some process can start doing that.
   *
   * BTPARALLEL_DONE indicates that the scan is complete (including error exit).
- * We reach this state once for every distinct combination of array keys.
   */
  typedef enum
  {
         BTPARALLEL_NOT_INITIALIZED,
+       BTPARALLEL_NEED_PRIMSCAN,
         BTPARALLEL_ADVANCING,
         BTPARALLEL_IDLE,
         BTPARALLEL_DONE,
@@ -67,10 +70,14 @@ typedef struct BTParallelScanDescData
         BTPS_State      btps_pageStatus;        /* indicates whether next page is
                                                                          * available for scan. see above for
                                                                          * possible states of parallel scan. */
-       int                     btps_arrayKeyCount; /* count indicating number of array scan
-                                                                        * keys processed by parallel scan */
-       slock_t         btps_mutex;             /* protects above variables */
+       slock_t         btps_mutex;             /* protects above variables, btps_arrElems */
         ConditionVariable btps_cv;      /* used to synchronize parallel scan */
+
+       /*
+        * btps_arrElems is used when scans need to schedule another primitive
+        * index scan.  Holds BTArrayKeyInfo.cur_elem offsets for scan keys.
+        */
+       int                     btps_arrElems[FLEXIBLE_ARRAY_MEMBER];
  }                      BTParallelScanDescData;
  
  typedef struct BTParallelScanDescData *BTParallelScanDesc;
@@ -204,21 +211,7 @@ btgettuple(IndexScanDesc scan, ScanDirection dir)
         /* btree indexes are never lossy */
         scan->xs_recheck = false;
  
-       /*
-        * If we have any array keys, initialize them during first call for a
-        * scan.  We can't do this in btrescan because we don't know the scan
-        * direction at that time.
-        */
-       if (so->numArrayKeys && !BTScanPosIsValid(so->currPos))
-       {
-               /* punt if we have any unsatisfiable array keys */
-               if (so->numArrayKeys < 0)
-                       return false;
-
-               _bt_start_array_keys(scan, dir);
-       }
-
-       /* This loop handles advancing to the next array elements, if any */
+       /* Each loop iteration performs another primitive index scan */
         do
         {
                 /*
@@ -260,8 +253,8 @@ btgettuple(IndexScanDesc scan, ScanDirection dir)
                 /* If we have a tuple, return it ... */
                 if (res)
                         break;
-               /* ... otherwise see if we have more array keys to deal with */
-       } while (so->numArrayKeys && _bt_advance_array_keys(scan, dir));
+               /* ... otherwise see if we need another primitive index scan */
+       } while (so->numArrayKeys && _bt_start_prim_scan(scan, dir));
  
         return res;
  }
@@ -276,19 +269,7 @@ btgetbitmap(IndexScanDesc scan, TIDBitmap *tbm)
         int64           ntids = 0;
         ItemPointer heapTid;
  
-       /*
-        * If we have any array keys, initialize them.
-        */
-       if (so->numArrayKeys)
-       {
-               /* punt if we have any unsatisfiable array keys */
-               if (so->numArrayKeys < 0)
-                       return ntids;
-
-               _bt_start_array_keys(scan, ForwardScanDirection);
-       }
-
-       /* This loop handles advancing to the next array elements, if any */
+       /* Each loop iteration performs another primitive index scan */
         do
         {
                 /* Fetch the first page & tuple */
@@ -318,8 +299,8 @@ btgetbitmap(IndexScanDesc scan, TIDBitmap *tbm)
                                 ntids++;
                         }
                 }
-               /* Now see if we have more array keys to deal with */
-       } while (so->numArrayKeys && _bt_advance_array_keys(scan, ForwardScanDirection));
+               /* Now see if we need another primitive index scan */
+       } while (so->numArrayKeys && _bt_start_prim_scan(scan, ForwardScanDirection));
  
         return ntids;
  }
@@ -348,10 +329,10 @@ btbeginscan(Relation rel, int nkeys, int norderbys)
         else
                 so->keyData = NULL;
  
-       so->arrayKeyData = NULL;        /* assume no array keys for now */
-       so->arraysStarted = false;
-       so->numArrayKeys = 0;
+       so->needPrimScan = false;
+       so->scanBehind = false;
         so->arrayKeys = NULL;
+       so->orderProcs = NULL;
         so->arrayContext = NULL;
  
         so->killedItems = NULL;         /* until needed */
@@ -391,7 +372,8 @@ btrescan(IndexScanDesc scan, ScanKey scankey, int nscankeys,
         }
  
         so->markItemIndex = -1;
-       so->arrayKeyCount = 0;
+       so->needPrimScan = false;
+       so->scanBehind = false;
         BTScanPosUnpinIfPinned(so->markPos);
         BTScanPosInvalidate(so->markPos);
  
@@ -425,9 +407,7 @@ btrescan(IndexScanDesc scan, ScanKey scankey, int nscankeys,
                                 scankey,
                                 scan->numberOfKeys * sizeof(ScanKeyData));
         so->numberOfKeys = 0;           /* until _bt_preprocess_keys sets it */
-
-       /* If any keys are SK_SEARCHARRAY type, set up array-key info */
-       _bt_preprocess_array_keys(scan);
+       so->numArrayKeys = 0;           /* ditto */
  }
  
  /*
@@ -455,7 +435,7 @@ btendscan(IndexScanDesc scan)
         /* Release storage */
         if (so->keyData != NULL)
                 pfree(so->keyData);
-       /* so->arrayKeyData and so->arrayKeys are in arrayContext */
+       /* so->arrayKeys and so->orderProcs are in arrayContext */
         if (so->arrayContext != NULL)
                 MemoryContextDelete(so->arrayContext);
         if (so->killedItems != NULL)
@@ -490,10 +470,6 @@ btmarkpos(IndexScanDesc scan)
                 BTScanPosInvalidate(so->markPos);
                 so->markItemIndex = -1;
         }
-
-       /* Also record the current positions of any array keys */
-       if (so->numArrayKeys)
-               _bt_mark_array_keys(scan);
  }
  
  /*
@@ -504,10 +480,6 @@ btrestrpos(IndexScanDesc scan)
  {
         BTScanOpaque so = (BTScanOpaque) scan->opaque;
  
-       /* Restore the marked positions of any array keys */
-       if (so->numArrayKeys)
-               _bt_restore_array_keys(scan);
-
         if (so->markItemIndex >= 0)
         {
                 /*
@@ -546,6 +518,12 @@ btrestrpos(IndexScanDesc scan)
                         if (so->currTuples)
                                 memcpy(so->currTuples, so->markTuples,
                                            so->markPos.nextTupleOffset);
+                       /* Reset the scan's array keys (see _bt_steppage for why) */
+                       if (so->numArrayKeys)
+                       {
+                               _bt_start_array_keys(scan, so->currPos.dir);
+                               so->needPrimScan = false;
+                       }
                 }
                 else
                         BTScanPosInvalidate(so->currPos);
@@ -556,9 +534,10 @@ btrestrpos(IndexScanDesc scan)
   * btestimateparallelscan -- estimate storage for BTParallelScanDescData
   */
  Size
-btestimateparallelscan(void)
+btestimateparallelscan(int nkeys, int norderbys)
  {
-       return sizeof(BTParallelScanDescData);
+       /* Pessimistically assume all input scankeys will be output with arrays */
+       return offsetof(BTParallelScanDescData, btps_arrElems) + sizeof(int) * nkeys;
  }
  
  /*
@@ -572,7 +551,6 @@ btinitparallelscan(void *target)
         SpinLockInit(&bt_target->btps_mutex);
         bt_target->btps_scanPage = InvalidBlockNumber;
         bt_target->btps_pageStatus = BTPARALLEL_NOT_INITIALIZED;
-       bt_target->btps_arrayKeyCount = 0;
         ConditionVariableInit(&bt_target->btps_cv);
  }
  
@@ -598,7 +576,6 @@ btparallelrescan(IndexScanDesc scan)
         SpinLockAcquire(&btscan->btps_mutex);
         btscan->btps_scanPage = InvalidBlockNumber;
         btscan->btps_pageStatus = BTPARALLEL_NOT_INITIALIZED;
-       btscan->btps_arrayKeyCount = 0;
         SpinLockRelease(&btscan->btps_mutex);
  }
  
@@ -608,23 +585,26 @@ btparallelrescan(IndexScanDesc scan)
   *             or _bt_parallel_done().
   *
   * The return value is true if we successfully seized the scan and false
- * if we did not.  The latter case occurs if no pages remain for the current
- * set of scankeys.
+ * if we did not.  The latter case occurs if no pages remain.
   *
   * If the return value is true, *pageno returns the next or current page
   * of the scan (depending on the scan direction).  An invalid block number
- * means the scan hasn't yet started, and P_NONE means we've reached the end.
+ * means the scan hasn't yet started, or that caller needs to start the next
+ * primitive index scan (if it's the latter case we'll set so.needPrimScan).
   * The first time a participating process reaches the last page, it will return
   * true and set *pageno to P_NONE; after that, further attempts to seize the
   * scan will return false.
   *
   * Callers should ignore the value of pageno if the return value is false.
+ *
+ * Callers that are in a position to start a new primitive index scan must
+ * pass first=true (all other callers pass first=false).  We just return false
+ * for first=false callers that require another primitive index scan.
   */
  bool
-_bt_parallel_seize(IndexScanDesc scan, BlockNumber *pageno)
+_bt_parallel_seize(IndexScanDesc scan, BlockNumber *pageno, bool first)
  {
         BTScanOpaque so = (BTScanOpaque) scan->opaque;
-       BTPS_State      pageStatus;
         bool            exit_loop = false;
         bool            status = true;
         ParallelIndexScanDesc parallel_scan = scan->parallel_scan;
@@ -632,28 +612,69 @@ _bt_parallel_seize(IndexScanDesc scan, BlockNumber *pageno)
  
         *pageno = P_NONE;
  
+       if (first)
+       {
+               /*
+                * Initialize array related state when called from _bt_first, assuming
+                * that this will either be the first primitive index scan for the
+                * scan, or a previous explicitly scheduled primitive scan.
+                *
+                * Note: so->needPrimScan is only set when a scheduled primitive index
+                * scan is set to be performed in caller's worker process.  It should
+                * not be set here by us for the first primitive scan, nor should we
+                * ever set it for a parallel scan that has no array keys.
+                */
+               so->needPrimScan = false;
+               so->scanBehind = false;
+       }
+       else
+       {
+               /*
+                * Don't attempt to seize the scan when backend requires another
+                * primitive index scan unless we're in a position to start it now
+                */
+               if (so->needPrimScan)
+                       return false;
+       }
+
         btscan = (BTParallelScanDesc) OffsetToPointer((void *) parallel_scan,
                                                                                                   parallel_scan->ps_offset);
  
         while (1)
         {
                 SpinLockAcquire(&btscan->btps_mutex);
-               pageStatus = btscan->btps_pageStatus;
  
-               if (so->arrayKeyCount < btscan->btps_arrayKeyCount)
+               if (btscan->btps_pageStatus == BTPARALLEL_DONE)
                 {
-                       /* Parallel scan has already advanced to a new set of scankeys. */
+                       /* We're done with this parallel index scan */
                         status = false;
                 }
-               else if (pageStatus == BTPARALLEL_DONE)
+               else if (btscan->btps_pageStatus == BTPARALLEL_NEED_PRIMSCAN)
                 {
+                       Assert(so->numArrayKeys);
+
                         /*
-                        * We're done with this set of scankeys.  This may be the end, or
-                        * there could be more sets to try.
+                        * If we can start another primitive scan right away, do so.
+                        * Otherwise just wait.
                          */
-                       status = false;
+                       if (first)
+                       {
+                               btscan->btps_pageStatus = BTPARALLEL_ADVANCING;
+                               for (int i = 0; i < so->numArrayKeys; i++)
+                               {
+                                       BTArrayKeyInfo *array = &so->arrayKeys[i];
+                                       ScanKey         skey = &so->keyData[array->scan_key];
+
+                                       array->cur_elem = btscan->btps_arrElems[i];
+                                       skey->sk_argument = array->elem_values[array->cur_elem];
+                               }
+                               so->needPrimScan = true;
+                               so->scanBehind = false;
+                               *pageno = InvalidBlockNumber;
+                               exit_loop = true;
+                       }
                 }
-               else if (pageStatus != BTPARALLEL_ADVANCING)
+               else if (btscan->btps_pageStatus != BTPARALLEL_ADVANCING)
                 {
                         /*
                          * We have successfully seized control of the scan for the purpose
@@ -677,6 +698,12 @@ _bt_parallel_seize(IndexScanDesc scan, BlockNumber *pageno)
   * _bt_parallel_release() -- Complete the process of advancing the scan to a
   *             new page.  We now have the new value btps_scanPage; some other backend
   *             can now begin advancing the scan.
+ *
+ * Callers whose scan uses array keys must save their scan_page argument so
+ * that it can be passed to _bt_parallel_primscan_schedule, should caller
+ * determine that another primitive index scan is required.  If that happens,
+ * scan_page won't be scanned by any backend (unless the next primitive index
+ * scan lands on scan_page).
   */
  void
  _bt_parallel_release(IndexScanDesc scan, BlockNumber scan_page)
@@ -704,7 +731,6 @@ _bt_parallel_release(IndexScanDesc scan, BlockNumber scan_page)
  void
  _bt_parallel_done(IndexScanDesc scan)
  {
-       BTScanOpaque so = (BTScanOpaque) scan->opaque;
         ParallelIndexScanDesc parallel_scan = scan->parallel_scan;
         BTParallelScanDesc btscan;
         bool            status_changed = false;
@@ -717,13 +743,11 @@ _bt_parallel_done(IndexScanDesc scan)
                                                                                                   parallel_scan->ps_offset);
  
         /*
-        * Mark the parallel scan as done for this combination of scan keys,
-        * unless some other process already did so.  See also
-        * _bt_advance_array_keys.
+        * Mark the parallel scan as done, unless some other process did so
+        * already
          */
         SpinLockAcquire(&btscan->btps_mutex);
-       if (so->arrayKeyCount >= btscan->btps_arrayKeyCount &&
-               btscan->btps_pageStatus != BTPARALLEL_DONE)
+       if (btscan->btps_pageStatus != BTPARALLEL_DONE)
         {
                 btscan->btps_pageStatus = BTPARALLEL_DONE;
                 status_changed = true;
@@ -736,29 +760,39 @@ _bt_parallel_done(IndexScanDesc scan)
  }
  
  /*
- * _bt_parallel_advance_array_keys() -- Advances the parallel scan for array
- *                     keys.
+ * _bt_parallel_primscan_schedule() -- Schedule another primitive index scan.
   *
- * Updates the count of array keys processed for both local and parallel
- * scans.
+ * Caller passes the block number most recently passed to _bt_parallel_release
+ * by its backend.  Caller successfully schedules the next primitive index scan
+ * if the shared parallel state hasn't been seized since caller's backend last
+ * advanced the scan.
   */
  void
-_bt_parallel_advance_array_keys(IndexScanDesc scan)
+_bt_parallel_primscan_schedule(IndexScanDesc scan, BlockNumber prev_scan_page)
  {
         BTScanOpaque so = (BTScanOpaque) scan->opaque;
         ParallelIndexScanDesc parallel_scan = scan->parallel_scan;
         BTParallelScanDesc btscan;
  
+       Assert(so->numArrayKeys);
+
         btscan = (BTParallelScanDesc) OffsetToPointer((void *) parallel_scan,
                                                                                                   parallel_scan->ps_offset);
  
-       so->arrayKeyCount++;
         SpinLockAcquire(&btscan->btps_mutex);
-       if (btscan->btps_pageStatus == BTPARALLEL_DONE)
+       if (btscan->btps_scanPage == prev_scan_page &&
+               btscan->btps_pageStatus == BTPARALLEL_IDLE)
         {
                 btscan->btps_scanPage = InvalidBlockNumber;
-               btscan->btps_pageStatus = BTPARALLEL_NOT_INITIALIZED;
-               btscan->btps_arrayKeyCount++;
+               btscan->btps_pageStatus = BTPARALLEL_NEED_PRIMSCAN;
+
+               /* Serialize scan's current array keys */
+               for (int i = 0; i < so->numArrayKeys; i++)
+               {
+                       BTArrayKeyInfo *array = &so->arrayKeys[i];
+
+                       btscan->btps_arrElems[i] = array->cur_elem;
+               }
         }
         SpinLockRelease(&btscan->btps_mutex);
  }
diff --git a/src/backend/access/nbtree/nbtsearch.c b/src/backend/access/nbtree/nbtsearch.c

index e3fff90d8e421a119c2b3e7290bdc89a381c29d0..d241e8ea1dcce5b6c5ee1a5a932ebc4b19c6a68c 100644 (file)
--- a/src/backend/access/nbtree/nbtsearch.c
+++ b/src/backend/access/nbtree/nbtsearch.c
@@ -907,7 +907,6 @@ _bt_first(IndexScanDesc scan, ScanDirection dir)
          */
         if (!so->qual_ok)
         {
-               /* Notify any other workers that we're done with this scan key. */
                 _bt_parallel_done(scan);
                 return false;
         }
@@ -917,10 +916,22 @@ _bt_first(IndexScanDesc scan, ScanDirection dir)
          * scan has not started, proceed to find out first leaf page in the usual
          * way while keeping other participating processes waiting.  If the scan
          * has already begun, use the page number from the shared structure.
+        *
+        * When a parallel scan has another primitive index scan scheduled, a
+        * parallel worker will seize the scan for that purpose now.  This is
+        * similar to the case where the top-level scan hasn't started.
          */
         if (scan->parallel_scan != NULL)
         {
-               status = _bt_parallel_seize(scan, &blkno);
+               status = _bt_parallel_seize(scan, &blkno, true);
+
+               /*
+                * Initialize arrays (when _bt_parallel_seize didn't already set up
+                * the next primitive index scan)
+                */
+               if (so->numArrayKeys && !so->needPrimScan)
+                       _bt_start_array_keys(scan, dir);
+
                 if (!status)
                         return false;
                 else if (blkno == P_NONE)
@@ -935,6 +946,16 @@ _bt_first(IndexScanDesc scan, ScanDirection dir)
                         goto readcomplete;
                 }
         }
+       else if (so->numArrayKeys && !so->needPrimScan)
+       {
+               /*
+                * First _bt_first call (for current btrescan) without parallelism.
+                *
+                * Initialize arrays, and the corresponding scan keys that were just
+                * output by _bt_preprocess_keys.
+                */
+               _bt_start_array_keys(scan, dir);
+       }
  
         /*----------
          * Examine the scan keys to discover where we need to start the scan.
@@ -980,6 +1001,18 @@ _bt_first(IndexScanDesc scan, ScanDirection dir)
          *
          * The selected scan keys (at most one per index column) are remembered by
          * storing their addresses into the local startKeys[] array.
+        *
+        * _bt_checkkeys/_bt_advance_array_keys decide whether and when to start
+        * the next primitive index scan (for scans with array keys) based in part
+        * on an understanding of how it'll enable us to reposition the scan.
+        * They're directly aware of how we'll sometimes cons up an explicit
+        * SK_SEARCHNOTNULL key.  They'll even end primitive scans by applying a
+        * symmetric "deduce NOT NULL" rule of their own.  This allows top-level
+        * scans to skip large groups of NULLs through repeated deductions about
+        * key strictness (for a required inequality key) and whether NULLs in the
+        * key's index column are stored last or first (relative to non-NULLs).
+        * If you update anything here, _bt_checkkeys/_bt_advance_array_keys might
+        * need to be kept in sync.
          *----------
          */
         strat_total = BTEqualStrategyNumber;
@@ -1502,7 +1535,8 @@ _bt_next(IndexScanDesc scan, ScanDirection dir)
   * We scan the current page starting at offnum and moving in the indicated
   * direction.  All items matching the scan keys are loaded into currPos.items.
   * moreLeft or moreRight (as appropriate) is cleared if _bt_checkkeys reports
- * that there can be no more matching tuples in the current scan direction.
+ * that there can be no more matching tuples in the current scan direction
+ * (could just be for the current primitive index scan when scan has arrays).
   *
   * _bt_first caller passes us an offnum returned by _bt_binsrch, which might
   * be an out of bounds offnum such as "maxoff + 1" in certain corner cases.
@@ -1527,11 +1561,10 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum,
         BTPageOpaque opaque;
         OffsetNumber minoff;
         OffsetNumber maxoff;
-       int                     itemIndex;
-       bool            continuescan;
-       int                     indnatts;
-       bool            continuescanPrechecked;
-       bool            haveFirstMatch = false;
+       BTReadPageState pstate;
+       bool            arrayKeys;
+       int                     itemIndex,
+                               indnatts;
  
         /*
          * We must have the buffer pinned and locked, but the usual macro can't be
@@ -1546,16 +1579,32 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum,
         if (scan->parallel_scan)
         {
                 if (ScanDirectionIsForward(dir))
-                       _bt_parallel_release(scan, opaque->btpo_next);
+                       pstate.prev_scan_page = opaque->btpo_next;
                 else
-                       _bt_parallel_release(scan, BufferGetBlockNumber(so->currPos.buf));
+                       pstate.prev_scan_page = BufferGetBlockNumber(so->currPos.buf);
+
+               _bt_parallel_release(scan, pstate.prev_scan_page);
         }
  
-       continuescan = true;            /* default assumption */
         indnatts = IndexRelationGetNumberOfAttributes(scan->indexRelation);
+       arrayKeys = so->numArrayKeys != 0;
         minoff = P_FIRSTDATAKEY(opaque);
         maxoff = PageGetMaxOffsetNumber(page);
  
+       /* initialize page-level state that we'll pass to _bt_checkkeys */
+       pstate.dir = dir;
+       pstate.minoff = minoff;
+       pstate.maxoff = maxoff;
+       pstate.finaltup = NULL;
+       pstate.page = page;
+       pstate.offnum = InvalidOffsetNumber;
+       pstate.skip = InvalidOffsetNumber;
+       pstate.continuescan = true; /* default assumption */
+       pstate.prechecked = false;
+       pstate.firstmatch = false;
+       pstate.rechecks = 0;
+       pstate.targetdistance = 0;
+
         /*
          * We note the buffer's block number so that we can release the pin later.
          * This allows us to re-read the buffer if it is needed again for hinting.
@@ -1598,10 +1647,34 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum,
          * corresponding value from the last item on the page.  So checking with
          * the last item on the page would give a more precise answer.
          *
-        * We skip this for the first page in the scan to evade the possible
-        * slowdown of the point queries.
+        * We skip this for the first page read by each (primitive) scan, to avoid
+        * slowing down point queries.  They typically don't stand to gain much
+        * when the optimization can be applied, and are more likely to notice the
+        * overhead of the precheck.
+        *
+        * The optimization is unsafe and must be avoided whenever _bt_checkkeys
+        * just set a low-order required array's key to the best available match
+        * for a truncated -inf attribute value from the prior page's high key
+        * (array element 0 is always the best available match in this scenario).
+        * It's quite likely that matches for array element 0 begin on this page,
+        * but the start of matches won't necessarily align with page boundaries.
+        * When the start of matches is somewhere in the middle of this page, it
+        * would be wrong to treat page's final non-pivot tuple as representative.
+        * Doing so might lead us to treat some of the page's earlier tuples as
+        * being part of a group of tuples thought to satisfy the required keys.
+        *
+        * Note: Conversely, in the case where the scan's arrays just advanced
+        * using the prior page's HIKEY _without_ advancement setting scanBehind,
+        * the start of matches must be aligned with page boundaries, which makes
+        * it safe to attempt the optimization here now.  It's also safe when the
+        * prior page's HIKEY simply didn't need to advance any required array. In
+        * both cases we can safely assume that the _first_ tuple from this page
+        * must be >= the current set of array keys/equality constraints. And so
+        * if the final tuple is == those same keys (and also satisfies any
+        * required < or <= strategy scan keys) during the precheck, we can safely
+        * assume that this must also be true of all earlier tuples from the page.
          */
-       if (!firstPage && minoff < maxoff)
+       if (!firstPage && !so->scanBehind && minoff < maxoff)
         {
                 ItemId          iid;
                 IndexTuple      itup;
@@ -1609,22 +1682,22 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum,
                 iid = PageGetItemId(page, ScanDirectionIsForward(dir) ? maxoff : minoff);
                 itup = (IndexTuple) PageGetItem(page, iid);
  
-               /*
-                * Do the precheck.  Note that we pass the pointer to the
-                * 'continuescanPrechecked' to the 'continuescan' argument. That will
-                * set flag to true if all required keys are satisfied and false
-                * otherwise.
-                */
-               (void) _bt_checkkeys(scan, itup, indnatts, dir,
-                                                        &continuescanPrechecked, false, false);
-       }
-       else
-       {
-               continuescanPrechecked = false;
+               /* Call with arrayKeys=false to avoid undesirable side-effects */
+               _bt_checkkeys(scan, &pstate, false, itup, indnatts);
+               pstate.prechecked = pstate.continuescan;
+               pstate.continuescan = true; /* reset */
         }
  
         if (ScanDirectionIsForward(dir))
         {
+               /* SK_SEARCHARRAY forward scans must provide high key up front */
+               if (arrayKeys && !P_RIGHTMOST(opaque))
+               {
+                       ItemId          iid = PageGetItemId(page, P_HIKEY);
+
+                       pstate.finaltup = (IndexTuple) PageGetItem(page, iid);
+               }
+
                 /* load items[] in ascending order */
                 itemIndex = 0;
  
@@ -1649,23 +1722,28 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum,
                         itup = (IndexTuple) PageGetItem(page, iid);
                         Assert(!BTreeTupleIsPivot(itup));
  
-                       passes_quals = _bt_checkkeys(scan, itup, indnatts, dir,
-                                                                                &continuescan,
-                                                                                continuescanPrechecked,
-                                                                                haveFirstMatch);
+                       pstate.offnum = offnum;
+                       passes_quals = _bt_checkkeys(scan, &pstate, arrayKeys,
+                                                                                itup, indnatts);
  
                         /*
-                        * If the result of prechecking required keys was true, then in
-                        * assert-enabled builds we also recheck that the _bt_checkkeys()
-                        * result is the same.
+                        * Check if we need to skip ahead to a later tuple (only possible
+                        * when the scan uses array keys)
                          */
-                       Assert((!continuescanPrechecked && haveFirstMatch) ||
-                                  passes_quals == _bt_checkkeys(scan, itup, indnatts, dir,
-                                                                                                &continuescan, false, false));
+                       if (arrayKeys && OffsetNumberIsValid(pstate.skip))
+                       {
+                               Assert(!passes_quals && pstate.continuescan);
+                               Assert(offnum < pstate.skip);
+
+                               offnum = pstate.skip;
+                               pstate.skip = InvalidOffsetNumber;
+                               continue;
+                       }
+
                         if (passes_quals)
                         {
                                 /* tuple passes all scan key conditions */
-                               haveFirstMatch = true;
+                               pstate.firstmatch = true;
                                 if (!BTreeTupleIsPosting(itup))
                                 {
                                         /* Remember it */
@@ -1696,7 +1774,7 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum,
                                 }
                         }
                         /* When !continuescan, there can't be any more matches, so stop */
-                       if (!continuescan)
+                       if (!pstate.continuescan)
                                 break;
  
                         offnum = OffsetNumberNext(offnum);
@@ -1713,17 +1791,18 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum,
                  * only appear on non-pivot tuples on the right sibling page are
                  * common.
                  */
-               if (continuescan && !P_RIGHTMOST(opaque))
+               if (pstate.continuescan && !P_RIGHTMOST(opaque))
                 {
                         ItemId          iid = PageGetItemId(page, P_HIKEY);
                         IndexTuple      itup = (IndexTuple) PageGetItem(page, iid);
                         int                     truncatt;
  
                         truncatt = BTreeTupleGetNAtts(itup, scan->indexRelation);
-                       _bt_checkkeys(scan, itup, truncatt, dir, &continuescan, false, false);
+                       pstate.prechecked = false;      /* precheck didn't cover HIKEY */
+                       _bt_checkkeys(scan, &pstate, arrayKeys, itup, truncatt);
                 }
  
-               if (!continuescan)
+               if (!pstate.continuescan)
                         so->currPos.moreRight = false;
  
                 Assert(itemIndex <= MaxTIDsPerBTreePage);
@@ -1733,6 +1812,14 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum,
         }
         else
         {
+               /* SK_SEARCHARRAY backward scans must provide final tuple up front */
+               if (arrayKeys && minoff <= maxoff && !P_LEFTMOST(opaque))
+               {
+                       ItemId          iid = PageGetItemId(page, minoff);
+
+                       pstate.finaltup = (IndexTuple) PageGetItem(page, iid);
+               }
+
                 /* load items[] in descending order */
                 itemIndex = MaxTIDsPerBTreePage;
  
@@ -1772,23 +1859,28 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum,
                         itup = (IndexTuple) PageGetItem(page, iid);
                         Assert(!BTreeTupleIsPivot(itup));
  
-                       passes_quals = _bt_checkkeys(scan, itup, indnatts, dir,
-                                                                                &continuescan,
-                                                                                continuescanPrechecked,
-                                                                                haveFirstMatch);
+                       pstate.offnum = offnum;
+                       passes_quals = _bt_checkkeys(scan, &pstate, arrayKeys,
+                                                                                itup, indnatts);
  
                         /*
-                        * If the result of prechecking required keys was true, then in
-                        * assert-enabled builds we also recheck that the _bt_checkkeys()
-                        * result is the same.
+                        * Check if we need to skip ahead to a later tuple (only possible
+                        * when the scan uses array keys)
                          */
-                       Assert((!continuescanPrechecked && !haveFirstMatch) ||
-                                  passes_quals == _bt_checkkeys(scan, itup, indnatts, dir,
-                                                                                                &continuescan, false, false));
+                       if (arrayKeys && OffsetNumberIsValid(pstate.skip))
+                       {
+                               Assert(!passes_quals && pstate.continuescan);
+                               Assert(offnum > pstate.skip);
+
+                               offnum = pstate.skip;
+                               pstate.skip = InvalidOffsetNumber;
+                               continue;
+                       }
+
                         if (passes_quals && tuple_alive)
                         {
                                 /* tuple passes all scan key conditions */
-                               haveFirstMatch = true;
+                               pstate.firstmatch = true;
                                 if (!BTreeTupleIsPosting(itup))
                                 {
                                         /* Remember it */
@@ -1824,7 +1916,7 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum,
                                         }
                                 }
                         }
-                       if (!continuescan)
+                       if (!pstate.continuescan)
                         {
                                 /* there can't be any more matches, so stop */
                                 so->currPos.moreLeft = false;
@@ -1970,6 +2062,31 @@ _bt_steppage(IndexScanDesc scan, ScanDirection dir)
                                    so->currPos.nextTupleOffset);
                 so->markPos.itemIndex = so->markItemIndex;
                 so->markItemIndex = -1;
+
+               /*
+                * If we're just about to start the next primitive index scan
+                * (possible with a scan that has arrays keys, and needs to skip to
+                * continue in the current scan direction), moreLeft/moreRight only
+                * indicate the end of the current primitive index scan.  They must
+                * never be taken to indicate that the top-level index scan has ended
+                * (that would be wrong).
+                *
+                * We could handle this case by treating the current array keys as
+                * markPos state.  But depending on the current array state like this
+                * would add complexity.  Instead, we just unset markPos's copy of
+                * moreRight or moreLeft (whichever might be affected), while making
+                * btrestpos reset the scan's arrays to their initial scan positions.
+                * In effect, btrestpos leaves advancing the arrays up to the first
+                * _bt_readpage call (that takes place after it has restored markPos).
+                */
+               Assert(so->markPos.dir == dir);
+               if (so->needPrimScan)
+               {
+                       if (ScanDirectionIsForward(dir))
+                               so->markPos.moreRight = true;
+                       else
+                               so->markPos.moreLeft = true;
+               }
         }
  
         if (ScanDirectionIsForward(dir))
@@ -1981,7 +2098,7 @@ _bt_steppage(IndexScanDesc scan, ScanDirection dir)
                          * Seize the scan to get the next block number; if the scan has
                          * ended already, bail out.
                          */
-                       status = _bt_parallel_seize(scan, &blkno);
+                       status = _bt_parallel_seize(scan, &blkno, false);
                         if (!status)
                         {
                                 /* release the previous buffer, if pinned */
@@ -2013,7 +2130,7 @@ _bt_steppage(IndexScanDesc scan, ScanDirection dir)
                          * Seize the scan to get the current block number; if the scan has
                          * ended already, bail out.
                          */
-                       status = _bt_parallel_seize(scan, &blkno);
+                       status = _bt_parallel_seize(scan, &blkno, false);
                         BTScanPosUnpinIfPinned(so->currPos);
                         if (!status)
                         {
@@ -2097,7 +2214,7 @@ _bt_readnextpage(IndexScanDesc scan, BlockNumber blkno, ScanDirection dir)
                         if (scan->parallel_scan != NULL)
                         {
                                 _bt_relbuf(rel, so->currPos.buf);
-                               status = _bt_parallel_seize(scan, &blkno);
+                               status = _bt_parallel_seize(scan, &blkno, false);
                                 if (!status)
                                 {
                                         BTScanPosInvalidate(so->currPos);
@@ -2193,7 +2310,7 @@ _bt_readnextpage(IndexScanDesc scan, BlockNumber blkno, ScanDirection dir)
                         if (scan->parallel_scan != NULL)
                         {
                                 _bt_relbuf(rel, so->currPos.buf);
-                               status = _bt_parallel_seize(scan, &blkno);
+                               status = _bt_parallel_seize(scan, &blkno, false);
                                 if (!status)
                                 {
                                         BTScanPosInvalidate(so->currPos);
@@ -2218,6 +2335,8 @@ _bt_parallel_readpage(IndexScanDesc scan, BlockNumber blkno, ScanDirection dir)
  {
         BTScanOpaque so = (BTScanOpaque) scan->opaque;
  
+       Assert(!so->needPrimScan);
+
         _bt_initialize_more_data(so, dir);
  
         if (!_bt_readnextpage(scan, blkno, dir))
@@ -2524,14 +2643,22 @@ _bt_endpoint(IndexScanDesc scan, ScanDirection dir)
  }
  
  /*
- * _bt_initialize_more_data() -- initialize moreLeft/moreRight appropriately
- * for scan direction
+ * _bt_initialize_more_data() -- initialize moreLeft, moreRight and scan dir
+ * from currPos
   */
  static inline void
  _bt_initialize_more_data(BTScanOpaque so, ScanDirection dir)
  {
-       /* initialize moreLeft/moreRight appropriately for scan direction */
-       if (ScanDirectionIsForward(dir))
+       so->currPos.dir = dir;
+       if (so->needPrimScan)
+       {
+               Assert(so->numArrayKeys);
+
+               so->currPos.moreLeft = true;
+               so->currPos.moreRight = true;
+               so->needPrimScan = false;
+       }
+       else if (ScanDirectionIsForward(dir))
         {
                 so->currPos.moreLeft = false;
                 so->currPos.moreRight = true;
diff --git a/src/backend/access/nbtree/nbtutils.c b/src/backend/access/nbtree/nbtutils.c

index d50317096da347933562ec6d10109dcb3fcbd083..e963de78a7bea106752e82d2bdf135fbae57af35 100644 (file)
--- a/src/backend/access/nbtree/nbtutils.c
+++ b/src/backend/access/nbtree/nbtutils.c
@@ -29,29 +29,77 @@
  #include "utils/memutils.h"
  #include "utils/rel.h"
  
+#define LOOK_AHEAD_REQUIRED_RECHECKS   3
+#define LOOK_AHEAD_DEFAULT_DISTANCE    5
  
  typedef struct BTSortArrayContext
  {
-       FmgrInfo        flinfo;
+       FmgrInfo   *sortproc;
         Oid                     collation;
         bool            reverse;
  } BTSortArrayContext;
  
+typedef struct BTScanKeyPreproc
+{
+       ScanKey         skey;
+       int                     ikey;
+       int                     arrayidx;
+} BTScanKeyPreproc;
+
+static void _bt_setup_array_cmp(IndexScanDesc scan, ScanKey skey, Oid elemtype,
+                                                               FmgrInfo *orderproc, FmgrInfo **sortprocp);
  static Datum _bt_find_extreme_element(IndexScanDesc scan, ScanKey skey,
-                                                                         StrategyNumber strat,
+                                                                         Oid elemtype, StrategyNumber strat,
                                                                           Datum *elems, int nelems);
-static int     _bt_sort_array_elements(IndexScanDesc scan, ScanKey skey,
-                                                                       bool reverse,
-                                                                       Datum *elems, int nelems);
+static int     _bt_sort_array_elements(ScanKey skey, FmgrInfo *sortproc,
+                                                                       bool reverse, Datum *elems, int nelems);
+static bool _bt_merge_arrays(IndexScanDesc scan, ScanKey skey,
+                                                        FmgrInfo *sortproc, bool reverse,
+                                                        Oid origelemtype, Oid nextelemtype,
+                                                        Datum *elems_orig, int *nelems_orig,
+                                                        Datum *elems_next, int nelems_next);
+static bool _bt_compare_array_scankey_args(IndexScanDesc scan,
+                                                                                  ScanKey arraysk, ScanKey skey,
+                                                                                  FmgrInfo *orderproc, BTArrayKeyInfo *array,
+                                                                                  bool *qual_ok);
+static ScanKey _bt_preprocess_array_keys(IndexScanDesc scan);
+static void _bt_preprocess_array_keys_final(IndexScanDesc scan, int *keyDataMap);
  static int     _bt_compare_array_elements(const void *a, const void *b, void *arg);
+static inline int32 _bt_compare_array_skey(FmgrInfo *orderproc,
+                                                                                  Datum tupdatum, bool tupnull,
+                                                                                  Datum arrdatum, ScanKey cur);
+static int     _bt_binsrch_array_skey(FmgrInfo *orderproc,
+                                                                  bool cur_elem_trig, ScanDirection dir,
+                                                                  Datum tupdatum, bool tupnull,
+                                                                  BTArrayKeyInfo *array, ScanKey cur,
+                                                                  int32 *set_elem_result);
+static bool _bt_advance_array_keys_increment(IndexScanDesc scan, ScanDirection dir);
+static void _bt_rewind_nonrequired_arrays(IndexScanDesc scan, ScanDirection dir);
+static bool _bt_tuple_before_array_skeys(IndexScanDesc scan, ScanDirection dir,
+                                                                                IndexTuple tuple, TupleDesc tupdesc, int tupnatts,
+                                                                                bool readpagetup, int sktrig, bool *scanBehind);
+static bool _bt_advance_array_keys(IndexScanDesc scan, BTReadPageState *pstate,
+                                                                  IndexTuple tuple, int tupnatts, TupleDesc tupdesc,
+                                                                  int sktrig, bool sktrig_required);
+#ifdef USE_ASSERT_CHECKING
+static bool _bt_verify_arrays_bt_first(IndexScanDesc scan, ScanDirection dir);
+static bool _bt_verify_keys_with_arraykeys(IndexScanDesc scan);
+#endif
  static bool _bt_compare_scankey_args(IndexScanDesc scan, ScanKey op,
                                                                          ScanKey leftarg, ScanKey rightarg,
+                                                                        BTArrayKeyInfo *array, FmgrInfo *orderproc,
                                                                          bool *result);
  static bool _bt_fix_scankey_strategy(ScanKey skey, int16 *indoption);
  static void _bt_mark_scankey_required(ScanKey skey);
+static bool _bt_check_compare(IndexScanDesc scan, ScanDirection dir,
+                                                         IndexTuple tuple, int tupnatts, TupleDesc tupdesc,
+                                                         bool advancenonrequired, bool prechecked, bool firstmatch,
+                                                         bool *continuescan, int *ikey);
  static bool _bt_check_rowcompare(ScanKey skey,
                                                                  IndexTuple tuple, int tupnatts, TupleDesc tupdesc,
                                                                  ScanDirection dir, bool *continuescan);
+static void _bt_checkkeys_look_ahead(IndexScanDesc scan, BTReadPageState *pstate,
+                                                                        int tupnatts, TupleDesc tupdesc);
  static int     _bt_keep_natts(Relation rel, IndexTuple lastleft,
                                                    IndexTuple firstright, BTScanInsert itup_key);
  
@@ -188,29 +236,55 @@ _bt_freestack(BTStack stack)
   *
   * If there are any SK_SEARCHARRAY scan keys, deconstruct the array(s) and
   * set up BTArrayKeyInfo info for each one that is an equality-type key.
- * Prepare modified scan keys in so->arrayKeyData, which will hold the current
- * array elements during each primitive indexscan operation.  For inequality
- * array keys, it's sufficient to find the extreme element value and replace
- * the whole array with that scalar value.
- *
- * Note: the reason we need so->arrayKeyData, rather than just scribbling
- * on scan->keyData, is that callers are permitted to call btrescan without
- * supplying a new set of scankey data.
+ * Returns modified scan keys as input for further, standard preprocessing.
+ *
+ * Currently we perform two kinds of preprocessing to deal with redundancies.
+ * For inequality array keys, it's sufficient to find the extreme element
+ * value and replace the whole array with that scalar value.  This eliminates
+ * all but one array element as redundant.  Similarly, we are capable of
+ * "merging together" multiple equality array keys (from two or more input
+ * scan keys) into a single output scan key containing only the intersecting
+ * array elements.  This can eliminate many redundant array elements, as well
+ * as eliminating whole array scan keys as redundant.  It can also allow us to
+ * detect contradictory quals.
+ *
+ * It is convenient for _bt_preprocess_keys caller to have to deal with no
+ * more than one equality strategy array scan key per index attribute.  We'll
+ * always be able to set things up that way when complete opfamilies are used.
+ * Eliminated array scan keys can be recognized as those that have had their
+ * sk_strategy field set to InvalidStrategy here by us.  Caller should avoid
+ * including these in the scan's so->keyData[] output array.
+ *
+ * We set the scan key references from the scan's BTArrayKeyInfo info array to
+ * offsets into the temp modified input array returned to caller.  Scans that
+ * have array keys should call _bt_preprocess_array_keys_final when standard
+ * preprocessing steps are complete.  This will convert the scan key offset
+ * references into references to the scan's so->keyData[] output scan keys.
+ *
+ * Note: the reason we need to return a temp scan key array, rather than just
+ * scribbling on scan->keyData, is that callers are permitted to call btrescan
+ * without supplying a new set of scankey data.
   */
-void
+static ScanKey
  _bt_preprocess_array_keys(IndexScanDesc scan)
  {
         BTScanOpaque so = (BTScanOpaque) scan->opaque;
+       Relation        rel = scan->indexRelation;
         int                     numberOfKeys = scan->numberOfKeys;
-       int16      *indoption = scan->indexRelation->rd_indoption;
+       int16      *indoption = rel->rd_indoption;
         int                     numArrayKeys;
+       int                     origarrayatt = InvalidAttrNumber,
+                               origarraykey = -1;
+       Oid                     origelemtype = InvalidOid;
         ScanKey         cur;
-       int                     i;
         MemoryContext oldContext;
+       ScanKey         arrayKeyData;   /* modified copy of scan->keyData */
+
+       Assert(numberOfKeys);
  
         /* Quick check to see if there are any array keys */
         numArrayKeys = 0;
-       for (i = 0; i < numberOfKeys; i++)
+       for (int i = 0; i < numberOfKeys; i++)
         {
                 cur = &scan->keyData[i];
                 if (cur->sk_flags & SK_SEARCHARRAY)
@@ -220,20 +294,15 @@ _bt_preprocess_array_keys(IndexScanDesc scan)
                         /* If any arrays are null as a whole, we can quit right now. */
                         if (cur->sk_flags & SK_ISNULL)
                         {
-                               so->numArrayKeys = -1;
-                               so->arrayKeyData = NULL;
-                               return;
+                               so->qual_ok = false;
+                               return NULL;
                         }
                 }
         }
  
         /* Quit if nothing to do. */
         if (numArrayKeys == 0)
-       {
-               so->numArrayKeys = 0;
-               so->arrayKeyData = NULL;
-               return;
-       }
+               return NULL;
  
         /*
          * Make a scan-lifespan context to hold array-associated data, or reset it
@@ -249,18 +318,23 @@ _bt_preprocess_array_keys(IndexScanDesc scan)
         oldContext = MemoryContextSwitchTo(so->arrayContext);
  
         /* Create modifiable copy of scan->keyData in the workspace context */
-       so->arrayKeyData = (ScanKey) palloc(scan->numberOfKeys * sizeof(ScanKeyData));
-       memcpy(so->arrayKeyData,
-                  scan->keyData,
-                  scan->numberOfKeys * sizeof(ScanKeyData));
+       arrayKeyData = (ScanKey) palloc(numberOfKeys * sizeof(ScanKeyData));
+       memcpy(arrayKeyData, scan->keyData, numberOfKeys * sizeof(ScanKeyData));
  
         /* Allocate space for per-array data in the workspace context */
-       so->arrayKeys = (BTArrayKeyInfo *) palloc0(numArrayKeys * sizeof(BTArrayKeyInfo));
+       so->arrayKeys = (BTArrayKeyInfo *) palloc(numArrayKeys * sizeof(BTArrayKeyInfo));
+
+       /* Allocate space for ORDER procs used to help _bt_checkkeys */
+       so->orderProcs = (FmgrInfo *) palloc(numberOfKeys * sizeof(FmgrInfo));
  
         /* Now process each array key */
         numArrayKeys = 0;
-       for (i = 0; i < numberOfKeys; i++)
+       for (int i = 0; i < numberOfKeys; i++)
         {
+               FmgrInfo        sortproc;
+               FmgrInfo   *sortprocp = &sortproc;
+               Oid                     elemtype;
+               bool            reverse;
                 ArrayType  *arrayval;
                 int16           elmlen;
                 bool            elmbyval;
@@ -271,7 +345,7 @@ _bt_preprocess_array_keys(IndexScanDesc scan)
                 int                     num_nonnulls;
                 int                     j;
  
-               cur = &so->arrayKeyData[i];
+               cur = &arrayKeyData[i];
                 if (!(cur->sk_flags & SK_SEARCHARRAY))
                         continue;
  
@@ -305,10 +379,21 @@ _bt_preprocess_array_keys(IndexScanDesc scan)
                 /* If there's no non-nulls, the scan qual is unsatisfiable */
                 if (num_nonnulls == 0)
                 {
-                       numArrayKeys = -1;
+                       so->qual_ok = false;
                         break;
                 }
  
+               /*
+                * Determine the nominal datatype of the array elements.  We have to
+                * support the convention that sk_subtype == InvalidOid means the
+                * opclass input type; this is a hack to simplify life for
+                * ScanKeyInit().
+                */
+               elemtype = cur->sk_subtype;
+               if (elemtype == InvalidOid)
+                       elemtype = rel->rd_opcintype[cur->sk_attno - 1];
+               Assert(elemtype == ARR_ELEMTYPE(arrayval));
+
                 /*
                  * If the comparison operator is not equality, then the array qual
                  * degenerates to a simple comparison against the smallest or largest
@@ -319,7 +404,7 @@ _bt_preprocess_array_keys(IndexScanDesc scan)
                         case BTLessStrategyNumber:
                         case BTLessEqualStrategyNumber:
                                 cur->sk_argument =
-                                       _bt_find_extreme_element(scan, cur,
+                                       _bt_find_extreme_element(scan, cur, elemtype,
                                                                                          BTGreaterStrategyNumber,
                                                                                          elem_values, num_nonnulls);
                                 continue;
@@ -329,7 +414,7 @@ _bt_preprocess_array_keys(IndexScanDesc scan)
                         case BTGreaterEqualStrategyNumber:
                         case BTGreaterStrategyNumber:
                                 cur->sk_argument =
-                                       _bt_find_extreme_element(scan, cur,
+                                       _bt_find_extreme_element(scan, cur, elemtype,
                                                                                          BTLessStrategyNumber,
                                                                                          elem_values, num_nonnulls);
                                 continue;
@@ -339,17 +424,93 @@ _bt_preprocess_array_keys(IndexScanDesc scan)
                                 break;
                 }
  
+               /*
+                * We'll need a 3-way ORDER proc to perform binary searches for the
+                * next matching array element.  Set that up now.
+                *
+                * Array scan keys with cross-type equality operators will require a
+                * separate same-type ORDER proc for sorting their array.  Otherwise,
+                * sortproc just points to the same proc used during binary searches.
+                */
+               _bt_setup_array_cmp(scan, cur, elemtype,
+                                                       &so->orderProcs[i], &sortprocp);
+
                 /*
                  * Sort the non-null elements and eliminate any duplicates.  We must
                  * sort in the same ordering used by the index column, so that the
-                * successive primitive indexscans produce data in index order.
+                * arrays can be advanced in lockstep with the scan's progress through
+                * the index's key space.
                  */
-               num_elems = _bt_sort_array_elements(scan, cur,
-                                                                                       (indoption[cur->sk_attno - 1] & INDOPTION_DESC) != 0,
+               reverse = (indoption[cur->sk_attno - 1] & INDOPTION_DESC) != 0;
+               num_elems = _bt_sort_array_elements(cur, sortprocp, reverse,
                                                                                         elem_values, num_nonnulls);
  
+               if (origarrayatt == cur->sk_attno)
+               {
+                       BTArrayKeyInfo *orig = &so->arrayKeys[origarraykey];
+
+                       /*
+                        * This array scan key is redundant with a previous equality
+                        * operator array scan key.  Merge the two arrays together to
+                        * eliminate contradictory non-intersecting elements (or try to).
+                        *
+                        * We merge this next array back into attribute's original array.
+                        */
+                       Assert(arrayKeyData[orig->scan_key].sk_attno == cur->sk_attno);
+                       Assert(arrayKeyData[orig->scan_key].sk_collation ==
+                                  cur->sk_collation);
+                       if (_bt_merge_arrays(scan, cur, sortprocp, reverse,
+                                                                origelemtype, elemtype,
+                                                                orig->elem_values, &orig->num_elems,
+                                                                elem_values, num_elems))
+                       {
+                               /* Successfully eliminated this array */
+                               pfree(elem_values);
+
+
author	Peter Geoghegan <pg@bowt.ie>
	Sat, 6 Apr 2024 15:47:10 +0000 (11:47 -0400)
committer	Peter Geoghegan <pg@bowt.ie>
	Sat, 6 Apr 2024 15:47:10 +0000 (11:47 -0400)
doc/src/sgml/indexam.sgml		patch \| blob \| blame \| history
doc/src/sgml/monitoring.sgml		patch \| blob \| blame \| history
src/backend/access/index/indexam.c		patch \| blob \| blame \| history
src/backend/access/nbtree/nbtree.c		patch \| blob \| blame \| history
src/backend/access/nbtree/nbtsearch.c		patch \| blob \| blame \| history
src/backend/access/nbtree/nbtutils.c		patch \| blob \| blame \| history
src/backend/executor/nodeIndexonlyscan.c		patch \| blob \| blame \| history
src/backend/executor/nodeIndexscan.c		patch \| blob \| blame \| history
src/backend/optimizer/path/indxpath.c		patch \| blob \| blame \| history
src/backend/utils/adt/selfuncs.c		patch \| blob \| blame \| history
src/include/access/amapi.h		patch \| blob \| blame \| history
src/include/access/genam.h		patch \| blob \| blame \| history
src/include/access/nbtree.h		patch \| blob \| blame \| history
src/include/utils/selfuncs.h		patch \| blob \| blame \| history
src/test/regress/expected/btree_index.out		patch \| blob \| blame \| history
src/test/regress/expected/create_index.out		patch \| blob \| blame \| history
src/test/regress/expected/join.out		patch \| blob \| blame \| history
src/test/regress/expected/select_parallel.out		patch \| blob \| blame \| history
src/test/regress/sql/btree_index.sql		patch \| blob \| blame \| history
src/test/regress/sql/create_index.sql		patch \| blob \| blame \| history
src/test/regress/sql/select_parallel.sql		patch \| blob \| blame \| history
src/tools/pgindent/typedefs.list		patch \| blob \| blame \| history